returnn 1.20240802.134933__tar.gz → 1.20240824.1611__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of returnn might be problematic. Click here for more details.

Files changed (460) hide show
  1. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/PKG-INFO +1 -1
  2. returnn-1.20240824.1611/_setup_info_generated.py +2 -0
  3. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_backend.py +9 -0
  4. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/array_.py +4 -0
  5. returnn-1.20240824.1611/returnn/frontend/conversions/hf_llama.py +246 -0
  6. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/decoder/transformer.py +60 -15
  7. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/math_.py +9 -0
  8. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/_backend.py +5 -0
  9. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/frontend/_backend.py +11 -0
  10. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/updater.py +1 -1
  11. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/debug.py +57 -27
  12. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/file_cache.py +85 -21
  13. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/math.py +5 -0
  14. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn.egg-info/PKG-INFO +1 -1
  15. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn.egg-info/SOURCES.txt +1 -0
  16. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_Util.py +8 -3
  17. returnn-1.20240824.1611/tests/test_rf_decoder_transformer.py +163 -0
  18. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_math.py +16 -0
  19. returnn-1.20240802.134933/_setup_info_generated.py +0 -2
  20. returnn-1.20240802.134933/returnn/frontend/conversions/hf_llama.py +0 -56
  21. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/.editorconfig +0 -0
  22. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/.gitignore +0 -0
  23. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/.gitmodules +0 -0
  24. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/.kateconfig +0 -0
  25. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/CHANGELOG.md +0 -0
  26. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/CODEOWNERS +0 -0
  27. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/CONTRIBUTING.md +0 -0
  28. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/LICENSE +0 -0
  29. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/MANIFEST.in +0 -0
  30. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/README.rst +0 -0
  31. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/__init__.py +0 -0
  32. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/12AX.cluster_map +0 -0
  33. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/_setup_returnn_env.py +0 -0
  34. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-fwd.config +0 -0
  35. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-horovod-mpi.py +0 -0
  36. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-horovod-mpi.py.sh +0 -0
  37. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-horovod-mpi.sh +0 -0
  38. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-hyper-param-tuning.config +0 -0
  39. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-iter-dataset.py +0 -0
  40. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-list-devices.py +0 -0
  41. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-lua-torch-layer.config +0 -0
  42. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-pretrain.config +0 -0
  43. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-record-and-push-to-webserver.py +0 -0
  44. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-returnn-as-framework.py +0 -0
  45. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-rf-pt-benchmark.py +0 -0
  46. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-rf.config +0 -0
  47. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-rhn-enwik8.config +0 -0
  48. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-sprint-interface.py +0 -0
  49. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-att-copy.config +0 -0
  50. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-attention.config +0 -0
  51. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-chunking-blstm.12ax.config +0 -0
  52. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-contribrnn-lstm.12ax.config +0 -0
  53. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-enc-dec.config +0 -0
  54. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-hard-att-copy.config +0 -0
  55. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-lstm-benchmark.py +0 -0
  56. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-maxgradnorm-lstm.12ax.config +0 -0
  57. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-native-lstm-lowmem.12ax.config +0 -0
  58. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-native-lstm.12ax.config +0 -0
  59. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-native-lstm2.12ax.config +0 -0
  60. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-native-lstm2.12ax.tuned.config +0 -0
  61. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-neural-transducer.12ax.config +0 -0
  62. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-rec-explicit-lstm.config +0 -0
  63. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-rec-explicit-rnn.config +0 -0
  64. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-rec-self-att.config +0 -0
  65. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-search-compiled-graph.py +0 -0
  66. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-vanilla-lstm.12ax.config +0 -0
  67. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-timit-lstm-ctc.config +0 -0
  68. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-torch.config +0 -0
  69. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-upd-mult-model.lstm.12ax.config +0 -0
  70. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo.sh +0 -0
  71. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/IAM_lines/a01-000u-00.png +0 -0
  72. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/IAM_lines/a01-007-04.png +0 -0
  73. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/IAM_lines/a01-007-06.png +0 -0
  74. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/README.txt +0 -0
  75. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/chars.txt +0 -0
  76. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/config_demo +0 -0
  77. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/config_fwd +0 -0
  78. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/config_real +0 -0
  79. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/create_IAM_dataset.py +0 -0
  80. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/decode.py +0 -0
  81. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/features/raw/demo.h5 +0 -0
  82. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/go.sh +0 -0
  83. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/lines.txt +0 -0
  84. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/split/eval.txt +0 -0
  85. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/split/train.txt +0 -0
  86. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/split/valid.txt +0 -0
  87. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/README.md +0 -0
  88. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial/create_test_h5.py +0 -0
  89. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial/forwardconfig +0 -0
  90. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial/go.sh +0 -0
  91. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial/trainconfig +0 -0
  92. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial_rgb/create_test_h5.py +0 -0
  93. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial_rgb/forwardconfig +0 -0
  94. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial_rgb/go.sh +0 -0
  95. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial_rgb/trainconfig +0 -0
  96. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/pyproject.toml +0 -0
  97. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/requirements.txt +0 -0
  98. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/__init__.py +0 -0
  99. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/__main__.py +0 -0
  100. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/__old_mod_loader__.py +0 -0
  101. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/__setup__.py +0 -0
  102. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/config.py +0 -0
  103. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/__init__.py +0 -0
  104. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/audio.py +0 -0
  105. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/basic.py +0 -0
  106. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/bundle_file.py +0 -0
  107. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/cached.py +0 -0
  108. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/cached2.py +0 -0
  109. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/distrib_files.py +0 -0
  110. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/generating.py +0 -0
  111. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/hdf.py +0 -0
  112. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/lm.py +0 -0
  113. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/map.py +0 -0
  114. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/meta.py +0 -0
  115. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/multi_proc.py +0 -0
  116. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/normalization_data.py +0 -0
  117. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/numpy_dump.py +0 -0
  118. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/postprocessing.py +0 -0
  119. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/raw_wav.py +0 -0
  120. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/sprint.py +0 -0
  121. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/stereo.py +0 -0
  122. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/util/__init__.py +0 -0
  123. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/util/feature_extraction.py +0 -0
  124. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/util/strings.py +0 -0
  125. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/util/vocabulary.py +0 -0
  126. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/engine/__init__.py +0 -0
  127. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/engine/base.py +0 -0
  128. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/engine/batch.py +0 -0
  129. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/__init__.py +0 -0
  130. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/__main__.py +0 -0
  131. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/.git +0 -0
  132. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/.gitignore +0 -0
  133. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/LICENSE +0 -0
  134. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/README.md +0 -0
  135. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/aligner.gif +0 -0
  136. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/check.png +0 -0
  137. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/core.cu +0 -0
  138. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/core.h +0 -0
  139. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/core_cpu.cpp +0 -0
  140. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/LICENSE +0 -0
  141. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/MANIFEST.in +0 -0
  142. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/README.md +0 -0
  143. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/binding.cpp +0 -0
  144. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/core.cu +0 -0
  145. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/core.h +0 -0
  146. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/requirements.txt +0 -0
  147. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/setup.py +0 -0
  148. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/warp_rna/__init__.py +0 -0
  149. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/warp_rna/test.py +0 -0
  150. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/ref_rna.py +0 -0
  151. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/setup.py +0 -0
  152. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/src/warp_rna_op.cc +0 -0
  153. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/src/warp_rna_op_kernel_tmpl.h +0 -0
  154. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/warp_rna/__init__.py +0 -0
  155. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/test.cpp +0 -0
  156. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/__init__.py +0 -0
  157. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/README.md +0 -0
  158. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/__init__.py +0 -0
  159. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/edit.py +0 -0
  160. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/reroute.py +0 -0
  161. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/select.py +0 -0
  162. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/subgraph.py +0 -0
  163. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/transform.py +0 -0
  164. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/util.py +0 -0
  165. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/forward_iface.py +0 -0
  166. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/__init__.py +0 -0
  167. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/__init__.py +0 -0
  168. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/backend.cpp +0 -0
  169. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/backend.hpp +0 -0
  170. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/module.cpp +0 -0
  171. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/module.hpp +0 -0
  172. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/py_utils.hpp +0 -0
  173. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/tensor_ops.cpp +0 -0
  174. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/tensor_ops.hpp +0 -0
  175. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_numpy_backend.py +0 -0
  176. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_random_journal.py +0 -0
  177. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_utils.py +0 -0
  178. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/attention.py +0 -0
  179. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/audio/__init__.py +0 -0
  180. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/audio/mel.py +0 -0
  181. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/audio/specaugment.py +0 -0
  182. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/backend.py +0 -0
  183. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/build_from_dict.py +0 -0
  184. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/cond.py +0 -0
  185. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/const.py +0 -0
  186. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/container.py +0 -0
  187. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/control_flow_ctx.py +0 -0
  188. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/conv.py +0 -0
  189. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/conversions/__init__.py +0 -0
  190. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/decoder/__init__.py +0 -0
  191. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/device.py +0 -0
  192. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/dims.py +0 -0
  193. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/dropout.py +0 -0
  194. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/dtype.py +0 -0
  195. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/encoder/__init__.py +0 -0
  196. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/encoder/base.py +0 -0
  197. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/encoder/conformer.py +0 -0
  198. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/gradient.py +0 -0
  199. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/graph.py +0 -0
  200. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/hooks.py +0 -0
  201. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/init.py +0 -0
  202. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/label_smoothing.py +0 -0
  203. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/linear.py +0 -0
  204. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/loop.py +0 -0
  205. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/loss.py +0 -0
  206. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/matmul.py +0 -0
  207. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/module.py +0 -0
  208. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/normalization.py +0 -0
  209. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/parameter.py +0 -0
  210. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/parametrizations.py +0 -0
  211. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/parametrize.py +0 -0
  212. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/piecewise_linear.py +0 -0
  213. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/rand.py +0 -0
  214. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/rec.py +0 -0
  215. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/reduce.py +0 -0
  216. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/run_ctx.py +0 -0
  217. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/signal.py +0 -0
  218. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/state.py +0 -0
  219. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/stepwise_scheduler.py +0 -0
  220. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/tensor_array.py +0 -0
  221. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/types.py +0 -0
  222. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/import_/__init__.py +0 -0
  223. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/import_/common.py +0 -0
  224. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/import_/git.py +0 -0
  225. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/import_/import_.py +0 -0
  226. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/learning_rate_control.py +0 -0
  227. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/log.py +0 -0
  228. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/native_op.cpp +0 -0
  229. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/native_op.py +0 -0
  230. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/pretrain.py +0 -0
  231. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/sprint/__init__.py +0 -0
  232. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/sprint/cache.py +0 -0
  233. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/sprint/control.py +0 -0
  234. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/sprint/error_signals.py +0 -0
  235. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/sprint/extern_interface.py +0 -0
  236. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/sprint/interface.py +0 -0
  237. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/README.md +0 -0
  238. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/__init__.py +0 -0
  239. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/_dim_extra.py +0 -0
  240. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/_tensor_extra.py +0 -0
  241. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/_tensor_mixin_base.py +0 -0
  242. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/_tensor_op_overloads.py +0 -0
  243. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/control_flow_ctx.py +0 -0
  244. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/dim.py +0 -0
  245. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/marked_dim.py +0 -0
  246. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/tensor.py +0 -0
  247. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/tensor_dict.py +0 -0
  248. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/utils.py +0 -0
  249. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/__init__.py +0 -0
  250. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/compat.py +0 -0
  251. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/data_pipeline.py +0 -0
  252. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/distributed.py +0 -0
  253. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/engine.py +0 -0
  254. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/README.md +0 -0
  255. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/__init__.py +0 -0
  256. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/_utils.py +0 -0
  257. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/cond.py +0 -0
  258. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/config_entry_points.py +0 -0
  259. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/debug_eager_mode.py +0 -0
  260. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/dims.py +0 -0
  261. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/layer.py +0 -0
  262. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/loop.py +0 -0
  263. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/make_layer.py +0 -0
  264. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/masked_computation.py +0 -0
  265. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/parameter_assign.py +0 -0
  266. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/prev_tensor_ref.py +0 -0
  267. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_low_level/__init__.py +0 -0
  268. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_low_level/_backend.py +0 -0
  269. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/horovod.py +0 -0
  270. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/hyper_param_tuning.py +0 -0
  271. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/__init__.py +0 -0
  272. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/base.py +0 -0
  273. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/basic.py +0 -0
  274. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/rec.py +0 -0
  275. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/segmental_model.py +0 -0
  276. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/signal_processing.py +0 -0
  277. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/variable.py +0 -0
  278. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/native_op.py +0 -0
  279. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/network.py +0 -0
  280. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/sprint.py +0 -0
  281. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/updater.py +0 -0
  282. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/util/__init__.py +0 -0
  283. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/util/basic.py +0 -0
  284. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/util/data.py +0 -0
  285. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/util/gradient_checkpoint.py +0 -0
  286. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/util/ken_lm.py +0 -0
  287. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/util/open_fst.py +0 -0
  288. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/README.md +0 -0
  289. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/__init__.py +0 -0
  290. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/data/__init__.py +0 -0
  291. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/data/extern_data.py +0 -0
  292. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/data/pipeline.py +0 -0
  293. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/data/queued_data_iter.py +0 -0
  294. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/data/returnn_dataset_wrapper.py +0 -0
  295. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/data/tensor_utils.py +0 -0
  296. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/distributed.py +0 -0
  297. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/engine.py +0 -0
  298. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/frontend/__init__.py +0 -0
  299. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/frontend/_rand.py +0 -0
  300. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/frontend/bridge.py +0 -0
  301. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/frontend/raw_ops.py +0 -0
  302. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/util/README.md +0 -0
  303. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/util/__init__.py +0 -0
  304. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/util/array_.py +0 -0
  305. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/util/diagnose_gpu.py +0 -0
  306. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/util/gradient_checkpoint.py +0 -0
  307. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/util/scaled_gradient.py +0 -0
  308. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/__init__.py +0 -0
  309. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/basic.py +0 -0
  310. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/better_exchook.py +0 -0
  311. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/bpe.py +0 -0
  312. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/debug_helpers.py +0 -0
  313. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/fsa.py +0 -0
  314. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/literal_py_to_pickle.py +0 -0
  315. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/multi_proc_non_daemonic_spawn.py +0 -0
  316. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/native_code_compiler.py +0 -0
  317. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/pprint.py +0 -0
  318. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/py-to-pickle.cpp +0 -0
  319. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/py_compat.py +0 -0
  320. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/py_ext_mod_compiler.py +0 -0
  321. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/result_with_reason.py +0 -0
  322. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/sig_proc.py +0 -0
  323. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/task_system.py +0 -0
  324. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/train_proc_manager.py +0 -0
  325. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/watch_memory.py +0 -0
  326. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn.egg-info/dependency_links.txt +0 -0
  327. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn.egg-info/top_level.txt +0 -0
  328. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/rnn.py +0 -0
  329. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/setup.cfg +0 -0
  330. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/setup.py +0 -0
  331. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/DummySprintExec.py +0 -0
  332. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm-inspection-profile.xml +0 -0
  333. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/.gitignore +0 -0
  334. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/.name +0 -0
  335. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/codeStyleSettings.xml +0 -0
  336. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/codeStyles/Project.xml +0 -0
  337. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/codeStyles/codeStyleConfig.xml +0 -0
  338. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/inspectionProfiles/Project_Default.xml +0 -0
  339. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/inspectionProfiles/profiles_settings.xml +0 -0
  340. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/misc.xml +0 -0
  341. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/modules.xml +0 -0
  342. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/returnn.iml +0 -0
  343. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/scopes/scope_settings.xml +0 -0
  344. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/_set_num_threads1.py +0 -0
  345. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/_setup_returnn_env.py +0 -0
  346. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/_setup_test_env.py +0 -0
  347. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/bpe-unicode-demo.codes +0 -0
  348. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/bpe-unicode-demo.vocab +0 -0
  349. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/lexicon_opt.fst +0 -0
  350. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/lexicon_opt.isyms +0 -0
  351. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/lexicon_opt.jpg +0 -0
  352. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/lexicon_opt.osyms +0 -0
  353. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/lint_common.py +0 -0
  354. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/pycharm-inspect.py +0 -0
  355. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/pylint.py +0 -0
  356. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/returnn-as-framework.py +0 -0
  357. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/rf_utils.py +0 -0
  358. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/spelling.dic +0 -0
  359. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_Config.py +0 -0
  360. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_Dataset.py +0 -0
  361. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_Fsa.py +0 -0
  362. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_GeneratingDataset.py +0 -0
  363. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_HDFDataset.py +0 -0
  364. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_LearningRateControl.py +0 -0
  365. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_Log.py +0 -0
  366. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_MultiProcDataset.py +0 -0
  367. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_Pretrain.py +0 -0
  368. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_ResNet.py +0 -0
  369. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_SprintDataset.py +0 -0
  370. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_SprintInterface.py +0 -0
  371. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFEngine.py +0 -0
  372. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFNativeOp.py +0 -0
  373. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFNetworkLayer.py +0 -0
  374. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFNetworkRecLayer.py +0 -0
  375. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFNetworkSigProcLayer.py +0 -0
  376. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFUpdater.py +0 -0
  377. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFUtil.py +0 -0
  378. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TF_determinism.py +0 -0
  379. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TaskSystem.py +0 -0
  380. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TaskSystem_SharedMem.py +0 -0
  381. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TranslationDataset.py +0 -0
  382. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_demos.py +0 -0
  383. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_fork_exec.py +0 -0
  384. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_hdf_dump.py +0 -0
  385. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_array.py +0 -0
  386. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_attention.py +0 -0
  387. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_base.py +0 -0
  388. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_cond.py +0 -0
  389. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_const.py +0 -0
  390. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_container.py +0 -0
  391. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_conv.py +0 -0
  392. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_encoder_conformer.py +0 -0
  393. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_gradient.py +0 -0
  394. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_label_smoothing.py +0 -0
  395. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_loop.py +0 -0
  396. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_normalization.py +0 -0
  397. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_piecewise_linear.py +0 -0
  398. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_rec.py +0 -0
  399. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_reduce.py +0 -0
  400. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_signal.py +0 -0
  401. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_tensor.py +0 -0
  402. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_tools.py +0 -0
  403. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_torch_dataset.py +0 -0
  404. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_torch_engine.py +0 -0
  405. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_torch_frontend.py +0 -0
  406. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_torch_internal_frontend.py +0 -0
  407. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_torch_util.py +0 -0
  408. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/torch_utils.py +0 -0
  409. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/_setup_returnn_env.py +0 -0
  410. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/analyze-dataset-batches.py +0 -0
  411. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/bliss-collect-seq-lens.py +0 -0
  412. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/bliss-dump-text.py +0 -0
  413. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/bliss-get-segment-names.py +0 -0
  414. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/bliss-to-ogg-zip.py +0 -0
  415. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/bpe-create-lexicon.py +0 -0
  416. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/calculate-word-error-rate.py +0 -0
  417. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/cleanup-old-models.py +0 -0
  418. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/collect-orth-symbols.py +0 -0
  419. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/collect-words.py +0 -0
  420. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/compile_native_op.py +0 -0
  421. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/compile_tf_graph.py +0 -0
  422. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/debug-dump-search-scores.py +0 -0
  423. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/debug-plot-search-scores.py +0 -0
  424. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/dump-dataset-raw-strings.py +0 -0
  425. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/dump-dataset.py +0 -0
  426. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/dump-forward-stats.py +0 -0
  427. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/dump-forward.py +0 -0
  428. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/dump-network-json.py +0 -0
  429. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/dump-pickle.py +0 -0
  430. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/extract_state_tying_from_dataset.py +0 -0
  431. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/get-attention-weights.py +0 -0
  432. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/get-best-model-epoch.py +0 -0
  433. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/hdf_dump.py +0 -0
  434. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/hdf_dump_translation_dataset.py +0 -0
  435. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/import-blocks-mt-model.py +0 -0
  436. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/import-t2t-mt-model.py +0 -0
  437. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/.gitignore +0 -0
  438. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/Makefile +0 -0
  439. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/README.md +0 -0
  440. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/README.md +0 -0
  441. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/libs_list +0 -0
  442. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/network.040/i600_m600_m600.sgd_b16_lr0_cl2.newbobabs.config +0 -0
  443. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/network.040/i600_m600_m600.sgd_b16_lr0_cl2.newbobabs.keep_over_epoch.lstm2.config +0 -0
  444. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/rescore_lattice.sh +0 -0
  445. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/state_vars_list +0 -0
  446. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/tensor_names_list +0 -0
  447. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/file.h +0 -0
  448. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/htklatticerescorer.cc +0 -0
  449. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/htklatticerescorer.h +0 -0
  450. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/main.cc +0 -0
  451. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/rescorer.h +0 -0
  452. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/vocabulary.cc +0 -0
  453. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/vocabulary.h +0 -0
  454. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/tf_avg_checkpoints.py +0 -0
  455. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/tf_inspect_checkpoint.py +0 -0
  456. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/tf_inspect_summary_log.py +0 -0
  457. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/torch_avg_checkpoints.py +0 -0
  458. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/torch_export_to_onnx.py +0 -0
  459. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/torch_inspect_checkpoint.py +0 -0
  460. {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/torch_inspect_checkpoint_and_opt.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20240802.134933
3
+ Version: 1.20240824.1611
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -0,0 +1,2 @@
1
+ version = '1.20240824.001611'
2
+ long_version = '1.20240824.001611+git.a15f05e'
@@ -1014,6 +1014,15 @@ class Backend(Generic[T]):
1014
1014
  return start + weight * (end - start)
1015
1015
  return rf.combine_bc(start, "+", rf.combine_bc(weight, "*", rf.combine_bc(end, "-", start)))
1016
1016
 
1017
+ @staticmethod
1018
+ def cumsum(source: Tensor, *, spatial_dim: Dim) -> Tensor:
1019
+ """
1020
+ :param source:
1021
+ :param spatial_dim:
1022
+ :return: cumsum over spatial dim
1023
+ """
1024
+ raise NotImplementedError
1025
+
1017
1026
  @staticmethod
1018
1027
  def matmul(a: Tensor[T], b: Tensor[T], *, reduce: Union[Dim, Sequence[Dim]], use_mask: bool = True) -> Tensor[T]:
1019
1028
  """
@@ -520,9 +520,12 @@ def masked_select(
520
520
  tensor: Tensor, *, mask: Tensor, dims: Sequence[Dim], out_dim: Optional[Dim] = None
521
521
  ) -> Tuple[Tensor, Dim]:
522
522
  """
523
+ This will pack the tensor based on the mask.
523
524
  In TF, this is ``boolean_mask``.
524
525
  The inverse of this is :func:`masked_scatter`.
525
526
 
527
+ Related: :func:`pack_padded`, which uses :func:`sequence_mask` as the mask.
528
+
526
529
  :param tensor:
527
530
  :param mask:
528
531
  :param dims: the order of the dims defines the format. those dims should be exactly the dims of the mask.
@@ -553,6 +556,7 @@ def sequence_mask(dims: Union[Dim, Sequence[Dim]], *, device: Optional[str] = No
553
556
  """
554
557
  :param dims:
555
558
  :param device:
559
+ :return: mask based on the sequence lengths
556
560
  """
557
561
  if isinstance(dims, Dim):
558
562
  dims = [dims]
@@ -0,0 +1,246 @@
1
+ """
2
+ Import the parameters from the HuggingFace Llama model (PyTorch).
3
+ """
4
+
5
+ from __future__ import annotations
6
+ from typing import TYPE_CHECKING, Union
7
+ import returnn.frontend as rf
8
+ from returnn.frontend.decoder.transformer import TransformerDecoder, TransformerDecoderLayer, FeedForwardGated
9
+
10
+ if TYPE_CHECKING:
11
+ from transformers.models.llama.modeling_llama import (
12
+ LlamaModel,
13
+ LlamaForCausalLM,
14
+ LlamaDecoderLayer,
15
+ LlamaMLP,
16
+ LlamaRMSNorm,
17
+ LlamaAttention,
18
+ )
19
+
20
+
21
+ def import_params_hf_llama_to_rf_transformer_decoder(
22
+ model_hf: Union[LlamaModel, LlamaForCausalLM], model_rf: TransformerDecoder
23
+ ):
24
+ """
25
+ Import params from HF Llama model to RF :class:`TransformerDecoder`.
26
+ """
27
+ import torch
28
+ from transformers.models.llama.modeling_llama import LlamaModel, LlamaForCausalLM, LlamaDecoderLayer
29
+
30
+ print("HF Model:")
31
+ print(model_hf)
32
+ print("Parameters:")
33
+ num_params_hf = 0
34
+ for k, v in model_hf.named_parameters():
35
+ print(f"{k}: {list(v.shape)} {v.dtype}")
36
+ num_params_hf += v.numel()
37
+ print("Total number of parameters:", num_params_hf)
38
+
39
+ print("RF Model:")
40
+ print(model_rf)
41
+ print("Parameters:")
42
+ num_params_rf = 0
43
+ for k, v in model_rf.named_parameters():
44
+ print(f"{k}: {list(v.dims)} {v.dtype}")
45
+ assert isinstance(v.raw_tensor, torch.nn.Parameter)
46
+ num_params_rf += v.num_elements()
47
+ print("Total number of parameters:", num_params_rf)
48
+ # Check if the number of parameters is the same below.
49
+ # First import individual sub modules.
50
+ # We might detect any mismatches there, and this will easy the debugging.
51
+
52
+ lm_head = None
53
+ if isinstance(model_hf, LlamaForCausalLM):
54
+ lm_head = model_hf.lm_head
55
+ model_hf = model_hf.model
56
+ else:
57
+ # Exclude logits.
58
+ num_params_rf -= model_rf.logits.weight.num_elements()
59
+ assert isinstance(model_hf, LlamaModel)
60
+ assert model_hf.norm.weight.shape[0] == model_rf.model_dim.dimension
61
+
62
+ assert len(model_hf.layers) == len(model_rf.layers)
63
+ for i, (layer_hf, layer_rf) in enumerate(zip(model_hf.layers, model_rf.layers)):
64
+ assert isinstance(layer_hf, LlamaDecoderLayer)
65
+ assert isinstance(layer_rf, TransformerDecoderLayer)
66
+ import_params_hf_llama_decoder_layer_to_rf(layer_hf, layer_rf)
67
+
68
+ assert model_hf.embed_tokens.weight.shape == model_rf.input_embedding.weight.raw_tensor.shape
69
+ with torch.no_grad():
70
+ model_rf.input_embedding.weight.raw_tensor.copy_(model_hf.embed_tokens.weight) # (vocab,hidden)
71
+
72
+ assert isinstance(model_rf.final_layer_norm, rf.RMSNorm)
73
+ import_params_hf_llama_rms_norm_to_rf(model_hf.norm, model_rf.final_layer_norm)
74
+
75
+ if lm_head is not None:
76
+ assert lm_head.bias is None and model_rf.logits.bias is None # not implemented
77
+ # Torch Linear: (out,in), but RF has (in,out).
78
+ with torch.no_grad():
79
+ model_rf.logits.weight.raw_tensor.copy_(lm_head.weight.T) # (hidden,vocab)
80
+
81
+ assert num_params_rf == num_params_hf, f"missmatch num params: RF {num_params_rf} != HF {num_params_hf}"
82
+
83
+
84
+ def import_params_hf_llama_decoder_layer_to_rf(model_hf: LlamaDecoderLayer, model_rf: TransformerDecoderLayer):
85
+ """
86
+ Import the parameters from the HF Llama decoder layer.
87
+ """
88
+ import torch
89
+
90
+ assert model_hf.hidden_size == model_rf.out_dim.dimension
91
+
92
+ print("HF Model:")
93
+ print(model_hf)
94
+ print("Parameters:")
95
+ num_params_hf = 0
96
+ for k, v in model_hf.named_parameters():
97
+ print(f"{k}: {list(v.shape)} {v.dtype}")
98
+ num_params_hf += v.numel()
99
+ print("Total number of parameters:", num_params_hf)
100
+
101
+ print("RF Model:")
102
+ print(model_rf)
103
+ print("Parameters:")
104
+ num_params_rf = 0
105
+ for k, v in model_rf.named_parameters():
106
+ print(f"{k}: {list(v.dims)} {v.dtype}")
107
+ assert isinstance(v.raw_tensor, torch.nn.Parameter)
108
+ num_params_rf += v.num_elements()
109
+ print("Total number of parameters:", num_params_rf)
110
+ # Check if the number of parameters is the same below.
111
+ # First import individual sub modules.
112
+ # We might detect any mismatches there, and this will easy the debugging.
113
+
114
+ assert isinstance(model_rf.ff, FeedForwardGated), f"unexpected: {model_rf.ff}"
115
+ import_params_hf_llama_mlp_to_rf_feed_forward_gated(model_hf.mlp, model_rf.ff)
116
+
117
+ assert isinstance(model_rf.self_att, rf.RotaryPosCausalSelfAttention), f"unexpected: {model_rf.self_att}"
118
+ import_params_hf_llama_att_to_rf_rotary_att(model_hf.self_attn, model_rf.self_att)
119
+
120
+ assert isinstance(model_rf.self_att_layer_norm, rf.RMSNorm), f"unexpected: {model_rf.self_att_layer_norm}"
121
+ import_params_hf_llama_rms_norm_to_rf(model_hf.input_layernorm, model_rf.self_att_layer_norm)
122
+
123
+ assert isinstance(model_rf.ff_layer_norm, rf.RMSNorm), f"unexpected: {model_rf.ff_layer_norm}"
124
+ import_params_hf_llama_rms_norm_to_rf(model_hf.post_attention_layernorm, model_rf.ff_layer_norm)
125
+
126
+ assert num_params_rf == num_params_hf
127
+
128
+
129
+ def import_params_hf_llama_mlp_to_rf_feed_forward_gated(model_hf: LlamaMLP, model_rf: FeedForwardGated):
130
+ """
131
+ Import the parameters from the HF Llama MLP module.
132
+ """
133
+ import torch
134
+
135
+ assert model_hf.hidden_size == model_rf.out_dim.dimension == model_rf.linear_ff.in_dim.dimension
136
+
137
+ print("HF Model:")
138
+ print(model_hf)
139
+ print("Parameters:")
140
+ num_params_hf = 0
141
+ for k, v in model_hf.named_parameters():
142
+ print(f"{k}: {list(v.shape)} {v.dtype}")
143
+ num_params_hf += v.numel()
144
+ print("Total number of parameters:", num_params_hf)
145
+
146
+ print("RF Model:")
147
+ print(model_rf)
148
+ print("Parameters:")
149
+ num_params_rf = 0
150
+ for k, v in model_rf.named_parameters():
151
+ print(f"{k}: {list(v.dims)} {v.dtype}")
152
+ assert isinstance(v.raw_tensor, torch.nn.Parameter)
153
+ num_params_rf += v.num_elements()
154
+ print("Total number of parameters:", num_params_rf)
155
+ assert num_params_rf == num_params_hf
156
+
157
+ # Torch Linear: (out,in), but RF has (in,out).
158
+ w1 = model_hf.gate_proj.weight.T # (in,out)
159
+ w2 = model_hf.up_proj.weight.T # (in,out)
160
+ w3 = model_hf.down_proj.weight.T # (out,in)
161
+ assert model_hf.gate_proj.bias is None # not implemented
162
+ assert model_hf.up_proj.bias is None # not implemented
163
+ assert model_hf.down_proj.bias is None # not implemented
164
+ with torch.no_grad():
165
+ w = torch.cat((w1, w2), dim=1) # (in,out*2)
166
+ model_rf.linear_ff.weight.raw_tensor.copy_(w)
167
+ model_rf.linear_out.weight.raw_tensor.copy_(w3)
168
+
169
+
170
+ def import_params_hf_llama_rms_norm_to_rf(model_hf: LlamaRMSNorm, model_rf: rf.RMSNorm):
171
+ """
172
+ Import the parameters from the HF Llama RMSNorm module.
173
+ """
174
+ import torch
175
+
176
+ assert model_hf.weight.shape[0] == model_rf.in_dim.dimension
177
+
178
+ print("HF Model:")
179
+ print(model_hf)
180
+ print("Parameters:")
181
+ num_params_hf = 0
182
+ for k, v in model_hf.named_parameters():
183
+ print(f"{k}: {list(v.shape)} {v.dtype}")
184
+ num_params_hf += v.numel()
185
+ print("Total number of parameters:", num_params_hf)
186
+
187
+ print("RF Model:")
188
+ print(model_rf)
189
+ print("Parameters:")
190
+ num_params_rf = 0
191
+ for k, v in model_rf.named_parameters():
192
+ print(f"{k}: {list(v.dims)} {v.dtype}")
193
+ assert isinstance(v.raw_tensor, torch.nn.Parameter)
194
+ num_params_rf += v.num_elements()
195
+ print("Total number of parameters:", num_params_rf)
196
+ assert num_params_rf == num_params_hf
197
+
198
+ w = model_hf.weight # (in,)
199
+ with torch.no_grad():
200
+ model_rf.scale.raw_tensor.copy_(w)
201
+
202
+
203
+ def import_params_hf_llama_att_to_rf_rotary_att(model_hf: LlamaAttention, model_rf: rf.RotaryPosCausalSelfAttention):
204
+ """
205
+ Import the parameters from the HF Llama attention module.
206
+ """
207
+ import torch
208
+
209
+ assert model_hf.num_heads == model_rf.num_heads.dimension
210
+ assert model_hf.hidden_size == model_rf.in_dim.dimension
211
+ dim = model_hf.hidden_size
212
+ nh = model_hf.num_heads
213
+ hdim = dim // nh
214
+
215
+ print("HF Model:")
216
+ print(model_hf)
217
+ print("Parameters:")
218
+ num_params_hf = 0
219
+ for k, v in model_hf.named_parameters():
220
+ print(f"{k}: {list(v.shape)} {v.dtype}")
221
+ num_params_hf += v.numel()
222
+ print("Total number of parameters:", num_params_hf)
223
+
224
+ print("RF Model:")
225
+ print(model_rf)
226
+ print("Parameters:")
227
+ num_params_rf = 0
228
+ for k, v in model_rf.named_parameters():
229
+ print(f"{k}: {list(v.dims)} {v.dtype}")
230
+ assert isinstance(v.raw_tensor, torch.nn.Parameter)
231
+ num_params_rf += v.num_elements()
232
+ print("Total number of parameters:", num_params_rf)
233
+ assert num_params_rf == num_params_hf, f"num params RF {num_params_rf} != params HF {num_params_hf}"
234
+
235
+ # Torch Linear: (out,in), but RF has (in,out).
236
+ q = model_hf.q_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
237
+ k = model_hf.k_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
238
+ v = model_hf.v_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
239
+ q = q.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2) # reorder complex numbers
240
+ k = k.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2) # reorder complex numbers
241
+ qkv = torch.cat([q, k, v], dim=2) # (in,h,out/h*3)
242
+ qkv = qkv.reshape(dim, 3 * dim)
243
+ assert model_hf.q_proj.bias is None # not implemented
244
+ with torch.no_grad():
245
+ model_rf.qkv.weight.raw_tensor.copy_(qkv)
246
+ model_rf.proj.weight.raw_tensor.copy_(model_hf.o_proj.weight.T)
@@ -13,10 +13,12 @@ References:
13
13
 
14
14
  from __future__ import annotations
15
15
  from typing import Optional, Any, Union, Tuple, Dict, Callable, Sequence
16
+ from types import FunctionType
16
17
  import functools
17
18
  import logging
18
19
  import copy as _copy
19
20
  from returnn.util.basic import NotSpecified, BehaviorVersion
21
+ from returnn.util.math import ceil_div
20
22
  import returnn.frontend as rf
21
23
  from returnn.tensor import Tensor, Dim, single_step_dim
22
24
 
@@ -36,6 +38,7 @@ class TransformerDecoder(rf.Module):
36
38
  ff: Union[type, Dict[str, Any], rf.Module] = NotSpecified,
37
39
  ff_dim: Union[Dim, int] = NotSpecified,
38
40
  ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
41
+ pos_enc: Union[None, Callable, Dict[str, Any], rf.Module] = rf.sinusoidal_positional_encoding,
39
42
  dropout: float = 0.1,
40
43
  num_heads: int = 8,
41
44
  att_dropout: float = 0.1,
@@ -57,6 +60,7 @@ class TransformerDecoder(rf.Module):
57
60
  :param ff: feed-forward / MLP block. Default is :class:`FeedForward`
58
61
  :param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
59
62
  :param ff_activation: activation function for feed-forward network
63
+ :param pos_enc: positional encoding. Default is sinusoidal positional encoding.
60
64
  :param dropout: the dropout value for the FF block
61
65
  :param num_heads: the number of attention heads
62
66
  :param att_dropout: attention dropout value
@@ -92,10 +96,21 @@ class TransformerDecoder(rf.Module):
92
96
  if embed_dim:
93
97
  self.input_embedding_proj = rf.Linear(embed_dim, model_dim, with_bias=False)
94
98
 
95
- # TODO This should be configurable...
96
- self.pos_enc = functools.partial(
97
- rf.sinusoidal_positional_encoding, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
98
- )
99
+ if pos_enc is None:
100
+ pass
101
+ elif isinstance(pos_enc, dict):
102
+ pos_enc = rf.build_from_dict(
103
+ pos_enc, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
104
+ )
105
+ elif isinstance(pos_enc, rf.Module):
106
+ pass
107
+ elif isinstance(pos_enc, FunctionType):
108
+ pos_enc = functools.partial(
109
+ pos_enc, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
110
+ )
111
+ else:
112
+ raise TypeError(f"unexpected pos_enc type {pos_enc!r}")
113
+ self.pos_enc = pos_enc
99
114
  if share_embedding is None:
100
115
  if BehaviorVersion.get() < 20:
101
116
  logging.getLogger("returnn.frontend").warning(
@@ -189,7 +204,8 @@ class TransformerDecoder(rf.Module):
189
204
  new_state = rf.State()
190
205
 
191
206
  decoded = self.input_embedding(source) * self.input_embedding_scale
192
- decoded = decoded + self.pos_enc(spatial_dim=spatial_dim, offset=state.pos)
207
+ if self.pos_enc is not None:
208
+ decoded = decoded + self.pos_enc(spatial_dim=spatial_dim, offset=state.pos)
193
209
  decoded = rf.dropout(decoded, self.input_dropout)
194
210
  if self.input_embedding_proj is not None:
195
211
  decoded = self.input_embedding_proj(decoded)
@@ -228,7 +244,9 @@ class TransformerDecoderLayer(rf.Module):
228
244
  ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
229
245
  dropout: float = 0.1,
230
246
  num_heads: int = 8,
231
- self_att: Optional[Union[rf.CausalSelfAttention, rf.RelPosCausalSelfAttention, rf.Module, type, Any]] = None,
247
+ self_att: Optional[
248
+ Union[rf.CausalSelfAttention, rf.RelPosCausalSelfAttention, rf.Module, type, Dict[str, Any]]
249
+ ] = None,
232
250
  self_att_opts: Optional[Dict[str, Any]] = None,
233
251
  att_dropout: float = 0.1,
234
252
  norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
@@ -271,7 +289,7 @@ class TransformerDecoderLayer(rf.Module):
271
289
  self.ff = ff
272
290
  self.ff_layer_norm = _make_norm(norm, out_dim)
273
291
 
274
- if self_att is None or isinstance(self_att, type):
292
+ if self_att is None or isinstance(self_att, type) or isinstance(self_att, dict):
275
293
  self_att_opts_ = dict(
276
294
  in_dim=out_dim,
277
295
  proj_dim=out_dim,
@@ -284,10 +302,16 @@ class TransformerDecoderLayer(rf.Module):
284
302
  self_att_opts_.update(self_att_opts)
285
303
  if self_att is None:
286
304
  self.self_att = rf.CausalSelfAttention(**self_att_opts_)
287
- else:
305
+ elif isinstance(self_att, type):
288
306
  self.self_att = self_att(**self_att_opts_)
307
+ elif isinstance(self_att, dict):
308
+ self.self_att = rf.build_from_dict(self_att, **self_att_opts_)
309
+ else:
310
+ raise TypeError(f"unexpected self_att type {self_att!r}")
311
+ elif isinstance(self_att, rf.Module):
312
+ self.self_att = _copy.deepcopy(self_att)
289
313
  else:
290
- self.self_att = self_att
314
+ raise TypeError(f"unexpected self_att type {self_att!r}")
291
315
  self.self_att_layer_norm = _make_norm(norm, out_dim)
292
316
 
293
317
  self.cross_att = None
@@ -353,12 +377,15 @@ class FeedForward(rf.Module):
353
377
  ff_dim: Optional[Union[Dim, int]] = NotSpecified,
354
378
  dropout: float = 0.1,
355
379
  activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.relu,
380
+ with_bias: bool = True,
356
381
  ):
357
382
  """
358
383
  :param out_dim: output feature dimension
359
384
  :param ff_dim: dimension of the feed-forward layers
360
385
  :param dropout: dropout value
361
386
  :param activation: activation function, relu by default
387
+ :param with_bias: whether to use bias in the linear layers.
388
+ True by default for compatibility, but nowadays it's common to use without bias.
362
389
  """
363
390
  super().__init__()
364
391
 
@@ -381,8 +408,8 @@ class FeedForward(rf.Module):
381
408
  self.dropout_broadcast = rf.dropout_broadcast_default()
382
409
  self.activation = activation
383
410
 
384
- self.linear_ff = rf.Linear(out_dim, ff_dim)
385
- self.linear_out = rf.Linear(ff_dim, out_dim)
411
+ self.linear_ff = rf.Linear(out_dim, ff_dim, with_bias=with_bias)
412
+ self.linear_out = rf.Linear(ff_dim, out_dim, with_bias=with_bias)
386
413
 
387
414
  def __call__(self, inp: Tensor) -> Tensor:
388
415
  """forward"""
@@ -401,6 +428,8 @@ class FeedForwardGated(rf.Module):
401
428
  f(Linear(x)) * Linear(x)
402
429
 
403
430
  This is a feed-forward block based on SwiGLU, as defined in the paper.
431
+
432
+ Alternative to :class:`FeedForward`.
404
433
  """
405
434
 
406
435
  def __init__(
@@ -410,14 +439,30 @@ class FeedForwardGated(rf.Module):
410
439
  ff_dim: Optional[Union[Dim, int]] = NotSpecified,
411
440
  dropout: float = 0.1,
412
441
  activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.swish,
442
+ with_bias: bool = False,
413
443
  ):
444
+ """
445
+ :param out_dim:
446
+ :param ff_dim: intermediate dimension.
447
+ Unlike :class:`FeedForward`:
448
+ If not provided, factor 4*2/3 to keep same number of parameters as in the original :class:`FeedForward`,
449
+ just as in the paper, and also making it a multiple of 256.
450
+ :param dropout:
451
+ :param activation: activation function for the gating. unlike :class:`FeedForward`, default is swish.
452
+ :param with_bias: whether to use bias in the linear layers.
453
+ unlike :class:`FeedForward`, default is False.
454
+ """
414
455
  super().__init__()
415
456
 
416
457
  if isinstance(ff_dim, int):
417
458
  ff_dim = Dim(ff_dim, name="transformer-ff-dim")
418
459
  if ff_dim is NotSpecified or ff_dim is None:
419
- # Factor 2/3 to keep same number of parameters as in the original FF block, just as in the paper.
420
- ff_dim = out_dim * 2 // 3
460
+ # Factor 4 as usual.
461
+ # The additional factor 2/3 to keep same number of parameters as in the original FF block,
462
+ # just as in the paper.
463
+ ff_dim_ = out_dim.dimension * 4 * 2 // 3
464
+ ff_dim_ = ceil_div(ff_dim_, 256) * 256 # make multiple of 256
465
+ ff_dim = Dim(ff_dim_, name="transformer-ff-dim")
421
466
  if not isinstance(ff_dim, Dim):
422
467
  raise TypeError(f"Transformer FeedForward: unexpected ff_dim {ff_dim!r} type {type(ff_dim)}")
423
468
 
@@ -434,8 +479,8 @@ class FeedForwardGated(rf.Module):
434
479
  self.activation = activation
435
480
 
436
481
  # Factor 2 because we concatenate the two paths.
437
- self.linear_ff = rf.Linear(out_dim, 2 * ff_dim)
438
- self.linear_out = rf.Linear(ff_dim, out_dim)
482
+ self.linear_ff = rf.Linear(out_dim, 2 * ff_dim, with_bias=with_bias)
483
+ self.linear_out = rf.Linear(ff_dim, out_dim, with_bias=with_bias)
439
484
 
440
485
  def __call__(self, inp: Tensor) -> Tensor:
441
486
  """forward"""
@@ -69,6 +69,7 @@ __all__ = [
69
69
  "log_softmax",
70
70
  "gating",
71
71
  "lerp",
72
+ "cumsum",
72
73
  ]
73
74
 
74
75
 
@@ -612,3 +613,11 @@ def lerp(
612
613
  """
613
614
  # noinspection PyProtectedMember
614
615
  return start._raw_backend.lerp(start, end, weight, allow_broadcast_all_sources=allow_broadcast_all_sources)
616
+
617
+
618
+ def cumsum(source: Tensor, *, spatial_dim: Dim) -> Tensor:
619
+ """
620
+ Applies cumsum.
621
+ """
622
+ # noinspection PyProtectedMember
623
+ return source._raw_backend.cumsum(source, spatial_dim=spatial_dim)
@@ -730,6 +730,11 @@ class ReturnnLayersBackend(Backend[Layer]):
730
730
  name="clip_by_value",
731
731
  )
732
732
 
733
+ @staticmethod
734
+ def cumsum(source: Tensor, *, spatial_dim: Dim) -> Tensor:
735
+ """cumsum"""
736
+ return rfl.make_layer({"class": "cumsum", "from": source, "axis": spatial_dim}, name="cumsum")
737
+
733
738
  @staticmethod
734
739
  def matmul(a: Tensor, b: Tensor, *, reduce: Union[Dim, Sequence[Dim]], use_mask: bool = True) -> Tensor:
735
740
  """matmul"""
@@ -983,6 +983,9 @@ class TorchBackend(Backend[torch.Tensor]):
983
983
  elif axis_int == 0 and source.batch_ndim == 2:
984
984
  # This is exactly what torch.embedding is intended for. Let's use that.
985
985
  out.raw_tensor = torch.embedding(source.raw_tensor, indices.raw_tensor)
986
+ elif indices.batch_ndim <= 1:
987
+ # Note: This also works when indices is on CPU and source is on GPU.
988
+ out.raw_tensor = source.raw_tensor[(slice(None),) * axis_int + (indices.raw_tensor,)]
986
989
  else:
987
990
  out_raw = torch.index_select(source.raw_tensor, dim=axis_int, index=indices.raw_tensor.flatten())
988
991
  out_shape = (
@@ -1189,6 +1192,14 @@ class TorchBackend(Backend[torch.Tensor]):
1189
1192
  )
1190
1193
  return out
1191
1194
 
1195
+ @staticmethod
1196
+ def cumsum(source: Tensor, *, spatial_dim: Dim) -> Tensor:
1197
+ """cumsum"""
1198
+ axis = source.get_axis_from_description(spatial_dim)
1199
+ out = source.copy_template("cumsum")
1200
+ out.raw_tensor = torch.cumsum(source.raw_tensor, dim=axis, dtype=source.raw_tensor.dtype)
1201
+ return out
1202
+
1192
1203
  @staticmethod
1193
1204
  def matmul(a: _TT, b: _TT, *, reduce: Union[Dim, Sequence[Dim]], use_mask: bool = True) -> _TT:
1194
1205
  """
@@ -79,7 +79,7 @@ def _get_class_init_kwargs(optim_class):
79
79
  return optim_class_init_kwargs
80
80
 
81
81
 
82
- class Updater(object):
82
+ class Updater:
83
83
  """
84
84
  Wraps a torch.optim.Optimizer, and extends it by some further functionality.
85
85
  """