returnn 1.20240730.135048__tar.gz → 1.20240731.50408__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of returnn might be problematic. Click here for more details.

Files changed (458) hide show
  1. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/PKG-INFO +1 -1
  2. returnn-1.20240731.50408/_setup_info_generated.py +2 -0
  3. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/basic.py +1 -0
  4. returnn-1.20240731.50408/returnn/datasets/postprocessing.py +212 -0
  5. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/attention.py +69 -5
  6. returnn-1.20240731.50408/returnn/frontend/conversions/__init__.py +3 -0
  7. returnn-1.20240731.50408/returnn/frontend/conversions/hf_llama.py +56 -0
  8. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/decoder/transformer.py +104 -11
  9. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/linear.py +1 -1
  10. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/normalization.py +41 -5
  11. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/debug.py +188 -1
  12. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn.egg-info/PKG-INFO +1 -1
  13. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn.egg-info/SOURCES.txt +3 -0
  14. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm-inspection-profile.xml +2 -1
  15. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/inspectionProfiles/Project_Default.xml +2 -1
  16. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_Dataset.py +52 -1
  17. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_attention.py +239 -0
  18. returnn-1.20240730.135048/_setup_info_generated.py +0 -2
  19. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/.editorconfig +0 -0
  20. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/.gitignore +0 -0
  21. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/.gitmodules +0 -0
  22. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/.kateconfig +0 -0
  23. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/CHANGELOG.md +0 -0
  24. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/CODEOWNERS +0 -0
  25. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/CONTRIBUTING.md +0 -0
  26. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/LICENSE +0 -0
  27. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/MANIFEST.in +0 -0
  28. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/README.rst +0 -0
  29. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/__init__.py +0 -0
  30. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/12AX.cluster_map +0 -0
  31. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/_setup_returnn_env.py +0 -0
  32. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-fwd.config +0 -0
  33. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-horovod-mpi.py +0 -0
  34. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-horovod-mpi.py.sh +0 -0
  35. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-horovod-mpi.sh +0 -0
  36. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-hyper-param-tuning.config +0 -0
  37. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-iter-dataset.py +0 -0
  38. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-list-devices.py +0 -0
  39. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-lua-torch-layer.config +0 -0
  40. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-pretrain.config +0 -0
  41. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-record-and-push-to-webserver.py +0 -0
  42. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-returnn-as-framework.py +0 -0
  43. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-rf-pt-benchmark.py +0 -0
  44. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-rf.config +0 -0
  45. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-rhn-enwik8.config +0 -0
  46. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-sprint-interface.py +0 -0
  47. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-att-copy.config +0 -0
  48. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-attention.config +0 -0
  49. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-chunking-blstm.12ax.config +0 -0
  50. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-contribrnn-lstm.12ax.config +0 -0
  51. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-enc-dec.config +0 -0
  52. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-hard-att-copy.config +0 -0
  53. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-lstm-benchmark.py +0 -0
  54. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-maxgradnorm-lstm.12ax.config +0 -0
  55. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-native-lstm-lowmem.12ax.config +0 -0
  56. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-native-lstm.12ax.config +0 -0
  57. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-native-lstm2.12ax.config +0 -0
  58. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-native-lstm2.12ax.tuned.config +0 -0
  59. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-neural-transducer.12ax.config +0 -0
  60. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-rec-explicit-lstm.config +0 -0
  61. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-rec-explicit-rnn.config +0 -0
  62. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-rec-self-att.config +0 -0
  63. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-search-compiled-graph.py +0 -0
  64. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-vanilla-lstm.12ax.config +0 -0
  65. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-timit-lstm-ctc.config +0 -0
  66. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-torch.config +0 -0
  67. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-upd-mult-model.lstm.12ax.config +0 -0
  68. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo.sh +0 -0
  69. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/IAM_lines/a01-000u-00.png +0 -0
  70. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/IAM_lines/a01-007-04.png +0 -0
  71. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/IAM_lines/a01-007-06.png +0 -0
  72. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/README.txt +0 -0
  73. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/chars.txt +0 -0
  74. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/config_demo +0 -0
  75. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/config_fwd +0 -0
  76. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/config_real +0 -0
  77. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/create_IAM_dataset.py +0 -0
  78. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/decode.py +0 -0
  79. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/features/raw/demo.h5 +0 -0
  80. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/go.sh +0 -0
  81. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/lines.txt +0 -0
  82. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/split/eval.txt +0 -0
  83. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/split/train.txt +0 -0
  84. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/split/valid.txt +0 -0
  85. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/README.md +0 -0
  86. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial/create_test_h5.py +0 -0
  87. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial/forwardconfig +0 -0
  88. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial/go.sh +0 -0
  89. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial/trainconfig +0 -0
  90. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial_rgb/create_test_h5.py +0 -0
  91. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial_rgb/forwardconfig +0 -0
  92. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial_rgb/go.sh +0 -0
  93. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial_rgb/trainconfig +0 -0
  94. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/pyproject.toml +0 -0
  95. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/requirements.txt +0 -0
  96. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/__init__.py +0 -0
  97. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/__main__.py +0 -0
  98. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/__old_mod_loader__.py +0 -0
  99. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/__setup__.py +0 -0
  100. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/config.py +0 -0
  101. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/__init__.py +0 -0
  102. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/audio.py +0 -0
  103. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/bundle_file.py +0 -0
  104. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/cached.py +0 -0
  105. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/cached2.py +0 -0
  106. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/distrib_files.py +0 -0
  107. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/generating.py +0 -0
  108. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/hdf.py +0 -0
  109. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/lm.py +0 -0
  110. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/map.py +0 -0
  111. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/meta.py +0 -0
  112. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/multi_proc.py +0 -0
  113. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/normalization_data.py +0 -0
  114. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/numpy_dump.py +0 -0
  115. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/raw_wav.py +0 -0
  116. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/sprint.py +0 -0
  117. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/stereo.py +0 -0
  118. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/util/__init__.py +0 -0
  119. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/util/feature_extraction.py +0 -0
  120. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/util/strings.py +0 -0
  121. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/util/vocabulary.py +0 -0
  122. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/engine/__init__.py +0 -0
  123. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/engine/base.py +0 -0
  124. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/engine/batch.py +0 -0
  125. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/__init__.py +0 -0
  126. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/__main__.py +0 -0
  127. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/.git +0 -0
  128. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/.gitignore +0 -0
  129. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/LICENSE +0 -0
  130. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/README.md +0 -0
  131. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/aligner.gif +0 -0
  132. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/check.png +0 -0
  133. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/core.cu +0 -0
  134. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/core.h +0 -0
  135. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/core_cpu.cpp +0 -0
  136. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/LICENSE +0 -0
  137. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/MANIFEST.in +0 -0
  138. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/README.md +0 -0
  139. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/binding.cpp +0 -0
  140. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/core.cu +0 -0
  141. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/core.h +0 -0
  142. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/requirements.txt +0 -0
  143. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/setup.py +0 -0
  144. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/warp_rna/__init__.py +0 -0
  145. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/warp_rna/test.py +0 -0
  146. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/ref_rna.py +0 -0
  147. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/setup.py +0 -0
  148. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/src/warp_rna_op.cc +0 -0
  149. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/src/warp_rna_op_kernel_tmpl.h +0 -0
  150. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/warp_rna/__init__.py +0 -0
  151. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/test.cpp +0 -0
  152. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/__init__.py +0 -0
  153. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/README.md +0 -0
  154. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/__init__.py +0 -0
  155. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/edit.py +0 -0
  156. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/reroute.py +0 -0
  157. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/select.py +0 -0
  158. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/subgraph.py +0 -0
  159. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/transform.py +0 -0
  160. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/util.py +0 -0
  161. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/forward_iface.py +0 -0
  162. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/__init__.py +0 -0
  163. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_backend.py +0 -0
  164. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/__init__.py +0 -0
  165. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/backend.cpp +0 -0
  166. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/backend.hpp +0 -0
  167. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/module.cpp +0 -0
  168. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/module.hpp +0 -0
  169. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/py_utils.hpp +0 -0
  170. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/tensor_ops.cpp +0 -0
  171. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/tensor_ops.hpp +0 -0
  172. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_numpy_backend.py +0 -0
  173. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_random_journal.py +0 -0
  174. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_utils.py +0 -0
  175. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/array_.py +0 -0
  176. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/audio/__init__.py +0 -0
  177. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/audio/mel.py +0 -0
  178. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/audio/specaugment.py +0 -0
  179. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/backend.py +0 -0
  180. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/build_from_dict.py +0 -0
  181. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/cond.py +0 -0
  182. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/const.py +0 -0
  183. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/container.py +0 -0
  184. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/control_flow_ctx.py +0 -0
  185. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/conv.py +0 -0
  186. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/decoder/__init__.py +0 -0
  187. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/device.py +0 -0
  188. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/dims.py +0 -0
  189. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/dropout.py +0 -0
  190. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/dtype.py +0 -0
  191. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/encoder/__init__.py +0 -0
  192. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/encoder/base.py +0 -0
  193. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/encoder/conformer.py +0 -0
  194. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/gradient.py +0 -0
  195. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/graph.py +0 -0
  196. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/hooks.py +0 -0
  197. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/init.py +0 -0
  198. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/label_smoothing.py +0 -0
  199. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/loop.py +0 -0
  200. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/loss.py +0 -0
  201. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/math_.py +0 -0
  202. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/matmul.py +0 -0
  203. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/module.py +0 -0
  204. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/parameter.py +0 -0
  205. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/parametrizations.py +0 -0
  206. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/parametrize.py +0 -0
  207. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/piecewise_linear.py +0 -0
  208. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/rand.py +0 -0
  209. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/rec.py +0 -0
  210. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/reduce.py +0 -0
  211. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/run_ctx.py +0 -0
  212. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/signal.py +0 -0
  213. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/state.py +0 -0
  214. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/stepwise_scheduler.py +0 -0
  215. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/tensor_array.py +0 -0
  216. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/types.py +0 -0
  217. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/import_/__init__.py +0 -0
  218. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/import_/common.py +0 -0
  219. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/import_/git.py +0 -0
  220. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/import_/import_.py +0 -0
  221. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/learning_rate_control.py +0 -0
  222. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/log.py +0 -0
  223. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/native_op.cpp +0 -0
  224. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/native_op.py +0 -0
  225. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/pretrain.py +0 -0
  226. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/sprint/__init__.py +0 -0
  227. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/sprint/cache.py +0 -0
  228. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/sprint/control.py +0 -0
  229. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/sprint/error_signals.py +0 -0
  230. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/sprint/extern_interface.py +0 -0
  231. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/sprint/interface.py +0 -0
  232. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/README.md +0 -0
  233. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/__init__.py +0 -0
  234. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/_dim_extra.py +0 -0
  235. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/_tensor_extra.py +0 -0
  236. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/_tensor_mixin_base.py +0 -0
  237. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/_tensor_op_overloads.py +0 -0
  238. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/control_flow_ctx.py +0 -0
  239. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/dim.py +0 -0
  240. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/marked_dim.py +0 -0
  241. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/tensor.py +0 -0
  242. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/tensor_dict.py +0 -0
  243. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/utils.py +0 -0
  244. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/__init__.py +0 -0
  245. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/compat.py +0 -0
  246. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/data_pipeline.py +0 -0
  247. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/distributed.py +0 -0
  248. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/engine.py +0 -0
  249. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/README.md +0 -0
  250. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/__init__.py +0 -0
  251. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/_backend.py +0 -0
  252. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/_utils.py +0 -0
  253. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/cond.py +0 -0
  254. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/config_entry_points.py +0 -0
  255. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/debug_eager_mode.py +0 -0
  256. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/dims.py +0 -0
  257. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/layer.py +0 -0
  258. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/loop.py +0 -0
  259. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/make_layer.py +0 -0
  260. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/masked_computation.py +0 -0
  261. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/parameter_assign.py +0 -0
  262. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/prev_tensor_ref.py +0 -0
  263. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_low_level/__init__.py +0 -0
  264. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_low_level/_backend.py +0 -0
  265. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/horovod.py +0 -0
  266. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/hyper_param_tuning.py +0 -0
  267. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/__init__.py +0 -0
  268. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/base.py +0 -0
  269. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/basic.py +0 -0
  270. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/rec.py +0 -0
  271. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/segmental_model.py +0 -0
  272. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/signal_processing.py +0 -0
  273. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/variable.py +0 -0
  274. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/native_op.py +0 -0
  275. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/network.py +0 -0
  276. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/sprint.py +0 -0
  277. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/updater.py +0 -0
  278. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/util/__init__.py +0 -0
  279. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/util/basic.py +0 -0
  280. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/util/data.py +0 -0
  281. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/util/gradient_checkpoint.py +0 -0
  282. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/util/ken_lm.py +0 -0
  283. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/util/open_fst.py +0 -0
  284. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/README.md +0 -0
  285. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/__init__.py +0 -0
  286. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/data/__init__.py +0 -0
  287. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/data/extern_data.py +0 -0
  288. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/data/pipeline.py +0 -0
  289. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/data/queued_data_iter.py +0 -0
  290. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/data/returnn_dataset_wrapper.py +0 -0
  291. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/data/tensor_utils.py +0 -0
  292. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/distributed.py +0 -0
  293. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/engine.py +0 -0
  294. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/frontend/__init__.py +0 -0
  295. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/frontend/_backend.py +0 -0
  296. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/frontend/_rand.py +0 -0
  297. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/frontend/bridge.py +0 -0
  298. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/frontend/raw_ops.py +0 -0
  299. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/updater.py +0 -0
  300. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/util/README.md +0 -0
  301. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/util/__init__.py +0 -0
  302. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/util/array_.py +0 -0
  303. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/util/diagnose_gpu.py +0 -0
  304. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/util/gradient_checkpoint.py +0 -0
  305. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/util/scaled_gradient.py +0 -0
  306. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/__init__.py +0 -0
  307. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/basic.py +0 -0
  308. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/better_exchook.py +0 -0
  309. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/bpe.py +0 -0
  310. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/debug_helpers.py +0 -0
  311. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/file_cache.py +0 -0
  312. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/fsa.py +0 -0
  313. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/literal_py_to_pickle.py +0 -0
  314. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/math.py +0 -0
  315. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/multi_proc_non_daemonic_spawn.py +0 -0
  316. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/native_code_compiler.py +0 -0
  317. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/pprint.py +0 -0
  318. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/py-to-pickle.cpp +0 -0
  319. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/py_compat.py +0 -0
  320. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/py_ext_mod_compiler.py +0 -0
  321. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/result_with_reason.py +0 -0
  322. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/sig_proc.py +0 -0
  323. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/task_system.py +0 -0
  324. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/train_proc_manager.py +0 -0
  325. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/watch_memory.py +0 -0
  326. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn.egg-info/dependency_links.txt +0 -0
  327. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn.egg-info/top_level.txt +0 -0
  328. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/rnn.py +0 -0
  329. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/setup.cfg +0 -0
  330. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/setup.py +0 -0
  331. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/DummySprintExec.py +0 -0
  332. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/.gitignore +0 -0
  333. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/.name +0 -0
  334. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/codeStyleSettings.xml +0 -0
  335. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/codeStyles/Project.xml +0 -0
  336. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/codeStyles/codeStyleConfig.xml +0 -0
  337. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/inspectionProfiles/profiles_settings.xml +0 -0
  338. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/misc.xml +0 -0
  339. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/modules.xml +0 -0
  340. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/returnn.iml +0 -0
  341. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/scopes/scope_settings.xml +0 -0
  342. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/_set_num_threads1.py +0 -0
  343. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/_setup_returnn_env.py +0 -0
  344. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/_setup_test_env.py +0 -0
  345. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/bpe-unicode-demo.codes +0 -0
  346. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/bpe-unicode-demo.vocab +0 -0
  347. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/lexicon_opt.fst +0 -0
  348. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/lexicon_opt.isyms +0 -0
  349. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/lexicon_opt.jpg +0 -0
  350. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/lexicon_opt.osyms +0 -0
  351. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/lint_common.py +0 -0
  352. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/pycharm-inspect.py +0 -0
  353. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/pylint.py +0 -0
  354. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/returnn-as-framework.py +0 -0
  355. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/rf_utils.py +0 -0
  356. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/spelling.dic +0 -0
  357. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_Config.py +0 -0
  358. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_Fsa.py +0 -0
  359. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_GeneratingDataset.py +0 -0
  360. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_HDFDataset.py +0 -0
  361. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_LearningRateControl.py +0 -0
  362. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_Log.py +0 -0
  363. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_MultiProcDataset.py +0 -0
  364. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_Pretrain.py +0 -0
  365. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_ResNet.py +0 -0
  366. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_SprintDataset.py +0 -0
  367. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_SprintInterface.py +0 -0
  368. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFEngine.py +0 -0
  369. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFNativeOp.py +0 -0
  370. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFNetworkLayer.py +0 -0
  371. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFNetworkRecLayer.py +0 -0
  372. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFNetworkSigProcLayer.py +0 -0
  373. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFUpdater.py +0 -0
  374. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFUtil.py +0 -0
  375. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TF_determinism.py +0 -0
  376. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TaskSystem.py +0 -0
  377. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TaskSystem_SharedMem.py +0 -0
  378. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TranslationDataset.py +0 -0
  379. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_Util.py +0 -0
  380. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_demos.py +0 -0
  381. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_fork_exec.py +0 -0
  382. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_hdf_dump.py +0 -0
  383. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_array.py +0 -0
  384. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_base.py +0 -0
  385. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_cond.py +0 -0
  386. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_const.py +0 -0
  387. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_container.py +0 -0
  388. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_conv.py +0 -0
  389. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_encoder_conformer.py +0 -0
  390. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_gradient.py +0 -0
  391. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_label_smoothing.py +0 -0
  392. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_loop.py +0 -0
  393. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_math.py +0 -0
  394. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_normalization.py +0 -0
  395. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_piecewise_linear.py +0 -0
  396. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_rec.py +0 -0
  397. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_reduce.py +0 -0
  398. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_signal.py +0 -0
  399. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_tensor.py +0 -0
  400. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_tools.py +0 -0
  401. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_torch_dataset.py +0 -0
  402. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_torch_engine.py +0 -0
  403. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_torch_frontend.py +0 -0
  404. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_torch_internal_frontend.py +0 -0
  405. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_torch_util.py +0 -0
  406. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/torch_utils.py +0 -0
  407. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/_setup_returnn_env.py +0 -0
  408. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/analyze-dataset-batches.py +0 -0
  409. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/bliss-collect-seq-lens.py +0 -0
  410. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/bliss-dump-text.py +0 -0
  411. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/bliss-get-segment-names.py +0 -0
  412. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/bliss-to-ogg-zip.py +0 -0
  413. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/bpe-create-lexicon.py +0 -0
  414. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/calculate-word-error-rate.py +0 -0
  415. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/cleanup-old-models.py +0 -0
  416. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/collect-orth-symbols.py +0 -0
  417. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/collect-words.py +0 -0
  418. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/compile_native_op.py +0 -0
  419. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/compile_tf_graph.py +0 -0
  420. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/debug-dump-search-scores.py +0 -0
  421. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/debug-plot-search-scores.py +0 -0
  422. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/dump-dataset-raw-strings.py +0 -0
  423. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/dump-dataset.py +0 -0
  424. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/dump-forward-stats.py +0 -0
  425. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/dump-forward.py +0 -0
  426. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/dump-network-json.py +0 -0
  427. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/dump-pickle.py +0 -0
  428. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/extract_state_tying_from_dataset.py +0 -0
  429. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/get-attention-weights.py +0 -0
  430. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/get-best-model-epoch.py +0 -0
  431. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/hdf_dump.py +0 -0
  432. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/hdf_dump_translation_dataset.py +0 -0
  433. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/import-blocks-mt-model.py +0 -0
  434. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/import-t2t-mt-model.py +0 -0
  435. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/.gitignore +0 -0
  436. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/Makefile +0 -0
  437. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/README.md +0 -0
  438. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/README.md +0 -0
  439. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/libs_list +0 -0
  440. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/network.040/i600_m600_m600.sgd_b16_lr0_cl2.newbobabs.config +0 -0
  441. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/network.040/i600_m600_m600.sgd_b16_lr0_cl2.newbobabs.keep_over_epoch.lstm2.config +0 -0
  442. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/rescore_lattice.sh +0 -0
  443. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/state_vars_list +0 -0
  444. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/tensor_names_list +0 -0
  445. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/file.h +0 -0
  446. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/htklatticerescorer.cc +0 -0
  447. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/htklatticerescorer.h +0 -0
  448. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/main.cc +0 -0
  449. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/rescorer.h +0 -0
  450. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/vocabulary.cc +0 -0
  451. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/vocabulary.h +0 -0
  452. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/tf_avg_checkpoints.py +0 -0
  453. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/tf_inspect_checkpoint.py +0 -0
  454. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/tf_inspect_summary_log.py +0 -0
  455. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/torch_avg_checkpoints.py +0 -0
  456. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/torch_export_to_onnx.py +0 -0
  457. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/torch_inspect_checkpoint.py +0 -0
  458. {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/torch_inspect_checkpoint_and_opt.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20240730.135048
3
+ Version: 1.20240731.50408
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -0,0 +1,2 @@
1
+ version = '1.20240731.050408'
2
+ long_version = '1.20240731.050408+git.89645c0'
@@ -1388,6 +1388,7 @@ def get_dataset_class(name: Union[str, Type[Dataset]]) -> Optional[Type[Dataset]
1388
1388
  "map",
1389
1389
  "multi_proc",
1390
1390
  "distrib_files",
1391
+ "postprocessing",
1391
1392
  ]
1392
1393
  for mod_name in mod_names:
1393
1394
  mod = import_module("returnn.datasets.%s" % mod_name)
@@ -0,0 +1,212 @@
1
+ """
2
+ Provides :class:`PostprocessingDataset`.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
8
+
9
+ from returnn.datasets.basic import DatasetSeq
10
+ from returnn.datasets.util.vocabulary import Vocabulary
11
+ from returnn.tensor import Tensor, TensorDict
12
+ from returnn.tensor.dim import Dim
13
+ from .basic import init_dataset
14
+ from .cached2 import CachedDataset2
15
+
16
+ __all__ = ["PostprocessingDataset"]
17
+
18
+
19
+ class PostprocessingDataset(CachedDataset2):
20
+ """
21
+ A dataset that allows for generic post-processing of data from another dataset
22
+ using a function on the segment level and on the level of multiple segments via
23
+ an iterator.
24
+
25
+ This allows integrating various data augmentation techniques like e.g. Mixup,
26
+ SpecAugment or speed perturbation into the data loading pipeline.
27
+
28
+ The integration into the data loading pipeline makes it easy to distribute the
29
+ data processing work across multiple CPU cores using `MultiProcDataset` and in
30
+ turn frees the GPU from data preprocessing tasks.
31
+
32
+ Example usage::
33
+
34
+ from returnn.tensor.dim import Dim, DimTypes
35
+
36
+ time_dim = Dim(None, kind=DimTypes.Spatial)
37
+ new_data_dim = Dim(128)
38
+
39
+ train = {
40
+ "class": "PostprocessingDataset",
41
+ "dataset": {
42
+ "class": "HDFDataset",
43
+ "files": ["/path/to/data.hdf"],
44
+ },
45
+ # one of them, but not both:
46
+ "map_seq": map_seq, # (data: TensorDict) -> TensorDict
47
+ "map_seq_stream": map_seqs, # (iter: Iterator[TensorDict]) -> Iterator[TensorDict]
48
+ # only required when data shapes change wrt. the wrapped dataset:
49
+ "map_outputs": {
50
+ "data": {"dims": [time_dim, new_data_dim]},
51
+ },
52
+ }
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ dataset: Dict[str, Any],
58
+ map_seq: Optional[Union[Callable[[TensorDict], TensorDict]]] = None,
59
+ map_seq_stream: Optional[Callable[[Iterator[TensorDict]], Iterator[TensorDict]]] = None,
60
+ map_outputs: Optional[Dict[str, Any]] = None,
61
+ **kwargs,
62
+ ):
63
+ """
64
+ :param dataset: inner dataset to be post-processed
65
+ :param map_seq: post processor function operating on the single-segment level.
66
+ To avoid confusion on the order of how the processing functions are applied to the data, only one of
67
+ `map_seq` and `map_seq_stream` can be specified at a time.
68
+ :param map_seq_stream: post processor function operating on the multiple segment level via an iterator.
69
+ Allows merging multiple segments into one, or generating multiple output segments from one input segment.
70
+ To avoid confusion on the order of how the processing functions are applied to the data, only one of
71
+ `map_seq` and `map_seq_stream` can be specified at a time.
72
+ :param map_outputs: Type and axis specification of the outputs of the mapping functions,
73
+ like extern_data and model_outputs.
74
+ To simplify the common case when no shapes change, this value can be left unspecified. The dataset then
75
+ assumes the same data layout as returned by the wrapped dataset.
76
+ Example: `map_outputs={"data": {"dim": 42}}`
77
+ :param kwargs: see :class:`CachedDataset2`, :class:`Dataset`
78
+ """
79
+ super().__init__(**kwargs)
80
+
81
+ if self.seq_ordering != "default":
82
+ raise ValueError(f"{self}: specify seq_ordering in wrapped dataset, not in {self.__class__.__name__}")
83
+ if map_seq is None and map_seq_stream is None:
84
+ raise ValueError(f"{self}: need to either set map_seq or map_seq_stream")
85
+ if map_seq and map_seq_stream:
86
+ raise ValueError(f"{self}: cannot set both map_seq and map_seq_stream")
87
+
88
+ self._dataset_def = dataset
89
+ self._map_seq = map_seq
90
+ self._map_seq_stream = map_seq_stream
91
+ self._map_outputs = map_outputs
92
+
93
+ self._dataset = init_dataset(self._dataset_def, parent_dataset=self)
94
+ if self._map_seq_stream is None:
95
+ # if the stream mapper is set, the num_seqs may change and the estimation is less accurate
96
+ self._estimated_num_seqs = self._dataset.estimated_num_seqs
97
+ self._data_iter: Optional[Iterator[Tuple[int, TensorDict]]] = None
98
+
99
+ self._in_tensor_dict_template = TensorDict(
100
+ {name: self._make_tensor_template_from_input(name) for name in self._dataset.get_data_keys()}
101
+ )
102
+ if self._map_outputs is not None:
103
+ self._out_tensor_dict_template = TensorDict()
104
+ self._out_tensor_dict_template.update(self._map_outputs, auto_convert=True)
105
+ else:
106
+ self._out_tensor_dict_template = self._in_tensor_dict_template
107
+ self.num_outputs = {
108
+ k: (t.sparse_dim.size if t.sparse_dim else t.shape[-1] if len(t.shape) > 0 else 1, t.ndim)
109
+ for k, t in self._out_tensor_dict_template.data.items()
110
+ }
111
+ self._default_input = "data" if "data" in self.num_outputs else next(iter(self.num_outputs.keys()))
112
+ self.num_inputs = self.num_outputs[self._default_input][0]
113
+
114
+ self.labels = {}
115
+ for k, t in self._out_tensor_dict_template.data.items():
116
+ if t.vocab:
117
+ self.labels[k] = t.vocab.labels
118
+ elif t.sparse_dim: # sparse_dim but not vocab
119
+ self.labels[k] = list(map(str, range(t.sparse_dim.dimension))) # dummy labels
120
+
121
+ def init_seq_order(
122
+ self, epoch: Optional[int] = None, seq_list: Optional[List[str]] = None, seq_order: Optional[List[int]] = None
123
+ ):
124
+ """
125
+ :param epoch:
126
+ :param seq_list:
127
+ :param seq_order:
128
+ :return: whether the order changed (True is always safe to return)
129
+ """
130
+ super().init_seq_order(epoch=epoch, seq_list=seq_list, seq_order=seq_order)
131
+
132
+ if epoch is None and seq_list is None and seq_order is None:
133
+ self._num_seqs = 0
134
+ return True
135
+
136
+ assert self._dataset is not None
137
+ self._dataset.init_seq_order(epoch=epoch, seq_list=seq_list, seq_order=seq_order)
138
+ self._data_iter = enumerate(self._build_mapping_iter())
139
+ return True
140
+
141
+ def _collect_single_seq(self, seq_idx: int) -> Optional[DatasetSeq]:
142
+ while True:
143
+ try:
144
+ loaded_seq_idx, tensor_dict = next(self._data_iter)
145
+ except StopIteration:
146
+ return None
147
+ assert loaded_seq_idx <= seq_idx, "_collect_single_seq must be done monotonically"
148
+ if loaded_seq_idx != seq_idx:
149
+ continue
150
+ seq = DatasetSeq(features={k: t.raw_tensor for k, t in tensor_dict.data.items()}, seq_idx=seq_idx)
151
+ return seq
152
+
153
+ def _build_mapping_iter(self) -> Iterator[TensorDict]:
154
+ """
155
+ :return: an iterator applying both the segment level and across-segment transformations on the given dataset
156
+ """
157
+
158
+ def _validate_tensor_dict_iter(inner: Iterator[TensorDict]) -> Iterator[TensorDict]:
159
+ for t_dict in inner:
160
+ for data_key, out_t in self._out_tensor_dict_template.data.items():
161
+ in_t = t_dict.data[data_key]
162
+ assert (
163
+ in_t.ndim == out_t.batch_ndim
164
+ and in_t.dtype == out_t.dtype
165
+ and all(d.dimension in (d_, None) for (d, d_) in zip(in_t.dims, out_t.shape))
166
+ )
167
+ yield t_dict
168
+
169
+ data_iter = self._iterate_dataset()
170
+ if self._map_seq_stream is not None:
171
+ data_iter = self._map_seq_stream(data_iter)
172
+ assert isinstance(
173
+ data_iter, Iterator
174
+ ), f"map_seq_stream must produce an {Iterator.__name__}, but produced {type(data_iter).__name__}"
175
+ return _validate_tensor_dict_iter(data_iter)
176
+
177
+ def _iterate_dataset(self) -> Iterator[TensorDict]:
178
+ """
179
+ :return: generator providing data samples in the form of a TensorDict
180
+ """
181
+ data_keys = self._dataset.get_data_keys()
182
+
183
+ seq_index = 0
184
+ while self._dataset.is_less_than_num_seqs(seq_index):
185
+ self._dataset.load_seqs(seq_index, seq_index + 1)
186
+ tensor_dict = self._in_tensor_dict_template.copy_template()
187
+ for data_key in data_keys:
188
+ tensor_dict.data[data_key].raw_tensor = self._dataset.get_data(seq_index, data_key)
189
+ if self._map_seq is not None:
190
+ tensor_dict = self._map_seq(tensor_dict)
191
+ assert isinstance(
192
+ tensor_dict, TensorDict
193
+ ), f"map_seq must produce a {TensorDict.__name__}, but produced {type(tensor_dict).__name__}"
194
+ yield tensor_dict
195
+ seq_index += 1
196
+
197
+ def _make_tensor_template_from_input(self, data_key: str) -> Tensor:
198
+ dtype = self._dataset.get_data_dtype(data_key)
199
+ if dtype == "string":
200
+ dims = []
201
+ else:
202
+ feature_dims = [
203
+ Dim(dimension=dim, name=f"{data_key}_dim{i + 1}")
204
+ for i, dim in enumerate(self._dataset.get_data_shape(data_key))
205
+ ]
206
+ dims = [Dim(dimension=None, name=f"{data_key}_frame"), *feature_dims]
207
+ sparse_dim = None
208
+ if self._dataset.is_data_sparse(data_key):
209
+ sparse_dim = Dim(dimension=self._dataset.get_data_dim(data_key), name=f"{data_key}_sparse")
210
+ if data_key in self._dataset.labels:
211
+ sparse_dim.vocab = Vocabulary.create_vocab_from_labels(self._dataset.labels[data_key])
212
+ return Tensor(data_key, dims=dims, dtype=dtype, sparse_dim=sparse_dim)
@@ -2,7 +2,6 @@
2
2
  Attention
3
3
  """
4
4
 
5
-
6
5
  from __future__ import annotations
7
6
  from typing import Tuple, Union, Optional, Sequence
8
7
  import weakref
@@ -17,6 +16,7 @@ __all__ = [
17
16
  "SelfAttention",
18
17
  "CausalSelfAttention",
19
18
  "CausalSelfAttentionState",
19
+ "RotaryPosCausalSelfAttention",
20
20
  "RelPosSelfAttention",
21
21
  "RelPosCausalSelfAttention",
22
22
  "CrossAttention",
@@ -264,6 +264,69 @@ class CausalSelfAttentionState(rf.State):
264
264
  self.accum_axis = accum_axis
265
265
 
266
266
 
267
+ class RotaryPosCausalSelfAttention(CausalSelfAttention):
268
+ """
269
+ Rotary positional encoding (RoPE)-based causal self attention
270
+ """
271
+
272
+ def __call__(
273
+ self,
274
+ source: Tensor,
275
+ axis: Dim,
276
+ *,
277
+ state: Optional[CausalSelfAttentionState] = None,
278
+ ) -> Tuple[Tensor, CausalSelfAttentionState]:
279
+ """forward"""
280
+ q, k, v = self.forward_qkv(source)
281
+ k, v, hist_dim, new_state = _causal_self_att_step(k, v, axis=axis, state=state, self=self)
282
+
283
+ # Apply RoPE using sinusoidal positional encoding.
284
+ # Note: base is a bit different in rf.sinusoidal_positional_encoding (like the original)
285
+ # vs how it's commonly used for RoPE.
286
+ # log(base) / (dim / 2 - 1) = log(10_000) * 2 / dim
287
+ # <=> log(base) = log(10_000) * (dim / 2 - 1) * 2 / dim = log(10_000) * (1 - 2 / dim)
288
+ # <=> base = 10_000 ** (1 - 2 / dim)
289
+ pos_enc = rf.sinusoidal_positional_encoding(
290
+ spatial_dim=hist_dim,
291
+ feat_dim=self.key_dim_per_head,
292
+ base=10_000 ** (1 - 2 / self.key_dim_per_head.dimension),
293
+ ) # [T,D]
294
+ q = _apply_rope(
295
+ q,
296
+ (
297
+ rf.gather(pos_enc, axis=hist_dim, indices=hist_dim.dyn_size_ext - 1)
298
+ if axis == single_step_dim
299
+ else rf.replace_dim(pos_enc, in_dim=hist_dim, out_dim=axis)[0]
300
+ ),
301
+ self.key_dim_per_head,
302
+ )
303
+ k = _apply_rope(k, pos_enc, self.key_dim_per_head)
304
+
305
+ output = self.attention(q, k, v, kv_axis=hist_dim)
306
+ return output, new_state
307
+
308
+
309
+ def _apply_rope(x: Tensor, pos_enc: Tensor, feat_dim: Dim) -> Tensor:
310
+ """
311
+ :param x: [...,T,D] or [...,D]
312
+ :param pos_enc: [T,D] or [D]
313
+ :param feat_dim: D
314
+ :return: [...,T,D] or [...,D]
315
+ """
316
+ feat_half_dim = feat_dim.div_left(2)
317
+ pe_imag, pe_real = rf.split(pos_enc, axis=feat_dim, out_dims=[feat_half_dim] * 2) # [T,D/2]
318
+ # pe_imag = sin, pe_real = cos
319
+ d2 = Dim(2, name="complex")
320
+ x = rf.split_dims(x, axis=feat_dim, dims=(feat_half_dim, d2)) # [...,T,D/2,2]
321
+ x_real = rf.gather(x, indices=0, axis=d2)
322
+ x_imag = rf.gather(x, indices=1, axis=d2)
323
+ x_real_ = x_real * pe_real - x_imag * pe_imag
324
+ x_imag_ = x_real * pe_imag + x_imag * pe_real
325
+ x_, _ = rf.stack((x_real_, x_imag_), out_dim=d2) # [...,T,D/2,2]
326
+ x_, _ = rf.merge_dims(x_, dims=(feat_half_dim, d2), out_dim=feat_dim) # [...,T,D]
327
+ return x_
328
+
329
+
267
330
  class RelPosSelfAttention(SelfAttentionBase):
268
331
  """
269
332
  Self-attention with relative positional encoding.
@@ -836,7 +899,7 @@ def relative_positional_encoding(
836
899
  return emb, out_spatial_dim
837
900
 
838
901
 
839
- _positional_encoding_cache = weakref.WeakKeyDictionary() # run ctx -> (spatial_dim, feat_dim) -> enc
902
+ _sinusoidal_positional_encoding_cache = weakref.WeakKeyDictionary() # run ctx -> (spatial_dim, feat_dim) -> enc
840
903
 
841
904
 
842
905
  def sinusoidal_positional_encoding(
@@ -844,6 +907,7 @@ def sinusoidal_positional_encoding(
844
907
  spatial_dim: Dim,
845
908
  feat_dim: Dim,
846
909
  offset: Optional[Union[int, Tensor]] = None,
910
+ base: Union[int, float] = 1e4,
847
911
  dtype: Optional[str] = None,
848
912
  device: Optional[str] = None,
849
913
  ) -> Tensor:
@@ -867,8 +931,8 @@ def sinusoidal_positional_encoding(
867
931
  dtype = rf.get_default_float_dtype()
868
932
  if not device:
869
933
  device = rf.get_default_device()
870
- cache = _positional_encoding_cache.setdefault(rf.get_run_ctx(), {})
871
- cache_key = (spatial_dim, feat_dim, offset, dtype, device)
934
+ cache = _sinusoidal_positional_encoding_cache.setdefault(rf.get_run_ctx(), {})
935
+ cache_key = (spatial_dim, feat_dim, offset, base, dtype, device)
872
936
  if cache_key in cache:
873
937
  return cache[cache_key]
874
938
  import math
@@ -886,7 +950,7 @@ def sinusoidal_positional_encoding(
886
950
 
887
951
  feat2_dim = feat_dim.div_left(2)
888
952
  div_term = rf.exp(
889
- rf.range_over_dim(feat2_dim, dtype=dtype, device=device) * -(math.log(1e4) / (feat2_dim.dimension - 1))
953
+ rf.range_over_dim(feat2_dim, dtype=dtype, device=device) * -(math.log(base) / (feat2_dim.dimension - 1))
890
954
  )
891
955
  arg_sin = rf.combine_bc(rf.cast(indices, dtype), "*", div_term)
892
956
  arg_cos = arg_sin + math.pi / 2.0
@@ -0,0 +1,3 @@
1
+ """
2
+ Model conversion code, to import model parameters from some external source
3
+ """
@@ -0,0 +1,56 @@
1
+ """
2
+ Import the parameters from the HuggingFace Llama model.
3
+ """
4
+
5
+ from __future__ import annotations
6
+ from typing import TYPE_CHECKING
7
+ import returnn.frontend as rf
8
+
9
+ if TYPE_CHECKING:
10
+ from transformers.models.llama.modeling_llama import LlamaAttention
11
+
12
+
13
+ def import_params_hf_llama_att_to_rf_rotary_att(model_hf: LlamaAttention, model_rf: rf.RotaryPosCausalSelfAttention):
14
+ """
15
+ Import the parameters from the HF attention module.
16
+ """
17
+ import torch
18
+
19
+ assert model_hf.num_heads == model_rf.num_heads.dimension
20
+ assert model_hf.hidden_size == model_rf.in_dim.dimension
21
+ dim = model_hf.hidden_size
22
+ nh = model_hf.num_heads
23
+ hdim = dim // nh
24
+
25
+ print("HF Model:")
26
+ print(model_hf)
27
+ print("Parameters:")
28
+ num_params_hf = 0
29
+ for k, v in model_hf.named_parameters():
30
+ print(f"{k}: {list(v.shape)} {v.dtype}")
31
+ num_params_hf += v.numel()
32
+ print("Total number of parameters:", num_params_hf)
33
+
34
+ print("RF Model:")
35
+ print(model_rf)
36
+ print("Parameters:")
37
+ num_params_rf = 0
38
+ for k, v in model_rf.named_parameters():
39
+ print(f"{k}: {list(v.dims)} {v.dtype}")
40
+ assert isinstance(v.raw_tensor, torch.nn.Parameter)
41
+ num_params_rf += v.num_elements()
42
+ print("Total number of parameters:", num_params_rf)
43
+ assert num_params_rf == num_params_hf
44
+
45
+ # Torch Linear: (out,in), but RF has (in,out).
46
+ q = model_hf.q_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
47
+ k = model_hf.k_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
48
+ v = model_hf.v_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
49
+ q = q.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2) # reorder complex numbers
50
+ k = k.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2) # reorder complex numbers
51
+ qkv = torch.cat([q, k, v], dim=2) # (in,h,out/h*3)
52
+ qkv = qkv.reshape(dim, 3 * dim)
53
+ assert model_hf.q_proj.bias is None # not implemented
54
+ with torch.no_grad():
55
+ model_rf.qkv.weight.raw_tensor.copy_(qkv)
56
+ model_rf.proj.weight.raw_tensor.copy_(model_hf.o_proj.weight.T)
@@ -33,11 +33,13 @@ class TransformerDecoder(rf.Module):
33
33
  model_dim: Union[Dim, int] = Dim(512, name="transformer-dec-default-model-dim"),
34
34
  *,
35
35
  num_layers: int,
36
+ ff: Union[type, Dict[str, Any], rf.Module] = NotSpecified,
36
37
  ff_dim: Union[Dim, int] = NotSpecified,
37
- ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.relu,
38
+ ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
38
39
  dropout: float = 0.1,
39
40
  num_heads: int = 8,
40
41
  att_dropout: float = 0.1,
42
+ norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
41
43
  decoder_layer: Optional[Union[TransformerDecoderLayer, rf.Module, type, Any]] = None,
42
44
  decoder_layer_opts: Optional[Dict[str, Any]] = None,
43
45
  embed_dim: Optional[Dim] = None,
@@ -52,11 +54,13 @@ class TransformerDecoder(rf.Module):
52
54
  :param vocab_dim:
53
55
  :param model_dim: the output feature dimension
54
56
  :param num_layers: the number of encoder layers
57
+ :param ff: feed-forward / MLP block. Default is :class:`FeedForward`
55
58
  :param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
56
59
  :param ff_activation: activation function for feed-forward network
57
60
  :param dropout: the dropout value for the FF block
58
61
  :param num_heads: the number of attention heads
59
62
  :param att_dropout: attention dropout value
63
+ :param norm: pre-normalization for FF and attention blocks
60
64
  :param decoder_layer: an instance of :class:`TransformerDecoderLayer` or similar
61
65
  :param decoder_layer_opts: options for the encoder layer
62
66
  :param embed_dim: if given, will first have an embedding [vocab,embed] and then a linear [embed,model].
@@ -123,11 +127,13 @@ class TransformerDecoder(rf.Module):
123
127
  decoder_layer_opts_ = dict(
124
128
  encoder_dim=encoder_dim,
125
129
  out_dim=model_dim,
130
+ ff=ff,
126
131
  ff_dim=ff_dim,
127
132
  ff_activation=ff_activation,
128
133
  dropout=dropout,
129
134
  num_heads=num_heads,
130
135
  att_dropout=att_dropout,
136
+ norm=norm,
131
137
  )
132
138
  if decoder_layer_opts:
133
139
  decoder_layer_opts_.update(decoder_layer_opts)
@@ -140,7 +146,7 @@ class TransformerDecoder(rf.Module):
140
146
 
141
147
  self.layers = sequential(_copy.deepcopy(decoder_layer) for _ in range(num_layers))
142
148
 
143
- self.final_layer_norm = rf.LayerNorm(model_dim)
149
+ self.final_layer_norm = _make_norm(norm, model_dim)
144
150
 
145
151
  self.logits = rf.Linear(model_dim, vocab_dim, with_bias=logits_with_bias)
146
152
 
@@ -217,17 +223,20 @@ class TransformerDecoderLayer(rf.Module):
217
223
  encoder_dim: Optional[Dim],
218
224
  out_dim: Dim = Dim(512, name="transformer-dec-default-out-dim"),
219
225
  *,
226
+ ff: Union[type, Dict[str, Any], rf.Module] = NotSpecified,
220
227
  ff_dim: Union[Dim, int] = NotSpecified,
221
- ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.relu,
228
+ ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
222
229
  dropout: float = 0.1,
223
230
  num_heads: int = 8,
224
231
  self_att: Optional[Union[rf.CausalSelfAttention, rf.RelPosCausalSelfAttention, rf.Module, type, Any]] = None,
225
232
  self_att_opts: Optional[Dict[str, Any]] = None,
226
233
  att_dropout: float = 0.1,
234
+ norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
227
235
  ):
228
236
  """
229
237
  :param encoder_dim: for cross-attention. None if no cross-attention.
230
238
  :param out_dim: the output feature dimension
239
+ :param ff: feed-forward / MLP block. Default is :class:`FeedForward`
231
240
  :param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
232
241
  :param ff_activation: activation function for feed-forward network
233
242
  :param dropout: the dropout value for the FF block
@@ -235,6 +244,7 @@ class TransformerDecoderLayer(rf.Module):
235
244
  :param self_att: the self-attention layer. RelPosSelfAttention originally and default
236
245
  :param self_att_opts: options for the self-attention layer, for :class:`nn.RelPosSelfAttention`
237
246
  :param att_dropout: attention dropout value
247
+ :param norm: pre-normalization for FF and attention blocks
238
248
  """
239
249
  super().__init__()
240
250
 
@@ -243,8 +253,23 @@ class TransformerDecoderLayer(rf.Module):
243
253
  self.dropout_broadcast = rf.dropout_broadcast_default()
244
254
  self.out_dim = out_dim
245
255
 
246
- self.ff = FeedForward(out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, activation=ff_activation)
247
- self.ff_layer_norm = rf.LayerNorm(out_dim)
256
+ if ff is NotSpecified:
257
+ ff = FeedForward
258
+ if isinstance(ff, rf.Module):
259
+ ff = _copy.deepcopy(ff)
260
+ else:
261
+ ff_kwargs = dict(out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, activation=ff_activation)
262
+ ff_kwargs = {k: v for (k, v) in ff_kwargs.items() if v is not NotSpecified}
263
+ if isinstance(ff, type):
264
+ ff = ff(**ff_kwargs)
265
+ elif isinstance(ff, dict):
266
+ ff = rf.build_from_dict(ff, **ff_kwargs)
267
+ else:
268
+ raise TypeError(f"unexpected ff type {ff!r}")
269
+ assert isinstance(ff, rf.Module)
270
+
271
+ self.ff = ff
272
+ self.ff_layer_norm = _make_norm(norm, out_dim)
248
273
 
249
274
  if self_att is None or isinstance(self_att, type):
250
275
  self_att_opts_ = dict(
@@ -263,7 +288,7 @@ class TransformerDecoderLayer(rf.Module):
263
288
  self.self_att = self_att(**self_att_opts_)
264
289
  else:
265
290
  self.self_att = self_att
266
- self.self_att_layer_norm = rf.LayerNorm(out_dim)
291
+ self.self_att_layer_norm = _make_norm(norm, out_dim)
267
292
 
268
293
  self.cross_att = None
269
294
  self.cross_att_layer_norm = None
@@ -277,7 +302,7 @@ class TransformerDecoderLayer(rf.Module):
277
302
  num_heads=num_heads,
278
303
  att_dropout=att_dropout,
279
304
  )
280
- self.cross_att_layer_norm = rf.LayerNorm(out_dim)
305
+ self.cross_att_layer_norm = _make_norm(norm, out_dim)
281
306
 
282
307
  def default_initial_state(self, *, batch_dims: Sequence[Dim]) -> rf.State:
283
308
  """default initial state"""
@@ -326,14 +351,14 @@ class FeedForward(rf.Module):
326
351
  out_dim: Dim,
327
352
  *,
328
353
  ff_dim: Optional[Union[Dim, int]] = NotSpecified,
329
- dropout: float,
330
- activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module],
354
+ dropout: float = 0.1,
355
+ activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.relu,
331
356
  ):
332
357
  """
333
358
  :param out_dim: output feature dimension
334
359
  :param ff_dim: dimension of the feed-forward layers
335
360
  :param dropout: dropout value
336
- :param activation: activation function
361
+ :param activation: activation function, relu by default
337
362
  """
338
363
  super().__init__()
339
364
 
@@ -344,7 +369,9 @@ class FeedForward(rf.Module):
344
369
  if not isinstance(ff_dim, Dim):
345
370
  raise TypeError(f"Transformer FeedForward: unexpected ff_dim {ff_dim!r} type {type(ff_dim)}")
346
371
 
347
- if isinstance(activation, dict):
372
+ if activation is NotSpecified:
373
+ activation = rf.relu
374
+ elif isinstance(activation, dict):
348
375
  activation = rf.build_from_dict(activation)
349
376
  elif not callable(activation):
350
377
  raise TypeError(f"{self}: unexpected activation type {activation!r}")
@@ -364,3 +391,69 @@ class FeedForward(rf.Module):
364
391
  x_drop = rf.dropout(x_act, self.dropout, axis=self.dropout_broadcast and self.linear_ff.out_dim)
365
392
  x_ff2 = self.linear_out(x_drop)
366
393
  return x_ff2
394
+
395
+
396
+ class FeedForwardGated(rf.Module):
397
+ """
398
+ E.g. with f=swish=silu:
399
+ SwiGLU, from `GLU Variants Improve Transformer <https://arxiv.org/abs/2002.05202>`__::
400
+
401
+ f(Linear(x)) * Linear(x)
402
+
403
+ This is a feed-forward block based on SwiGLU, as defined in the paper.
404
+ """
405
+
406
+ def __init__(
407
+ self,
408
+ out_dim: Dim,
409
+ *,
410
+ ff_dim: Optional[Union[Dim, int]] = NotSpecified,
411
+ dropout: float = 0.1,
412
+ activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.swish,
413
+ ):
414
+ super().__init__()
415
+
416
+ if isinstance(ff_dim, int):
417
+ ff_dim = Dim(ff_dim, name="transformer-ff-dim")
418
+ if ff_dim is NotSpecified or ff_dim is None:
419
+ # Factor 2/3 to keep same number of parameters as in the original FF block, just as in the paper.
420
+ ff_dim = out_dim * 2 // 3
421
+ if not isinstance(ff_dim, Dim):
422
+ raise TypeError(f"Transformer FeedForward: unexpected ff_dim {ff_dim!r} type {type(ff_dim)}")
423
+
424
+ if activation is NotSpecified:
425
+ activation = rf.swish
426
+ elif isinstance(activation, dict):
427
+ activation = rf.build_from_dict(activation)
428
+ elif not callable(activation):
429
+ raise TypeError(f"{self}: unexpected activation type {activation!r}")
430
+
431
+ self.out_dim = out_dim
432
+ self.dropout = dropout
433
+ self.dropout_broadcast = rf.dropout_broadcast_default()
434
+ self.activation = activation
435
+
436
+ # Factor 2 because we concatenate the two paths.
437
+ self.linear_ff = rf.Linear(out_dim, 2 * ff_dim)
438
+ self.linear_out = rf.Linear(ff_dim, out_dim)
439
+
440
+ def __call__(self, inp: Tensor) -> Tensor:
441
+ """forward"""
442
+ x_ff1 = self.linear_ff(inp)
443
+ x_ff1a, x_ff1b = rf.split(x_ff1, axis=self.linear_ff.out_dim, out_dims=[self.linear_out.in_dim] * 2)
444
+ x_act = self.activation(x_ff1a) * x_ff1b
445
+ x_drop = rf.dropout(x_act, self.dropout, axis=self.dropout_broadcast and self.linear_out.in_dim)
446
+ x_ff2 = self.linear_out(x_drop)
447
+ return x_ff2
448
+
449
+
450
+ def _make_norm(norm: Union[type, Dict[str, Any], rf.Module, Callable], out_dim: Dim) -> Union[rf.Module, Callable]:
451
+ if isinstance(norm, type):
452
+ norm = norm(out_dim)
453
+ elif isinstance(norm, dict):
454
+ norm = rf.build_from_dict(norm, out_dim)
455
+ elif isinstance(norm, rf.Module):
456
+ norm = _copy.deepcopy(norm)
457
+ if not callable(norm):
458
+ raise TypeError(f"unexpected norm type {norm!r}")
459
+ return norm
@@ -15,7 +15,7 @@ class Linear(rf.Module):
15
15
  Linear transformation.
16
16
  """
17
17
 
18
- def __init__(self, in_dim: Dim, out_dim: Dim, *, with_bias=True):
18
+ def __init__(self, in_dim: Dim, out_dim: Dim, *, with_bias: bool = True):
19
19
  super().__init__()
20
20
  assert isinstance(in_dim, Dim) and isinstance(out_dim, Dim)
21
21
  self.in_dim = in_dim