returnn 1.20240730.135048__tar.gz → 1.20240731.50408__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of returnn might be problematic. Click here for more details.
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/PKG-INFO +1 -1
- returnn-1.20240731.50408/_setup_info_generated.py +2 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/basic.py +1 -0
- returnn-1.20240731.50408/returnn/datasets/postprocessing.py +212 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/attention.py +69 -5
- returnn-1.20240731.50408/returnn/frontend/conversions/__init__.py +3 -0
- returnn-1.20240731.50408/returnn/frontend/conversions/hf_llama.py +56 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/decoder/transformer.py +104 -11
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/linear.py +1 -1
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/normalization.py +41 -5
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/debug.py +188 -1
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn.egg-info/PKG-INFO +1 -1
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn.egg-info/SOURCES.txt +3 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm-inspection-profile.xml +2 -1
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/inspectionProfiles/Project_Default.xml +2 -1
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_Dataset.py +52 -1
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_attention.py +239 -0
- returnn-1.20240730.135048/_setup_info_generated.py +0 -2
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/.editorconfig +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/.gitignore +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/.gitmodules +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/.kateconfig +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/CHANGELOG.md +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/CODEOWNERS +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/CONTRIBUTING.md +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/LICENSE +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/MANIFEST.in +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/README.rst +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/12AX.cluster_map +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/_setup_returnn_env.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-fwd.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-horovod-mpi.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-horovod-mpi.py.sh +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-horovod-mpi.sh +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-hyper-param-tuning.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-iter-dataset.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-list-devices.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-lua-torch-layer.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-pretrain.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-record-and-push-to-webserver.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-returnn-as-framework.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-rf-pt-benchmark.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-rf.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-rhn-enwik8.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-sprint-interface.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-att-copy.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-attention.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-chunking-blstm.12ax.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-contribrnn-lstm.12ax.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-enc-dec.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-hard-att-copy.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-lstm-benchmark.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-maxgradnorm-lstm.12ax.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-native-lstm-lowmem.12ax.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-native-lstm.12ax.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-native-lstm2.12ax.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-native-lstm2.12ax.tuned.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-neural-transducer.12ax.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-rec-explicit-lstm.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-rec-explicit-rnn.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-rec-self-att.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-search-compiled-graph.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-tf-vanilla-lstm.12ax.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-timit-lstm-ctc.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-torch.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo-upd-mult-model.lstm.12ax.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/demo.sh +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/IAM_lines/a01-000u-00.png +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/IAM_lines/a01-007-04.png +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/IAM_lines/a01-007-06.png +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/README.txt +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/chars.txt +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/config_demo +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/config_fwd +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/config_real +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/create_IAM_dataset.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/decode.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/features/raw/demo.h5 +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/go.sh +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/lines.txt +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/split/eval.txt +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/split/train.txt +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/IAM/split/valid.txt +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/README.md +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial/create_test_h5.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial/forwardconfig +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial/go.sh +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial/trainconfig +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial_rgb/create_test_h5.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial_rgb/forwardconfig +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial_rgb/go.sh +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/demos/mdlstm/artificial_rgb/trainconfig +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/pyproject.toml +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/requirements.txt +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/__main__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/__old_mod_loader__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/__setup__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/config.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/audio.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/bundle_file.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/cached.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/cached2.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/distrib_files.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/generating.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/hdf.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/lm.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/map.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/meta.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/multi_proc.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/normalization_data.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/numpy_dump.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/raw_wav.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/sprint.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/stereo.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/util/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/util/feature_extraction.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/util/strings.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/datasets/util/vocabulary.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/engine/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/engine/base.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/engine/batch.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/__main__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/.git +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/.gitignore +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/LICENSE +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/README.md +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/aligner.gif +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/check.png +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/core.cu +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/core.h +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/core_cpu.cpp +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/LICENSE +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/MANIFEST.in +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/README.md +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/binding.cpp +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/core.cu +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/core.h +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/requirements.txt +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/setup.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/warp_rna/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/pytorch_binding/warp_rna/test.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/ref_rna.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/setup.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/src/warp_rna_op.cc +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/src/warp_rna_op_kernel_tmpl.h +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/warp_rna/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/WarpRna/warp-rna/test.cpp +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/README.md +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/edit.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/reroute.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/select.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/subgraph.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/transform.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/extern/graph_editor/util.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/forward_iface.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_backend.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/backend.cpp +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/backend.hpp +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/module.cpp +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/module.hpp +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/py_utils.hpp +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/tensor_ops.cpp +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_native/tensor_ops.hpp +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_numpy_backend.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_random_journal.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/_utils.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/array_.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/audio/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/audio/mel.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/audio/specaugment.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/backend.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/build_from_dict.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/cond.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/const.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/container.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/control_flow_ctx.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/conv.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/decoder/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/device.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/dims.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/dropout.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/dtype.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/encoder/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/encoder/base.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/encoder/conformer.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/gradient.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/graph.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/hooks.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/init.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/label_smoothing.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/loop.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/loss.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/math_.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/matmul.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/module.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/parameter.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/parametrizations.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/parametrize.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/piecewise_linear.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/rand.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/rec.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/reduce.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/run_ctx.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/signal.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/state.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/stepwise_scheduler.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/tensor_array.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/types.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/import_/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/import_/common.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/import_/git.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/import_/import_.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/learning_rate_control.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/log.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/native_op.cpp +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/native_op.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/pretrain.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/sprint/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/sprint/cache.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/sprint/control.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/sprint/error_signals.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/sprint/extern_interface.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/sprint/interface.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/README.md +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/_dim_extra.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/_tensor_extra.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/_tensor_mixin_base.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/_tensor_op_overloads.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/control_flow_ctx.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/dim.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/marked_dim.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/tensor.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/tensor_dict.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tensor/utils.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/compat.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/data_pipeline.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/distributed.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/engine.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/README.md +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/_backend.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/_utils.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/cond.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/config_entry_points.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/debug_eager_mode.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/dims.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/layer.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/loop.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/make_layer.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/masked_computation.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/parameter_assign.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_layers/prev_tensor_ref.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_low_level/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/frontend_low_level/_backend.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/horovod.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/hyper_param_tuning.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/base.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/basic.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/rec.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/segmental_model.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/signal_processing.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/layers/variable.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/native_op.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/network.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/sprint.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/updater.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/util/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/util/basic.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/util/data.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/util/gradient_checkpoint.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/util/ken_lm.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/tf/util/open_fst.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/README.md +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/data/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/data/extern_data.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/data/pipeline.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/data/queued_data_iter.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/data/returnn_dataset_wrapper.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/data/tensor_utils.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/distributed.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/engine.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/frontend/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/frontend/_backend.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/frontend/_rand.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/frontend/bridge.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/frontend/raw_ops.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/updater.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/util/README.md +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/util/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/util/array_.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/util/diagnose_gpu.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/util/gradient_checkpoint.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/torch/util/scaled_gradient.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/__init__.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/basic.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/better_exchook.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/bpe.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/debug_helpers.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/file_cache.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/fsa.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/literal_py_to_pickle.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/math.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/multi_proc_non_daemonic_spawn.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/native_code_compiler.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/pprint.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/py-to-pickle.cpp +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/py_compat.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/py_ext_mod_compiler.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/result_with_reason.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/sig_proc.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/task_system.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/train_proc_manager.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/util/watch_memory.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn.egg-info/dependency_links.txt +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn.egg-info/top_level.txt +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/rnn.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/setup.cfg +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/setup.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/DummySprintExec.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/.gitignore +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/.name +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/codeStyleSettings.xml +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/codeStyles/Project.xml +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/codeStyles/codeStyleConfig.xml +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/inspectionProfiles/profiles_settings.xml +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/misc.xml +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/modules.xml +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/returnn.iml +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/PyCharm.idea/scopes/scope_settings.xml +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/_set_num_threads1.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/_setup_returnn_env.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/_setup_test_env.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/bpe-unicode-demo.codes +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/bpe-unicode-demo.vocab +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/lexicon_opt.fst +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/lexicon_opt.isyms +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/lexicon_opt.jpg +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/lexicon_opt.osyms +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/lint_common.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/pycharm-inspect.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/pylint.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/returnn-as-framework.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/rf_utils.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/spelling.dic +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_Config.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_Fsa.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_GeneratingDataset.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_HDFDataset.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_LearningRateControl.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_Log.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_MultiProcDataset.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_Pretrain.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_ResNet.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_SprintDataset.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_SprintInterface.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFEngine.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFNativeOp.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFNetworkLayer.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFNetworkRecLayer.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFNetworkSigProcLayer.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFUpdater.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TFUtil.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TF_determinism.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TaskSystem.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TaskSystem_SharedMem.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_TranslationDataset.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_Util.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_demos.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_fork_exec.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_hdf_dump.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_array.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_base.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_cond.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_const.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_container.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_conv.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_encoder_conformer.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_gradient.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_label_smoothing.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_loop.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_math.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_normalization.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_piecewise_linear.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_rec.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_reduce.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_rf_signal.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_tensor.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_tools.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_torch_dataset.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_torch_engine.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_torch_frontend.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_torch_internal_frontend.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/test_torch_util.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tests/torch_utils.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/_setup_returnn_env.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/analyze-dataset-batches.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/bliss-collect-seq-lens.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/bliss-dump-text.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/bliss-get-segment-names.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/bliss-to-ogg-zip.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/bpe-create-lexicon.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/calculate-word-error-rate.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/cleanup-old-models.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/collect-orth-symbols.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/collect-words.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/compile_native_op.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/compile_tf_graph.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/debug-dump-search-scores.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/debug-plot-search-scores.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/dump-dataset-raw-strings.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/dump-dataset.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/dump-forward-stats.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/dump-forward.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/dump-network-json.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/dump-pickle.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/extract_state_tying_from_dataset.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/get-attention-weights.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/get-best-model-epoch.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/hdf_dump.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/hdf_dump_translation_dataset.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/import-blocks-mt-model.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/import-t2t-mt-model.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/.gitignore +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/Makefile +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/README.md +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/README.md +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/libs_list +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/network.040/i600_m600_m600.sgd_b16_lr0_cl2.newbobabs.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/network.040/i600_m600_m600.sgd_b16_lr0_cl2.newbobabs.keep_over_epoch.lstm2.config +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/rescore_lattice.sh +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/state_vars_list +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/example/tensor_names_list +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/file.h +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/htklatticerescorer.cc +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/htklatticerescorer.h +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/main.cc +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/rescorer.h +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/vocabulary.cc +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/lattice_rescorer/vocabulary.h +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/tf_avg_checkpoints.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/tf_inspect_checkpoint.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/tf_inspect_summary_log.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/torch_avg_checkpoints.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/torch_export_to_onnx.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/torch_inspect_checkpoint.py +0 -0
- {returnn-1.20240730.135048 → returnn-1.20240731.50408}/tools/torch_inspect_checkpoint_and_opt.py +0 -0
|
@@ -1388,6 +1388,7 @@ def get_dataset_class(name: Union[str, Type[Dataset]]) -> Optional[Type[Dataset]
|
|
|
1388
1388
|
"map",
|
|
1389
1389
|
"multi_proc",
|
|
1390
1390
|
"distrib_files",
|
|
1391
|
+
"postprocessing",
|
|
1391
1392
|
]
|
|
1392
1393
|
for mod_name in mod_names:
|
|
1393
1394
|
mod = import_module("returnn.datasets.%s" % mod_name)
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Provides :class:`PostprocessingDataset`.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
|
|
8
|
+
|
|
9
|
+
from returnn.datasets.basic import DatasetSeq
|
|
10
|
+
from returnn.datasets.util.vocabulary import Vocabulary
|
|
11
|
+
from returnn.tensor import Tensor, TensorDict
|
|
12
|
+
from returnn.tensor.dim import Dim
|
|
13
|
+
from .basic import init_dataset
|
|
14
|
+
from .cached2 import CachedDataset2
|
|
15
|
+
|
|
16
|
+
__all__ = ["PostprocessingDataset"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PostprocessingDataset(CachedDataset2):
|
|
20
|
+
"""
|
|
21
|
+
A dataset that allows for generic post-processing of data from another dataset
|
|
22
|
+
using a function on the segment level and on the level of multiple segments via
|
|
23
|
+
an iterator.
|
|
24
|
+
|
|
25
|
+
This allows integrating various data augmentation techniques like e.g. Mixup,
|
|
26
|
+
SpecAugment or speed perturbation into the data loading pipeline.
|
|
27
|
+
|
|
28
|
+
The integration into the data loading pipeline makes it easy to distribute the
|
|
29
|
+
data processing work across multiple CPU cores using `MultiProcDataset` and in
|
|
30
|
+
turn frees the GPU from data preprocessing tasks.
|
|
31
|
+
|
|
32
|
+
Example usage::
|
|
33
|
+
|
|
34
|
+
from returnn.tensor.dim import Dim, DimTypes
|
|
35
|
+
|
|
36
|
+
time_dim = Dim(None, kind=DimTypes.Spatial)
|
|
37
|
+
new_data_dim = Dim(128)
|
|
38
|
+
|
|
39
|
+
train = {
|
|
40
|
+
"class": "PostprocessingDataset",
|
|
41
|
+
"dataset": {
|
|
42
|
+
"class": "HDFDataset",
|
|
43
|
+
"files": ["/path/to/data.hdf"],
|
|
44
|
+
},
|
|
45
|
+
# one of them, but not both:
|
|
46
|
+
"map_seq": map_seq, # (data: TensorDict) -> TensorDict
|
|
47
|
+
"map_seq_stream": map_seqs, # (iter: Iterator[TensorDict]) -> Iterator[TensorDict]
|
|
48
|
+
# only required when data shapes change wrt. the wrapped dataset:
|
|
49
|
+
"map_outputs": {
|
|
50
|
+
"data": {"dims": [time_dim, new_data_dim]},
|
|
51
|
+
},
|
|
52
|
+
}
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
dataset: Dict[str, Any],
|
|
58
|
+
map_seq: Optional[Union[Callable[[TensorDict], TensorDict]]] = None,
|
|
59
|
+
map_seq_stream: Optional[Callable[[Iterator[TensorDict]], Iterator[TensorDict]]] = None,
|
|
60
|
+
map_outputs: Optional[Dict[str, Any]] = None,
|
|
61
|
+
**kwargs,
|
|
62
|
+
):
|
|
63
|
+
"""
|
|
64
|
+
:param dataset: inner dataset to be post-processed
|
|
65
|
+
:param map_seq: post processor function operating on the single-segment level.
|
|
66
|
+
To avoid confusion on the order of how the processing functions are applied to the data, only one of
|
|
67
|
+
`map_seq` and `map_seq_stream` can be specified at a time.
|
|
68
|
+
:param map_seq_stream: post processor function operating on the multiple segment level via an iterator.
|
|
69
|
+
Allows merging multiple segments into one, or generating multiple output segments from one input segment.
|
|
70
|
+
To avoid confusion on the order of how the processing functions are applied to the data, only one of
|
|
71
|
+
`map_seq` and `map_seq_stream` can be specified at a time.
|
|
72
|
+
:param map_outputs: Type and axis specification of the outputs of the mapping functions,
|
|
73
|
+
like extern_data and model_outputs.
|
|
74
|
+
To simplify the common case when no shapes change, this value can be left unspecified. The dataset then
|
|
75
|
+
assumes the same data layout as returned by the wrapped dataset.
|
|
76
|
+
Example: `map_outputs={"data": {"dim": 42}}`
|
|
77
|
+
:param kwargs: see :class:`CachedDataset2`, :class:`Dataset`
|
|
78
|
+
"""
|
|
79
|
+
super().__init__(**kwargs)
|
|
80
|
+
|
|
81
|
+
if self.seq_ordering != "default":
|
|
82
|
+
raise ValueError(f"{self}: specify seq_ordering in wrapped dataset, not in {self.__class__.__name__}")
|
|
83
|
+
if map_seq is None and map_seq_stream is None:
|
|
84
|
+
raise ValueError(f"{self}: need to either set map_seq or map_seq_stream")
|
|
85
|
+
if map_seq and map_seq_stream:
|
|
86
|
+
raise ValueError(f"{self}: cannot set both map_seq and map_seq_stream")
|
|
87
|
+
|
|
88
|
+
self._dataset_def = dataset
|
|
89
|
+
self._map_seq = map_seq
|
|
90
|
+
self._map_seq_stream = map_seq_stream
|
|
91
|
+
self._map_outputs = map_outputs
|
|
92
|
+
|
|
93
|
+
self._dataset = init_dataset(self._dataset_def, parent_dataset=self)
|
|
94
|
+
if self._map_seq_stream is None:
|
|
95
|
+
# if the stream mapper is set, the num_seqs may change and the estimation is less accurate
|
|
96
|
+
self._estimated_num_seqs = self._dataset.estimated_num_seqs
|
|
97
|
+
self._data_iter: Optional[Iterator[Tuple[int, TensorDict]]] = None
|
|
98
|
+
|
|
99
|
+
self._in_tensor_dict_template = TensorDict(
|
|
100
|
+
{name: self._make_tensor_template_from_input(name) for name in self._dataset.get_data_keys()}
|
|
101
|
+
)
|
|
102
|
+
if self._map_outputs is not None:
|
|
103
|
+
self._out_tensor_dict_template = TensorDict()
|
|
104
|
+
self._out_tensor_dict_template.update(self._map_outputs, auto_convert=True)
|
|
105
|
+
else:
|
|
106
|
+
self._out_tensor_dict_template = self._in_tensor_dict_template
|
|
107
|
+
self.num_outputs = {
|
|
108
|
+
k: (t.sparse_dim.size if t.sparse_dim else t.shape[-1] if len(t.shape) > 0 else 1, t.ndim)
|
|
109
|
+
for k, t in self._out_tensor_dict_template.data.items()
|
|
110
|
+
}
|
|
111
|
+
self._default_input = "data" if "data" in self.num_outputs else next(iter(self.num_outputs.keys()))
|
|
112
|
+
self.num_inputs = self.num_outputs[self._default_input][0]
|
|
113
|
+
|
|
114
|
+
self.labels = {}
|
|
115
|
+
for k, t in self._out_tensor_dict_template.data.items():
|
|
116
|
+
if t.vocab:
|
|
117
|
+
self.labels[k] = t.vocab.labels
|
|
118
|
+
elif t.sparse_dim: # sparse_dim but not vocab
|
|
119
|
+
self.labels[k] = list(map(str, range(t.sparse_dim.dimension))) # dummy labels
|
|
120
|
+
|
|
121
|
+
def init_seq_order(
|
|
122
|
+
self, epoch: Optional[int] = None, seq_list: Optional[List[str]] = None, seq_order: Optional[List[int]] = None
|
|
123
|
+
):
|
|
124
|
+
"""
|
|
125
|
+
:param epoch:
|
|
126
|
+
:param seq_list:
|
|
127
|
+
:param seq_order:
|
|
128
|
+
:return: whether the order changed (True is always safe to return)
|
|
129
|
+
"""
|
|
130
|
+
super().init_seq_order(epoch=epoch, seq_list=seq_list, seq_order=seq_order)
|
|
131
|
+
|
|
132
|
+
if epoch is None and seq_list is None and seq_order is None:
|
|
133
|
+
self._num_seqs = 0
|
|
134
|
+
return True
|
|
135
|
+
|
|
136
|
+
assert self._dataset is not None
|
|
137
|
+
self._dataset.init_seq_order(epoch=epoch, seq_list=seq_list, seq_order=seq_order)
|
|
138
|
+
self._data_iter = enumerate(self._build_mapping_iter())
|
|
139
|
+
return True
|
|
140
|
+
|
|
141
|
+
def _collect_single_seq(self, seq_idx: int) -> Optional[DatasetSeq]:
|
|
142
|
+
while True:
|
|
143
|
+
try:
|
|
144
|
+
loaded_seq_idx, tensor_dict = next(self._data_iter)
|
|
145
|
+
except StopIteration:
|
|
146
|
+
return None
|
|
147
|
+
assert loaded_seq_idx <= seq_idx, "_collect_single_seq must be done monotonically"
|
|
148
|
+
if loaded_seq_idx != seq_idx:
|
|
149
|
+
continue
|
|
150
|
+
seq = DatasetSeq(features={k: t.raw_tensor for k, t in tensor_dict.data.items()}, seq_idx=seq_idx)
|
|
151
|
+
return seq
|
|
152
|
+
|
|
153
|
+
def _build_mapping_iter(self) -> Iterator[TensorDict]:
|
|
154
|
+
"""
|
|
155
|
+
:return: an iterator applying both the segment level and across-segment transformations on the given dataset
|
|
156
|
+
"""
|
|
157
|
+
|
|
158
|
+
def _validate_tensor_dict_iter(inner: Iterator[TensorDict]) -> Iterator[TensorDict]:
|
|
159
|
+
for t_dict in inner:
|
|
160
|
+
for data_key, out_t in self._out_tensor_dict_template.data.items():
|
|
161
|
+
in_t = t_dict.data[data_key]
|
|
162
|
+
assert (
|
|
163
|
+
in_t.ndim == out_t.batch_ndim
|
|
164
|
+
and in_t.dtype == out_t.dtype
|
|
165
|
+
and all(d.dimension in (d_, None) for (d, d_) in zip(in_t.dims, out_t.shape))
|
|
166
|
+
)
|
|
167
|
+
yield t_dict
|
|
168
|
+
|
|
169
|
+
data_iter = self._iterate_dataset()
|
|
170
|
+
if self._map_seq_stream is not None:
|
|
171
|
+
data_iter = self._map_seq_stream(data_iter)
|
|
172
|
+
assert isinstance(
|
|
173
|
+
data_iter, Iterator
|
|
174
|
+
), f"map_seq_stream must produce an {Iterator.__name__}, but produced {type(data_iter).__name__}"
|
|
175
|
+
return _validate_tensor_dict_iter(data_iter)
|
|
176
|
+
|
|
177
|
+
def _iterate_dataset(self) -> Iterator[TensorDict]:
|
|
178
|
+
"""
|
|
179
|
+
:return: generator providing data samples in the form of a TensorDict
|
|
180
|
+
"""
|
|
181
|
+
data_keys = self._dataset.get_data_keys()
|
|
182
|
+
|
|
183
|
+
seq_index = 0
|
|
184
|
+
while self._dataset.is_less_than_num_seqs(seq_index):
|
|
185
|
+
self._dataset.load_seqs(seq_index, seq_index + 1)
|
|
186
|
+
tensor_dict = self._in_tensor_dict_template.copy_template()
|
|
187
|
+
for data_key in data_keys:
|
|
188
|
+
tensor_dict.data[data_key].raw_tensor = self._dataset.get_data(seq_index, data_key)
|
|
189
|
+
if self._map_seq is not None:
|
|
190
|
+
tensor_dict = self._map_seq(tensor_dict)
|
|
191
|
+
assert isinstance(
|
|
192
|
+
tensor_dict, TensorDict
|
|
193
|
+
), f"map_seq must produce a {TensorDict.__name__}, but produced {type(tensor_dict).__name__}"
|
|
194
|
+
yield tensor_dict
|
|
195
|
+
seq_index += 1
|
|
196
|
+
|
|
197
|
+
def _make_tensor_template_from_input(self, data_key: str) -> Tensor:
|
|
198
|
+
dtype = self._dataset.get_data_dtype(data_key)
|
|
199
|
+
if dtype == "string":
|
|
200
|
+
dims = []
|
|
201
|
+
else:
|
|
202
|
+
feature_dims = [
|
|
203
|
+
Dim(dimension=dim, name=f"{data_key}_dim{i + 1}")
|
|
204
|
+
for i, dim in enumerate(self._dataset.get_data_shape(data_key))
|
|
205
|
+
]
|
|
206
|
+
dims = [Dim(dimension=None, name=f"{data_key}_frame"), *feature_dims]
|
|
207
|
+
sparse_dim = None
|
|
208
|
+
if self._dataset.is_data_sparse(data_key):
|
|
209
|
+
sparse_dim = Dim(dimension=self._dataset.get_data_dim(data_key), name=f"{data_key}_sparse")
|
|
210
|
+
if data_key in self._dataset.labels:
|
|
211
|
+
sparse_dim.vocab = Vocabulary.create_vocab_from_labels(self._dataset.labels[data_key])
|
|
212
|
+
return Tensor(data_key, dims=dims, dtype=dtype, sparse_dim=sparse_dim)
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
Attention
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
|
|
6
5
|
from __future__ import annotations
|
|
7
6
|
from typing import Tuple, Union, Optional, Sequence
|
|
8
7
|
import weakref
|
|
@@ -17,6 +16,7 @@ __all__ = [
|
|
|
17
16
|
"SelfAttention",
|
|
18
17
|
"CausalSelfAttention",
|
|
19
18
|
"CausalSelfAttentionState",
|
|
19
|
+
"RotaryPosCausalSelfAttention",
|
|
20
20
|
"RelPosSelfAttention",
|
|
21
21
|
"RelPosCausalSelfAttention",
|
|
22
22
|
"CrossAttention",
|
|
@@ -264,6 +264,69 @@ class CausalSelfAttentionState(rf.State):
|
|
|
264
264
|
self.accum_axis = accum_axis
|
|
265
265
|
|
|
266
266
|
|
|
267
|
+
class RotaryPosCausalSelfAttention(CausalSelfAttention):
|
|
268
|
+
"""
|
|
269
|
+
Rotary positional encoding (RoPE)-based causal self attention
|
|
270
|
+
"""
|
|
271
|
+
|
|
272
|
+
def __call__(
|
|
273
|
+
self,
|
|
274
|
+
source: Tensor,
|
|
275
|
+
axis: Dim,
|
|
276
|
+
*,
|
|
277
|
+
state: Optional[CausalSelfAttentionState] = None,
|
|
278
|
+
) -> Tuple[Tensor, CausalSelfAttentionState]:
|
|
279
|
+
"""forward"""
|
|
280
|
+
q, k, v = self.forward_qkv(source)
|
|
281
|
+
k, v, hist_dim, new_state = _causal_self_att_step(k, v, axis=axis, state=state, self=self)
|
|
282
|
+
|
|
283
|
+
# Apply RoPE using sinusoidal positional encoding.
|
|
284
|
+
# Note: base is a bit different in rf.sinusoidal_positional_encoding (like the original)
|
|
285
|
+
# vs how it's commonly used for RoPE.
|
|
286
|
+
# log(base) / (dim / 2 - 1) = log(10_000) * 2 / dim
|
|
287
|
+
# <=> log(base) = log(10_000) * (dim / 2 - 1) * 2 / dim = log(10_000) * (1 - 2 / dim)
|
|
288
|
+
# <=> base = 10_000 ** (1 - 2 / dim)
|
|
289
|
+
pos_enc = rf.sinusoidal_positional_encoding(
|
|
290
|
+
spatial_dim=hist_dim,
|
|
291
|
+
feat_dim=self.key_dim_per_head,
|
|
292
|
+
base=10_000 ** (1 - 2 / self.key_dim_per_head.dimension),
|
|
293
|
+
) # [T,D]
|
|
294
|
+
q = _apply_rope(
|
|
295
|
+
q,
|
|
296
|
+
(
|
|
297
|
+
rf.gather(pos_enc, axis=hist_dim, indices=hist_dim.dyn_size_ext - 1)
|
|
298
|
+
if axis == single_step_dim
|
|
299
|
+
else rf.replace_dim(pos_enc, in_dim=hist_dim, out_dim=axis)[0]
|
|
300
|
+
),
|
|
301
|
+
self.key_dim_per_head,
|
|
302
|
+
)
|
|
303
|
+
k = _apply_rope(k, pos_enc, self.key_dim_per_head)
|
|
304
|
+
|
|
305
|
+
output = self.attention(q, k, v, kv_axis=hist_dim)
|
|
306
|
+
return output, new_state
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _apply_rope(x: Tensor, pos_enc: Tensor, feat_dim: Dim) -> Tensor:
|
|
310
|
+
"""
|
|
311
|
+
:param x: [...,T,D] or [...,D]
|
|
312
|
+
:param pos_enc: [T,D] or [D]
|
|
313
|
+
:param feat_dim: D
|
|
314
|
+
:return: [...,T,D] or [...,D]
|
|
315
|
+
"""
|
|
316
|
+
feat_half_dim = feat_dim.div_left(2)
|
|
317
|
+
pe_imag, pe_real = rf.split(pos_enc, axis=feat_dim, out_dims=[feat_half_dim] * 2) # [T,D/2]
|
|
318
|
+
# pe_imag = sin, pe_real = cos
|
|
319
|
+
d2 = Dim(2, name="complex")
|
|
320
|
+
x = rf.split_dims(x, axis=feat_dim, dims=(feat_half_dim, d2)) # [...,T,D/2,2]
|
|
321
|
+
x_real = rf.gather(x, indices=0, axis=d2)
|
|
322
|
+
x_imag = rf.gather(x, indices=1, axis=d2)
|
|
323
|
+
x_real_ = x_real * pe_real - x_imag * pe_imag
|
|
324
|
+
x_imag_ = x_real * pe_imag + x_imag * pe_real
|
|
325
|
+
x_, _ = rf.stack((x_real_, x_imag_), out_dim=d2) # [...,T,D/2,2]
|
|
326
|
+
x_, _ = rf.merge_dims(x_, dims=(feat_half_dim, d2), out_dim=feat_dim) # [...,T,D]
|
|
327
|
+
return x_
|
|
328
|
+
|
|
329
|
+
|
|
267
330
|
class RelPosSelfAttention(SelfAttentionBase):
|
|
268
331
|
"""
|
|
269
332
|
Self-attention with relative positional encoding.
|
|
@@ -836,7 +899,7 @@ def relative_positional_encoding(
|
|
|
836
899
|
return emb, out_spatial_dim
|
|
837
900
|
|
|
838
901
|
|
|
839
|
-
|
|
902
|
+
_sinusoidal_positional_encoding_cache = weakref.WeakKeyDictionary() # run ctx -> (spatial_dim, feat_dim) -> enc
|
|
840
903
|
|
|
841
904
|
|
|
842
905
|
def sinusoidal_positional_encoding(
|
|
@@ -844,6 +907,7 @@ def sinusoidal_positional_encoding(
|
|
|
844
907
|
spatial_dim: Dim,
|
|
845
908
|
feat_dim: Dim,
|
|
846
909
|
offset: Optional[Union[int, Tensor]] = None,
|
|
910
|
+
base: Union[int, float] = 1e4,
|
|
847
911
|
dtype: Optional[str] = None,
|
|
848
912
|
device: Optional[str] = None,
|
|
849
913
|
) -> Tensor:
|
|
@@ -867,8 +931,8 @@ def sinusoidal_positional_encoding(
|
|
|
867
931
|
dtype = rf.get_default_float_dtype()
|
|
868
932
|
if not device:
|
|
869
933
|
device = rf.get_default_device()
|
|
870
|
-
cache =
|
|
871
|
-
cache_key = (spatial_dim, feat_dim, offset, dtype, device)
|
|
934
|
+
cache = _sinusoidal_positional_encoding_cache.setdefault(rf.get_run_ctx(), {})
|
|
935
|
+
cache_key = (spatial_dim, feat_dim, offset, base, dtype, device)
|
|
872
936
|
if cache_key in cache:
|
|
873
937
|
return cache[cache_key]
|
|
874
938
|
import math
|
|
@@ -886,7 +950,7 @@ def sinusoidal_positional_encoding(
|
|
|
886
950
|
|
|
887
951
|
feat2_dim = feat_dim.div_left(2)
|
|
888
952
|
div_term = rf.exp(
|
|
889
|
-
rf.range_over_dim(feat2_dim, dtype=dtype, device=device) * -(math.log(
|
|
953
|
+
rf.range_over_dim(feat2_dim, dtype=dtype, device=device) * -(math.log(base) / (feat2_dim.dimension - 1))
|
|
890
954
|
)
|
|
891
955
|
arg_sin = rf.combine_bc(rf.cast(indices, dtype), "*", div_term)
|
|
892
956
|
arg_cos = arg_sin + math.pi / 2.0
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Import the parameters from the HuggingFace Llama model.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
import returnn.frontend as rf
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from transformers.models.llama.modeling_llama import LlamaAttention
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def import_params_hf_llama_att_to_rf_rotary_att(model_hf: LlamaAttention, model_rf: rf.RotaryPosCausalSelfAttention):
|
|
14
|
+
"""
|
|
15
|
+
Import the parameters from the HF attention module.
|
|
16
|
+
"""
|
|
17
|
+
import torch
|
|
18
|
+
|
|
19
|
+
assert model_hf.num_heads == model_rf.num_heads.dimension
|
|
20
|
+
assert model_hf.hidden_size == model_rf.in_dim.dimension
|
|
21
|
+
dim = model_hf.hidden_size
|
|
22
|
+
nh = model_hf.num_heads
|
|
23
|
+
hdim = dim // nh
|
|
24
|
+
|
|
25
|
+
print("HF Model:")
|
|
26
|
+
print(model_hf)
|
|
27
|
+
print("Parameters:")
|
|
28
|
+
num_params_hf = 0
|
|
29
|
+
for k, v in model_hf.named_parameters():
|
|
30
|
+
print(f"{k}: {list(v.shape)} {v.dtype}")
|
|
31
|
+
num_params_hf += v.numel()
|
|
32
|
+
print("Total number of parameters:", num_params_hf)
|
|
33
|
+
|
|
34
|
+
print("RF Model:")
|
|
35
|
+
print(model_rf)
|
|
36
|
+
print("Parameters:")
|
|
37
|
+
num_params_rf = 0
|
|
38
|
+
for k, v in model_rf.named_parameters():
|
|
39
|
+
print(f"{k}: {list(v.dims)} {v.dtype}")
|
|
40
|
+
assert isinstance(v.raw_tensor, torch.nn.Parameter)
|
|
41
|
+
num_params_rf += v.num_elements()
|
|
42
|
+
print("Total number of parameters:", num_params_rf)
|
|
43
|
+
assert num_params_rf == num_params_hf
|
|
44
|
+
|
|
45
|
+
# Torch Linear: (out,in), but RF has (in,out).
|
|
46
|
+
q = model_hf.q_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
|
|
47
|
+
k = model_hf.k_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
|
|
48
|
+
v = model_hf.v_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
|
|
49
|
+
q = q.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2) # reorder complex numbers
|
|
50
|
+
k = k.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2) # reorder complex numbers
|
|
51
|
+
qkv = torch.cat([q, k, v], dim=2) # (in,h,out/h*3)
|
|
52
|
+
qkv = qkv.reshape(dim, 3 * dim)
|
|
53
|
+
assert model_hf.q_proj.bias is None # not implemented
|
|
54
|
+
with torch.no_grad():
|
|
55
|
+
model_rf.qkv.weight.raw_tensor.copy_(qkv)
|
|
56
|
+
model_rf.proj.weight.raw_tensor.copy_(model_hf.o_proj.weight.T)
|
{returnn-1.20240730.135048 → returnn-1.20240731.50408}/returnn/frontend/decoder/transformer.py
RENAMED
|
@@ -33,11 +33,13 @@ class TransformerDecoder(rf.Module):
|
|
|
33
33
|
model_dim: Union[Dim, int] = Dim(512, name="transformer-dec-default-model-dim"),
|
|
34
34
|
*,
|
|
35
35
|
num_layers: int,
|
|
36
|
+
ff: Union[type, Dict[str, Any], rf.Module] = NotSpecified,
|
|
36
37
|
ff_dim: Union[Dim, int] = NotSpecified,
|
|
37
|
-
ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] =
|
|
38
|
+
ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
|
|
38
39
|
dropout: float = 0.1,
|
|
39
40
|
num_heads: int = 8,
|
|
40
41
|
att_dropout: float = 0.1,
|
|
42
|
+
norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
|
|
41
43
|
decoder_layer: Optional[Union[TransformerDecoderLayer, rf.Module, type, Any]] = None,
|
|
42
44
|
decoder_layer_opts: Optional[Dict[str, Any]] = None,
|
|
43
45
|
embed_dim: Optional[Dim] = None,
|
|
@@ -52,11 +54,13 @@ class TransformerDecoder(rf.Module):
|
|
|
52
54
|
:param vocab_dim:
|
|
53
55
|
:param model_dim: the output feature dimension
|
|
54
56
|
:param num_layers: the number of encoder layers
|
|
57
|
+
:param ff: feed-forward / MLP block. Default is :class:`FeedForward`
|
|
55
58
|
:param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
|
|
56
59
|
:param ff_activation: activation function for feed-forward network
|
|
57
60
|
:param dropout: the dropout value for the FF block
|
|
58
61
|
:param num_heads: the number of attention heads
|
|
59
62
|
:param att_dropout: attention dropout value
|
|
63
|
+
:param norm: pre-normalization for FF and attention blocks
|
|
60
64
|
:param decoder_layer: an instance of :class:`TransformerDecoderLayer` or similar
|
|
61
65
|
:param decoder_layer_opts: options for the encoder layer
|
|
62
66
|
:param embed_dim: if given, will first have an embedding [vocab,embed] and then a linear [embed,model].
|
|
@@ -123,11 +127,13 @@ class TransformerDecoder(rf.Module):
|
|
|
123
127
|
decoder_layer_opts_ = dict(
|
|
124
128
|
encoder_dim=encoder_dim,
|
|
125
129
|
out_dim=model_dim,
|
|
130
|
+
ff=ff,
|
|
126
131
|
ff_dim=ff_dim,
|
|
127
132
|
ff_activation=ff_activation,
|
|
128
133
|
dropout=dropout,
|
|
129
134
|
num_heads=num_heads,
|
|
130
135
|
att_dropout=att_dropout,
|
|
136
|
+
norm=norm,
|
|
131
137
|
)
|
|
132
138
|
if decoder_layer_opts:
|
|
133
139
|
decoder_layer_opts_.update(decoder_layer_opts)
|
|
@@ -140,7 +146,7 @@ class TransformerDecoder(rf.Module):
|
|
|
140
146
|
|
|
141
147
|
self.layers = sequential(_copy.deepcopy(decoder_layer) for _ in range(num_layers))
|
|
142
148
|
|
|
143
|
-
self.final_layer_norm =
|
|
149
|
+
self.final_layer_norm = _make_norm(norm, model_dim)
|
|
144
150
|
|
|
145
151
|
self.logits = rf.Linear(model_dim, vocab_dim, with_bias=logits_with_bias)
|
|
146
152
|
|
|
@@ -217,17 +223,20 @@ class TransformerDecoderLayer(rf.Module):
|
|
|
217
223
|
encoder_dim: Optional[Dim],
|
|
218
224
|
out_dim: Dim = Dim(512, name="transformer-dec-default-out-dim"),
|
|
219
225
|
*,
|
|
226
|
+
ff: Union[type, Dict[str, Any], rf.Module] = NotSpecified,
|
|
220
227
|
ff_dim: Union[Dim, int] = NotSpecified,
|
|
221
|
-
ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] =
|
|
228
|
+
ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
|
|
222
229
|
dropout: float = 0.1,
|
|
223
230
|
num_heads: int = 8,
|
|
224
231
|
self_att: Optional[Union[rf.CausalSelfAttention, rf.RelPosCausalSelfAttention, rf.Module, type, Any]] = None,
|
|
225
232
|
self_att_opts: Optional[Dict[str, Any]] = None,
|
|
226
233
|
att_dropout: float = 0.1,
|
|
234
|
+
norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
|
|
227
235
|
):
|
|
228
236
|
"""
|
|
229
237
|
:param encoder_dim: for cross-attention. None if no cross-attention.
|
|
230
238
|
:param out_dim: the output feature dimension
|
|
239
|
+
:param ff: feed-forward / MLP block. Default is :class:`FeedForward`
|
|
231
240
|
:param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
|
|
232
241
|
:param ff_activation: activation function for feed-forward network
|
|
233
242
|
:param dropout: the dropout value for the FF block
|
|
@@ -235,6 +244,7 @@ class TransformerDecoderLayer(rf.Module):
|
|
|
235
244
|
:param self_att: the self-attention layer. RelPosSelfAttention originally and default
|
|
236
245
|
:param self_att_opts: options for the self-attention layer, for :class:`nn.RelPosSelfAttention`
|
|
237
246
|
:param att_dropout: attention dropout value
|
|
247
|
+
:param norm: pre-normalization for FF and attention blocks
|
|
238
248
|
"""
|
|
239
249
|
super().__init__()
|
|
240
250
|
|
|
@@ -243,8 +253,23 @@ class TransformerDecoderLayer(rf.Module):
|
|
|
243
253
|
self.dropout_broadcast = rf.dropout_broadcast_default()
|
|
244
254
|
self.out_dim = out_dim
|
|
245
255
|
|
|
246
|
-
|
|
247
|
-
|
|
256
|
+
if ff is NotSpecified:
|
|
257
|
+
ff = FeedForward
|
|
258
|
+
if isinstance(ff, rf.Module):
|
|
259
|
+
ff = _copy.deepcopy(ff)
|
|
260
|
+
else:
|
|
261
|
+
ff_kwargs = dict(out_dim=out_dim, ff_dim=ff_dim, dropout=dropout, activation=ff_activation)
|
|
262
|
+
ff_kwargs = {k: v for (k, v) in ff_kwargs.items() if v is not NotSpecified}
|
|
263
|
+
if isinstance(ff, type):
|
|
264
|
+
ff = ff(**ff_kwargs)
|
|
265
|
+
elif isinstance(ff, dict):
|
|
266
|
+
ff = rf.build_from_dict(ff, **ff_kwargs)
|
|
267
|
+
else:
|
|
268
|
+
raise TypeError(f"unexpected ff type {ff!r}")
|
|
269
|
+
assert isinstance(ff, rf.Module)
|
|
270
|
+
|
|
271
|
+
self.ff = ff
|
|
272
|
+
self.ff_layer_norm = _make_norm(norm, out_dim)
|
|
248
273
|
|
|
249
274
|
if self_att is None or isinstance(self_att, type):
|
|
250
275
|
self_att_opts_ = dict(
|
|
@@ -263,7 +288,7 @@ class TransformerDecoderLayer(rf.Module):
|
|
|
263
288
|
self.self_att = self_att(**self_att_opts_)
|
|
264
289
|
else:
|
|
265
290
|
self.self_att = self_att
|
|
266
|
-
self.self_att_layer_norm =
|
|
291
|
+
self.self_att_layer_norm = _make_norm(norm, out_dim)
|
|
267
292
|
|
|
268
293
|
self.cross_att = None
|
|
269
294
|
self.cross_att_layer_norm = None
|
|
@@ -277,7 +302,7 @@ class TransformerDecoderLayer(rf.Module):
|
|
|
277
302
|
num_heads=num_heads,
|
|
278
303
|
att_dropout=att_dropout,
|
|
279
304
|
)
|
|
280
|
-
self.cross_att_layer_norm =
|
|
305
|
+
self.cross_att_layer_norm = _make_norm(norm, out_dim)
|
|
281
306
|
|
|
282
307
|
def default_initial_state(self, *, batch_dims: Sequence[Dim]) -> rf.State:
|
|
283
308
|
"""default initial state"""
|
|
@@ -326,14 +351,14 @@ class FeedForward(rf.Module):
|
|
|
326
351
|
out_dim: Dim,
|
|
327
352
|
*,
|
|
328
353
|
ff_dim: Optional[Union[Dim, int]] = NotSpecified,
|
|
329
|
-
dropout: float,
|
|
330
|
-
activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module],
|
|
354
|
+
dropout: float = 0.1,
|
|
355
|
+
activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.relu,
|
|
331
356
|
):
|
|
332
357
|
"""
|
|
333
358
|
:param out_dim: output feature dimension
|
|
334
359
|
:param ff_dim: dimension of the feed-forward layers
|
|
335
360
|
:param dropout: dropout value
|
|
336
|
-
:param activation: activation function
|
|
361
|
+
:param activation: activation function, relu by default
|
|
337
362
|
"""
|
|
338
363
|
super().__init__()
|
|
339
364
|
|
|
@@ -344,7 +369,9 @@ class FeedForward(rf.Module):
|
|
|
344
369
|
if not isinstance(ff_dim, Dim):
|
|
345
370
|
raise TypeError(f"Transformer FeedForward: unexpected ff_dim {ff_dim!r} type {type(ff_dim)}")
|
|
346
371
|
|
|
347
|
-
if
|
|
372
|
+
if activation is NotSpecified:
|
|
373
|
+
activation = rf.relu
|
|
374
|
+
elif isinstance(activation, dict):
|
|
348
375
|
activation = rf.build_from_dict(activation)
|
|
349
376
|
elif not callable(activation):
|
|
350
377
|
raise TypeError(f"{self}: unexpected activation type {activation!r}")
|
|
@@ -364,3 +391,69 @@ class FeedForward(rf.Module):
|
|
|
364
391
|
x_drop = rf.dropout(x_act, self.dropout, axis=self.dropout_broadcast and self.linear_ff.out_dim)
|
|
365
392
|
x_ff2 = self.linear_out(x_drop)
|
|
366
393
|
return x_ff2
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
class FeedForwardGated(rf.Module):
|
|
397
|
+
"""
|
|
398
|
+
E.g. with f=swish=silu:
|
|
399
|
+
SwiGLU, from `GLU Variants Improve Transformer <https://arxiv.org/abs/2002.05202>`__::
|
|
400
|
+
|
|
401
|
+
f(Linear(x)) * Linear(x)
|
|
402
|
+
|
|
403
|
+
This is a feed-forward block based on SwiGLU, as defined in the paper.
|
|
404
|
+
"""
|
|
405
|
+
|
|
406
|
+
def __init__(
|
|
407
|
+
self,
|
|
408
|
+
out_dim: Dim,
|
|
409
|
+
*,
|
|
410
|
+
ff_dim: Optional[Union[Dim, int]] = NotSpecified,
|
|
411
|
+
dropout: float = 0.1,
|
|
412
|
+
activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.swish,
|
|
413
|
+
):
|
|
414
|
+
super().__init__()
|
|
415
|
+
|
|
416
|
+
if isinstance(ff_dim, int):
|
|
417
|
+
ff_dim = Dim(ff_dim, name="transformer-ff-dim")
|
|
418
|
+
if ff_dim is NotSpecified or ff_dim is None:
|
|
419
|
+
# Factor 2/3 to keep same number of parameters as in the original FF block, just as in the paper.
|
|
420
|
+
ff_dim = out_dim * 2 // 3
|
|
421
|
+
if not isinstance(ff_dim, Dim):
|
|
422
|
+
raise TypeError(f"Transformer FeedForward: unexpected ff_dim {ff_dim!r} type {type(ff_dim)}")
|
|
423
|
+
|
|
424
|
+
if activation is NotSpecified:
|
|
425
|
+
activation = rf.swish
|
|
426
|
+
elif isinstance(activation, dict):
|
|
427
|
+
activation = rf.build_from_dict(activation)
|
|
428
|
+
elif not callable(activation):
|
|
429
|
+
raise TypeError(f"{self}: unexpected activation type {activation!r}")
|
|
430
|
+
|
|
431
|
+
self.out_dim = out_dim
|
|
432
|
+
self.dropout = dropout
|
|
433
|
+
self.dropout_broadcast = rf.dropout_broadcast_default()
|
|
434
|
+
self.activation = activation
|
|
435
|
+
|
|
436
|
+
# Factor 2 because we concatenate the two paths.
|
|
437
|
+
self.linear_ff = rf.Linear(out_dim, 2 * ff_dim)
|
|
438
|
+
self.linear_out = rf.Linear(ff_dim, out_dim)
|
|
439
|
+
|
|
440
|
+
def __call__(self, inp: Tensor) -> Tensor:
|
|
441
|
+
"""forward"""
|
|
442
|
+
x_ff1 = self.linear_ff(inp)
|
|
443
|
+
x_ff1a, x_ff1b = rf.split(x_ff1, axis=self.linear_ff.out_dim, out_dims=[self.linear_out.in_dim] * 2)
|
|
444
|
+
x_act = self.activation(x_ff1a) * x_ff1b
|
|
445
|
+
x_drop = rf.dropout(x_act, self.dropout, axis=self.dropout_broadcast and self.linear_out.in_dim)
|
|
446
|
+
x_ff2 = self.linear_out(x_drop)
|
|
447
|
+
return x_ff2
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def _make_norm(norm: Union[type, Dict[str, Any], rf.Module, Callable], out_dim: Dim) -> Union[rf.Module, Callable]:
|
|
451
|
+
if isinstance(norm, type):
|
|
452
|
+
norm = norm(out_dim)
|
|
453
|
+
elif isinstance(norm, dict):
|
|
454
|
+
norm = rf.build_from_dict(norm, out_dim)
|
|
455
|
+
elif isinstance(norm, rf.Module):
|
|
456
|
+
norm = _copy.deepcopy(norm)
|
|
457
|
+
if not callable(norm):
|
|
458
|
+
raise TypeError(f"unexpected norm type {norm!r}")
|
|
459
|
+
return norm
|
|
@@ -15,7 +15,7 @@ class Linear(rf.Module):
|
|
|
15
15
|
Linear transformation.
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
def __init__(self, in_dim: Dim, out_dim: Dim, *, with_bias=True):
|
|
18
|
+
def __init__(self, in_dim: Dim, out_dim: Dim, *, with_bias: bool = True):
|
|
19
19
|
super().__init__()
|
|
20
20
|
assert isinstance(in_dim, Dim) and isinstance(out_dim, Dim)
|
|
21
21
|
self.in_dim = in_dim
|