returnn 1.20240802.134933__tar.gz → 1.20240824.1611__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of returnn might be problematic. Click here for more details.
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/PKG-INFO +1 -1
- returnn-1.20240824.1611/_setup_info_generated.py +2 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_backend.py +9 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/array_.py +4 -0
- returnn-1.20240824.1611/returnn/frontend/conversions/hf_llama.py +246 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/decoder/transformer.py +60 -15
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/math_.py +9 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/_backend.py +5 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/frontend/_backend.py +11 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/updater.py +1 -1
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/debug.py +57 -27
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/file_cache.py +85 -21
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/math.py +5 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn.egg-info/PKG-INFO +1 -1
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn.egg-info/SOURCES.txt +1 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_Util.py +8 -3
- returnn-1.20240824.1611/tests/test_rf_decoder_transformer.py +163 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_math.py +16 -0
- returnn-1.20240802.134933/_setup_info_generated.py +0 -2
- returnn-1.20240802.134933/returnn/frontend/conversions/hf_llama.py +0 -56
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/.editorconfig +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/.gitignore +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/.gitmodules +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/.kateconfig +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/CHANGELOG.md +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/CODEOWNERS +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/CONTRIBUTING.md +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/LICENSE +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/MANIFEST.in +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/README.rst +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/12AX.cluster_map +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/_setup_returnn_env.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-fwd.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-horovod-mpi.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-horovod-mpi.py.sh +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-horovod-mpi.sh +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-hyper-param-tuning.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-iter-dataset.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-list-devices.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-lua-torch-layer.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-pretrain.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-record-and-push-to-webserver.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-returnn-as-framework.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-rf-pt-benchmark.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-rf.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-rhn-enwik8.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-sprint-interface.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-att-copy.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-attention.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-chunking-blstm.12ax.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-contribrnn-lstm.12ax.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-enc-dec.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-hard-att-copy.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-lstm-benchmark.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-maxgradnorm-lstm.12ax.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-native-lstm-lowmem.12ax.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-native-lstm.12ax.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-native-lstm2.12ax.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-native-lstm2.12ax.tuned.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-neural-transducer.12ax.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-rec-explicit-lstm.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-rec-explicit-rnn.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-rec-self-att.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-search-compiled-graph.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-tf-vanilla-lstm.12ax.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-timit-lstm-ctc.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-torch.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo-upd-mult-model.lstm.12ax.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/demo.sh +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/IAM_lines/a01-000u-00.png +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/IAM_lines/a01-007-04.png +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/IAM_lines/a01-007-06.png +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/README.txt +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/chars.txt +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/config_demo +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/config_fwd +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/config_real +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/create_IAM_dataset.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/decode.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/features/raw/demo.h5 +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/go.sh +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/lines.txt +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/split/eval.txt +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/split/train.txt +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/IAM/split/valid.txt +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/README.md +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial/create_test_h5.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial/forwardconfig +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial/go.sh +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial/trainconfig +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial_rgb/create_test_h5.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial_rgb/forwardconfig +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial_rgb/go.sh +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/demos/mdlstm/artificial_rgb/trainconfig +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/pyproject.toml +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/requirements.txt +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/__main__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/__old_mod_loader__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/__setup__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/config.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/audio.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/basic.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/bundle_file.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/cached.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/cached2.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/distrib_files.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/generating.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/hdf.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/lm.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/map.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/meta.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/multi_proc.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/normalization_data.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/numpy_dump.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/postprocessing.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/raw_wav.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/sprint.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/stereo.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/util/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/util/feature_extraction.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/util/strings.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/datasets/util/vocabulary.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/engine/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/engine/base.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/engine/batch.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/__main__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/.git +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/.gitignore +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/LICENSE +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/README.md +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/aligner.gif +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/check.png +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/core.cu +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/core.h +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/core_cpu.cpp +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/LICENSE +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/MANIFEST.in +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/README.md +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/binding.cpp +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/core.cu +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/core.h +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/requirements.txt +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/setup.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/warp_rna/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/pytorch_binding/warp_rna/test.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/ref_rna.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/setup.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/src/warp_rna_op.cc +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/src/warp_rna_op_kernel_tmpl.h +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/warp_rna/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/WarpRna/warp-rna/test.cpp +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/README.md +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/edit.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/reroute.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/select.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/subgraph.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/transform.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/extern/graph_editor/util.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/forward_iface.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/backend.cpp +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/backend.hpp +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/module.cpp +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/module.hpp +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/py_utils.hpp +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/tensor_ops.cpp +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_native/tensor_ops.hpp +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_numpy_backend.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_random_journal.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/_utils.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/attention.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/audio/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/audio/mel.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/audio/specaugment.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/backend.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/build_from_dict.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/cond.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/const.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/container.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/control_flow_ctx.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/conv.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/conversions/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/decoder/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/device.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/dims.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/dropout.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/dtype.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/encoder/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/encoder/base.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/encoder/conformer.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/gradient.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/graph.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/hooks.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/init.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/label_smoothing.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/linear.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/loop.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/loss.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/matmul.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/module.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/normalization.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/parameter.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/parametrizations.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/parametrize.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/piecewise_linear.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/rand.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/rec.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/reduce.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/run_ctx.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/signal.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/state.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/stepwise_scheduler.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/tensor_array.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/types.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/import_/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/import_/common.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/import_/git.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/import_/import_.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/learning_rate_control.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/log.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/native_op.cpp +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/native_op.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/pretrain.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/sprint/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/sprint/cache.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/sprint/control.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/sprint/error_signals.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/sprint/extern_interface.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/sprint/interface.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/README.md +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/_dim_extra.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/_tensor_extra.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/_tensor_mixin_base.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/_tensor_op_overloads.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/control_flow_ctx.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/dim.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/marked_dim.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/tensor.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/tensor_dict.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tensor/utils.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/compat.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/data_pipeline.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/distributed.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/engine.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/README.md +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/_utils.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/cond.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/config_entry_points.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/debug_eager_mode.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/dims.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/layer.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/loop.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/make_layer.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/masked_computation.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/parameter_assign.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/prev_tensor_ref.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_low_level/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_low_level/_backend.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/horovod.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/hyper_param_tuning.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/base.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/basic.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/rec.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/segmental_model.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/signal_processing.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/layers/variable.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/native_op.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/network.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/sprint.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/updater.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/util/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/util/basic.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/util/data.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/util/gradient_checkpoint.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/util/ken_lm.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/util/open_fst.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/README.md +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/data/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/data/extern_data.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/data/pipeline.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/data/queued_data_iter.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/data/returnn_dataset_wrapper.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/data/tensor_utils.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/distributed.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/engine.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/frontend/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/frontend/_rand.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/frontend/bridge.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/frontend/raw_ops.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/util/README.md +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/util/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/util/array_.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/util/diagnose_gpu.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/util/gradient_checkpoint.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/torch/util/scaled_gradient.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/__init__.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/basic.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/better_exchook.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/bpe.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/debug_helpers.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/fsa.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/literal_py_to_pickle.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/multi_proc_non_daemonic_spawn.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/native_code_compiler.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/pprint.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/py-to-pickle.cpp +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/py_compat.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/py_ext_mod_compiler.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/result_with_reason.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/sig_proc.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/task_system.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/train_proc_manager.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/util/watch_memory.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn.egg-info/dependency_links.txt +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn.egg-info/top_level.txt +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/rnn.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/setup.cfg +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/setup.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/DummySprintExec.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm-inspection-profile.xml +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/.gitignore +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/.name +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/codeStyleSettings.xml +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/codeStyles/Project.xml +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/codeStyles/codeStyleConfig.xml +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/inspectionProfiles/Project_Default.xml +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/inspectionProfiles/profiles_settings.xml +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/misc.xml +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/modules.xml +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/returnn.iml +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/PyCharm.idea/scopes/scope_settings.xml +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/_set_num_threads1.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/_setup_returnn_env.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/_setup_test_env.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/bpe-unicode-demo.codes +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/bpe-unicode-demo.vocab +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/lexicon_opt.fst +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/lexicon_opt.isyms +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/lexicon_opt.jpg +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/lexicon_opt.osyms +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/lint_common.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/pycharm-inspect.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/pylint.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/returnn-as-framework.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/rf_utils.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/spelling.dic +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_Config.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_Dataset.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_Fsa.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_GeneratingDataset.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_HDFDataset.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_LearningRateControl.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_Log.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_MultiProcDataset.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_Pretrain.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_ResNet.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_SprintDataset.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_SprintInterface.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFEngine.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFNativeOp.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFNetworkLayer.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFNetworkRecLayer.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFNetworkSigProcLayer.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFUpdater.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TFUtil.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TF_determinism.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TaskSystem.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TaskSystem_SharedMem.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_TranslationDataset.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_demos.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_fork_exec.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_hdf_dump.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_array.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_attention.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_base.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_cond.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_const.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_container.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_conv.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_encoder_conformer.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_gradient.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_label_smoothing.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_loop.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_normalization.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_piecewise_linear.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_rec.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_reduce.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_rf_signal.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_tensor.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_tools.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_torch_dataset.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_torch_engine.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_torch_frontend.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_torch_internal_frontend.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/test_torch_util.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tests/torch_utils.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/_setup_returnn_env.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/analyze-dataset-batches.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/bliss-collect-seq-lens.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/bliss-dump-text.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/bliss-get-segment-names.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/bliss-to-ogg-zip.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/bpe-create-lexicon.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/calculate-word-error-rate.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/cleanup-old-models.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/collect-orth-symbols.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/collect-words.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/compile_native_op.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/compile_tf_graph.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/debug-dump-search-scores.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/debug-plot-search-scores.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/dump-dataset-raw-strings.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/dump-dataset.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/dump-forward-stats.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/dump-forward.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/dump-network-json.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/dump-pickle.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/extract_state_tying_from_dataset.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/get-attention-weights.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/get-best-model-epoch.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/hdf_dump.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/hdf_dump_translation_dataset.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/import-blocks-mt-model.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/import-t2t-mt-model.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/.gitignore +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/Makefile +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/README.md +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/README.md +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/libs_list +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/network.040/i600_m600_m600.sgd_b16_lr0_cl2.newbobabs.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/network.040/i600_m600_m600.sgd_b16_lr0_cl2.newbobabs.keep_over_epoch.lstm2.config +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/rescore_lattice.sh +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/state_vars_list +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/example/tensor_names_list +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/file.h +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/htklatticerescorer.cc +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/htklatticerescorer.h +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/main.cc +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/rescorer.h +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/vocabulary.cc +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/lattice_rescorer/vocabulary.h +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/tf_avg_checkpoints.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/tf_inspect_checkpoint.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/tf_inspect_summary_log.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/torch_avg_checkpoints.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/torch_export_to_onnx.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/torch_inspect_checkpoint.py +0 -0
- {returnn-1.20240802.134933 → returnn-1.20240824.1611}/tools/torch_inspect_checkpoint_and_opt.py +0 -0
|
@@ -1014,6 +1014,15 @@ class Backend(Generic[T]):
|
|
|
1014
1014
|
return start + weight * (end - start)
|
|
1015
1015
|
return rf.combine_bc(start, "+", rf.combine_bc(weight, "*", rf.combine_bc(end, "-", start)))
|
|
1016
1016
|
|
|
1017
|
+
@staticmethod
|
|
1018
|
+
def cumsum(source: Tensor, *, spatial_dim: Dim) -> Tensor:
|
|
1019
|
+
"""
|
|
1020
|
+
:param source:
|
|
1021
|
+
:param spatial_dim:
|
|
1022
|
+
:return: cumsum over spatial dim
|
|
1023
|
+
"""
|
|
1024
|
+
raise NotImplementedError
|
|
1025
|
+
|
|
1017
1026
|
@staticmethod
|
|
1018
1027
|
def matmul(a: Tensor[T], b: Tensor[T], *, reduce: Union[Dim, Sequence[Dim]], use_mask: bool = True) -> Tensor[T]:
|
|
1019
1028
|
"""
|
|
@@ -520,9 +520,12 @@ def masked_select(
|
|
|
520
520
|
tensor: Tensor, *, mask: Tensor, dims: Sequence[Dim], out_dim: Optional[Dim] = None
|
|
521
521
|
) -> Tuple[Tensor, Dim]:
|
|
522
522
|
"""
|
|
523
|
+
This will pack the tensor based on the mask.
|
|
523
524
|
In TF, this is ``boolean_mask``.
|
|
524
525
|
The inverse of this is :func:`masked_scatter`.
|
|
525
526
|
|
|
527
|
+
Related: :func:`pack_padded`, which uses :func:`sequence_mask` as the mask.
|
|
528
|
+
|
|
526
529
|
:param tensor:
|
|
527
530
|
:param mask:
|
|
528
531
|
:param dims: the order of the dims defines the format. those dims should be exactly the dims of the mask.
|
|
@@ -553,6 +556,7 @@ def sequence_mask(dims: Union[Dim, Sequence[Dim]], *, device: Optional[str] = No
|
|
|
553
556
|
"""
|
|
554
557
|
:param dims:
|
|
555
558
|
:param device:
|
|
559
|
+
:return: mask based on the sequence lengths
|
|
556
560
|
"""
|
|
557
561
|
if isinstance(dims, Dim):
|
|
558
562
|
dims = [dims]
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Import the parameters from the HuggingFace Llama model (PyTorch).
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
from typing import TYPE_CHECKING, Union
|
|
7
|
+
import returnn.frontend as rf
|
|
8
|
+
from returnn.frontend.decoder.transformer import TransformerDecoder, TransformerDecoderLayer, FeedForwardGated
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from transformers.models.llama.modeling_llama import (
|
|
12
|
+
LlamaModel,
|
|
13
|
+
LlamaForCausalLM,
|
|
14
|
+
LlamaDecoderLayer,
|
|
15
|
+
LlamaMLP,
|
|
16
|
+
LlamaRMSNorm,
|
|
17
|
+
LlamaAttention,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def import_params_hf_llama_to_rf_transformer_decoder(
|
|
22
|
+
model_hf: Union[LlamaModel, LlamaForCausalLM], model_rf: TransformerDecoder
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Import params from HF Llama model to RF :class:`TransformerDecoder`.
|
|
26
|
+
"""
|
|
27
|
+
import torch
|
|
28
|
+
from transformers.models.llama.modeling_llama import LlamaModel, LlamaForCausalLM, LlamaDecoderLayer
|
|
29
|
+
|
|
30
|
+
print("HF Model:")
|
|
31
|
+
print(model_hf)
|
|
32
|
+
print("Parameters:")
|
|
33
|
+
num_params_hf = 0
|
|
34
|
+
for k, v in model_hf.named_parameters():
|
|
35
|
+
print(f"{k}: {list(v.shape)} {v.dtype}")
|
|
36
|
+
num_params_hf += v.numel()
|
|
37
|
+
print("Total number of parameters:", num_params_hf)
|
|
38
|
+
|
|
39
|
+
print("RF Model:")
|
|
40
|
+
print(model_rf)
|
|
41
|
+
print("Parameters:")
|
|
42
|
+
num_params_rf = 0
|
|
43
|
+
for k, v in model_rf.named_parameters():
|
|
44
|
+
print(f"{k}: {list(v.dims)} {v.dtype}")
|
|
45
|
+
assert isinstance(v.raw_tensor, torch.nn.Parameter)
|
|
46
|
+
num_params_rf += v.num_elements()
|
|
47
|
+
print("Total number of parameters:", num_params_rf)
|
|
48
|
+
# Check if the number of parameters is the same below.
|
|
49
|
+
# First import individual sub modules.
|
|
50
|
+
# We might detect any mismatches there, and this will easy the debugging.
|
|
51
|
+
|
|
52
|
+
lm_head = None
|
|
53
|
+
if isinstance(model_hf, LlamaForCausalLM):
|
|
54
|
+
lm_head = model_hf.lm_head
|
|
55
|
+
model_hf = model_hf.model
|
|
56
|
+
else:
|
|
57
|
+
# Exclude logits.
|
|
58
|
+
num_params_rf -= model_rf.logits.weight.num_elements()
|
|
59
|
+
assert isinstance(model_hf, LlamaModel)
|
|
60
|
+
assert model_hf.norm.weight.shape[0] == model_rf.model_dim.dimension
|
|
61
|
+
|
|
62
|
+
assert len(model_hf.layers) == len(model_rf.layers)
|
|
63
|
+
for i, (layer_hf, layer_rf) in enumerate(zip(model_hf.layers, model_rf.layers)):
|
|
64
|
+
assert isinstance(layer_hf, LlamaDecoderLayer)
|
|
65
|
+
assert isinstance(layer_rf, TransformerDecoderLayer)
|
|
66
|
+
import_params_hf_llama_decoder_layer_to_rf(layer_hf, layer_rf)
|
|
67
|
+
|
|
68
|
+
assert model_hf.embed_tokens.weight.shape == model_rf.input_embedding.weight.raw_tensor.shape
|
|
69
|
+
with torch.no_grad():
|
|
70
|
+
model_rf.input_embedding.weight.raw_tensor.copy_(model_hf.embed_tokens.weight) # (vocab,hidden)
|
|
71
|
+
|
|
72
|
+
assert isinstance(model_rf.final_layer_norm, rf.RMSNorm)
|
|
73
|
+
import_params_hf_llama_rms_norm_to_rf(model_hf.norm, model_rf.final_layer_norm)
|
|
74
|
+
|
|
75
|
+
if lm_head is not None:
|
|
76
|
+
assert lm_head.bias is None and model_rf.logits.bias is None # not implemented
|
|
77
|
+
# Torch Linear: (out,in), but RF has (in,out).
|
|
78
|
+
with torch.no_grad():
|
|
79
|
+
model_rf.logits.weight.raw_tensor.copy_(lm_head.weight.T) # (hidden,vocab)
|
|
80
|
+
|
|
81
|
+
assert num_params_rf == num_params_hf, f"missmatch num params: RF {num_params_rf} != HF {num_params_hf}"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def import_params_hf_llama_decoder_layer_to_rf(model_hf: LlamaDecoderLayer, model_rf: TransformerDecoderLayer):
|
|
85
|
+
"""
|
|
86
|
+
Import the parameters from the HF Llama decoder layer.
|
|
87
|
+
"""
|
|
88
|
+
import torch
|
|
89
|
+
|
|
90
|
+
assert model_hf.hidden_size == model_rf.out_dim.dimension
|
|
91
|
+
|
|
92
|
+
print("HF Model:")
|
|
93
|
+
print(model_hf)
|
|
94
|
+
print("Parameters:")
|
|
95
|
+
num_params_hf = 0
|
|
96
|
+
for k, v in model_hf.named_parameters():
|
|
97
|
+
print(f"{k}: {list(v.shape)} {v.dtype}")
|
|
98
|
+
num_params_hf += v.numel()
|
|
99
|
+
print("Total number of parameters:", num_params_hf)
|
|
100
|
+
|
|
101
|
+
print("RF Model:")
|
|
102
|
+
print(model_rf)
|
|
103
|
+
print("Parameters:")
|
|
104
|
+
num_params_rf = 0
|
|
105
|
+
for k, v in model_rf.named_parameters():
|
|
106
|
+
print(f"{k}: {list(v.dims)} {v.dtype}")
|
|
107
|
+
assert isinstance(v.raw_tensor, torch.nn.Parameter)
|
|
108
|
+
num_params_rf += v.num_elements()
|
|
109
|
+
print("Total number of parameters:", num_params_rf)
|
|
110
|
+
# Check if the number of parameters is the same below.
|
|
111
|
+
# First import individual sub modules.
|
|
112
|
+
# We might detect any mismatches there, and this will easy the debugging.
|
|
113
|
+
|
|
114
|
+
assert isinstance(model_rf.ff, FeedForwardGated), f"unexpected: {model_rf.ff}"
|
|
115
|
+
import_params_hf_llama_mlp_to_rf_feed_forward_gated(model_hf.mlp, model_rf.ff)
|
|
116
|
+
|
|
117
|
+
assert isinstance(model_rf.self_att, rf.RotaryPosCausalSelfAttention), f"unexpected: {model_rf.self_att}"
|
|
118
|
+
import_params_hf_llama_att_to_rf_rotary_att(model_hf.self_attn, model_rf.self_att)
|
|
119
|
+
|
|
120
|
+
assert isinstance(model_rf.self_att_layer_norm, rf.RMSNorm), f"unexpected: {model_rf.self_att_layer_norm}"
|
|
121
|
+
import_params_hf_llama_rms_norm_to_rf(model_hf.input_layernorm, model_rf.self_att_layer_norm)
|
|
122
|
+
|
|
123
|
+
assert isinstance(model_rf.ff_layer_norm, rf.RMSNorm), f"unexpected: {model_rf.ff_layer_norm}"
|
|
124
|
+
import_params_hf_llama_rms_norm_to_rf(model_hf.post_attention_layernorm, model_rf.ff_layer_norm)
|
|
125
|
+
|
|
126
|
+
assert num_params_rf == num_params_hf
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def import_params_hf_llama_mlp_to_rf_feed_forward_gated(model_hf: LlamaMLP, model_rf: FeedForwardGated):
|
|
130
|
+
"""
|
|
131
|
+
Import the parameters from the HF Llama MLP module.
|
|
132
|
+
"""
|
|
133
|
+
import torch
|
|
134
|
+
|
|
135
|
+
assert model_hf.hidden_size == model_rf.out_dim.dimension == model_rf.linear_ff.in_dim.dimension
|
|
136
|
+
|
|
137
|
+
print("HF Model:")
|
|
138
|
+
print(model_hf)
|
|
139
|
+
print("Parameters:")
|
|
140
|
+
num_params_hf = 0
|
|
141
|
+
for k, v in model_hf.named_parameters():
|
|
142
|
+
print(f"{k}: {list(v.shape)} {v.dtype}")
|
|
143
|
+
num_params_hf += v.numel()
|
|
144
|
+
print("Total number of parameters:", num_params_hf)
|
|
145
|
+
|
|
146
|
+
print("RF Model:")
|
|
147
|
+
print(model_rf)
|
|
148
|
+
print("Parameters:")
|
|
149
|
+
num_params_rf = 0
|
|
150
|
+
for k, v in model_rf.named_parameters():
|
|
151
|
+
print(f"{k}: {list(v.dims)} {v.dtype}")
|
|
152
|
+
assert isinstance(v.raw_tensor, torch.nn.Parameter)
|
|
153
|
+
num_params_rf += v.num_elements()
|
|
154
|
+
print("Total number of parameters:", num_params_rf)
|
|
155
|
+
assert num_params_rf == num_params_hf
|
|
156
|
+
|
|
157
|
+
# Torch Linear: (out,in), but RF has (in,out).
|
|
158
|
+
w1 = model_hf.gate_proj.weight.T # (in,out)
|
|
159
|
+
w2 = model_hf.up_proj.weight.T # (in,out)
|
|
160
|
+
w3 = model_hf.down_proj.weight.T # (out,in)
|
|
161
|
+
assert model_hf.gate_proj.bias is None # not implemented
|
|
162
|
+
assert model_hf.up_proj.bias is None # not implemented
|
|
163
|
+
assert model_hf.down_proj.bias is None # not implemented
|
|
164
|
+
with torch.no_grad():
|
|
165
|
+
w = torch.cat((w1, w2), dim=1) # (in,out*2)
|
|
166
|
+
model_rf.linear_ff.weight.raw_tensor.copy_(w)
|
|
167
|
+
model_rf.linear_out.weight.raw_tensor.copy_(w3)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def import_params_hf_llama_rms_norm_to_rf(model_hf: LlamaRMSNorm, model_rf: rf.RMSNorm):
|
|
171
|
+
"""
|
|
172
|
+
Import the parameters from the HF Llama RMSNorm module.
|
|
173
|
+
"""
|
|
174
|
+
import torch
|
|
175
|
+
|
|
176
|
+
assert model_hf.weight.shape[0] == model_rf.in_dim.dimension
|
|
177
|
+
|
|
178
|
+
print("HF Model:")
|
|
179
|
+
print(model_hf)
|
|
180
|
+
print("Parameters:")
|
|
181
|
+
num_params_hf = 0
|
|
182
|
+
for k, v in model_hf.named_parameters():
|
|
183
|
+
print(f"{k}: {list(v.shape)} {v.dtype}")
|
|
184
|
+
num_params_hf += v.numel()
|
|
185
|
+
print("Total number of parameters:", num_params_hf)
|
|
186
|
+
|
|
187
|
+
print("RF Model:")
|
|
188
|
+
print(model_rf)
|
|
189
|
+
print("Parameters:")
|
|
190
|
+
num_params_rf = 0
|
|
191
|
+
for k, v in model_rf.named_parameters():
|
|
192
|
+
print(f"{k}: {list(v.dims)} {v.dtype}")
|
|
193
|
+
assert isinstance(v.raw_tensor, torch.nn.Parameter)
|
|
194
|
+
num_params_rf += v.num_elements()
|
|
195
|
+
print("Total number of parameters:", num_params_rf)
|
|
196
|
+
assert num_params_rf == num_params_hf
|
|
197
|
+
|
|
198
|
+
w = model_hf.weight # (in,)
|
|
199
|
+
with torch.no_grad():
|
|
200
|
+
model_rf.scale.raw_tensor.copy_(w)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def import_params_hf_llama_att_to_rf_rotary_att(model_hf: LlamaAttention, model_rf: rf.RotaryPosCausalSelfAttention):
|
|
204
|
+
"""
|
|
205
|
+
Import the parameters from the HF Llama attention module.
|
|
206
|
+
"""
|
|
207
|
+
import torch
|
|
208
|
+
|
|
209
|
+
assert model_hf.num_heads == model_rf.num_heads.dimension
|
|
210
|
+
assert model_hf.hidden_size == model_rf.in_dim.dimension
|
|
211
|
+
dim = model_hf.hidden_size
|
|
212
|
+
nh = model_hf.num_heads
|
|
213
|
+
hdim = dim // nh
|
|
214
|
+
|
|
215
|
+
print("HF Model:")
|
|
216
|
+
print(model_hf)
|
|
217
|
+
print("Parameters:")
|
|
218
|
+
num_params_hf = 0
|
|
219
|
+
for k, v in model_hf.named_parameters():
|
|
220
|
+
print(f"{k}: {list(v.shape)} {v.dtype}")
|
|
221
|
+
num_params_hf += v.numel()
|
|
222
|
+
print("Total number of parameters:", num_params_hf)
|
|
223
|
+
|
|
224
|
+
print("RF Model:")
|
|
225
|
+
print(model_rf)
|
|
226
|
+
print("Parameters:")
|
|
227
|
+
num_params_rf = 0
|
|
228
|
+
for k, v in model_rf.named_parameters():
|
|
229
|
+
print(f"{k}: {list(v.dims)} {v.dtype}")
|
|
230
|
+
assert isinstance(v.raw_tensor, torch.nn.Parameter)
|
|
231
|
+
num_params_rf += v.num_elements()
|
|
232
|
+
print("Total number of parameters:", num_params_rf)
|
|
233
|
+
assert num_params_rf == num_params_hf, f"num params RF {num_params_rf} != params HF {num_params_hf}"
|
|
234
|
+
|
|
235
|
+
# Torch Linear: (out,in), but RF has (in,out).
|
|
236
|
+
q = model_hf.q_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
|
|
237
|
+
k = model_hf.k_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
|
|
238
|
+
v = model_hf.v_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
|
|
239
|
+
q = q.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2) # reorder complex numbers
|
|
240
|
+
k = k.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2) # reorder complex numbers
|
|
241
|
+
qkv = torch.cat([q, k, v], dim=2) # (in,h,out/h*3)
|
|
242
|
+
qkv = qkv.reshape(dim, 3 * dim)
|
|
243
|
+
assert model_hf.q_proj.bias is None # not implemented
|
|
244
|
+
with torch.no_grad():
|
|
245
|
+
model_rf.qkv.weight.raw_tensor.copy_(qkv)
|
|
246
|
+
model_rf.proj.weight.raw_tensor.copy_(model_hf.o_proj.weight.T)
|
{returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/frontend/decoder/transformer.py
RENAMED
|
@@ -13,10 +13,12 @@ References:
|
|
|
13
13
|
|
|
14
14
|
from __future__ import annotations
|
|
15
15
|
from typing import Optional, Any, Union, Tuple, Dict, Callable, Sequence
|
|
16
|
+
from types import FunctionType
|
|
16
17
|
import functools
|
|
17
18
|
import logging
|
|
18
19
|
import copy as _copy
|
|
19
20
|
from returnn.util.basic import NotSpecified, BehaviorVersion
|
|
21
|
+
from returnn.util.math import ceil_div
|
|
20
22
|
import returnn.frontend as rf
|
|
21
23
|
from returnn.tensor import Tensor, Dim, single_step_dim
|
|
22
24
|
|
|
@@ -36,6 +38,7 @@ class TransformerDecoder(rf.Module):
|
|
|
36
38
|
ff: Union[type, Dict[str, Any], rf.Module] = NotSpecified,
|
|
37
39
|
ff_dim: Union[Dim, int] = NotSpecified,
|
|
38
40
|
ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
|
|
41
|
+
pos_enc: Union[None, Callable, Dict[str, Any], rf.Module] = rf.sinusoidal_positional_encoding,
|
|
39
42
|
dropout: float = 0.1,
|
|
40
43
|
num_heads: int = 8,
|
|
41
44
|
att_dropout: float = 0.1,
|
|
@@ -57,6 +60,7 @@ class TransformerDecoder(rf.Module):
|
|
|
57
60
|
:param ff: feed-forward / MLP block. Default is :class:`FeedForward`
|
|
58
61
|
:param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
|
|
59
62
|
:param ff_activation: activation function for feed-forward network
|
|
63
|
+
:param pos_enc: positional encoding. Default is sinusoidal positional encoding.
|
|
60
64
|
:param dropout: the dropout value for the FF block
|
|
61
65
|
:param num_heads: the number of attention heads
|
|
62
66
|
:param att_dropout: attention dropout value
|
|
@@ -92,10 +96,21 @@ class TransformerDecoder(rf.Module):
|
|
|
92
96
|
if embed_dim:
|
|
93
97
|
self.input_embedding_proj = rf.Linear(embed_dim, model_dim, with_bias=False)
|
|
94
98
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
+
if pos_enc is None:
|
|
100
|
+
pass
|
|
101
|
+
elif isinstance(pos_enc, dict):
|
|
102
|
+
pos_enc = rf.build_from_dict(
|
|
103
|
+
pos_enc, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
|
|
104
|
+
)
|
|
105
|
+
elif isinstance(pos_enc, rf.Module):
|
|
106
|
+
pass
|
|
107
|
+
elif isinstance(pos_enc, FunctionType):
|
|
108
|
+
pos_enc = functools.partial(
|
|
109
|
+
pos_enc, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
|
|
110
|
+
)
|
|
111
|
+
else:
|
|
112
|
+
raise TypeError(f"unexpected pos_enc type {pos_enc!r}")
|
|
113
|
+
self.pos_enc = pos_enc
|
|
99
114
|
if share_embedding is None:
|
|
100
115
|
if BehaviorVersion.get() < 20:
|
|
101
116
|
logging.getLogger("returnn.frontend").warning(
|
|
@@ -189,7 +204,8 @@ class TransformerDecoder(rf.Module):
|
|
|
189
204
|
new_state = rf.State()
|
|
190
205
|
|
|
191
206
|
decoded = self.input_embedding(source) * self.input_embedding_scale
|
|
192
|
-
|
|
207
|
+
if self.pos_enc is not None:
|
|
208
|
+
decoded = decoded + self.pos_enc(spatial_dim=spatial_dim, offset=state.pos)
|
|
193
209
|
decoded = rf.dropout(decoded, self.input_dropout)
|
|
194
210
|
if self.input_embedding_proj is not None:
|
|
195
211
|
decoded = self.input_embedding_proj(decoded)
|
|
@@ -228,7 +244,9 @@ class TransformerDecoderLayer(rf.Module):
|
|
|
228
244
|
ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
|
|
229
245
|
dropout: float = 0.1,
|
|
230
246
|
num_heads: int = 8,
|
|
231
|
-
self_att: Optional[
|
|
247
|
+
self_att: Optional[
|
|
248
|
+
Union[rf.CausalSelfAttention, rf.RelPosCausalSelfAttention, rf.Module, type, Dict[str, Any]]
|
|
249
|
+
] = None,
|
|
232
250
|
self_att_opts: Optional[Dict[str, Any]] = None,
|
|
233
251
|
att_dropout: float = 0.1,
|
|
234
252
|
norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
|
|
@@ -271,7 +289,7 @@ class TransformerDecoderLayer(rf.Module):
|
|
|
271
289
|
self.ff = ff
|
|
272
290
|
self.ff_layer_norm = _make_norm(norm, out_dim)
|
|
273
291
|
|
|
274
|
-
if self_att is None or isinstance(self_att, type):
|
|
292
|
+
if self_att is None or isinstance(self_att, type) or isinstance(self_att, dict):
|
|
275
293
|
self_att_opts_ = dict(
|
|
276
294
|
in_dim=out_dim,
|
|
277
295
|
proj_dim=out_dim,
|
|
@@ -284,10 +302,16 @@ class TransformerDecoderLayer(rf.Module):
|
|
|
284
302
|
self_att_opts_.update(self_att_opts)
|
|
285
303
|
if self_att is None:
|
|
286
304
|
self.self_att = rf.CausalSelfAttention(**self_att_opts_)
|
|
287
|
-
|
|
305
|
+
elif isinstance(self_att, type):
|
|
288
306
|
self.self_att = self_att(**self_att_opts_)
|
|
307
|
+
elif isinstance(self_att, dict):
|
|
308
|
+
self.self_att = rf.build_from_dict(self_att, **self_att_opts_)
|
|
309
|
+
else:
|
|
310
|
+
raise TypeError(f"unexpected self_att type {self_att!r}")
|
|
311
|
+
elif isinstance(self_att, rf.Module):
|
|
312
|
+
self.self_att = _copy.deepcopy(self_att)
|
|
289
313
|
else:
|
|
290
|
-
|
|
314
|
+
raise TypeError(f"unexpected self_att type {self_att!r}")
|
|
291
315
|
self.self_att_layer_norm = _make_norm(norm, out_dim)
|
|
292
316
|
|
|
293
317
|
self.cross_att = None
|
|
@@ -353,12 +377,15 @@ class FeedForward(rf.Module):
|
|
|
353
377
|
ff_dim: Optional[Union[Dim, int]] = NotSpecified,
|
|
354
378
|
dropout: float = 0.1,
|
|
355
379
|
activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.relu,
|
|
380
|
+
with_bias: bool = True,
|
|
356
381
|
):
|
|
357
382
|
"""
|
|
358
383
|
:param out_dim: output feature dimension
|
|
359
384
|
:param ff_dim: dimension of the feed-forward layers
|
|
360
385
|
:param dropout: dropout value
|
|
361
386
|
:param activation: activation function, relu by default
|
|
387
|
+
:param with_bias: whether to use bias in the linear layers.
|
|
388
|
+
True by default for compatibility, but nowadays it's common to use without bias.
|
|
362
389
|
"""
|
|
363
390
|
super().__init__()
|
|
364
391
|
|
|
@@ -381,8 +408,8 @@ class FeedForward(rf.Module):
|
|
|
381
408
|
self.dropout_broadcast = rf.dropout_broadcast_default()
|
|
382
409
|
self.activation = activation
|
|
383
410
|
|
|
384
|
-
self.linear_ff = rf.Linear(out_dim, ff_dim)
|
|
385
|
-
self.linear_out = rf.Linear(ff_dim, out_dim)
|
|
411
|
+
self.linear_ff = rf.Linear(out_dim, ff_dim, with_bias=with_bias)
|
|
412
|
+
self.linear_out = rf.Linear(ff_dim, out_dim, with_bias=with_bias)
|
|
386
413
|
|
|
387
414
|
def __call__(self, inp: Tensor) -> Tensor:
|
|
388
415
|
"""forward"""
|
|
@@ -401,6 +428,8 @@ class FeedForwardGated(rf.Module):
|
|
|
401
428
|
f(Linear(x)) * Linear(x)
|
|
402
429
|
|
|
403
430
|
This is a feed-forward block based on SwiGLU, as defined in the paper.
|
|
431
|
+
|
|
432
|
+
Alternative to :class:`FeedForward`.
|
|
404
433
|
"""
|
|
405
434
|
|
|
406
435
|
def __init__(
|
|
@@ -410,14 +439,30 @@ class FeedForwardGated(rf.Module):
|
|
|
410
439
|
ff_dim: Optional[Union[Dim, int]] = NotSpecified,
|
|
411
440
|
dropout: float = 0.1,
|
|
412
441
|
activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.swish,
|
|
442
|
+
with_bias: bool = False,
|
|
413
443
|
):
|
|
444
|
+
"""
|
|
445
|
+
:param out_dim:
|
|
446
|
+
:param ff_dim: intermediate dimension.
|
|
447
|
+
Unlike :class:`FeedForward`:
|
|
448
|
+
If not provided, factor 4*2/3 to keep same number of parameters as in the original :class:`FeedForward`,
|
|
449
|
+
just as in the paper, and also making it a multiple of 256.
|
|
450
|
+
:param dropout:
|
|
451
|
+
:param activation: activation function for the gating. unlike :class:`FeedForward`, default is swish.
|
|
452
|
+
:param with_bias: whether to use bias in the linear layers.
|
|
453
|
+
unlike :class:`FeedForward`, default is False.
|
|
454
|
+
"""
|
|
414
455
|
super().__init__()
|
|
415
456
|
|
|
416
457
|
if isinstance(ff_dim, int):
|
|
417
458
|
ff_dim = Dim(ff_dim, name="transformer-ff-dim")
|
|
418
459
|
if ff_dim is NotSpecified or ff_dim is None:
|
|
419
|
-
# Factor
|
|
420
|
-
|
|
460
|
+
# Factor 4 as usual.
|
|
461
|
+
# The additional factor 2/3 to keep same number of parameters as in the original FF block,
|
|
462
|
+
# just as in the paper.
|
|
463
|
+
ff_dim_ = out_dim.dimension * 4 * 2 // 3
|
|
464
|
+
ff_dim_ = ceil_div(ff_dim_, 256) * 256 # make multiple of 256
|
|
465
|
+
ff_dim = Dim(ff_dim_, name="transformer-ff-dim")
|
|
421
466
|
if not isinstance(ff_dim, Dim):
|
|
422
467
|
raise TypeError(f"Transformer FeedForward: unexpected ff_dim {ff_dim!r} type {type(ff_dim)}")
|
|
423
468
|
|
|
@@ -434,8 +479,8 @@ class FeedForwardGated(rf.Module):
|
|
|
434
479
|
self.activation = activation
|
|
435
480
|
|
|
436
481
|
# Factor 2 because we concatenate the two paths.
|
|
437
|
-
self.linear_ff = rf.Linear(out_dim, 2 * ff_dim)
|
|
438
|
-
self.linear_out = rf.Linear(ff_dim, out_dim)
|
|
482
|
+
self.linear_ff = rf.Linear(out_dim, 2 * ff_dim, with_bias=with_bias)
|
|
483
|
+
self.linear_out = rf.Linear(ff_dim, out_dim, with_bias=with_bias)
|
|
439
484
|
|
|
440
485
|
def __call__(self, inp: Tensor) -> Tensor:
|
|
441
486
|
"""forward"""
|
|
@@ -69,6 +69,7 @@ __all__ = [
|
|
|
69
69
|
"log_softmax",
|
|
70
70
|
"gating",
|
|
71
71
|
"lerp",
|
|
72
|
+
"cumsum",
|
|
72
73
|
]
|
|
73
74
|
|
|
74
75
|
|
|
@@ -612,3 +613,11 @@ def lerp(
|
|
|
612
613
|
"""
|
|
613
614
|
# noinspection PyProtectedMember
|
|
614
615
|
return start._raw_backend.lerp(start, end, weight, allow_broadcast_all_sources=allow_broadcast_all_sources)
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
def cumsum(source: Tensor, *, spatial_dim: Dim) -> Tensor:
|
|
619
|
+
"""
|
|
620
|
+
Applies cumsum.
|
|
621
|
+
"""
|
|
622
|
+
# noinspection PyProtectedMember
|
|
623
|
+
return source._raw_backend.cumsum(source, spatial_dim=spatial_dim)
|
{returnn-1.20240802.134933 → returnn-1.20240824.1611}/returnn/tf/frontend_layers/_backend.py
RENAMED
|
@@ -730,6 +730,11 @@ class ReturnnLayersBackend(Backend[Layer]):
|
|
|
730
730
|
name="clip_by_value",
|
|
731
731
|
)
|
|
732
732
|
|
|
733
|
+
@staticmethod
|
|
734
|
+
def cumsum(source: Tensor, *, spatial_dim: Dim) -> Tensor:
|
|
735
|
+
"""cumsum"""
|
|
736
|
+
return rfl.make_layer({"class": "cumsum", "from": source, "axis": spatial_dim}, name="cumsum")
|
|
737
|
+
|
|
733
738
|
@staticmethod
|
|
734
739
|
def matmul(a: Tensor, b: Tensor, *, reduce: Union[Dim, Sequence[Dim]], use_mask: bool = True) -> Tensor:
|
|
735
740
|
"""matmul"""
|
|
@@ -983,6 +983,9 @@ class TorchBackend(Backend[torch.Tensor]):
|
|
|
983
983
|
elif axis_int == 0 and source.batch_ndim == 2:
|
|
984
984
|
# This is exactly what torch.embedding is intended for. Let's use that.
|
|
985
985
|
out.raw_tensor = torch.embedding(source.raw_tensor, indices.raw_tensor)
|
|
986
|
+
elif indices.batch_ndim <= 1:
|
|
987
|
+
# Note: This also works when indices is on CPU and source is on GPU.
|
|
988
|
+
out.raw_tensor = source.raw_tensor[(slice(None),) * axis_int + (indices.raw_tensor,)]
|
|
986
989
|
else:
|
|
987
990
|
out_raw = torch.index_select(source.raw_tensor, dim=axis_int, index=indices.raw_tensor.flatten())
|
|
988
991
|
out_shape = (
|
|
@@ -1189,6 +1192,14 @@ class TorchBackend(Backend[torch.Tensor]):
|
|
|
1189
1192
|
)
|
|
1190
1193
|
return out
|
|
1191
1194
|
|
|
1195
|
+
@staticmethod
|
|
1196
|
+
def cumsum(source: Tensor, *, spatial_dim: Dim) -> Tensor:
|
|
1197
|
+
"""cumsum"""
|
|
1198
|
+
axis = source.get_axis_from_description(spatial_dim)
|
|
1199
|
+
out = source.copy_template("cumsum")
|
|
1200
|
+
out.raw_tensor = torch.cumsum(source.raw_tensor, dim=axis, dtype=source.raw_tensor.dtype)
|
|
1201
|
+
return out
|
|
1202
|
+
|
|
1192
1203
|
@staticmethod
|
|
1193
1204
|
def matmul(a: _TT, b: _TT, *, reduce: Union[Dim, Sequence[Dim]], use_mask: bool = True) -> _TT:
|
|
1194
1205
|
"""
|