returnn 1.20240731.223820__tar.gz → 1.20240808.234227__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of returnn might be problematic. Click here for more details.
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/PKG-INFO +1 -1
- returnn-1.20240808.234227/_setup_info_generated.py +2 -0
- returnn-1.20240808.234227/returnn/frontend/conversions/hf_llama.py +246 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/decoder/transformer.py +60 -15
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/frontend/_backend.py +3 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/debug.py +9 -2
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/file_cache.py +85 -21
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/math.py +5 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn.egg-info/PKG-INFO +1 -1
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn.egg-info/SOURCES.txt +1 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_Util.py +8 -3
- returnn-1.20240808.234227/tests/test_rf_decoder_transformer.py +163 -0
- returnn-1.20240731.223820/_setup_info_generated.py +0 -2
- returnn-1.20240731.223820/returnn/frontend/conversions/hf_llama.py +0 -56
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/.editorconfig +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/.gitignore +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/.gitmodules +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/.kateconfig +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/CHANGELOG.md +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/CODEOWNERS +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/CONTRIBUTING.md +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/LICENSE +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/MANIFEST.in +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/README.rst +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/12AX.cluster_map +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/_setup_returnn_env.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-fwd.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-horovod-mpi.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-horovod-mpi.py.sh +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-horovod-mpi.sh +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-hyper-param-tuning.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-iter-dataset.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-list-devices.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-lua-torch-layer.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-pretrain.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-record-and-push-to-webserver.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-returnn-as-framework.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-rf-pt-benchmark.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-rf.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-rhn-enwik8.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-sprint-interface.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-att-copy.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-attention.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-chunking-blstm.12ax.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-contribrnn-lstm.12ax.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-enc-dec.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-hard-att-copy.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-lstm-benchmark.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-maxgradnorm-lstm.12ax.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-native-lstm-lowmem.12ax.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-native-lstm.12ax.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-native-lstm2.12ax.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-native-lstm2.12ax.tuned.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-neural-transducer.12ax.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-rec-explicit-lstm.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-rec-explicit-rnn.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-rec-self-att.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-search-compiled-graph.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-tf-vanilla-lstm.12ax.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-timit-lstm-ctc.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-torch.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo-upd-mult-model.lstm.12ax.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/demo.sh +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/IAM_lines/a01-000u-00.png +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/IAM_lines/a01-007-04.png +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/IAM_lines/a01-007-06.png +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/README.txt +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/chars.txt +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/config_demo +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/config_fwd +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/config_real +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/create_IAM_dataset.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/decode.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/features/raw/demo.h5 +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/go.sh +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/lines.txt +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/split/eval.txt +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/split/train.txt +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/IAM/split/valid.txt +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/README.md +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/artificial/create_test_h5.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/artificial/forwardconfig +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/artificial/go.sh +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/artificial/trainconfig +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/artificial_rgb/create_test_h5.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/artificial_rgb/forwardconfig +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/artificial_rgb/go.sh +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/demos/mdlstm/artificial_rgb/trainconfig +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/pyproject.toml +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/requirements.txt +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/__main__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/__old_mod_loader__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/__setup__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/config.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/audio.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/basic.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/bundle_file.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/cached.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/cached2.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/distrib_files.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/generating.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/hdf.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/lm.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/map.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/meta.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/multi_proc.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/normalization_data.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/numpy_dump.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/postprocessing.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/raw_wav.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/sprint.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/stereo.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/util/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/util/feature_extraction.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/util/strings.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/datasets/util/vocabulary.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/engine/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/engine/base.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/engine/batch.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/__main__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/.git +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/.gitignore +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/LICENSE +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/README.md +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/aligner.gif +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/check.png +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/core.cu +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/core.h +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/core_cpu.cpp +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/pytorch_binding/LICENSE +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/pytorch_binding/MANIFEST.in +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/pytorch_binding/README.md +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/pytorch_binding/binding.cpp +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/pytorch_binding/core.cu +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/pytorch_binding/core.h +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/pytorch_binding/requirements.txt +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/pytorch_binding/setup.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/pytorch_binding/warp_rna/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/pytorch_binding/warp_rna/test.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/ref_rna.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/setup.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/src/warp_rna_op.cc +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/src/warp_rna_op_kernel_tmpl.h +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/warp_rna/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/WarpRna/warp-rna/test.cpp +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/graph_editor/README.md +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/graph_editor/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/graph_editor/edit.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/graph_editor/reroute.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/graph_editor/select.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/graph_editor/subgraph.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/graph_editor/transform.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/extern/graph_editor/util.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/forward_iface.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/_backend.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/_native/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/_native/backend.cpp +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/_native/backend.hpp +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/_native/module.cpp +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/_native/module.hpp +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/_native/py_utils.hpp +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/_native/tensor_ops.cpp +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/_native/tensor_ops.hpp +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/_numpy_backend.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/_random_journal.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/_utils.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/array_.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/attention.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/audio/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/audio/mel.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/audio/specaugment.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/backend.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/build_from_dict.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/cond.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/const.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/container.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/control_flow_ctx.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/conv.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/conversions/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/decoder/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/device.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/dims.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/dropout.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/dtype.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/encoder/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/encoder/base.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/encoder/conformer.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/gradient.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/graph.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/hooks.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/init.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/label_smoothing.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/linear.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/loop.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/loss.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/math_.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/matmul.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/module.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/normalization.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/parameter.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/parametrizations.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/parametrize.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/piecewise_linear.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/rand.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/rec.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/reduce.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/run_ctx.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/signal.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/state.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/stepwise_scheduler.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/tensor_array.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/types.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/import_/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/import_/common.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/import_/git.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/import_/import_.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/learning_rate_control.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/log.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/native_op.cpp +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/native_op.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/pretrain.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/sprint/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/sprint/cache.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/sprint/control.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/sprint/error_signals.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/sprint/extern_interface.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/sprint/interface.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tensor/README.md +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tensor/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tensor/_dim_extra.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tensor/_tensor_extra.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tensor/_tensor_mixin_base.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tensor/_tensor_op_overloads.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tensor/control_flow_ctx.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tensor/dim.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tensor/marked_dim.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tensor/tensor.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tensor/tensor_dict.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tensor/utils.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/compat.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/data_pipeline.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/distributed.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/engine.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/README.md +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/_backend.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/_utils.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/cond.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/config_entry_points.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/debug_eager_mode.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/dims.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/layer.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/loop.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/make_layer.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/masked_computation.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/parameter_assign.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_layers/prev_tensor_ref.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_low_level/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/frontend_low_level/_backend.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/horovod.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/hyper_param_tuning.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/layers/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/layers/base.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/layers/basic.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/layers/rec.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/layers/segmental_model.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/layers/signal_processing.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/layers/variable.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/native_op.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/network.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/sprint.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/updater.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/util/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/util/basic.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/util/data.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/util/gradient_checkpoint.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/util/ken_lm.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/tf/util/open_fst.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/README.md +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/data/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/data/extern_data.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/data/pipeline.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/data/queued_data_iter.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/data/returnn_dataset_wrapper.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/data/tensor_utils.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/distributed.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/engine.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/frontend/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/frontend/_rand.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/frontend/bridge.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/frontend/raw_ops.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/updater.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/util/README.md +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/util/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/util/array_.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/util/diagnose_gpu.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/util/gradient_checkpoint.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/torch/util/scaled_gradient.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/__init__.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/basic.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/better_exchook.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/bpe.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/debug_helpers.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/fsa.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/literal_py_to_pickle.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/multi_proc_non_daemonic_spawn.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/native_code_compiler.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/pprint.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/py-to-pickle.cpp +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/py_compat.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/py_ext_mod_compiler.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/result_with_reason.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/sig_proc.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/task_system.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/train_proc_manager.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/util/watch_memory.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn.egg-info/dependency_links.txt +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn.egg-info/top_level.txt +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/rnn.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/setup.cfg +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/setup.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/DummySprintExec.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/PyCharm-inspection-profile.xml +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/PyCharm.idea/.gitignore +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/PyCharm.idea/.name +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/PyCharm.idea/codeStyleSettings.xml +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/PyCharm.idea/codeStyles/Project.xml +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/PyCharm.idea/codeStyles/codeStyleConfig.xml +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/PyCharm.idea/inspectionProfiles/Project_Default.xml +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/PyCharm.idea/inspectionProfiles/profiles_settings.xml +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/PyCharm.idea/misc.xml +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/PyCharm.idea/modules.xml +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/PyCharm.idea/returnn.iml +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/PyCharm.idea/scopes/scope_settings.xml +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/_set_num_threads1.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/_setup_returnn_env.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/_setup_test_env.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/bpe-unicode-demo.codes +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/bpe-unicode-demo.vocab +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/lexicon_opt.fst +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/lexicon_opt.isyms +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/lexicon_opt.jpg +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/lexicon_opt.osyms +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/lint_common.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/pycharm-inspect.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/pylint.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/returnn-as-framework.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/rf_utils.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/spelling.dic +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_Config.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_Dataset.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_Fsa.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_GeneratingDataset.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_HDFDataset.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_LearningRateControl.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_Log.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_MultiProcDataset.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_Pretrain.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_ResNet.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_SprintDataset.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_SprintInterface.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_TFEngine.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_TFNativeOp.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_TFNetworkLayer.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_TFNetworkRecLayer.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_TFNetworkSigProcLayer.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_TFUpdater.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_TFUtil.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_TF_determinism.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_TaskSystem.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_TaskSystem_SharedMem.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_TranslationDataset.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_demos.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_fork_exec.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_hdf_dump.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_array.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_attention.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_base.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_cond.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_const.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_container.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_conv.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_encoder_conformer.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_gradient.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_label_smoothing.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_loop.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_math.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_normalization.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_piecewise_linear.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_rec.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_reduce.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_rf_signal.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_tensor.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_tools.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_torch_dataset.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_torch_engine.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_torch_frontend.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_torch_internal_frontend.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/test_torch_util.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tests/torch_utils.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/_setup_returnn_env.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/analyze-dataset-batches.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/bliss-collect-seq-lens.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/bliss-dump-text.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/bliss-get-segment-names.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/bliss-to-ogg-zip.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/bpe-create-lexicon.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/calculate-word-error-rate.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/cleanup-old-models.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/collect-orth-symbols.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/collect-words.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/compile_native_op.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/compile_tf_graph.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/debug-dump-search-scores.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/debug-plot-search-scores.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/dump-dataset-raw-strings.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/dump-dataset.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/dump-forward-stats.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/dump-forward.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/dump-network-json.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/dump-pickle.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/extract_state_tying_from_dataset.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/get-attention-weights.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/get-best-model-epoch.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/hdf_dump.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/hdf_dump_translation_dataset.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/import-blocks-mt-model.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/import-t2t-mt-model.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/.gitignore +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/Makefile +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/README.md +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/example/README.md +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/example/libs_list +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/example/network.040/i600_m600_m600.sgd_b16_lr0_cl2.newbobabs.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/example/network.040/i600_m600_m600.sgd_b16_lr0_cl2.newbobabs.keep_over_epoch.lstm2.config +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/example/rescore_lattice.sh +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/example/state_vars_list +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/example/tensor_names_list +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/file.h +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/htklatticerescorer.cc +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/htklatticerescorer.h +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/main.cc +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/rescorer.h +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/vocabulary.cc +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/lattice_rescorer/vocabulary.h +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/tf_avg_checkpoints.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/tf_inspect_checkpoint.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/tf_inspect_summary_log.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/torch_avg_checkpoints.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/torch_export_to_onnx.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/torch_inspect_checkpoint.py +0 -0
- {returnn-1.20240731.223820 → returnn-1.20240808.234227}/tools/torch_inspect_checkpoint_and_opt.py +0 -0
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Import the parameters from the HuggingFace Llama model (PyTorch).
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
from typing import TYPE_CHECKING, Union
|
|
7
|
+
import returnn.frontend as rf
|
|
8
|
+
from returnn.frontend.decoder.transformer import TransformerDecoder, TransformerDecoderLayer, FeedForwardGated
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from transformers.models.llama.modeling_llama import (
|
|
12
|
+
LlamaModel,
|
|
13
|
+
LlamaForCausalLM,
|
|
14
|
+
LlamaDecoderLayer,
|
|
15
|
+
LlamaMLP,
|
|
16
|
+
LlamaRMSNorm,
|
|
17
|
+
LlamaAttention,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def import_params_hf_llama_to_rf_transformer_decoder(
|
|
22
|
+
model_hf: Union[LlamaModel, LlamaForCausalLM], model_rf: TransformerDecoder
|
|
23
|
+
):
|
|
24
|
+
"""
|
|
25
|
+
Import params from HF Llama model to RF :class:`TransformerDecoder`.
|
|
26
|
+
"""
|
|
27
|
+
import torch
|
|
28
|
+
from transformers.models.llama.modeling_llama import LlamaModel, LlamaForCausalLM, LlamaDecoderLayer
|
|
29
|
+
|
|
30
|
+
print("HF Model:")
|
|
31
|
+
print(model_hf)
|
|
32
|
+
print("Parameters:")
|
|
33
|
+
num_params_hf = 0
|
|
34
|
+
for k, v in model_hf.named_parameters():
|
|
35
|
+
print(f"{k}: {list(v.shape)} {v.dtype}")
|
|
36
|
+
num_params_hf += v.numel()
|
|
37
|
+
print("Total number of parameters:", num_params_hf)
|
|
38
|
+
|
|
39
|
+
print("RF Model:")
|
|
40
|
+
print(model_rf)
|
|
41
|
+
print("Parameters:")
|
|
42
|
+
num_params_rf = 0
|
|
43
|
+
for k, v in model_rf.named_parameters():
|
|
44
|
+
print(f"{k}: {list(v.dims)} {v.dtype}")
|
|
45
|
+
assert isinstance(v.raw_tensor, torch.nn.Parameter)
|
|
46
|
+
num_params_rf += v.num_elements()
|
|
47
|
+
print("Total number of parameters:", num_params_rf)
|
|
48
|
+
# Check if the number of parameters is the same below.
|
|
49
|
+
# First import individual sub modules.
|
|
50
|
+
# We might detect any mismatches there, and this will easy the debugging.
|
|
51
|
+
|
|
52
|
+
lm_head = None
|
|
53
|
+
if isinstance(model_hf, LlamaForCausalLM):
|
|
54
|
+
lm_head = model_hf.lm_head
|
|
55
|
+
model_hf = model_hf.model
|
|
56
|
+
else:
|
|
57
|
+
# Exclude logits.
|
|
58
|
+
num_params_rf -= model_rf.logits.weight.num_elements()
|
|
59
|
+
assert isinstance(model_hf, LlamaModel)
|
|
60
|
+
assert model_hf.norm.weight.shape[0] == model_rf.model_dim.dimension
|
|
61
|
+
|
|
62
|
+
assert len(model_hf.layers) == len(model_rf.layers)
|
|
63
|
+
for i, (layer_hf, layer_rf) in enumerate(zip(model_hf.layers, model_rf.layers)):
|
|
64
|
+
assert isinstance(layer_hf, LlamaDecoderLayer)
|
|
65
|
+
assert isinstance(layer_rf, TransformerDecoderLayer)
|
|
66
|
+
import_params_hf_llama_decoder_layer_to_rf(layer_hf, layer_rf)
|
|
67
|
+
|
|
68
|
+
assert model_hf.embed_tokens.weight.shape == model_rf.input_embedding.weight.raw_tensor.shape
|
|
69
|
+
with torch.no_grad():
|
|
70
|
+
model_rf.input_embedding.weight.raw_tensor.copy_(model_hf.embed_tokens.weight) # (vocab,hidden)
|
|
71
|
+
|
|
72
|
+
assert isinstance(model_rf.final_layer_norm, rf.RMSNorm)
|
|
73
|
+
import_params_hf_llama_rms_norm_to_rf(model_hf.norm, model_rf.final_layer_norm)
|
|
74
|
+
|
|
75
|
+
if lm_head is not None:
|
|
76
|
+
assert lm_head.bias is None and model_rf.logits.bias is None # not implemented
|
|
77
|
+
# Torch Linear: (out,in), but RF has (in,out).
|
|
78
|
+
with torch.no_grad():
|
|
79
|
+
model_rf.logits.weight.raw_tensor.copy_(lm_head.weight.T) # (hidden,vocab)
|
|
80
|
+
|
|
81
|
+
assert num_params_rf == num_params_hf, f"missmatch num params: RF {num_params_rf} != HF {num_params_hf}"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def import_params_hf_llama_decoder_layer_to_rf(model_hf: LlamaDecoderLayer, model_rf: TransformerDecoderLayer):
|
|
85
|
+
"""
|
|
86
|
+
Import the parameters from the HF Llama decoder layer.
|
|
87
|
+
"""
|
|
88
|
+
import torch
|
|
89
|
+
|
|
90
|
+
assert model_hf.hidden_size == model_rf.out_dim.dimension
|
|
91
|
+
|
|
92
|
+
print("HF Model:")
|
|
93
|
+
print(model_hf)
|
|
94
|
+
print("Parameters:")
|
|
95
|
+
num_params_hf = 0
|
|
96
|
+
for k, v in model_hf.named_parameters():
|
|
97
|
+
print(f"{k}: {list(v.shape)} {v.dtype}")
|
|
98
|
+
num_params_hf += v.numel()
|
|
99
|
+
print("Total number of parameters:", num_params_hf)
|
|
100
|
+
|
|
101
|
+
print("RF Model:")
|
|
102
|
+
print(model_rf)
|
|
103
|
+
print("Parameters:")
|
|
104
|
+
num_params_rf = 0
|
|
105
|
+
for k, v in model_rf.named_parameters():
|
|
106
|
+
print(f"{k}: {list(v.dims)} {v.dtype}")
|
|
107
|
+
assert isinstance(v.raw_tensor, torch.nn.Parameter)
|
|
108
|
+
num_params_rf += v.num_elements()
|
|
109
|
+
print("Total number of parameters:", num_params_rf)
|
|
110
|
+
# Check if the number of parameters is the same below.
|
|
111
|
+
# First import individual sub modules.
|
|
112
|
+
# We might detect any mismatches there, and this will easy the debugging.
|
|
113
|
+
|
|
114
|
+
assert isinstance(model_rf.ff, FeedForwardGated), f"unexpected: {model_rf.ff}"
|
|
115
|
+
import_params_hf_llama_mlp_to_rf_feed_forward_gated(model_hf.mlp, model_rf.ff)
|
|
116
|
+
|
|
117
|
+
assert isinstance(model_rf.self_att, rf.RotaryPosCausalSelfAttention), f"unexpected: {model_rf.self_att}"
|
|
118
|
+
import_params_hf_llama_att_to_rf_rotary_att(model_hf.self_attn, model_rf.self_att)
|
|
119
|
+
|
|
120
|
+
assert isinstance(model_rf.self_att_layer_norm, rf.RMSNorm), f"unexpected: {model_rf.self_att_layer_norm}"
|
|
121
|
+
import_params_hf_llama_rms_norm_to_rf(model_hf.input_layernorm, model_rf.self_att_layer_norm)
|
|
122
|
+
|
|
123
|
+
assert isinstance(model_rf.ff_layer_norm, rf.RMSNorm), f"unexpected: {model_rf.ff_layer_norm}"
|
|
124
|
+
import_params_hf_llama_rms_norm_to_rf(model_hf.post_attention_layernorm, model_rf.ff_layer_norm)
|
|
125
|
+
|
|
126
|
+
assert num_params_rf == num_params_hf
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def import_params_hf_llama_mlp_to_rf_feed_forward_gated(model_hf: LlamaMLP, model_rf: FeedForwardGated):
|
|
130
|
+
"""
|
|
131
|
+
Import the parameters from the HF Llama MLP module.
|
|
132
|
+
"""
|
|
133
|
+
import torch
|
|
134
|
+
|
|
135
|
+
assert model_hf.hidden_size == model_rf.out_dim.dimension == model_rf.linear_ff.in_dim.dimension
|
|
136
|
+
|
|
137
|
+
print("HF Model:")
|
|
138
|
+
print(model_hf)
|
|
139
|
+
print("Parameters:")
|
|
140
|
+
num_params_hf = 0
|
|
141
|
+
for k, v in model_hf.named_parameters():
|
|
142
|
+
print(f"{k}: {list(v.shape)} {v.dtype}")
|
|
143
|
+
num_params_hf += v.numel()
|
|
144
|
+
print("Total number of parameters:", num_params_hf)
|
|
145
|
+
|
|
146
|
+
print("RF Model:")
|
|
147
|
+
print(model_rf)
|
|
148
|
+
print("Parameters:")
|
|
149
|
+
num_params_rf = 0
|
|
150
|
+
for k, v in model_rf.named_parameters():
|
|
151
|
+
print(f"{k}: {list(v.dims)} {v.dtype}")
|
|
152
|
+
assert isinstance(v.raw_tensor, torch.nn.Parameter)
|
|
153
|
+
num_params_rf += v.num_elements()
|
|
154
|
+
print("Total number of parameters:", num_params_rf)
|
|
155
|
+
assert num_params_rf == num_params_hf
|
|
156
|
+
|
|
157
|
+
# Torch Linear: (out,in), but RF has (in,out).
|
|
158
|
+
w1 = model_hf.gate_proj.weight.T # (in,out)
|
|
159
|
+
w2 = model_hf.up_proj.weight.T # (in,out)
|
|
160
|
+
w3 = model_hf.down_proj.weight.T # (out,in)
|
|
161
|
+
assert model_hf.gate_proj.bias is None # not implemented
|
|
162
|
+
assert model_hf.up_proj.bias is None # not implemented
|
|
163
|
+
assert model_hf.down_proj.bias is None # not implemented
|
|
164
|
+
with torch.no_grad():
|
|
165
|
+
w = torch.cat((w1, w2), dim=1) # (in,out*2)
|
|
166
|
+
model_rf.linear_ff.weight.raw_tensor.copy_(w)
|
|
167
|
+
model_rf.linear_out.weight.raw_tensor.copy_(w3)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def import_params_hf_llama_rms_norm_to_rf(model_hf: LlamaRMSNorm, model_rf: rf.RMSNorm):
|
|
171
|
+
"""
|
|
172
|
+
Import the parameters from the HF Llama RMSNorm module.
|
|
173
|
+
"""
|
|
174
|
+
import torch
|
|
175
|
+
|
|
176
|
+
assert model_hf.weight.shape[0] == model_rf.in_dim.dimension
|
|
177
|
+
|
|
178
|
+
print("HF Model:")
|
|
179
|
+
print(model_hf)
|
|
180
|
+
print("Parameters:")
|
|
181
|
+
num_params_hf = 0
|
|
182
|
+
for k, v in model_hf.named_parameters():
|
|
183
|
+
print(f"{k}: {list(v.shape)} {v.dtype}")
|
|
184
|
+
num_params_hf += v.numel()
|
|
185
|
+
print("Total number of parameters:", num_params_hf)
|
|
186
|
+
|
|
187
|
+
print("RF Model:")
|
|
188
|
+
print(model_rf)
|
|
189
|
+
print("Parameters:")
|
|
190
|
+
num_params_rf = 0
|
|
191
|
+
for k, v in model_rf.named_parameters():
|
|
192
|
+
print(f"{k}: {list(v.dims)} {v.dtype}")
|
|
193
|
+
assert isinstance(v.raw_tensor, torch.nn.Parameter)
|
|
194
|
+
num_params_rf += v.num_elements()
|
|
195
|
+
print("Total number of parameters:", num_params_rf)
|
|
196
|
+
assert num_params_rf == num_params_hf
|
|
197
|
+
|
|
198
|
+
w = model_hf.weight # (in,)
|
|
199
|
+
with torch.no_grad():
|
|
200
|
+
model_rf.scale.raw_tensor.copy_(w)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def import_params_hf_llama_att_to_rf_rotary_att(model_hf: LlamaAttention, model_rf: rf.RotaryPosCausalSelfAttention):
|
|
204
|
+
"""
|
|
205
|
+
Import the parameters from the HF Llama attention module.
|
|
206
|
+
"""
|
|
207
|
+
import torch
|
|
208
|
+
|
|
209
|
+
assert model_hf.num_heads == model_rf.num_heads.dimension
|
|
210
|
+
assert model_hf.hidden_size == model_rf.in_dim.dimension
|
|
211
|
+
dim = model_hf.hidden_size
|
|
212
|
+
nh = model_hf.num_heads
|
|
213
|
+
hdim = dim // nh
|
|
214
|
+
|
|
215
|
+
print("HF Model:")
|
|
216
|
+
print(model_hf)
|
|
217
|
+
print("Parameters:")
|
|
218
|
+
num_params_hf = 0
|
|
219
|
+
for k, v in model_hf.named_parameters():
|
|
220
|
+
print(f"{k}: {list(v.shape)} {v.dtype}")
|
|
221
|
+
num_params_hf += v.numel()
|
|
222
|
+
print("Total number of parameters:", num_params_hf)
|
|
223
|
+
|
|
224
|
+
print("RF Model:")
|
|
225
|
+
print(model_rf)
|
|
226
|
+
print("Parameters:")
|
|
227
|
+
num_params_rf = 0
|
|
228
|
+
for k, v in model_rf.named_parameters():
|
|
229
|
+
print(f"{k}: {list(v.dims)} {v.dtype}")
|
|
230
|
+
assert isinstance(v.raw_tensor, torch.nn.Parameter)
|
|
231
|
+
num_params_rf += v.num_elements()
|
|
232
|
+
print("Total number of parameters:", num_params_rf)
|
|
233
|
+
assert num_params_rf == num_params_hf, f"num params RF {num_params_rf} != params HF {num_params_hf}"
|
|
234
|
+
|
|
235
|
+
# Torch Linear: (out,in), but RF has (in,out).
|
|
236
|
+
q = model_hf.q_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
|
|
237
|
+
k = model_hf.k_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
|
|
238
|
+
v = model_hf.v_proj.weight.T.reshape(dim, nh, hdim) # (in,h,out/h)
|
|
239
|
+
q = q.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2) # reorder complex numbers
|
|
240
|
+
k = k.reshape(dim, nh, 2, hdim // 2).transpose(-1, -2).flatten(-2) # reorder complex numbers
|
|
241
|
+
qkv = torch.cat([q, k, v], dim=2) # (in,h,out/h*3)
|
|
242
|
+
qkv = qkv.reshape(dim, 3 * dim)
|
|
243
|
+
assert model_hf.q_proj.bias is None # not implemented
|
|
244
|
+
with torch.no_grad():
|
|
245
|
+
model_rf.qkv.weight.raw_tensor.copy_(qkv)
|
|
246
|
+
model_rf.proj.weight.raw_tensor.copy_(model_hf.o_proj.weight.T)
|
{returnn-1.20240731.223820 → returnn-1.20240808.234227}/returnn/frontend/decoder/transformer.py
RENAMED
|
@@ -13,10 +13,12 @@ References:
|
|
|
13
13
|
|
|
14
14
|
from __future__ import annotations
|
|
15
15
|
from typing import Optional, Any, Union, Tuple, Dict, Callable, Sequence
|
|
16
|
+
from types import FunctionType
|
|
16
17
|
import functools
|
|
17
18
|
import logging
|
|
18
19
|
import copy as _copy
|
|
19
20
|
from returnn.util.basic import NotSpecified, BehaviorVersion
|
|
21
|
+
from returnn.util.math import ceil_div
|
|
20
22
|
import returnn.frontend as rf
|
|
21
23
|
from returnn.tensor import Tensor, Dim, single_step_dim
|
|
22
24
|
|
|
@@ -36,6 +38,7 @@ class TransformerDecoder(rf.Module):
|
|
|
36
38
|
ff: Union[type, Dict[str, Any], rf.Module] = NotSpecified,
|
|
37
39
|
ff_dim: Union[Dim, int] = NotSpecified,
|
|
38
40
|
ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
|
|
41
|
+
pos_enc: Union[None, Callable, Dict[str, Any], rf.Module] = rf.sinusoidal_positional_encoding,
|
|
39
42
|
dropout: float = 0.1,
|
|
40
43
|
num_heads: int = 8,
|
|
41
44
|
att_dropout: float = 0.1,
|
|
@@ -57,6 +60,7 @@ class TransformerDecoder(rf.Module):
|
|
|
57
60
|
:param ff: feed-forward / MLP block. Default is :class:`FeedForward`
|
|
58
61
|
:param ff_dim: the dimension of feed-forward layers. 2048 originally, or 4 times out_dim
|
|
59
62
|
:param ff_activation: activation function for feed-forward network
|
|
63
|
+
:param pos_enc: positional encoding. Default is sinusoidal positional encoding.
|
|
60
64
|
:param dropout: the dropout value for the FF block
|
|
61
65
|
:param num_heads: the number of attention heads
|
|
62
66
|
:param att_dropout: attention dropout value
|
|
@@ -92,10 +96,21 @@ class TransformerDecoder(rf.Module):
|
|
|
92
96
|
if embed_dim:
|
|
93
97
|
self.input_embedding_proj = rf.Linear(embed_dim, model_dim, with_bias=False)
|
|
94
98
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
+
if pos_enc is None:
|
|
100
|
+
pass
|
|
101
|
+
elif isinstance(pos_enc, dict):
|
|
102
|
+
pos_enc = rf.build_from_dict(
|
|
103
|
+
pos_enc, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
|
|
104
|
+
)
|
|
105
|
+
elif isinstance(pos_enc, rf.Module):
|
|
106
|
+
pass
|
|
107
|
+
elif isinstance(pos_enc, FunctionType):
|
|
108
|
+
pos_enc = functools.partial(
|
|
109
|
+
pos_enc, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
|
|
110
|
+
)
|
|
111
|
+
else:
|
|
112
|
+
raise TypeError(f"unexpected pos_enc type {pos_enc!r}")
|
|
113
|
+
self.pos_enc = pos_enc
|
|
99
114
|
if share_embedding is None:
|
|
100
115
|
if BehaviorVersion.get() < 20:
|
|
101
116
|
logging.getLogger("returnn.frontend").warning(
|
|
@@ -189,7 +204,8 @@ class TransformerDecoder(rf.Module):
|
|
|
189
204
|
new_state = rf.State()
|
|
190
205
|
|
|
191
206
|
decoded = self.input_embedding(source) * self.input_embedding_scale
|
|
192
|
-
|
|
207
|
+
if self.pos_enc is not None:
|
|
208
|
+
decoded = decoded + self.pos_enc(spatial_dim=spatial_dim, offset=state.pos)
|
|
193
209
|
decoded = rf.dropout(decoded, self.input_dropout)
|
|
194
210
|
if self.input_embedding_proj is not None:
|
|
195
211
|
decoded = self.input_embedding_proj(decoded)
|
|
@@ -228,7 +244,9 @@ class TransformerDecoderLayer(rf.Module):
|
|
|
228
244
|
ff_activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = NotSpecified,
|
|
229
245
|
dropout: float = 0.1,
|
|
230
246
|
num_heads: int = 8,
|
|
231
|
-
self_att: Optional[
|
|
247
|
+
self_att: Optional[
|
|
248
|
+
Union[rf.CausalSelfAttention, rf.RelPosCausalSelfAttention, rf.Module, type, Dict[str, Any]]
|
|
249
|
+
] = None,
|
|
232
250
|
self_att_opts: Optional[Dict[str, Any]] = None,
|
|
233
251
|
att_dropout: float = 0.1,
|
|
234
252
|
norm: Union[type, Dict[str, Any], rf.Module, Callable] = rf.LayerNorm,
|
|
@@ -271,7 +289,7 @@ class TransformerDecoderLayer(rf.Module):
|
|
|
271
289
|
self.ff = ff
|
|
272
290
|
self.ff_layer_norm = _make_norm(norm, out_dim)
|
|
273
291
|
|
|
274
|
-
if self_att is None or isinstance(self_att, type):
|
|
292
|
+
if self_att is None or isinstance(self_att, type) or isinstance(self_att, dict):
|
|
275
293
|
self_att_opts_ = dict(
|
|
276
294
|
in_dim=out_dim,
|
|
277
295
|
proj_dim=out_dim,
|
|
@@ -284,10 +302,16 @@ class TransformerDecoderLayer(rf.Module):
|
|
|
284
302
|
self_att_opts_.update(self_att_opts)
|
|
285
303
|
if self_att is None:
|
|
286
304
|
self.self_att = rf.CausalSelfAttention(**self_att_opts_)
|
|
287
|
-
|
|
305
|
+
elif isinstance(self_att, type):
|
|
288
306
|
self.self_att = self_att(**self_att_opts_)
|
|
307
|
+
elif isinstance(self_att, dict):
|
|
308
|
+
self.self_att = rf.build_from_dict(self_att, **self_att_opts_)
|
|
309
|
+
else:
|
|
310
|
+
raise TypeError(f"unexpected self_att type {self_att!r}")
|
|
311
|
+
elif isinstance(self_att, rf.Module):
|
|
312
|
+
self.self_att = _copy.deepcopy(self_att)
|
|
289
313
|
else:
|
|
290
|
-
|
|
314
|
+
raise TypeError(f"unexpected self_att type {self_att!r}")
|
|
291
315
|
self.self_att_layer_norm = _make_norm(norm, out_dim)
|
|
292
316
|
|
|
293
317
|
self.cross_att = None
|
|
@@ -353,12 +377,15 @@ class FeedForward(rf.Module):
|
|
|
353
377
|
ff_dim: Optional[Union[Dim, int]] = NotSpecified,
|
|
354
378
|
dropout: float = 0.1,
|
|
355
379
|
activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.relu,
|
|
380
|
+
with_bias: bool = True,
|
|
356
381
|
):
|
|
357
382
|
"""
|
|
358
383
|
:param out_dim: output feature dimension
|
|
359
384
|
:param ff_dim: dimension of the feed-forward layers
|
|
360
385
|
:param dropout: dropout value
|
|
361
386
|
:param activation: activation function, relu by default
|
|
387
|
+
:param with_bias: whether to use bias in the linear layers.
|
|
388
|
+
True by default for compatibility, but nowadays it's common to use without bias.
|
|
362
389
|
"""
|
|
363
390
|
super().__init__()
|
|
364
391
|
|
|
@@ -381,8 +408,8 @@ class FeedForward(rf.Module):
|
|
|
381
408
|
self.dropout_broadcast = rf.dropout_broadcast_default()
|
|
382
409
|
self.activation = activation
|
|
383
410
|
|
|
384
|
-
self.linear_ff = rf.Linear(out_dim, ff_dim)
|
|
385
|
-
self.linear_out = rf.Linear(ff_dim, out_dim)
|
|
411
|
+
self.linear_ff = rf.Linear(out_dim, ff_dim, with_bias=with_bias)
|
|
412
|
+
self.linear_out = rf.Linear(ff_dim, out_dim, with_bias=with_bias)
|
|
386
413
|
|
|
387
414
|
def __call__(self, inp: Tensor) -> Tensor:
|
|
388
415
|
"""forward"""
|
|
@@ -401,6 +428,8 @@ class FeedForwardGated(rf.Module):
|
|
|
401
428
|
f(Linear(x)) * Linear(x)
|
|
402
429
|
|
|
403
430
|
This is a feed-forward block based on SwiGLU, as defined in the paper.
|
|
431
|
+
|
|
432
|
+
Alternative to :class:`FeedForward`.
|
|
404
433
|
"""
|
|
405
434
|
|
|
406
435
|
def __init__(
|
|
@@ -410,14 +439,30 @@ class FeedForwardGated(rf.Module):
|
|
|
410
439
|
ff_dim: Optional[Union[Dim, int]] = NotSpecified,
|
|
411
440
|
dropout: float = 0.1,
|
|
412
441
|
activation: Union[Callable[[Tensor], Tensor], Dict[str, Any], rf.Module] = rf.swish,
|
|
442
|
+
with_bias: bool = False,
|
|
413
443
|
):
|
|
444
|
+
"""
|
|
445
|
+
:param out_dim:
|
|
446
|
+
:param ff_dim: intermediate dimension.
|
|
447
|
+
Unlike :class:`FeedForward`:
|
|
448
|
+
If not provided, factor 4*2/3 to keep same number of parameters as in the original :class:`FeedForward`,
|
|
449
|
+
just as in the paper, and also making it a multiple of 256.
|
|
450
|
+
:param dropout:
|
|
451
|
+
:param activation: activation function for the gating. unlike :class:`FeedForward`, default is swish.
|
|
452
|
+
:param with_bias: whether to use bias in the linear layers.
|
|
453
|
+
unlike :class:`FeedForward`, default is False.
|
|
454
|
+
"""
|
|
414
455
|
super().__init__()
|
|
415
456
|
|
|
416
457
|
if isinstance(ff_dim, int):
|
|
417
458
|
ff_dim = Dim(ff_dim, name="transformer-ff-dim")
|
|
418
459
|
if ff_dim is NotSpecified or ff_dim is None:
|
|
419
|
-
# Factor
|
|
420
|
-
|
|
460
|
+
# Factor 4 as usual.
|
|
461
|
+
# The additional factor 2/3 to keep same number of parameters as in the original FF block,
|
|
462
|
+
# just as in the paper.
|
|
463
|
+
ff_dim_ = out_dim.dimension * 4 * 2 // 3
|
|
464
|
+
ff_dim_ = ceil_div(ff_dim_, 256) * 256 # make multiple of 256
|
|
465
|
+
ff_dim = Dim(ff_dim_, name="transformer-ff-dim")
|
|
421
466
|
if not isinstance(ff_dim, Dim):
|
|
422
467
|
raise TypeError(f"Transformer FeedForward: unexpected ff_dim {ff_dim!r} type {type(ff_dim)}")
|
|
423
468
|
|
|
@@ -434,8 +479,8 @@ class FeedForwardGated(rf.Module):
|
|
|
434
479
|
self.activation = activation
|
|
435
480
|
|
|
436
481
|
# Factor 2 because we concatenate the two paths.
|
|
437
|
-
self.linear_ff = rf.Linear(out_dim, 2 * ff_dim)
|
|
438
|
-
self.linear_out = rf.Linear(ff_dim, out_dim)
|
|
482
|
+
self.linear_ff = rf.Linear(out_dim, 2 * ff_dim, with_bias=with_bias)
|
|
483
|
+
self.linear_out = rf.Linear(ff_dim, out_dim, with_bias=with_bias)
|
|
439
484
|
|
|
440
485
|
def __call__(self, inp: Tensor) -> Tensor:
|
|
441
486
|
"""forward"""
|
|
@@ -983,6 +983,9 @@ class TorchBackend(Backend[torch.Tensor]):
|
|
|
983
983
|
elif axis_int == 0 and source.batch_ndim == 2:
|
|
984
984
|
# This is exactly what torch.embedding is intended for. Let's use that.
|
|
985
985
|
out.raw_tensor = torch.embedding(source.raw_tensor, indices.raw_tensor)
|
|
986
|
+
elif indices.batch_ndim <= 1:
|
|
987
|
+
# Note: This also works when indices is on CPU and source is on GPU.
|
|
988
|
+
out.raw_tensor = source.raw_tensor[(slice(None),) * axis_int + (indices.raw_tensor,)]
|
|
986
989
|
else:
|
|
987
990
|
out_raw = torch.index_select(source.raw_tensor, dim=axis_int, index=indices.raw_tensor.flatten())
|
|
988
991
|
out_shape = (
|
|
@@ -628,7 +628,10 @@ class PyTracer:
|
|
|
628
628
|
continue
|
|
629
629
|
prev = self.captured_locals[func][-1].get(k, None)
|
|
630
630
|
if prev is None or prev[-1] is not v:
|
|
631
|
-
print(
|
|
631
|
+
print(
|
|
632
|
+
f"{func.__qualname__}[{len(self.captured_locals[func]) - 1}]"
|
|
633
|
+
f" {type(v).__qualname__} var changed: {k} = {v}"
|
|
634
|
+
)
|
|
632
635
|
self.captured_locals[func][-1].setdefault(k, []).append(v)
|
|
633
636
|
return self
|
|
634
637
|
return prev_trace_func_res
|
|
@@ -686,9 +689,13 @@ def check_py_traces_rf_to_pt_equal(
|
|
|
686
689
|
else:
|
|
687
690
|
raise TypeError(f"invalid dim type: {dim!r}")
|
|
688
691
|
|
|
692
|
+
def _format_check(check: Tuple[Union[FunctionType, Callable], int, str, int]) -> str:
|
|
693
|
+
func, i, var_name, j = check
|
|
694
|
+
return f"{func.__qualname__}[{i}] {var_name}[{j}]"
|
|
695
|
+
|
|
689
696
|
non_matching = []
|
|
690
697
|
for check_rf, check_pt, pt_dims in checks:
|
|
691
|
-
print(f"checking {check_rf} vs {check_pt} ({pt_dims})...")
|
|
698
|
+
print(f"checking {_format_check(check_rf)} vs {_format_check(check_pt)} ({pt_dims})...")
|
|
692
699
|
tensor_rf: Tensor = _get_entry(trace_rf, *check_rf)
|
|
693
700
|
tensor_pt: torch.Tensor = _get_entry(trace_pt, *check_pt)
|
|
694
701
|
if callable(pt_dims):
|