returnn 1.20251007.115327__tar.gz → 1.20251007.223754__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of returnn might be problematic. Click here for more details.
- {returnn-1.20251007.115327/returnn.egg-info → returnn-1.20251007.223754}/PKG-INFO +1 -1
- returnn-1.20251007.223754/_setup_info_generated.py +2 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/basic.py +1 -0
- returnn-1.20251007.223754/returnn/datasets/huggingface.py +434 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754/returnn.egg-info}/PKG-INFO +1 -1
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn.egg-info/SOURCES.txt +2 -0
- returnn-1.20251007.223754/tests/test_datasets_huggingface.py +201 -0
- returnn-1.20251007.115327/_setup_info_generated.py +0 -2
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/.editorconfig +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/.gitignore +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/.gitmodules +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/.kateconfig +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/CHANGELOG.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/CODEOWNERS +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/CONTRIBUTING.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/LICENSE +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/MANIFEST.in +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/README.rst +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/12AX.cluster_map +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/_setup_returnn_env.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-fwd.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-horovod-mpi.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-horovod-mpi.py.sh +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-horovod-mpi.sh +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-hyper-param-tuning.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-iter-dataset.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-list-devices.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-lua-torch-layer.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-pretrain.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-record-and-push-to-webserver.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-returnn-as-framework.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-rf-pt-benchmark.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-rf.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-rhn-enwik8.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-sprint-interface.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-att-copy.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-attention.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-chunking-blstm.12ax.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-contribrnn-lstm.12ax.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-enc-dec.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-hard-att-copy.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-lstm-benchmark.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-maxgradnorm-lstm.12ax.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-native-lstm-lowmem.12ax.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-native-lstm.12ax.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-native-lstm2.12ax.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-native-lstm2.12ax.tuned.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-neural-transducer.12ax.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-rec-explicit-lstm.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-rec-explicit-rnn.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-rec-self-att.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-search-compiled-graph.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-tf-vanilla-lstm.12ax.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-timit-lstm-ctc.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-torch.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo-upd-mult-model.lstm.12ax.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/demo.sh +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/IAM_lines/a01-000u-00.png +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/IAM_lines/a01-007-04.png +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/IAM_lines/a01-007-06.png +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/README.txt +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/chars.txt +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/config_demo +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/config_fwd +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/config_real +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/create_IAM_dataset.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/decode.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/features/raw/demo.h5 +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/go.sh +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/lines.txt +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/split/eval.txt +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/split/train.txt +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/IAM/split/valid.txt +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/README.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/artificial/create_test_h5.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/artificial/forwardconfig +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/artificial/go.sh +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/artificial/trainconfig +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/artificial_rgb/create_test_h5.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/artificial_rgb/forwardconfig +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/artificial_rgb/go.sh +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/demos/mdlstm/artificial_rgb/trainconfig +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/pyproject.toml +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/requirements.txt +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/__main__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/__old_mod_loader__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/__setup__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/config.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/audio.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/bundle_file.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/cached.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/cached2.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/distrib_files.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/generating.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/hdf.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/lm.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/map.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/meta.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/multi_proc.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/normalization_data.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/numpy_dump.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/postprocessing.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/raw_wav.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/sprint.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/stereo.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/text_dict.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/util/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/util/feature_extraction.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/util/strings.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/datasets/util/vocabulary.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/engine/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/engine/base.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/engine/batch.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/__main__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/.git +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/.gitignore +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/LICENSE +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/README.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/aligner.gif +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/check.png +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/core.cu +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/core.h +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/core_cpu.cpp +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/pytorch_binding/LICENSE +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/pytorch_binding/MANIFEST.in +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/pytorch_binding/README.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/pytorch_binding/binding.cpp +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/pytorch_binding/core.cu +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/pytorch_binding/core.h +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/pytorch_binding/requirements.txt +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/pytorch_binding/setup.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/pytorch_binding/warp_rna/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/pytorch_binding/warp_rna/test.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/ref_rna.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/setup.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/src/warp_rna_op.cc +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/src/warp_rna_op_kernel_tmpl.h +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/tensorflow_binding/warp_rna/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/WarpRna/warp-rna/test.cpp +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/graph_editor/README.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/graph_editor/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/graph_editor/edit.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/graph_editor/reroute.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/graph_editor/select.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/graph_editor/subgraph.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/graph_editor/transform.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/extern/graph_editor/util.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/forward_iface.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_backend.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_cache.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_native/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_native/backend.cpp +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_native/backend.hpp +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_native/module.cpp +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_native/module.hpp +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_native/py_utils.hpp +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_native/tensor_ops.cpp +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_native/tensor_ops.hpp +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_numpy_backend.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_random_journal.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/_utils.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/array_.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/attention.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/audio/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/audio/mel.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/audio/specaugment.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/backend.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/build_from_dict.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/cond.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/const.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/container.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/control_flow_ctx.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/conv.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/conversions/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/conversions/espnet_e_branchformer.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/conversions/hf_llama.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/conversions/torch_nn.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/decoder/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/decoder/transformer.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/device.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/dims.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/dropout.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/dtype.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/encoder/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/encoder/base.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/encoder/conformer.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/encoder/conformer_v2.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/encoder/e_branchformer.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/encoder/transformer.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/gradient.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/graph.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/hooks.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/init.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/label_smoothing.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/linear.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/loop.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/loss.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/math_.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/matmul.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/module.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/nested.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/normalization.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/parameter.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/parametrizations.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/parametrize.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/piecewise_linear.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/rand.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/rec.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/reduce.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/run_ctx.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/signal.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/state.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/stepwise_scheduler.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/tensor_array.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/frontend/types.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/import_/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/import_/common.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/import_/git.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/import_/import_.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/learning_rate_control.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/log.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/native_op.cpp +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/native_op.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/pretrain.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/sprint/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/sprint/cache.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/sprint/control.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/sprint/error_signals.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/sprint/extern_interface.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/sprint/interface.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tensor/README.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tensor/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tensor/_dim_extra.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tensor/_tensor_extra.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tensor/_tensor_mixin_base.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tensor/_tensor_op_overloads.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tensor/control_flow_ctx.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tensor/dim.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tensor/marked_dim.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tensor/tensor.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tensor/tensor_dict.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tensor/utils.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/compat.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/data_pipeline.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/distributed.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/engine.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/README.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/_backend.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/_utils.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/cond.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/config_entry_points.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/debug_eager_mode.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/dims.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/layer.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/loop.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/make_layer.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/masked_computation.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/parameter_assign.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_layers/prev_tensor_ref.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_low_level/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/frontend_low_level/_backend.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/horovod.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/hyper_param_tuning.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/layers/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/layers/base.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/layers/basic.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/layers/rec.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/layers/segmental_model.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/layers/signal_processing.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/layers/variable.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/native_op.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/network.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/sprint.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/updater.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/util/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/util/basic.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/util/data.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/util/gradient_checkpoint.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/util/ken_lm.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/tf/util/open_fst.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/README.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/data/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/data/extern_data.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/data/pipeline.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/data/queued_data_iter.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/data/returnn_dataset_wrapper.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/data/tensor_utils.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/distributed.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/engine.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/frontend/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/frontend/_backend.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/frontend/_rand.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/frontend/bridge.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/frontend/raw_ops.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/optim/README.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/optim/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/optim/lion.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/updater.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/util/README.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/util/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/util/array_.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/util/debug_inf_nan.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/util/diagnose_gpu.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/util/exception_helper.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/util/gradient_checkpoint.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/util/module.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/torch/util/scaled_gradient.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/__init__.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/basic.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/better_exchook.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/bpe.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/debug.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/debug_helpers.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/file_cache.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/fsa.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/literal_py_to_pickle.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/lru_cache.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/math.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/multi_proc_non_daemonic_spawn.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/native_code_compiler.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/pprint.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/py-to-pickle.cpp +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/py_ext_mod_compiler.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/result_with_reason.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/sig_proc.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/task_system.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/train_proc_manager.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn/util/watch_memory.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn.egg-info/dependency_links.txt +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn.egg-info/requires.txt +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/returnn.egg-info/top_level.txt +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/rnn.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/setup.cfg +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/setup.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/DummySprintExec.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/PyCharm-inspection-profile.xml +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/PyCharm.idea/.gitignore +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/PyCharm.idea/.name +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/PyCharm.idea/codeStyleSettings.xml +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/PyCharm.idea/codeStyles/Project.xml +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/PyCharm.idea/codeStyles/codeStyleConfig.xml +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/PyCharm.idea/inspectionProfiles/Project_Default.xml +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/PyCharm.idea/inspectionProfiles/profiles_settings.xml +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/PyCharm.idea/misc.xml +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/PyCharm.idea/modules.xml +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/PyCharm.idea/returnn.iml +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/PyCharm.idea/scopes/scope_settings.xml +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/_set_num_threads1.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/_setup_returnn_env.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/_setup_test_env.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/bpe-unicode-demo.codes +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/bpe-unicode-demo.vocab +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/lexicon_opt.fst +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/lexicon_opt.isyms +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/lexicon_opt.jpg +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/lexicon_opt.osyms +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/lint_common.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/pycharm-inspect.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/pylint.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/returnn-as-framework.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/rf_utils.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/spelling.dic +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_Config.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_Dataset.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_Fsa.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_GeneratingDataset.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_HDFDataset.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_LearningRateControl.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_Log.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_MultiProcDataset.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_Pretrain.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_ResNet.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_SprintDataset.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_SprintInterface.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_TFEngine.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_TFNativeOp.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_TFNetworkLayer.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_TFNetworkRecLayer.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_TFNetworkSigProcLayer.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_TFUpdater.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_TFUtil.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_TF_determinism.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_TaskSystem.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_TaskSystem_SharedMem.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_TranslationDataset.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_Util.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_demos.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_fork_exec.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_hdf_dump.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_array.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_attention.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_base.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_cond.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_const.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_container.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_conv.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_decoder_transformer.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_encoder_conformer.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_gradient.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_label_smoothing.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_loop.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_math.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_normalization.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_piecewise_linear.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_rec.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_reduce.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_rf_signal.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_tensor.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_threading.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_tools.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_torch_dataset.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_torch_engine.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_torch_frontend.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_torch_internal_frontend.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/test_torch_util.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tests/torch_utils.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/_setup_returnn_env.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/analyze-dataset-batches.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/bliss-collect-seq-lens.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/bliss-dump-text.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/bliss-get-segment-names.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/bliss-to-ogg-zip.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/bpe-create-lexicon.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/calculate-word-error-rate.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/cleanup-old-models.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/collect-orth-symbols.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/collect-words.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/compile_native_op.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/compile_tf_graph.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/debug-dump-search-scores.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/debug-plot-search-scores.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/dump-dataset-raw-strings.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/dump-dataset.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/dump-forward-stats.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/dump-forward.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/dump-network-json.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/dump-pickle.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/extract_state_tying_from_dataset.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/get-attention-weights.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/get-best-model-epoch.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/hdf_dump.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/hdf_dump_translation_dataset.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/import-blocks-mt-model.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/import-t2t-mt-model.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/.gitignore +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/Makefile +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/README.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/example/README.md +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/example/libs_list +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/example/network.040/i600_m600_m600.sgd_b16_lr0_cl2.newbobabs.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/example/network.040/i600_m600_m600.sgd_b16_lr0_cl2.newbobabs.keep_over_epoch.lstm2.config +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/example/rescore_lattice.sh +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/example/state_vars_list +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/example/tensor_names_list +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/file.h +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/htklatticerescorer.cc +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/htklatticerescorer.h +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/main.cc +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/rescorer.h +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/vocabulary.cc +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/lattice_rescorer/vocabulary.h +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/tf_avg_checkpoints.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/tf_inspect_checkpoint.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/tf_inspect_summary_log.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/torch_avg_checkpoints.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/torch_export_to_onnx.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/torch_inspect_checkpoint.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/torch_inspect_checkpoint_and_opt.py +0 -0
- {returnn-1.20251007.115327 → returnn-1.20251007.223754}/tools/torch_scale_tuning.py +0 -0
|
@@ -1500,6 +1500,7 @@ def get_dataset_class(name: Union[str, Type[Dataset]]) -> Optional[Type[Dataset]
|
|
|
1500
1500
|
"distrib_files",
|
|
1501
1501
|
"postprocessing",
|
|
1502
1502
|
"text_dict",
|
|
1503
|
+
"huggingface",
|
|
1503
1504
|
]
|
|
1504
1505
|
for mod_name in mod_names:
|
|
1505
1506
|
mod = import_module("returnn.datasets.%s" % mod_name)
|
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HuggingFace dataset wrapper
|
|
3
|
+
|
|
4
|
+
See https://github.com/rwth-i6/returnn/issues/1257 for some initial discussion.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
from typing import TYPE_CHECKING, Optional, Union, Any, Callable, Sequence, Dict, List
|
|
9
|
+
import os
|
|
10
|
+
import re
|
|
11
|
+
import numpy
|
|
12
|
+
from returnn.tensor import Tensor
|
|
13
|
+
from returnn.util import file_cache
|
|
14
|
+
from .basic import DatasetSeq
|
|
15
|
+
from .cached2 import CachedDataset2
|
|
16
|
+
from .util.vocabulary import Vocabulary
|
|
17
|
+
from .util.strings import str_to_numpy_array
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
# noinspection PyUnresolvedReferences,PyPackageRequirements
|
|
21
|
+
import datasets
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HuggingFaceDataset(CachedDataset2):
|
|
25
|
+
"""
|
|
26
|
+
HuggingFace dataset wrapper.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
dataset_opts: Union[
|
|
32
|
+
Dict[str, Any],
|
|
33
|
+
str,
|
|
34
|
+
os.PathLike,
|
|
35
|
+
Sequence[Union[str, os.PathLike]],
|
|
36
|
+
Callable[[], Union[Dict[str, Any], str, os.PathLike, Sequence[Union[str, os.PathLike]], datasets.Dataset]],
|
|
37
|
+
],
|
|
38
|
+
*,
|
|
39
|
+
use_file_cache: bool = False,
|
|
40
|
+
map_func: Optional[Callable[[datasets.Dataset], datasets.Dataset]] = None,
|
|
41
|
+
rename_columns: Optional[Dict[str, str]] = None,
|
|
42
|
+
cast_columns: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
43
|
+
data_format: Dict[str, Dict[str, Any]],
|
|
44
|
+
seq_tag_column: Optional[str] = "id",
|
|
45
|
+
sorting_seq_len_column_data: Optional[str] = None,
|
|
46
|
+
sorting_seq_len_column: Optional[str] = None,
|
|
47
|
+
**kwargs,
|
|
48
|
+
):
|
|
49
|
+
"""
|
|
50
|
+
:param dataset_opts: either a dict of options for :func:`datasets.load_dataset`
|
|
51
|
+
or a path to a local dataset for :func:`datasets.load_from_disk`,
|
|
52
|
+
or a list of Arrow filenames to load with :func:`datasets.Dataset.from_file` and concatenate.
|
|
53
|
+
It can also be a callable returning one of the above,
|
|
54
|
+
or returning a :class:`datasets.Dataset` directly.
|
|
55
|
+
:param use_file_cache: if True, will cache the dataset files on local disk using :mod:`file_cache`.
|
|
56
|
+
This only works for dataset_opts which is a str or list of str (or callable returning that).
|
|
57
|
+
:param map_func: optional function to apply to the dataset after loading
|
|
58
|
+
:param rename_columns: if given, will rename these columns
|
|
59
|
+
:param cast_columns: if given, will cast these columns to the specified types.
|
|
60
|
+
This is useful if the dataset has not the expected types.
|
|
61
|
+
See :func:`datasets.Dataset.cast` for details.
|
|
62
|
+
You can also e.g. enforce some sample_rate for audio, etc.
|
|
63
|
+
:param data_format:
|
|
64
|
+
For each column name (data key), specify the format,
|
|
65
|
+
as a dict with entries for "dim", "ndim", "shape", and/or "dtype",
|
|
66
|
+
compatible to :class:`Tensor`.
|
|
67
|
+
It can be a subset of the available columns.
|
|
68
|
+
If "vocab" is specified, and the underlying HF datasets column is of dtype "string",
|
|
69
|
+
it will automatically tokenize the string using the vocab.
|
|
70
|
+
:param seq_tag_column: key (column name) in the dataset to use as sequence tag.
|
|
71
|
+
If None, will use the sequence index as tag.
|
|
72
|
+
:param sorting_seq_len_column_data: key (column name) in the dataset to use for sorting by sequence length.
|
|
73
|
+
It will take len(dataset[sorting_seq_len_column_data]) as sequence length (only for sorting/shuffling).
|
|
74
|
+
:param sorting_seq_len_column: key (column name) in the dataset to use for sorting by sequence length.
|
|
75
|
+
It will take the value of dataset[sorting_seq_len_column] as sequence length (only for sorting/shuffling).
|
|
76
|
+
E.g. some datasets provide "duration", "duration_ms", "wav_filesize" or similar such information
|
|
77
|
+
which can be used.
|
|
78
|
+
"""
|
|
79
|
+
super().__init__(**kwargs)
|
|
80
|
+
|
|
81
|
+
self.dataset_opts = dataset_opts
|
|
82
|
+
self.use_file_cache = use_file_cache
|
|
83
|
+
self.map_func = map_func
|
|
84
|
+
self.rename_columns = rename_columns
|
|
85
|
+
self.cast_columns = cast_columns
|
|
86
|
+
|
|
87
|
+
self.data_format: Dict[str, Tensor] = {k: _make_tensor_template(v, k) for k, v in data_format.items()}
|
|
88
|
+
self.seq_tag_column: Optional[str] = seq_tag_column
|
|
89
|
+
self.sorting_seq_len_column_data = sorting_seq_len_column_data
|
|
90
|
+
self.sorting_seq_len_column = sorting_seq_len_column
|
|
91
|
+
|
|
92
|
+
self.labels = {k: data.vocab.labels for k, data in self.data_format.items() if data.vocab}
|
|
93
|
+
self.num_outputs = {k: (data.dim, data.ndim) for k, data in self.data_format.items()}
|
|
94
|
+
|
|
95
|
+
self.hf_dataset: Optional[datasets.Dataset] = None # lazily loaded, _lazy_init
|
|
96
|
+
self._seq_order: Optional[Sequence[int]] = None # init_seq_order
|
|
97
|
+
self._seq_tags: Optional[List[str]] = None # get_all_tags cache
|
|
98
|
+
|
|
99
|
+
def _lazy_init(self):
|
|
100
|
+
if self.hf_dataset is not None:
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
# Load the dataset
|
|
104
|
+
# noinspection PyUnresolvedReferences,PyPackageRequirements
|
|
105
|
+
import datasets
|
|
106
|
+
|
|
107
|
+
dataset_opts = self.dataset_opts
|
|
108
|
+
if callable(dataset_opts):
|
|
109
|
+
dataset_opts = dataset_opts()
|
|
110
|
+
if self.use_file_cache:
|
|
111
|
+
assert isinstance(dataset_opts, (str, os.PathLike, list, tuple)), (
|
|
112
|
+
f"{self}: with use_file_cache, dataset_opts must be str or list of str, got {type(dataset_opts)}"
|
|
113
|
+
)
|
|
114
|
+
if isinstance(dataset_opts, (str, os.PathLike)):
|
|
115
|
+
dataset_opts = get_arrow_shard_files_from_hf_dataset_dir(dataset_opts)
|
|
116
|
+
assert isinstance(dataset_opts, (list, tuple))
|
|
117
|
+
cache = file_cache.get_instance()
|
|
118
|
+
dataset_opts = [cache.get_file(os.fspath(fn)) for fn in dataset_opts]
|
|
119
|
+
self.set_file_cache(cache)
|
|
120
|
+
if isinstance(dataset_opts, dict):
|
|
121
|
+
self.hf_dataset = datasets.load_dataset(**dataset_opts)
|
|
122
|
+
elif isinstance(dataset_opts, str):
|
|
123
|
+
self.hf_dataset = datasets.load_from_disk(dataset_opts)
|
|
124
|
+
elif isinstance(dataset_opts, (list, tuple)):
|
|
125
|
+
self.hf_dataset = datasets.concatenate_datasets([datasets.Dataset.from_file(fn) for fn in dataset_opts])
|
|
126
|
+
elif isinstance(dataset_opts, datasets.Dataset):
|
|
127
|
+
self.hf_dataset = dataset_opts
|
|
128
|
+
else:
|
|
129
|
+
raise TypeError(f"{self}: invalid dataset_opts type {type(dataset_opts)}")
|
|
130
|
+
assert isinstance(self.hf_dataset, datasets.Dataset), (
|
|
131
|
+
f"{self}: Expected single dataset, got {type(self.hf_dataset)} {self.hf_dataset}. Specify split if needed."
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if self.map_func is not None:
|
|
135
|
+
self.hf_dataset = self.map_func(self.hf_dataset)
|
|
136
|
+
|
|
137
|
+
if self.rename_columns:
|
|
138
|
+
self.hf_dataset = self.hf_dataset.rename_columns(self.rename_columns)
|
|
139
|
+
|
|
140
|
+
if self.cast_columns:
|
|
141
|
+
# Note: prefer cast_column, as this can avoid using `map`, i.e. be faster.
|
|
142
|
+
for key, column_format in self.cast_columns.items():
|
|
143
|
+
assert key in self.hf_dataset.features, (
|
|
144
|
+
f"{self}: cast_column {key} not in dataset features {self.hf_dataset.features}"
|
|
145
|
+
)
|
|
146
|
+
feat = datasets.features.features.generate_from_dict(column_format)
|
|
147
|
+
self.hf_dataset = self.hf_dataset.cast_column(key, feat)
|
|
148
|
+
|
|
149
|
+
if self.seq_tag_column:
|
|
150
|
+
assert self.seq_tag_column in self.hf_dataset.features, (
|
|
151
|
+
f"{self}: seq_tag_column {self.seq_tag_column} not in dataset features {self.hf_dataset.features}"
|
|
152
|
+
)
|
|
153
|
+
assert self.hf_dataset.features[self.seq_tag_column].dtype in ("string", "int64"), (
|
|
154
|
+
f"{self}: seq_tag_column {self.seq_tag_column} must be of dtype string or int64,"
|
|
155
|
+
f" got {self.hf_dataset.features[self.seq_tag_column].dtype}"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
selected_columns = list(self.data_format.keys())
|
|
159
|
+
if self.seq_tag_column and self.seq_tag_column not in selected_columns:
|
|
160
|
+
selected_columns.append(self.seq_tag_column)
|
|
161
|
+
if self.sorting_seq_len_column and self.sorting_seq_len_column not in selected_columns:
|
|
162
|
+
selected_columns.append(self.sorting_seq_len_column)
|
|
163
|
+
if self.sorting_seq_len_column_data and self.sorting_seq_len_column_data not in selected_columns:
|
|
164
|
+
selected_columns.append(self.sorting_seq_len_column_data)
|
|
165
|
+
self.hf_dataset = self.hf_dataset.select_columns(selected_columns)
|
|
166
|
+
|
|
167
|
+
self.hf_dataset.set_format("numpy")
|
|
168
|
+
|
|
169
|
+
for key, user_format in self.data_format.items():
|
|
170
|
+
feature = self.hf_dataset.features[key]
|
|
171
|
+
inferred_format = _infer_data_format_for_feature(feature, f"{self}: column {key}: ")
|
|
172
|
+
if user_format.vocab and inferred_format["dtype"] == "string":
|
|
173
|
+
pass # allow to auto-tokenize strings when vocab is specified
|
|
174
|
+
else:
|
|
175
|
+
for key_ in ["dtype", "ndim", "dim"]:
|
|
176
|
+
assert getattr(user_format, key_) == inferred_format[key_], (
|
|
177
|
+
f"{self}: column {key}, user-specified {user_format}, {key_}:"
|
|
178
|
+
f" user-specified {getattr(user_format, key_)} does not match inferred {inferred_format[key_]}"
|
|
179
|
+
)
|
|
180
|
+
if "vocab" in inferred_format and not user_format.vocab:
|
|
181
|
+
assert user_format.sparse, f"{self}: column {key}: user_format expected to be sparse, got {user_format}"
|
|
182
|
+
user_format.sparse_dim.vocab = Vocabulary.create_vocab(**inferred_format["vocab"])
|
|
183
|
+
self.labels[key] = user_format.vocab.labels
|
|
184
|
+
|
|
185
|
+
def get_data_keys(self) -> List[str]:
|
|
186
|
+
""":return: list of data keys"""
|
|
187
|
+
return list(self.data_format.keys())
|
|
188
|
+
|
|
189
|
+
def get_target_list(self) -> List[str]:
|
|
190
|
+
""":return: list of target keys"""
|
|
191
|
+
return self.get_data_keys() # it's somewhat arbitrary...
|
|
192
|
+
|
|
193
|
+
def get_data_shape(self, key: str) -> List[int]:
|
|
194
|
+
""":return: data shape for the given key"""
|
|
195
|
+
return list(self.data_format[key].shape)
|
|
196
|
+
|
|
197
|
+
def get_data_dim(self, key: str) -> int:
|
|
198
|
+
""":return: data dimension for the given key"""
|
|
199
|
+
return self.data_format[key].dim
|
|
200
|
+
|
|
201
|
+
def is_data_sparse(self, key: str) -> bool:
|
|
202
|
+
""":return: whether the data is sparse for the given key"""
|
|
203
|
+
return self.data_format[key].sparse
|
|
204
|
+
|
|
205
|
+
def get_data_dtype(self, key: str) -> str:
|
|
206
|
+
""":return: dtype"""
|
|
207
|
+
return self.data_format[key].dtype
|
|
208
|
+
|
|
209
|
+
def _get_seq_len(self, seq_idx: int) -> Union[int, float]:
|
|
210
|
+
if self._seq_order_seq_lens_by_idx is not None:
|
|
211
|
+
self._get_seq_len = self._seq_order_seq_lens_by_idx.__getitem__ # faster
|
|
212
|
+
return self._seq_order_seq_lens_by_idx[seq_idx]
|
|
213
|
+
assert not self._seq_order_seq_lens_file # not expected to call this
|
|
214
|
+
if self.sorting_seq_len_column:
|
|
215
|
+
self._seq_order_seq_lens_by_idx = numpy.array(self.hf_dataset[self.sorting_seq_len_column])
|
|
216
|
+
self._get_seq_len = self._seq_order_seq_lens_by_idx.__getitem__ # faster
|
|
217
|
+
v = self._seq_order_seq_lens_by_idx[seq_idx]
|
|
218
|
+
return int(v) # noqa
|
|
219
|
+
if self.sorting_seq_len_column_data:
|
|
220
|
+
v = self.hf_dataset[seq_idx][self.sorting_seq_len_column_data]
|
|
221
|
+
return len(v) # noqa
|
|
222
|
+
raise ValueError(
|
|
223
|
+
f"{self}: sorting/shuffling by seq len not configured,"
|
|
224
|
+
f" need sorting_seq_len_column or sorting_seq_len_column_data"
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
@property
|
|
228
|
+
def num_seqs(self) -> int:
|
|
229
|
+
""":return: number of sequences"""
|
|
230
|
+
assert self._seq_order is not None, "num_seqs is only known after calling init_seq_order()"
|
|
231
|
+
return len(self._seq_order)
|
|
232
|
+
|
|
233
|
+
def get_tag(self, sorted_seq_idx: int) -> str:
|
|
234
|
+
""":return: tag of the sequence"""
|
|
235
|
+
corpus_seq_idx = self.get_corpus_seq_idx(sorted_seq_idx)
|
|
236
|
+
self._lazy_init()
|
|
237
|
+
dataset_item = self.hf_dataset[corpus_seq_idx]
|
|
238
|
+
return self._get_seq_tag(corpus_seq_idx, dataset_item)
|
|
239
|
+
|
|
240
|
+
def get_all_tags(self) -> List[str]:
|
|
241
|
+
""":return: all tags"""
|
|
242
|
+
if self._seq_tags is not None:
|
|
243
|
+
return self._seq_tags
|
|
244
|
+
self._lazy_init()
|
|
245
|
+
if self.seq_tag_column:
|
|
246
|
+
res = list(map(str, self.hf_dataset[self.seq_tag_column]))
|
|
247
|
+
else:
|
|
248
|
+
res = [f"seq-{i}" for i in range(self.hf_dataset.num_rows)]
|
|
249
|
+
self._seq_tags = res
|
|
250
|
+
return res
|
|
251
|
+
|
|
252
|
+
def get_total_num_seqs(self, *, fast: bool = False) -> int:
|
|
253
|
+
""":return: total number of sequences in the dataset"""
|
|
254
|
+
if fast:
|
|
255
|
+
return super().get_total_num_seqs(fast=True)
|
|
256
|
+
self._lazy_init()
|
|
257
|
+
return self.hf_dataset.num_rows
|
|
258
|
+
|
|
259
|
+
def init_seq_order(
|
|
260
|
+
self,
|
|
261
|
+
epoch: Optional[int] = None,
|
|
262
|
+
seq_list: Optional[Sequence[str]] = None,
|
|
263
|
+
seq_order: Optional[Sequence[int]] = None,
|
|
264
|
+
) -> bool:
|
|
265
|
+
"""
|
|
266
|
+
:param epoch:
|
|
267
|
+
:param seq_list: List of sequence tags, to set a predefined order.
|
|
268
|
+
:param seq_order: List of corpus sequence indices, to set a predefined order.
|
|
269
|
+
:returns whether the order changed (True is always safe to return)
|
|
270
|
+
"""
|
|
271
|
+
super().init_seq_order(epoch=epoch, seq_list=seq_list, seq_order=seq_order)
|
|
272
|
+
|
|
273
|
+
if seq_order is not None:
|
|
274
|
+
self._seq_order = seq_order
|
|
275
|
+
elif seq_list is not None:
|
|
276
|
+
all_tags = self.get_all_tags()
|
|
277
|
+
self._seq_order = [all_tags.index(tag) for tag in seq_list]
|
|
278
|
+
elif epoch is None:
|
|
279
|
+
self._seq_order = ()
|
|
280
|
+
else:
|
|
281
|
+
self._lazy_init()
|
|
282
|
+
self._seq_order = self.get_seq_order_for_epoch(
|
|
283
|
+
epoch=epoch, num_seqs=self.hf_dataset.num_rows, get_seq_len=self._get_seq_len
|
|
284
|
+
)
|
|
285
|
+
return True
|
|
286
|
+
|
|
287
|
+
def _collect_single_seq(self, seq_idx: int) -> DatasetSeq:
|
|
288
|
+
# noinspection PyUnresolvedReferences,PyPackageRequirements
|
|
289
|
+
import datasets
|
|
290
|
+
|
|
291
|
+
corpus_seq_idx = self.get_corpus_seq_idx(seq_idx)
|
|
292
|
+
|
|
293
|
+
def _ensure_numpy(k, x):
|
|
294
|
+
if isinstance(x, numpy.ndarray): # fast path
|
|
295
|
+
return x
|
|
296
|
+
if isinstance(x, str):
|
|
297
|
+
if self.data_format[k].dtype == "string":
|
|
298
|
+
return str_to_numpy_array(x)
|
|
299
|
+
if self.data_format[k].vocab:
|
|
300
|
+
return numpy.array(self.data_format[k].vocab.get_seq(x), dtype=self.data_format[k].dtype)
|
|
301
|
+
raise ValueError(f"{self}: column {k}: cannot convert string {x!r} to numpy array")
|
|
302
|
+
feat = self.hf_dataset.features[k]
|
|
303
|
+
if isinstance(feat, datasets.features.Audio):
|
|
304
|
+
# In HF datasets 3, this is just a dict.
|
|
305
|
+
# In HF datasets 4, this can also be a datasets.features._torchcodec.AudioDecoder.
|
|
306
|
+
assert isinstance(x, dict) or x.__class__.__name__ == "AudioDecoder"
|
|
307
|
+
if feat.decode:
|
|
308
|
+
x = x["array"]
|
|
309
|
+
else:
|
|
310
|
+
x = x["bytes"]
|
|
311
|
+
if isinstance(x, numpy.ndarray): # fast path
|
|
312
|
+
return x
|
|
313
|
+
if isinstance(x, (bytes, bytearray)):
|
|
314
|
+
return numpy.frombuffer(x, dtype=self.data_format[k].dtype)
|
|
315
|
+
return numpy.array(x)
|
|
316
|
+
|
|
317
|
+
self._lazy_init()
|
|
318
|
+
dataset_item = self.hf_dataset[corpus_seq_idx]
|
|
319
|
+
seq_tag = self._get_seq_tag(corpus_seq_idx, dataset_item)
|
|
320
|
+
features = {k: _ensure_numpy(k, dataset_item[k]) for k in self.data_format}
|
|
321
|
+
return DatasetSeq(seq_idx, features=features, seq_tag=seq_tag)
|
|
322
|
+
|
|
323
|
+
def _get_seq_tag(self, corpus_seq_idx: int, dataset_item: Dict[str, Any]) -> str:
|
|
324
|
+
if self.seq_tag_column:
|
|
325
|
+
seq_tag = dataset_item[self.seq_tag_column]
|
|
326
|
+
assert isinstance(seq_tag, (str, int, numpy.int64)), f"got {type(seq_tag)} {seq_tag!r}"
|
|
327
|
+
seq_tag = str(seq_tag)
|
|
328
|
+
else:
|
|
329
|
+
seq_tag = f"seq-{corpus_seq_idx}"
|
|
330
|
+
return seq_tag
|
|
331
|
+
|
|
332
|
+
def get_current_seq_order(self) -> Sequence[int]:
|
|
333
|
+
""":return: list of corpus seq idx"""
|
|
334
|
+
assert self._seq_order is not None
|
|
335
|
+
return self._seq_order
|
|
336
|
+
|
|
337
|
+
def get_corpus_seq_idx(self, sorted_seq_idx: int) -> int:
|
|
338
|
+
""":return: corpus seq idx"""
|
|
339
|
+
return int(self._seq_order[sorted_seq_idx])
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def get_arrow_shard_files_from_hf_dataset_dir(hf_data_dir: Union[str, os.PathLike]) -> List[str]:
|
|
343
|
+
"""
|
|
344
|
+
Given some HF datasets directory (created via :func:`datasets.save_to_disk`),
|
|
345
|
+
return the list of Arrow shard files (``data-*-of-*.arrow``).
|
|
346
|
+
This also verifies that the directory looks like a valid HF datasets directory.
|
|
347
|
+
The order of the returned list is by shard index.
|
|
348
|
+
Note that this does not load the dataset, just inspects the directory structure.
|
|
349
|
+
|
|
350
|
+
:param hf_data_dir: directory
|
|
351
|
+
:return: list of Arrow shard files
|
|
352
|
+
"""
|
|
353
|
+
hf_data_dir = os.fspath(hf_data_dir)
|
|
354
|
+
content = os.listdir(hf_data_dir)
|
|
355
|
+
assert "state.json" in content, f"not a valid HF datasets dir: {hf_data_dir!r}"
|
|
356
|
+
assert "dataset_info.json" in content, f"not a valid HF datasets dir: {hf_data_dir!r}"
|
|
357
|
+
pat = re.compile("^(.*)-([0-9]+)-of-([0-9]+).arrow$")
|
|
358
|
+
content = [pat.match(fn) for fn in content]
|
|
359
|
+
content = [m for m in content if m]
|
|
360
|
+
assert content, f"no matching .arrow files in {hf_data_dir!r} found, expected *-*-of-*.arrow"
|
|
361
|
+
prefix = content[0].group(1)
|
|
362
|
+
assert all(m.group(1) == prefix for m in content), (
|
|
363
|
+
f"mismatching prefix in {hf_data_dir!r}, expected {prefix}, got {[m.group(1) for m in content]}"
|
|
364
|
+
)
|
|
365
|
+
num_shards = int(content[0].group(3))
|
|
366
|
+
assert all(int(m.group(3)) == num_shards for m in content), (
|
|
367
|
+
f"mismatching number of shards in {hf_data_dir!r}, expected {num_shards}, got {[m.group(3) for m in content]}"
|
|
368
|
+
)
|
|
369
|
+
assert len(content) == num_shards, f"expected {num_shards} shard files in {hf_data_dir!r}, got {content}"
|
|
370
|
+
content_by_idx = {int(m.group(2)): m for m in content}
|
|
371
|
+
assert set(content_by_idx.keys()) == set(range(num_shards)), (
|
|
372
|
+
f"expected shard indices 0..{num_shards - 1} in {hf_data_dir!r}, got {sorted(content_by_idx.keys())}"
|
|
373
|
+
)
|
|
374
|
+
return [hf_data_dir + "/" + content_by_idx[i].group(0) for i in range(num_shards)]
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
def _infer_data_format_for_feature(
|
|
378
|
+
feature: Union[
|
|
379
|
+
datasets.features.Sequence,
|
|
380
|
+
datasets.features.ClassLabel,
|
|
381
|
+
datasets.features.Value,
|
|
382
|
+
datasets.features.Array2D,
|
|
383
|
+
datasets.features.Array3D,
|
|
384
|
+
datasets.features.Array4D,
|
|
385
|
+
datasets.features.Audio,
|
|
386
|
+
],
|
|
387
|
+
exc_prefix: str = "",
|
|
388
|
+
) -> Dict[str, Any]:
|
|
389
|
+
# noinspection PyUnresolvedReferences,PyPackageRequirements
|
|
390
|
+
import datasets
|
|
391
|
+
|
|
392
|
+
labels = None
|
|
393
|
+
num_classes = None
|
|
394
|
+
num_dims = 0
|
|
395
|
+
while isinstance(feature, datasets.features.Sequence):
|
|
396
|
+
feature: datasets.features.List # typing for HF datasets 4
|
|
397
|
+
num_dims += 1
|
|
398
|
+
if feature.length != -1:
|
|
399
|
+
num_classes = feature.length
|
|
400
|
+
feature = feature.feature
|
|
401
|
+
if isinstance(feature, datasets.features.ClassLabel):
|
|
402
|
+
labels = feature.names
|
|
403
|
+
dtype = feature.dtype
|
|
404
|
+
num_classes = feature.num_classes # noqa
|
|
405
|
+
elif isinstance(feature, datasets.features.Value):
|
|
406
|
+
dtype = feature.dtype
|
|
407
|
+
elif isinstance(feature, (datasets.features.Array2D, datasets.features.Array3D, datasets.features.Array4D)):
|
|
408
|
+
dtype = feature.dtype
|
|
409
|
+
num_classes = feature.shape[-1]
|
|
410
|
+
num_dims += len(feature.shape)
|
|
411
|
+
elif isinstance(feature, datasets.features.Audio):
|
|
412
|
+
if feature.decode:
|
|
413
|
+
dtype = "float32" # samples
|
|
414
|
+
else:
|
|
415
|
+
dtype = "uint8" # bytes
|
|
416
|
+
num_dims += 1 # time axis
|
|
417
|
+
else:
|
|
418
|
+
assert False, f"{exc_prefix}unsupported feature type {type(feature)} {feature}"
|
|
419
|
+
|
|
420
|
+
d = {"dim": num_classes, "ndim": num_dims, "dtype": dtype}
|
|
421
|
+
if labels:
|
|
422
|
+
d["sparse"] = True
|
|
423
|
+
d["vocab"] = {"vocab_file": None, "labels": labels, "unknown_label": None}
|
|
424
|
+
return d
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def _make_tensor_template(data: Union[Dict[str, Any], Tensor], name: str) -> Tensor:
|
|
428
|
+
if isinstance(data, Tensor):
|
|
429
|
+
data = data.copy(name)
|
|
430
|
+
else:
|
|
431
|
+
assert isinstance(data, dict)
|
|
432
|
+
data = Tensor(name, batch_dim_axis=None, **data)
|
|
433
|
+
assert data.batch_dim_axis is None
|
|
434
|
+
return data
|
|
@@ -103,6 +103,7 @@ returnn/datasets/cached2.py
|
|
|
103
103
|
returnn/datasets/distrib_files.py
|
|
104
104
|
returnn/datasets/generating.py
|
|
105
105
|
returnn/datasets/hdf.py
|
|
106
|
+
returnn/datasets/huggingface.py
|
|
106
107
|
returnn/datasets/lm.py
|
|
107
108
|
returnn/datasets/map.py
|
|
108
109
|
returnn/datasets/meta.py
|
|
@@ -377,6 +378,7 @@ tests/test_TaskSystem.py
|
|
|
377
378
|
tests/test_TaskSystem_SharedMem.py
|
|
378
379
|
tests/test_TranslationDataset.py
|
|
379
380
|
tests/test_Util.py
|
|
381
|
+
tests/test_datasets_huggingface.py
|
|
380
382
|
tests/test_demos.py
|
|
381
383
|
tests/test_fork_exec.py
|
|
382
384
|
tests/test_hdf_dump.py
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import _setup_test_env # noqa
|
|
4
|
+
import os
|
|
5
|
+
import tempfile
|
|
6
|
+
import atexit
|
|
7
|
+
import shutil
|
|
8
|
+
import pickle
|
|
9
|
+
import numpy
|
|
10
|
+
|
|
11
|
+
from returnn.datasets import init_dataset
|
|
12
|
+
from returnn.datasets.huggingface import HuggingFaceDataset
|
|
13
|
+
from test_Dataset import dummy_iter_dataset
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _setup_hf_env():
|
|
17
|
+
if "HF_HOME" not in os.environ:
|
|
18
|
+
os.environ["HF_HOME"] = _get_tmp_dir()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_tmp_dir() -> str:
|
|
22
|
+
fn = tempfile.mkdtemp()
|
|
23
|
+
atexit.register(shutil.rmtree, fn)
|
|
24
|
+
return fn
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
_setup_hf_env()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_HuggingFaceDataset_audio():
|
|
31
|
+
ds = HuggingFaceDataset(
|
|
32
|
+
{"path": "datasets-examples/doc-audio-6", "split": "train"},
|
|
33
|
+
cast_columns={"audio": {"_type": "Audio", "sample_rate": 16_000}},
|
|
34
|
+
data_format={"audio": {"dtype": "float32", "shape": [None]}},
|
|
35
|
+
seq_tag_column=None,
|
|
36
|
+
)
|
|
37
|
+
ds.initialize()
|
|
38
|
+
res = dummy_iter_dataset(ds)
|
|
39
|
+
print(res[0].features["audio"])
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_HuggingFaceDataset_text1():
|
|
43
|
+
ds = HuggingFaceDataset(
|
|
44
|
+
{"path": "openai/gdpval", "split": "train"},
|
|
45
|
+
seq_tag_column="task_id",
|
|
46
|
+
data_format={
|
|
47
|
+
"prompt": {"dtype": "string", "shape": ()},
|
|
48
|
+
"sector": {"dtype": "string", "shape": ()},
|
|
49
|
+
"occupation": {"dtype": "string", "shape": ()},
|
|
50
|
+
},
|
|
51
|
+
)
|
|
52
|
+
ds.initialize()
|
|
53
|
+
res = dummy_iter_dataset(ds)
|
|
54
|
+
print(repr(res[0].seq_tag))
|
|
55
|
+
assert type(res[0].seq_tag) is str
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_HuggingFaceDataset_text2():
|
|
59
|
+
ds = HuggingFaceDataset(
|
|
60
|
+
{"path": "lavita/medical-qa-shared-task-v1-toy", "split": "train"},
|
|
61
|
+
seq_tag_column="id",
|
|
62
|
+
data_format={
|
|
63
|
+
"id": {"dtype": "int64", "shape": ()},
|
|
64
|
+
"startphrase": {"dtype": "string", "shape": ()},
|
|
65
|
+
"label": {"dtype": "int64", "shape": ()},
|
|
66
|
+
},
|
|
67
|
+
)
|
|
68
|
+
ds.initialize()
|
|
69
|
+
assert dummy_iter_dataset(ds)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_HuggingFaceDataset_rename_tokens():
|
|
73
|
+
ds = HuggingFaceDataset(
|
|
74
|
+
{"path": "lavita/medical-qa-shared-task-v1-toy", "split": "train"},
|
|
75
|
+
seq_tag_column="id",
|
|
76
|
+
rename_columns={"startphrase": "text"},
|
|
77
|
+
data_format={
|
|
78
|
+
"id": {"dtype": "int64", "shape": ()},
|
|
79
|
+
"text": {"dtype": "string", "shape": ()},
|
|
80
|
+
"label": {"dtype": "int64", "shape": ()},
|
|
81
|
+
},
|
|
82
|
+
)
|
|
83
|
+
ds.initialize()
|
|
84
|
+
assert dummy_iter_dataset(ds)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_HuggingFaceDataset_text_tokenize():
|
|
88
|
+
ds = HuggingFaceDataset(
|
|
89
|
+
{"path": "lavita/medical-qa-shared-task-v1-toy", "split": "train"},
|
|
90
|
+
seq_tag_column="id",
|
|
91
|
+
data_format={
|
|
92
|
+
"id": {"dtype": "int64", "shape": ()},
|
|
93
|
+
"startphrase": {"dtype": "int32", "vocab": {"class": "Utf8ByteTargets"}},
|
|
94
|
+
"label": {"dtype": "int64", "shape": ()},
|
|
95
|
+
},
|
|
96
|
+
)
|
|
97
|
+
ds.initialize()
|
|
98
|
+
res = dummy_iter_dataset(ds)
|
|
99
|
+
txt = res[0].features["startphrase"]
|
|
100
|
+
print("startphrase:", txt)
|
|
101
|
+
assert isinstance(txt, numpy.ndarray) and txt.dtype == numpy.int32
|
|
102
|
+
txt_ = ds.data_format["startphrase"].vocab.get_seq_labels(txt)
|
|
103
|
+
print("startphrase labels:", txt_)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def test_HuggingFaceDataset_pickle():
|
|
107
|
+
ds = HuggingFaceDataset(
|
|
108
|
+
{"path": "lavita/medical-qa-shared-task-v1-toy", "split": "train"},
|
|
109
|
+
seq_tag_column="id",
|
|
110
|
+
data_format={
|
|
111
|
+
"id": {"dtype": "int64", "shape": ()},
|
|
112
|
+
"startphrase": {"dtype": "string", "shape": ()},
|
|
113
|
+
"label": {"dtype": "int64", "shape": ()},
|
|
114
|
+
},
|
|
115
|
+
)
|
|
116
|
+
ds.initialize()
|
|
117
|
+
s = pickle.dumps(ds)
|
|
118
|
+
ds = pickle.loads(s)
|
|
119
|
+
assert isinstance(ds, HuggingFaceDataset)
|
|
120
|
+
assert dummy_iter_dataset(ds)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def test_HuggingFaceDataset_load_from_disk():
|
|
124
|
+
from datasets import load_dataset
|
|
125
|
+
|
|
126
|
+
datadir_path = _get_tmp_dir() + "/hf-dataset-save-to-disk"
|
|
127
|
+
hf_ds = load_dataset("lavita/medical-qa-shared-task-v1-toy", split="train")
|
|
128
|
+
hf_ds.save_to_disk(datadir_path)
|
|
129
|
+
|
|
130
|
+
ds = HuggingFaceDataset(
|
|
131
|
+
datadir_path,
|
|
132
|
+
seq_tag_column="id",
|
|
133
|
+
data_format={
|
|
134
|
+
"id": {"dtype": "int64", "shape": ()},
|
|
135
|
+
"startphrase": {"dtype": "string", "shape": ()},
|
|
136
|
+
"label": {"dtype": "int64", "shape": ()},
|
|
137
|
+
},
|
|
138
|
+
)
|
|
139
|
+
ds.initialize()
|
|
140
|
+
assert dummy_iter_dataset(ds)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def test_HuggingFaceDataset_single_arrows():
|
|
144
|
+
import datasets
|
|
145
|
+
|
|
146
|
+
datadir_path = _get_tmp_dir() + "/hf-dataset-save-to-disk"
|
|
147
|
+
hf_ds = datasets.Dataset.from_list([{"data": i} for i in range(100_000)])
|
|
148
|
+
hf_ds.save_to_disk(datadir_path, num_shards=100)
|
|
149
|
+
|
|
150
|
+
content = os.listdir(datadir_path)
|
|
151
|
+
print("Saved dir content:", content)
|
|
152
|
+
assert "state.json" in content
|
|
153
|
+
assert "dataset_info.json" in content
|
|
154
|
+
assert all(f"data-{i:05}-of-00100.arrow" in content for i in range(100))
|
|
155
|
+
|
|
156
|
+
ds = HuggingFaceDataset(
|
|
157
|
+
[f"{datadir_path}/data-{i:05}-of-00100.arrow" for i in range(0, 100, 5)],
|
|
158
|
+
seq_tag_column=None,
|
|
159
|
+
data_format={"data": {"dtype": "int64", "shape": ()}},
|
|
160
|
+
)
|
|
161
|
+
ds.initialize()
|
|
162
|
+
assert dummy_iter_dataset(ds)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def test_HuggingFaceDataset_file_cache_with_sharded():
|
|
166
|
+
import datasets
|
|
167
|
+
|
|
168
|
+
datadir_path = _get_tmp_dir() + "/hf-dataset-save-to-disk"
|
|
169
|
+
hf_ds = datasets.Dataset.from_list([{"data": i} for i in range(100_000)])
|
|
170
|
+
hf_ds.save_to_disk(datadir_path, num_shards=100)
|
|
171
|
+
|
|
172
|
+
ds = HuggingFaceDataset(
|
|
173
|
+
datadir_path,
|
|
174
|
+
use_file_cache=True,
|
|
175
|
+
seq_tag_column=None,
|
|
176
|
+
data_format={"data": {"dtype": "int64", "shape": ()}},
|
|
177
|
+
)
|
|
178
|
+
ds.initialize()
|
|
179
|
+
assert dummy_iter_dataset(ds)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def test_HuggingFaceDataset_in_multi_proc():
|
|
183
|
+
ds_dict = {
|
|
184
|
+
"class": "HuggingFaceDataset",
|
|
185
|
+
"dataset_opts": {"path": "lavita/medical-qa-shared-task-v1-toy", "split": "train"},
|
|
186
|
+
"seq_tag_column": "id",
|
|
187
|
+
"data_format": {
|
|
188
|
+
"id": {"dtype": "int64", "shape": ()},
|
|
189
|
+
"startphrase": {"dtype": "string", "shape": ()},
|
|
190
|
+
"label": {"dtype": "int64", "shape": ()},
|
|
191
|
+
},
|
|
192
|
+
}
|
|
193
|
+
ds_dict = {
|
|
194
|
+
"class": "MultiProcDataset",
|
|
195
|
+
"num_workers": 2,
|
|
196
|
+
"buffer_size": 5,
|
|
197
|
+
"dataset": ds_dict,
|
|
198
|
+
}
|
|
199
|
+
ds = init_dataset(ds_dict)
|
|
200
|
+
ds.initialize()
|
|
201
|
+
assert dummy_iter_dataset(ds)
|