returnn 1.20260105.192646__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. returnn/PKG-INFO +1 -1
  2. returnn/__old_mod_loader__.py +26 -2
  3. returnn/_setup_info_generated.py +2 -2
  4. returnn/datasets/lm.py +110 -42
  5. returnn/frontend/__init__.py +1 -0
  6. returnn/frontend/_backend.py +41 -0
  7. returnn/frontend/_native/__init__.py +22 -0
  8. returnn/frontend/_numpy_backend.py +7 -0
  9. returnn/frontend/_utils.py +1 -1
  10. returnn/frontend/array_.py +6 -5
  11. returnn/frontend/assert_.py +35 -0
  12. returnn/frontend/device.py +14 -1
  13. returnn/frontend/encoder/conformer.py +19 -0
  14. returnn/frontend/loss.py +183 -3
  15. returnn/frontend/math_.py +54 -14
  16. returnn/native_op.cpp +104 -174
  17. returnn/native_op.py +36 -31
  18. returnn/tensor/_dim_extra.py +7 -7
  19. returnn/tensor/_tensor_extra.py +10 -10
  20. returnn/tensor/utils.py +1 -1
  21. returnn/tf/frontend_layers/_backend.py +3 -1
  22. returnn/tf/layers/basic.py +13 -2
  23. returnn/tf/native_op.py +16 -5
  24. returnn/tf/util/basic.py +7 -201
  25. returnn/torch/engine.py +120 -3
  26. returnn/torch/frontend/_backend.py +166 -22
  27. returnn/torch/frontend/bridge.py +61 -0
  28. returnn/torch/frontend/compile_helper.py +106 -0
  29. returnn/torch/util/array_.py +30 -0
  30. returnn/torch/util/assert_.py +122 -0
  31. returnn/torch/util/native_op.py +885 -0
  32. returnn/torch/util/native_op_code_compiler.py +308 -0
  33. returnn/util/basic.py +3 -1
  34. returnn/util/cuda_env.py +332 -0
  35. returnn/util/debug.py +1 -0
  36. returnn/util/fsa.py +17 -13
  37. returnn/util/native_code_compiler.py +104 -47
  38. {returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/METADATA +1 -1
  39. {returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/RECORD +42 -36
  40. {returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/WHEEL +1 -1
  41. {returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/LICENSE +0 -0
  42. {returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/top_level.txt +0 -0
returnn/native_op.cpp CHANGED
@@ -1,3 +1,14 @@
1
+ /*
2
+ This file is imported in various ways.
3
+ The mode is determined via the preprocessor defines:
4
+
5
+ TENSORFLOW: If defined and set to 1, TensorFlow is used as backend.
6
+ TORCH: If defined and set to 1, PyTorch is used as backend.
7
+
8
+ CUDA: If defined and set to 1, CUDA is used for GPU support.
9
+ Otherwise, it uses CPU only.
10
+ The kernels are all expected to also compile in CPU-only mode.
11
+ */
1
12
 
2
13
  #include <assert.h>
3
14
  #include <iostream>
@@ -16,6 +27,10 @@
16
27
  #define TENSORFLOW 0
17
28
  #endif
18
29
 
30
+ #ifndef TORCH
31
+ #define TORCH 0
32
+ #endif
33
+
19
34
  #ifndef _ns
20
35
  #define _ns
21
36
  #endif
@@ -118,7 +133,7 @@ static inline int _host_float_as_int(float x) {
118
133
  #define INF_F int_as_float(0x7f800000)
119
134
  #define NAN_F int_as_float(0x7fffffff)
120
135
 
121
- #endif
136
+ #endif // CUDA
122
137
 
123
138
 
124
139
 
@@ -157,7 +172,7 @@ The BLAS functions expect the inputs in column-major and return in column-major.
157
172
  #define Ndarray tensorflow::Tensor
158
173
  #define Ndarray_DEV_DATA(x) ((float*) (x)->tensor_data().data())
159
174
  #define Ndarray_DEV_DATA_int32(x) ((int32_t*) (x)->tensor_data().data())
160
- #define Ndarray_DEV_DATA_int32_scalar(x) (x)->scalar<int32>()()
175
+ #define Ndarray_DEV_DATA_int32_scalar(x) (x)->scalar<int32_t>()()
161
176
  #define Ndarray_HOST_DIMS(x) DimsAccessor(x)
162
177
  #define Ndarray_DIMS Ndarray_HOST_DIMS
163
178
  #define Ndarray_NDIM(x) (x)->dims()
@@ -399,13 +414,13 @@ static void tf_cuda_sgemm_batched(
399
414
 
400
415
 
401
416
  #define Ndarray_sgemm( \
402
- transpose_A, transpose_B, \
403
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
417
+ transpose_A, transpose_B, \
418
+ m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
404
419
  tf_cuda_sgemm<float>(context, transpose_A, transpose_B, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
405
420
 
406
421
  #define Ndarray_sgemm_batched( \
407
- transpose_A, transpose_B, \
408
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchSize, finalize_stream) \
422
+ transpose_A, transpose_B, \
423
+ m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchSize, finalize_stream) \
409
424
  tf_cuda_sgemm_batched<float>(context, transpose_A, transpose_B, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchSize, finalize_stream);
410
425
 
411
426
 
@@ -415,21 +430,21 @@ static void tf_cuda_sgemm_batched(
415
430
 
416
431
  /*
417
432
  // matrices are in column-major form
418
- int sgemm_(char *transa, char *transb,
419
- integer *m, integer *n, integer *k,
420
- real *alpha, real *a, integer *lda,
421
- real *b, integer *ldb, real *beta,
422
- real *c, integer *ldc);
433
+ int sgemm_(char *transa, char *transb,
434
+ integer *m, integer *n, integer *k,
435
+ real *alpha, real *a, integer *lda,
436
+ real *b, integer *ldb, real *beta,
437
+ real *c, integer *ldc);
423
438
  */
424
439
  #define Ndarray_sgemm(\
425
- transpose_A, transpose_B, \
426
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
427
- { \
428
- char transa = transpose_A, transb = transpose_B; \
429
- int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; \
430
- sgemm_(&transa, &transb, \
431
- &m_, &n_, &k_, alpha, A, &lda_, B, &ldb_, beta, C, &ldc_); \
432
- }
440
+ transpose_A, transpose_B, \
441
+ m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
442
+ { \
443
+ char transa = transpose_A, transb = transpose_B; \
444
+ int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; \
445
+ sgemm_(&transa, &transb, \
446
+ &m_, &n_, &k_, alpha, A, &lda_, B, &ldb_, beta, C, &ldc_); \
447
+ }
433
448
 
434
449
  #else // HAVE_CUSTOM_BLAS
435
450
 
@@ -494,77 +509,77 @@ static void tf_cpu_sgemm(
494
509
  }
495
510
 
496
511
  #define Ndarray_sgemm(\
497
- transpose_A, transpose_B, \
498
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
512
+ transpose_A, transpose_B, \
513
+ m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
499
514
  tf_cpu_sgemm<float>(context, transpose_A, transpose_B, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
500
515
 
501
516
  #endif // HAVE_CUSTOM_BLAS
502
517
  #endif // CUDA
503
518
 
519
+ #define CHECK_WITH_MSG(condition, message) \
520
+ if(!(condition)) { \
521
+ std::cerr << "NativeOp check failed: " << message << std::endl; \
522
+ assert(condition); \
523
+ }
524
+
504
525
  // See Context struct below.
505
526
  #define CONTEXT_ARGS context
506
527
 
507
- #else // TENSORFLOW
528
+
529
+ #elif TORCH
530
+ // https://github.com/rwth-i6/i6_native_ops/blob/main/i6_native_ops/common/returnn_definitions.h
531
+ // https://docs.pytorch.org/cppdocs/stable.html#tensor-class
532
+
533
+ #define Ndarray torch::Tensor
534
+ #define Ndarray_DEV_DATA(x) ((float*)(x)->data_ptr())
535
+ #define Ndarray_DEV_DATA_int32(x) ((int32_t*)(x)->data_ptr())
536
+ #define Ndarray_DEV_DATA_uint32(x) ((uint32_t*)(x)->data_ptr())
537
+ #define Ndarray_DEV_DATA_int32_scalar(x) ((x)->item().to<int32_t>())
538
+ #define Ndarray_HOST_DIMS(x) ((x)->sizes())
539
+ #define Ndarray_DIMS(x) ((x)->sizes())
540
+ typedef at::IntArrayRef Ndarray_DIMS_Type;
541
+ #define Ndarray_NDIM(x) (x)->dim()
542
+ #define Ndarray_dtype_size(x) torch::elementSize((x)->scalar_type())
543
+ typedef int64_t Ndarray_DIM_Type;
544
+ #define Ndarray_SIZE(x) ((x)->numel())
545
+ #define Ndarray_STRIDE(x, dim) ((x)->stride(dim))
546
+
547
+ #define CHECK_WITH_MSG TORCH_CHECK
508
548
 
509
549
  // See Context struct below.
510
550
  #define CONTEXT_ARGS
511
551
 
512
- #endif // TENSORFLOW
552
+ template<typename T>
553
+ static void Ndarray_sgemm(
554
+ char transa_, char transb_,
555
+ int m, int n, int k,
556
+ const T* alpha_ptr, const T* a_ptr, int lda,
557
+ const T* b_ptr, int ldb, const T* beta_ptr,
558
+ T* c_ptr, int ldc)
559
+ {
560
+ // TODO...
561
+ assert("Torch Ndarray_sgemm not implemented" && 0);
562
+ }
513
563
 
564
+ #else // TENSORFLOW or TORCH
514
565
 
566
+ #error "No framework defined: TENSORFLOW or TORCH"
567
+
568
+ #endif // TENSORFLOW or TORCH
515
569
 
516
- #if CUDA
517
570
 
571
+ #if CUDA
518
572
 
519
573
  #if TENSORFLOW
520
574
  // Ndarray and friends already declared above, they are same for CUDA and non-CUDA
521
575
  #define CUDA_CUR_STREAM (context->eigen_gpu_device().stream())
522
576
 
523
- #else // TENSORFLOW, thus Theano here
524
- #define CUDA_CUR_STREAM (0) // default stream
577
+ #elif TORCH
525
578
 
526
- // Defined here: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/cuda_ndarray.cuh
527
- // See also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/cuda_ndarray.cu
528
- #define Ndarray CudaNdarray
529
- #define Ndarray_DEV_DATA CudaNdarray_DEV_DATA
530
- #define Ndarray_DEV_DATA_int32(x) ((int32_t*) (Ndarray_DEV_DATA(x)))
531
- #define Ndarray_DEV_DATA_int32_scalar(x) Ndarray_DEV_DATA_int32(x)[0]
532
- #define Ndarray_HOST_DIMS CudaNdarray_HOST_DIMS
533
- #define Ndarray_DIMS Ndarray_HOST_DIMS
534
- #define Ndarray_STRIDE(x, i) (CudaNdarray_HOST_STRIDES(x)[i]) // return in elements. CudaNdarray stores like that
535
- #define Ndarray_NDIM(x) (x->nd)
536
- #define Ndarray_DIM_Type int
537
- typedef Ndarray_DIM_Type const* Ndarray_DIMS_Type;
538
- #define Ndarray_dtype_size(x) sizeof(float)
539
- #define Ndarray_SIZE CudaNdarray_SIZE
540
- // PyObject *CudaNdarray_NewDims(int nd, const inttype * dims), uninitialized
541
- #define Ndarray_NewDims CudaNdarray_NewDims
542
- // PyObject * CudaNdarray_Copy(const CudaNdarray * self);
543
- #define Ndarray_Copy CudaNdarray_Copy
544
-
545
- /*
546
- // via: https://docs.nvidia.com/cuda/cublas/
547
- // matrices are in column-major form
548
- cublasStatus_t cublasSgemm(cublasHandle_t handle,
549
- cublasOperation_t transa, cublasOperation_t transb,
550
- int m, int n, int k,
551
- const float *alpha, const float *A, int lda,
552
- const float *B, int ldb, const float *beta,
553
- float *C, int ldc);
554
- */
555
- #define _cublasTranspose(t) \
556
- ((t == 'T') ? CUBLAS_OP_T : \
557
- (t == 'C') ? CUBLAS_OP_C : \
558
- (t == 'N') ? CUBLAS_OP_N : cublasOperation_t('E'))
559
- #define Ndarray_sgemm( \
560
- transpose_A, transpose_B, \
561
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
562
- (_cudaHandleError(cublasSgemm(handle, \
563
- _cublasTranspose(transpose_A), \
564
- _cublasTranspose(transpose_B), \
565
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc), \
566
- __FILE__, __LINE__ ))
579
+ #define CUDA_CUR_STREAM (at::cuda::getCurrentCUDAStream().stream())
567
580
 
581
+ #else
582
+ #error Unknown backend
568
583
  #endif
569
584
 
570
585
  #define Ndarray_memcpy(y, x, size) (cudaMemcpyAsync(y, x, size, cudaMemcpyDeviceToDevice, CUDA_CUR_STREAM))
@@ -581,48 +596,10 @@ typedef Ndarray_DIM_Type const* Ndarray_DIMS_Type;
581
596
 
582
597
  #define DEF_SHARED(type, name) extern __shared__ type name[];
583
598
 
584
- static const char *_cudaGetErrorEnum(cublasStatus_t error) {
585
- switch (error) {
586
- case CUBLAS_STATUS_SUCCESS:
587
- return "CUBLAS_STATUS_SUCCESS";
588
-
589
- case CUBLAS_STATUS_NOT_INITIALIZED:
590
- return "CUBLAS_STATUS_NOT_INITIALIZED";
591
-
592
- case CUBLAS_STATUS_ALLOC_FAILED:
593
- return "CUBLAS_STATUS_ALLOC_FAILED";
594
-
595
- case CUBLAS_STATUS_INVALID_VALUE:
596
- return "CUBLAS_STATUS_INVALID_VALUE";
597
-
598
- case CUBLAS_STATUS_ARCH_MISMATCH:
599
- return "CUBLAS_STATUS_ARCH_MISMATCH";
600
-
601
- case CUBLAS_STATUS_MAPPING_ERROR:
602
- return "CUBLAS_STATUS_MAPPING_ERROR";
603
-
604
- case CUBLAS_STATUS_EXECUTION_FAILED:
605
- return "CUBLAS_STATUS_EXECUTION_FAILED";
606
-
607
- case CUBLAS_STATUS_INTERNAL_ERROR:
608
- return "CUBLAS_STATUS_INTERNAL_ERROR";
609
- }
610
-
611
- return "<unknown>";
612
- }
613
-
614
- static void _cudaHandleError(cudaError_t err, const char *file, int line) {
615
- if (err != cudaSuccess) {
616
- printf("NativeOp: CUDA runtime error: '%s' in %s at line %d\n", cudaGetErrorString(err), file, line);
617
- exit(EXIT_FAILURE);
618
- }
619
- }
620
-
621
- static void _cudaHandleError(cublasStatus_t status, const char *file, int line) {
622
- if (status != CUBLAS_STATUS_SUCCESS) {
623
- printf("NativeOp: cuBLAS runtime error: '%s' in %s at line %d\n", _cudaGetErrorEnum(status), file, line);
624
- exit(EXIT_FAILURE);
625
- }
599
+ static void _cudaHandleError(cudaError_t err, const char* file, int line) {
600
+ CHECK_WITH_MSG(
601
+ err == cudaSuccess,
602
+ "NativeOp: CUDA runtime error: ", cudaGetErrorString(err), " in ", file, " at line ", line);
626
603
  }
627
604
 
628
605
  #define HANDLE_ERROR(status) (_cudaHandleError( status, __FILE__, __LINE__ ))
@@ -630,49 +607,7 @@ static void _cudaHandleError(cublasStatus_t status, const char *file, int line)
630
607
 
631
608
  #else // not CUDA
632
609
 
633
-
634
- #if !TENSORFLOW
635
- // Numpy, see: https://docs.scipy.org/doc/numpy/reference/c-api.array.html
636
- // And: https://deeplearning.net/software/theano/extending/extending_theano_c.html
637
- #define Ndarray PyArrayObject
638
- #define Ndarray_DEV_DATA(x) ((float*) PyArray_DATA(x))
639
- #define Ndarray_DEV_DATA_int32(x) ((int32_t*) (Ndarray_DEV_DATA(x)))
640
- #define Ndarray_DEV_DATA_int32_scalar(x) Ndarray_DEV_DATA_int32(x)[0]
641
- #define Ndarray_HOST_DIMS PyArray_DIMS
642
- #define Ndarray_STRIDE(x, i) (PyArray_STRIDE(x, i) / sizeof(float)) // return in elements. Numpy stores in bytes
643
- #define Ndarray_DIMS Ndarray_HOST_DIMS
644
- #define Ndarray_NDIM PyArray_NDIM
645
- #define Ndarray_DIM_Type npy_intp
646
- typedef Ndarray_DIM_Type const* Ndarray_DIMS_Type;
647
- #define Ndarray_dtype_size(x) sizeof(float)
648
- #define Ndarray_SIZE PyArray_SIZE
649
- #define Ndarray_NewDims(nd, dims) (PyArray_SimpleNew(nd, dims, NPY_FLOAT32))
650
- #define Ndarray_Copy(x) (PyArray_FromArray(x, NULL, NPY_ARRAY_OUT_ARRAY | NPY_ARRAY_ENSURECOPY))
651
- /*
652
- // matrices are in column-major form
653
- int sgemm_(char *transa, char *transb,
654
- integer *m, integer *n, integer *k,
655
- real *alpha, real *a, integer *lda,
656
- real *b, integer *ldb, real *beta,
657
- real *c, integer *ldc);
658
-
659
- Cast to (float*) because we might have the C-style declaration incorrectly in the C++ scope.
660
- */
661
- #define Ndarray_sgemm(\
662
- transpose_A, transpose_B, \
663
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
664
- { \
665
- char transa = transpose_A, transb = transpose_B; \
666
- int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; \
667
- sgemm_(&transa, &transb, \
668
- &m_, &n_, &k_, alpha, (float*) A, &lda_, (float*) B, &ldb_, beta, C, &ldc_); \
669
- }
670
-
671
- static inline void* device_malloc(size_t size) { return malloc(size); }
672
- static inline void device_free(void* ptr) { free(ptr); }
673
- #endif
674
-
675
- #define HANDLE_LAST_ERROR() (0)
610
+ #define HANDLE_LAST_ERROR() {}
676
611
 
677
612
  #define Ndarray_memcpy(y, x, size) (memcpy(y, x, size))
678
613
  #define Ndarray_memset(s, c, size) (memset(s, c, size))
@@ -751,19 +686,9 @@ struct _KernelLoop {
751
686
  #endif
752
687
 
753
688
 
754
- Ndarray* Ndarray_uninitialized_like(Ndarray* a) {
755
- Ndarray_DIMS_Type dim = Ndarray_HOST_DIMS(a);
756
- #if TENSORFLOW
757
- Ndarray* res = (Ndarray*) Ndarray_NewDims(Ndarray_NDIM(a), dim);
758
- #else
759
- Ndarray* res = (Ndarray*) Ndarray_NewDims(Ndarray_NDIM(a), const_cast<Ndarray_DIM_Type*>(dim));
760
- #endif
761
- return res;
762
- }
763
-
764
- long Ndarray_get_n_total_elements(Ndarray* a) {
765
- long c = 1;
766
- for(long i = 0; i < Ndarray_NDIM(a); ++i)
689
+ int64_t Ndarray_get_n_total_elements(Ndarray* a) {
690
+ int64_t c = 1;
691
+ for(int i = 0; i < Ndarray_NDIM(a); ++i)
767
692
  c *= Ndarray_DIMS(a)[i];
768
693
  return c;
769
694
  }
@@ -849,17 +774,22 @@ void _free(void* ptr) {
849
774
  context->device()->GetAllocator(AllocatorAttributes());
850
775
  allocator->DeallocateRaw(ptr);
851
776
  }
852
- #define device_malloc Context(CONTEXT_ARGS)._malloc
853
- #define device_free Context(CONTEXT_ARGS)._free
777
+
778
+ #elif TORCH
854
779
 
855
780
  #if CUDA
856
- cublasHandle_t _handle() {
857
- assert("not available" && 0);
858
- return NULL;
859
- }
860
- #define handle Context(CONTEXT_ARGS)._handle()
861
- #endif
862
- #endif
781
+ void* _malloc(size_t num_bytes) { return c10::cuda::CUDACachingAllocator::raw_alloc(num_bytes); }
782
+ void _free(void* ptr) { c10::cuda::CUDACachingAllocator::raw_delete(ptr); }
783
+ #else // not CUDA
784
+ void* _malloc(size_t num_bytes) { return c10::GetCPUAllocator()->raw_allocate(num_bytes); }
785
+ void _free(void* ptr) { c10::GetCPUAllocator()->raw_deallocate(ptr); }
786
+ #endif // CUDA
787
+
788
+ #endif // TENSORFLOW or TORCH
789
+
790
+
791
+ #define device_malloc Context(CONTEXT_ARGS)._malloc
792
+ #define device_free Context(CONTEXT_ARGS)._free
863
793
 
864
794
 
865
795
  //C[x] += A[x]*B[x]
returnn/native_op.py CHANGED
@@ -5,38 +5,40 @@ Generic interface which automatically creates:
5
5
  * inplace and not inplace
6
6
  * grad variants
7
7
 
8
- See :mod:`returnn.tf.native_op` and :mod:`returnn.theano.native_op`
9
- for usage in TensorFlow and Theano.
8
+ See :mod:`returnn.tf.native_op` and :mod:`returnn.torch.utils.native_op`
9
+ for usage in TensorFlow and PyTorch.
10
10
 
11
11
  See :ref:`native_ops` for more background.
12
12
  """
13
13
 
14
+ from __future__ import annotations
15
+ from typing import Optional, Union, Any, Callable, Dict, Sequence, Tuple
14
16
  import copy
15
17
  import numpy
16
- import typing
18
+
17
19
  from returnn.util.basic import make_hashable, unicode
18
20
 
19
21
 
20
22
  class NativeOpBaseMixin:
21
23
  """
22
- The purpose of having this as a separate base class is to make this independent of any Theano specific
23
- functionality so that we can also use this base for example for TensorFlow.
24
+ The purpose of having this as a separate base class
25
+ is to make this independent of any TensorFlow or PyTorch-specific functionality.
24
26
  """
25
27
 
26
28
  def __init__(
27
29
  self,
28
- in_info,
29
- out_info,
30
- c_fw_code,
31
- c_bw_code=None,
32
- c_extra_support_code=None,
33
- code_version=None,
34
- cpu_support=True,
35
- grad_input_map=None,
36
- name=None,
30
+ in_info: Sequence[Dict[str, Any]],
31
+ out_info: Sequence[Dict[str, Any]],
32
+ c_fw_code: str,
33
+ c_bw_code: Optional[str] = None,
34
+ c_extra_support_code: Union[None, str, Dict[str, str]] = None,
35
+ code_version: Optional[Tuple[int, ...]] = None,
36
+ cpu_support: bool = True,
37
+ grad_input_map: Union[None, Tuple[int, ...], Callable] = None,
38
+ name: Optional[str] = None,
37
39
  ):
38
40
  """
39
- :param list[dict(str)] in_info: each dict describes one input var.
41
+ :param in_info: each dict describes one input var.
40
42
  attribs in the dict:
41
43
  int ndim: the ndim.
42
44
  tuple shape: tuple and can contain None for specific dimensions.
@@ -49,18 +51,18 @@ class NativeOpBaseMixin:
49
51
  str gradient: can be "disconnected". see grad().
50
52
  bool bw_input: True by default. add this param to the bw input.
51
53
  other attribs are just ignored.
52
- :param list[dict(str)] out_info: like in_info.
54
+ :param out_info: like in_info.
53
55
  slightly different behavior for:
54
56
  shape: we also allow refs to the in_info in the form (in-idx,dim). see infer_shape().
55
57
  need_contiguous/want_inplace: used for bw, in case for bw_input == True.
56
- :param str c_fw_code: C code for forward pass
57
- :param str|dict[str] c_extra_support_code: C support code (for c_support_code)
58
- :param str|None c_bw_code: C code for backward pass (for gradient)
59
- :param tuple[int] code_version: will be returned by c_code_cache_version.
60
- :param bool cpu_support:
61
- :param tuple[int]|callable grad_input_map: selection of grad inputs.
58
+ :param c_fw_code: C code for forward pass
59
+ :param c_extra_support_code: C support code (for c_support_code)
60
+ :param c_bw_code: C code for backward pass (for gradient)
61
+ :param code_version: will be returned by c_code_cache_version.
62
+ :param cpu_support:
63
+ :param grad_input_map: selection of grad inputs.
62
64
  by default, we get all inputs + all outputs + all grad outputs.
63
- :param str name: name
65
+ :param name: name
64
66
  """
65
67
  assert isinstance(in_info, (list, tuple))
66
68
  assert isinstance(out_info, (list, tuple))
@@ -251,12 +253,12 @@ class NativeOpGenBase:
251
253
  See NativeOp.__init__() for attribs.
252
254
  """
253
255
 
254
- in_info = None # type: typing.Tuple[typing.Dict[str]]
255
- out_info = None # type: typing.Tuple[typing.Dict[str]]
256
- c_fw_code = None # type: str
257
- c_bw_code = None # type: str
258
- c_extra_support_code = None # type: typing.Dict[str,str]
259
- code_version = None # type: typing.Union[typing.Tuple[int], int]
256
+ in_info: Optional[Tuple[Dict[str, Any], ...]] = None
257
+ out_info: Optional[Tuple[Dict[str, Any], ...]] = None
258
+ c_fw_code: Optional[str] = None
259
+ c_bw_code: Optional[str] = None
260
+ c_extra_support_code: Optional[Dict[str, str]] = None
261
+ code_version: Union[None, Tuple[int, ...], int] = None
260
262
  grad_input_map = None
261
263
  theano_custom_grad = None
262
264
  cpu_support = True
@@ -4699,7 +4701,7 @@ class FastViterbiOp(NativeOpGenBase):
4699
4701
  int n_states,
4700
4702
  int n_edges,
4701
4703
  int t,
4702
- int32* cur_state, // (n_batch,)
4704
+ int32_t* cur_state, // (n_batch,)
4703
4705
  const IdxAndVal* frame,
4704
4706
  const int32_t* d_am_seq_len,
4705
4707
  const int32_t* d_edge_from,
@@ -5339,7 +5341,10 @@ class EditDistanceOp(NativeOpGenBase):
5339
5341
  sub_cost = last1_dist[last1_idx];
5340
5342
  if(a[batch_idx * n_a_max_len + t_a - 1] != b[batch_idx * n_b_max_len + t_b - 1])
5341
5343
  ++sub_cost;
5342
- //printf("t_a %i, t_b %i, del %i, ins %i, sub %i\\n", t_a, t_b, del_cost, ins_cost, sub_cost);
5344
+ /*printf("t_a %i, t_b %i, a %d, b %d, del %i, ins %i, sub %i\\n",
5345
+ t_a, t_b,
5346
+ a[batch_idx * n_a_max_len + t_a - 1], b[batch_idx * n_b_max_len + t_b - 1],
5347
+ del_cost, ins_cost, sub_cost);*/
5343
5348
  int min_cost = del_cost;
5344
5349
  if(min_cost > ins_cost) min_cost = ins_cost;
5345
5350
  if(min_cost > sub_cost) min_cost = sub_cost;
@@ -858,7 +858,7 @@ class _DimMixin:
858
858
  self._make_extra()
859
859
  dim_order_default = self.dyn_size_ext.dims + (self,)
860
860
  if dim_order is not None:
861
- dim_order = tuple(d for d in dim_order if d in dim_order_default) # filter
861
+ dim_order = tuple([d for d in dim_order if d in dim_order_default]) # filter
862
862
  else:
863
863
  dim_order = dim_order_default
864
864
  cache_key = (device, dim_order)
@@ -2484,16 +2484,16 @@ _BinOpStrs = {
2484
2484
 
2485
2485
  def _math_get_dim_via_bin_op(dims: Sequence[Union[Dim, int]], op_kind: str) -> Dim:
2486
2486
  dims = [d if isinstance(d, _d.Dim) else _make_constant_static_dim(d) for d in dims]
2487
- if all(d.dimension is not None for d in dims):
2487
+ if all([d.dimension is not None for d in dims]):
2488
2488
  op = _BinOps[op_kind]
2489
2489
  dim_value = dims[0].dimension
2490
2490
  for d in dims[1:]:
2491
2491
  dim_value = op(dim_value, d.dimension)
2492
2492
  else:
2493
2493
  dim_value = None
2494
- if all(d.is_constant_static_dim() for d in dims):
2494
+ if all([d.is_constant_static_dim() for d in dims]):
2495
2495
  return _make_constant_static_dim(dim_value, kind=_get_merged_dim_kind(dims))
2496
- desc = _BinOpStrs[op_kind].join(_get_description(d) for d in dims)
2496
+ desc = _BinOpStrs[op_kind].join([_get_description(d) for d in dims])
2497
2497
  if op_kind.startswith("ceildiv"):
2498
2498
  desc = f"⌈{desc}⌉"
2499
2499
  return _d.Dim(
@@ -2676,16 +2676,16 @@ def _get_description(dim, brackets=True):
2676
2676
 
2677
2677
 
2678
2678
  def _get_merged_dim_kind(dim_tags: Sequence[Dim]) -> Entity:
2679
- if any(tag.is_batch_dim() for tag in dim_tags):
2679
+ if any([tag.is_batch_dim() for tag in dim_tags]):
2680
2680
  return DimTypes.Batch
2681
- elif any(tag.is_feature_dim() for tag in dim_tags):
2681
+ elif any([tag.is_feature_dim() for tag in dim_tags]):
2682
2682
  return DimTypes.Feature
2683
2683
  else:
2684
2684
  return DimTypes.Spatial
2685
2685
 
2686
2686
 
2687
2687
  def _representative_tag(terms: Sequence[Dim]) -> Optional[Dim]:
2688
- if any(not term_.auto_generated for term_ in terms):
2688
+ if any([not term_.auto_generated for term_ in terms]):
2689
2689
  # Always prefer non-auto-generated.
2690
2690
  terms = [term_ for term_ in terms if not term_.auto_generated]
2691
2691
  # First find any dynamic.
@@ -32,8 +32,8 @@ class _TensorExtra:
32
32
  tensor: Tensor,
33
33
  time_dim_axis=NotSpecified,
34
34
  available_for_inference=True,
35
- batch=None,
36
- beam=None,
35
+ batch: Optional[BatchInfo] = None,
36
+ beam: Optional[SearchBeam] = None,
37
37
  control_flow_ctx=None,
38
38
  ):
39
39
  """
@@ -41,8 +41,8 @@ class _TensorExtra:
41
41
  :param int|None|NotSpecified time_dim_axis: where we have the time dim axis, after we added the batch-dim.
42
42
  this is often 1. however, can be None if there is no time-dim.
43
43
  :param bool available_for_inference: e.g. the extern data "classes" is usually not available for inference
44
- :param BatchInfo|None batch:
45
- :param SearchBeam|None beam: the batch-dim could be extended by a beam-size,
44
+ :param batch:
45
+ :param beam: the batch-dim could be extended by a beam-size,
46
46
  such that it represents the merged dims [batch, beam_size].
47
47
  :param ControlFlowContext|None control_flow_ctx:
48
48
  """
@@ -668,11 +668,11 @@ class _TensorMixin(_TensorMixinBase):
668
668
  if not perm:
669
669
  return self.copy()
670
670
  if allow_int and isinstance(perm[0], int):
671
- assert all(isinstance(a, int) for a in perm), f"{self}: invalid perm {perm!r} types"
671
+ assert all([isinstance(a, int) for a in perm]), f"{self}: invalid perm {perm!r} types"
672
672
  assert set(perm) == set(range(len(perm))), f"{self}: invalid perm {perm!r}"
673
673
  return self._copy_compatible_to_dims_with_perm([self._dims[i] for i in perm], perm)
674
674
  else:
675
- assert all(isinstance(a, Dim) for a in perm), f"{self}: invalid perm {perm!r} types"
675
+ assert all([isinstance(a, Dim) for a in perm]), f"{self}: invalid perm {perm!r} types"
676
676
  return self.copy_compatible_to_dims(perm)
677
677
 
678
678
  def copy_move_axis(self, old_axis, new_axis) -> _t.Tensor:
@@ -1155,7 +1155,7 @@ class _TensorMixin(_TensorMixinBase):
1155
1155
  )
1156
1156
 
1157
1157
  assert v.batch_ndim == data.batch_ndim
1158
- assert all(mapped_axes[ax] == ax for ax in range(v.batch_ndim))
1158
+ assert all([mapped_axes[ax] == ax for ax in range(v.batch_ndim)])
1159
1159
 
1160
1160
  if self.version == 1:
1161
1161
  # Ensure time_dim_axis and feature_dim_axis is same as in data
@@ -1702,7 +1702,7 @@ class _TensorMixin(_TensorMixinBase):
1702
1702
  """
1703
1703
  :return: shape with added batch-dim. e.g. (batch,time,feat) = (None,None,128)
1704
1704
  """
1705
- return tuple(tag.dimension for tag in self.dim_tags)
1705
+ return tuple([tag.dimension for tag in self.dim_tags])
1706
1706
 
1707
1707
  # noinspection PyShadowingNames
1708
1708
  def get_batch_shape(self, batch_dim):
@@ -3214,7 +3214,7 @@ class _TensorMixin(_TensorMixinBase):
3214
3214
  if len(sources) == 1:
3215
3215
  return sources[0].copy_template()
3216
3216
  max_ndim = max([s.batch_ndim for s in sources])
3217
- if any(src.batch for src in sources):
3217
+ if any([src.batch for src in sources]):
3218
3218
  from returnn.tf.util.data import BatchInfo
3219
3219
 
3220
3220
  common_batch = BatchInfo.get_common_batch_info([src.batch for src in sources if src.batch])
@@ -3254,7 +3254,7 @@ class _TensorMixin(_TensorMixinBase):
3254
3254
  else:
3255
3255
  axis = common.get_default_new_axis_for_dim_tag(dim_tag)
3256
3256
  common = common.copy_add_dim_by_tag(dim_tag, unbroadcast=True, axis=axis)
3257
- if all(s.batch_ndim < common.batch_ndim for s in sources):
3257
+ if all([s.batch_ndim < common.batch_ndim for s in sources]):
3258
3258
  from returnn.util.basic import validate_broadcast_all_sources
3259
3259
 
3260
3260
  validate_broadcast_all_sources(
returnn/tensor/utils.py CHANGED
@@ -71,7 +71,7 @@ def tensor_fill_random_numpy_(
71
71
  # Make sure at least one of the dyn sizes matches the max size.
72
72
  i = rnd.randint(0, dim.dyn_size_ext.raw_tensor.size)
73
73
  dim.dyn_size_ext.raw_tensor.flat[i] = dyn_dim_max_sizes[dim]
74
- if dim in dyn_dim_min_sizes:
74
+ if dim in dyn_dim_min_sizes and dim.dyn_size_ext.raw_tensor.size > 1:
75
75
  j = rnd.randint(0, dim.dyn_size_ext.raw_tensor.size - 1)
76
76
  if j >= i:
77
77
  j += 1
@@ -465,6 +465,8 @@ class ReturnnLayersBackend(Backend[Layer]):
465
465
  targets_spatial_dim: Dim,
466
466
  blank_index: int,
467
467
  max_approx: bool = False,
468
+ use_native_op: Optional[bool] = None,
469
+ label_loop: bool = True,
468
470
  ) -> Tensor:
469
471
  """CTC"""
470
472
  assert targets.sparse_dim and targets.sparse_dim.dimension <= logits.feature_dim.dimension
@@ -482,6 +484,7 @@ class ReturnnLayersBackend(Backend[Layer]):
482
484
  "targets": targets,
483
485
  "blank_index": blank_index,
484
486
  "max_approx": max_approx,
487
+ "label_loop": label_loop,
485
488
  },
486
489
  name="ctc_loss",
487
490
  )
@@ -944,7 +947,6 @@ class ReturnnLayersBackend(Backend[Layer]):
944
947
  """
945
948
  assert mask.dtype == "bool"
946
949
  assert set(mask.dims) == set(dims)
947
- assert set(mask.dims).issubset(set(tensor.dims))
948
950
  if not out_dim:
949
951
  out_dim = Dim(None, name="mask")
950
952
  return (