returnn 1.20251027.232712__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. returnn/PKG-INFO +2 -2
  2. returnn/__old_mod_loader__.py +26 -2
  3. returnn/_setup_info_generated.py +2 -2
  4. returnn/datasets/lm.py +130 -42
  5. returnn/datasets/meta.py +93 -43
  6. returnn/datasets/postprocessing.py +597 -108
  7. returnn/datasets/util/vocabulary.py +90 -0
  8. returnn/frontend/__init__.py +1 -0
  9. returnn/frontend/_backend.py +41 -0
  10. returnn/frontend/_native/__init__.py +22 -0
  11. returnn/frontend/_numpy_backend.py +7 -0
  12. returnn/frontend/_utils.py +1 -1
  13. returnn/frontend/array_.py +48 -2
  14. returnn/frontend/assert_.py +35 -0
  15. returnn/frontend/attention.py +54 -20
  16. returnn/frontend/conv.py +273 -54
  17. returnn/frontend/device.py +14 -1
  18. returnn/frontend/encoder/conformer.py +20 -0
  19. returnn/frontend/encoder/transformer.py +2 -0
  20. returnn/frontend/loss.py +222 -3
  21. returnn/frontend/math_.py +54 -14
  22. returnn/native_op.cpp +182 -172
  23. returnn/native_op.py +36 -31
  24. returnn/sprint/cache.py +12 -13
  25. returnn/tensor/_dim_extra.py +7 -7
  26. returnn/tensor/_tensor_extra.py +10 -10
  27. returnn/tensor/utils.py +8 -5
  28. returnn/tf/frontend_layers/_backend.py +7 -3
  29. returnn/tf/layers/basic.py +27 -40
  30. returnn/tf/native_op.py +27 -63
  31. returnn/tf/network.py +1 -1
  32. returnn/tf/util/basic.py +22 -197
  33. returnn/torch/engine.py +157 -6
  34. returnn/torch/frontend/_backend.py +280 -29
  35. returnn/torch/frontend/bridge.py +61 -0
  36. returnn/torch/frontend/compile_helper.py +106 -0
  37. returnn/torch/util/array_.py +30 -0
  38. returnn/torch/util/assert_.py +122 -0
  39. returnn/torch/util/exception_helper.py +7 -1
  40. returnn/torch/util/native_op.py +885 -0
  41. returnn/torch/util/native_op_code_compiler.py +308 -0
  42. returnn/util/basic.py +6 -7
  43. returnn/util/better_exchook.py +4 -0
  44. returnn/util/cuda_env.py +332 -0
  45. returnn/util/debug.py +12 -2
  46. returnn/util/file_cache.py +15 -1
  47. returnn/util/fsa.py +17 -13
  48. returnn/util/native_code_compiler.py +104 -47
  49. returnn/util/task_system.py +1 -1
  50. {returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/METADATA +2 -2
  51. {returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/RECORD +54 -48
  52. {returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/WHEEL +1 -1
  53. {returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/LICENSE +0 -0
  54. {returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/top_level.txt +0 -0
returnn/native_op.cpp CHANGED
@@ -1,3 +1,14 @@
1
+ /*
2
+ This file is imported in various ways.
3
+ The mode is determined via the preprocessor defines:
4
+
5
+ TENSORFLOW: If defined and set to 1, TensorFlow is used as backend.
6
+ TORCH: If defined and set to 1, PyTorch is used as backend.
7
+
8
+ CUDA: If defined and set to 1, CUDA is used for GPU support.
9
+ Otherwise, it uses CPU only.
10
+ The kernels are all expected to also compile in CPU-only mode.
11
+ */
1
12
 
2
13
  #include <assert.h>
3
14
  #include <iostream>
@@ -16,6 +27,10 @@
16
27
  #define TENSORFLOW 0
17
28
  #endif
18
29
 
30
+ #ifndef TORCH
31
+ #define TORCH 0
32
+ #endif
33
+
19
34
  #ifndef _ns
20
35
  #define _ns
21
36
  #endif
@@ -118,7 +133,7 @@ static inline int _host_float_as_int(float x) {
118
133
  #define INF_F int_as_float(0x7f800000)
119
134
  #define NAN_F int_as_float(0x7fffffff)
120
135
 
121
- #endif
136
+ #endif // CUDA
122
137
 
123
138
 
124
139
 
@@ -157,7 +172,7 @@ The BLAS functions expect the inputs in column-major and return in column-major.
157
172
  #define Ndarray tensorflow::Tensor
158
173
  #define Ndarray_DEV_DATA(x) ((float*) (x)->tensor_data().data())
159
174
  #define Ndarray_DEV_DATA_int32(x) ((int32_t*) (x)->tensor_data().data())
160
- #define Ndarray_DEV_DATA_int32_scalar(x) (x)->scalar<int32>()()
175
+ #define Ndarray_DEV_DATA_int32_scalar(x) (x)->scalar<int32_t>()()
161
176
  #define Ndarray_HOST_DIMS(x) DimsAccessor(x)
162
177
  #define Ndarray_DIMS Ndarray_HOST_DIMS
163
178
  #define Ndarray_NDIM(x) (x)->dims()
@@ -206,6 +221,14 @@ Ndarray* Ndarray_Copy(const Ndarray* self) {
206
221
 
207
222
  #include "tensorflow/core/public/version.h"
208
223
 
224
+ #ifndef TF_MAJOR_VERSION
225
+ #error "TF_MAJOR_VERSION is not defined!"
226
+ #endif
227
+
228
+ #ifndef TF_MINOR_VERSION
229
+ #error "TF_MINOR_VERSION is not defined!"
230
+ #endif
231
+
209
232
  #if (TF_MAJOR_VERSION == 1 && TF_MINOR_VERSION >= 6) || (TF_MAJOR_VERSION > 1)
210
233
  #define TF_issue_6602_workaround 0
211
234
  #define TWOD_LSTM_SUPPORT 1
@@ -391,100 +414,172 @@ static void tf_cuda_sgemm_batched(
391
414
 
392
415
 
393
416
  #define Ndarray_sgemm( \
394
- transpose_A, transpose_B, \
395
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
417
+ transpose_A, transpose_B, \
418
+ m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
396
419
  tf_cuda_sgemm<float>(context, transpose_A, transpose_B, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
397
420
 
398
421
  #define Ndarray_sgemm_batched( \
399
- transpose_A, transpose_B, \
400
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchSize, finalize_stream) \
422
+ transpose_A, transpose_B, \
423
+ m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchSize, finalize_stream) \
401
424
  tf_cuda_sgemm_batched<float>(context, transpose_A, transpose_B, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchSize, finalize_stream);
402
425
 
403
426
 
404
427
  #else // CUDA
428
+
429
+ #ifdef HAVE_CUSTOM_BLAS
430
+
405
431
  /*
406
432
  // matrices are in column-major form
407
- int sgemm_(char *transa, char *transb,
408
- integer *m, integer *n, integer *k,
409
- real *alpha, real *a, integer *lda,
410
- real *b, integer *ldb, real *beta,
411
- real *c, integer *ldc);
433
+ int sgemm_(char *transa, char *transb,
434
+ integer *m, integer *n, integer *k,
435
+ real *alpha, real *a, integer *lda,
436
+ real *b, integer *ldb, real *beta,
437
+ real *c, integer *ldc);
412
438
  */
413
439
  #define Ndarray_sgemm(\
414
- transpose_A, transpose_B, \
415
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
416
- { \
417
- char transa = transpose_A, transb = transpose_B; \
418
- int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; \
419
- sgemm_(&transa, &transb, \
420
- &m_, &n_, &k_, alpha, A, &lda_, B, &ldb_, beta, C, &ldc_); \
421
- }
440
+ transpose_A, transpose_B, \
441
+ m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
442
+ { \
443
+ char transa = transpose_A, transb = transpose_B; \
444
+ int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; \
445
+ sgemm_(&transa, &transb, \
446
+ &m_, &n_, &k_, alpha, A, &lda_, B, &ldb_, beta, C, &ldc_); \
447
+ }
448
+
449
+ #else // HAVE_CUSTOM_BLAS
450
+
451
+ template<typename T>
452
+ static void tf_cpu_sgemm(
453
+ OpKernelContext* context,
454
+ char transa_, char transb_,
455
+ int m, int n, int k,
456
+ const T* alpha_ptr, const T* a_ptr, int lda,
457
+ const T* b_ptr, int ldb, const T* beta_ptr,
458
+ T* c_ptr, int ldc)
459
+ {
460
+ if (m <= 0 || n <= 0 || k <= 0) return;
461
+
462
+ auto d = context->eigen_cpu_device();
463
+ const T alpha = *alpha_ptr;
464
+ const T beta = *beta_ptr;
465
+
466
+ bool transa = (transa_ == 'T' || transa_ == 't' || transa_ == 'C' || transa_ == 'c');
467
+ bool transb = (transb_ == 'T' || transb_ == 't' || transb_ == 'C' || transb_ == 'c');
468
+
469
+ // 1. Map as COLUMN-MAJOR
470
+ // Physical rows (height) for the Map is always the leading dimension (lda, ldb, ldc)
471
+ typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::ColMajor>, Eigen::Unaligned> ConstMap;
472
+ typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::ColMajor>, Eigen::Unaligned> MutableMap;
473
+
474
+ // Logical height/width of slices before any transposition
475
+ int a_slice_rows = transa ? k : m;
476
+ int a_slice_cols = transa ? m : k;
477
+ int b_slice_rows = transb ? n : k;
478
+ int b_slice_cols = transb ? k : n;
479
+
480
+ // Map and Slice
481
+ auto a = ConstMap(a_ptr, lda, a_slice_cols).slice(
482
+ Eigen::array<Eigen::Index, 2>({0, 0}),
483
+ Eigen::array<Eigen::Index, 2>({(Eigen::Index)a_slice_rows, (Eigen::Index)a_slice_cols}));
484
+
485
+ auto b = ConstMap(b_ptr, ldb, b_slice_cols).slice(
486
+ Eigen::array<Eigen::Index, 2>({0, 0}),
487
+ Eigen::array<Eigen::Index, 2>({(Eigen::Index)b_slice_rows, (Eigen::Index)b_slice_cols}));
488
+
489
+ auto c = MutableMap(c_ptr, ldc, n).slice(
490
+ Eigen::array<Eigen::Index, 2>({0, 0}),
491
+ Eigen::array<Eigen::Index, 2>({(Eigen::Index)m, (Eigen::Index)n}));
492
+
493
+ // 2. Define Contraction Pairs based on Transposition
494
+ // Column-Major Matrix Mult: (M x K) * (K x N)
495
+ // Standard: Contract Axis 1 of A with Axis 0 of B
496
+ // If A is Transposed: A is (K x M), contract Axis 0 of A
497
+ // If B is Transposed: B is (N x K), contract Axis 1 of B
498
+ Eigen::array<Eigen::IndexPair<int>, 1> pairs;
499
+ pairs[0] = Eigen::IndexPair<int>(transa ? 0 : 1, transb ? 1 : 0);
500
+
501
+ // 3. Execution
502
+ if (alpha == T(1) && beta == T(0)) {
503
+ c.device(d) = a.contract(b, pairs);
504
+ } else if (alpha == T(1) && beta == T(1)) {
505
+ c.device(d) += a.contract(b, pairs);
506
+ } else {
507
+ c.device(d) = a.contract(b, pairs) * alpha + c * beta;
508
+ }
509
+ }
510
+
511
+ #define Ndarray_sgemm(\
512
+ transpose_A, transpose_B, \
513
+ m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
514
+ tf_cpu_sgemm<float>(context, transpose_A, transpose_B, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
515
+
516
+ #endif // HAVE_CUSTOM_BLAS
422
517
  #endif // CUDA
423
518
 
519
+ #define CHECK_WITH_MSG(condition, message) \
520
+ if(!(condition)) { \
521
+ std::cerr << "NativeOp check failed: " << message << std::endl; \
522
+ assert(condition); \
523
+ }
524
+
424
525
  // See Context struct below.
425
526
  #define CONTEXT_ARGS context
426
527
 
427
- #else // TENSORFLOW
528
+
529
+ #elif TORCH
530
+ // https://github.com/rwth-i6/i6_native_ops/blob/main/i6_native_ops/common/returnn_definitions.h
531
+ // https://docs.pytorch.org/cppdocs/stable.html#tensor-class
532
+
533
+ #define Ndarray torch::Tensor
534
+ #define Ndarray_DEV_DATA(x) ((float*)(x)->data_ptr())
535
+ #define Ndarray_DEV_DATA_int32(x) ((int32_t*)(x)->data_ptr())
536
+ #define Ndarray_DEV_DATA_uint32(x) ((uint32_t*)(x)->data_ptr())
537
+ #define Ndarray_DEV_DATA_int32_scalar(x) ((x)->item().to<int32_t>())
538
+ #define Ndarray_HOST_DIMS(x) ((x)->sizes())
539
+ #define Ndarray_DIMS(x) ((x)->sizes())
540
+ typedef at::IntArrayRef Ndarray_DIMS_Type;
541
+ #define Ndarray_NDIM(x) (x)->dim()
542
+ #define Ndarray_dtype_size(x) torch::elementSize((x)->scalar_type())
543
+ typedef int64_t Ndarray_DIM_Type;
544
+ #define Ndarray_SIZE(x) ((x)->numel())
545
+ #define Ndarray_STRIDE(x, dim) ((x)->stride(dim))
546
+
547
+ #define CHECK_WITH_MSG TORCH_CHECK
428
548
 
429
549
  // See Context struct below.
430
550
  #define CONTEXT_ARGS
431
551
 
432
- #endif // TENSORFLOW
552
+ template<typename T>
553
+ static void Ndarray_sgemm(
554
+ char transa_, char transb_,
555
+ int m, int n, int k,
556
+ const T* alpha_ptr, const T* a_ptr, int lda,
557
+ const T* b_ptr, int ldb, const T* beta_ptr,
558
+ T* c_ptr, int ldc)
559
+ {
560
+ // TODO...
561
+ assert("Torch Ndarray_sgemm not implemented" && 0);
562
+ }
433
563
 
564
+ #else // TENSORFLOW or TORCH
434
565
 
566
+ #error "No framework defined: TENSORFLOW or TORCH"
435
567
 
436
- #if CUDA
568
+ #endif // TENSORFLOW or TORCH
437
569
 
438
570
 
571
+ #if CUDA
572
+
439
573
  #if TENSORFLOW
440
574
  // Ndarray and friends already declared above, they are same for CUDA and non-CUDA
441
575
  #define CUDA_CUR_STREAM (context->eigen_gpu_device().stream())
442
576
 
443
- #else // TENSORFLOW, thus Theano here
444
- #define CUDA_CUR_STREAM (0) // default stream
577
+ #elif TORCH
445
578
 
446
- // Defined here: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/cuda_ndarray.cuh
447
- // See also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/cuda_ndarray.cu
448
- #define Ndarray CudaNdarray
449
- #define Ndarray_DEV_DATA CudaNdarray_DEV_DATA
450
- #define Ndarray_DEV_DATA_int32(x) ((int32_t*) (Ndarray_DEV_DATA(x)))
451
- #define Ndarray_DEV_DATA_int32_scalar(x) Ndarray_DEV_DATA_int32(x)[0]
452
- #define Ndarray_HOST_DIMS CudaNdarray_HOST_DIMS
453
- #define Ndarray_DIMS Ndarray_HOST_DIMS
454
- #define Ndarray_STRIDE(x, i) (CudaNdarray_HOST_STRIDES(x)[i]) // return in elements. CudaNdarray stores like that
455
- #define Ndarray_NDIM(x) (x->nd)
456
- #define Ndarray_DIM_Type int
457
- typedef Ndarray_DIM_Type const* Ndarray_DIMS_Type;
458
- #define Ndarray_dtype_size(x) sizeof(float)
459
- #define Ndarray_SIZE CudaNdarray_SIZE
460
- // PyObject *CudaNdarray_NewDims(int nd, const inttype * dims), uninitialized
461
- #define Ndarray_NewDims CudaNdarray_NewDims
462
- // PyObject * CudaNdarray_Copy(const CudaNdarray * self);
463
- #define Ndarray_Copy CudaNdarray_Copy
464
-
465
- /*
466
- // via: https://docs.nvidia.com/cuda/cublas/
467
- // matrices are in column-major form
468
- cublasStatus_t cublasSgemm(cublasHandle_t handle,
469
- cublasOperation_t transa, cublasOperation_t transb,
470
- int m, int n, int k,
471
- const float *alpha, const float *A, int lda,
472
- const float *B, int ldb, const float *beta,
473
- float *C, int ldc);
474
- */
475
- #define _cublasTranspose(t) \
476
- ((t == 'T') ? CUBLAS_OP_T : \
477
- (t == 'C') ? CUBLAS_OP_C : \
478
- (t == 'N') ? CUBLAS_OP_N : cublasOperation_t('E'))
479
- #define Ndarray_sgemm( \
480
- transpose_A, transpose_B, \
481
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
482
- (_cudaHandleError(cublasSgemm(handle, \
483
- _cublasTranspose(transpose_A), \
484
- _cublasTranspose(transpose_B), \
485
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc), \
486
- __FILE__, __LINE__ ))
579
+ #define CUDA_CUR_STREAM (at::cuda::getCurrentCUDAStream().stream())
487
580
 
581
+ #else
582
+ #error Unknown backend
488
583
  #endif
489
584
 
490
585
  #define Ndarray_memcpy(y, x, size) (cudaMemcpyAsync(y, x, size, cudaMemcpyDeviceToDevice, CUDA_CUR_STREAM))
@@ -501,48 +596,10 @@ typedef Ndarray_DIM_Type const* Ndarray_DIMS_Type;
501
596
 
502
597
  #define DEF_SHARED(type, name) extern __shared__ type name[];
503
598
 
504
- static const char *_cudaGetErrorEnum(cublasStatus_t error) {
505
- switch (error) {
506
- case CUBLAS_STATUS_SUCCESS:
507
- return "CUBLAS_STATUS_SUCCESS";
508
-
509
- case CUBLAS_STATUS_NOT_INITIALIZED:
510
- return "CUBLAS_STATUS_NOT_INITIALIZED";
511
-
512
- case CUBLAS_STATUS_ALLOC_FAILED:
513
- return "CUBLAS_STATUS_ALLOC_FAILED";
514
-
515
- case CUBLAS_STATUS_INVALID_VALUE:
516
- return "CUBLAS_STATUS_INVALID_VALUE";
517
-
518
- case CUBLAS_STATUS_ARCH_MISMATCH:
519
- return "CUBLAS_STATUS_ARCH_MISMATCH";
520
-
521
- case CUBLAS_STATUS_MAPPING_ERROR:
522
- return "CUBLAS_STATUS_MAPPING_ERROR";
523
-
524
- case CUBLAS_STATUS_EXECUTION_FAILED:
525
- return "CUBLAS_STATUS_EXECUTION_FAILED";
526
-
527
- case CUBLAS_STATUS_INTERNAL_ERROR:
528
- return "CUBLAS_STATUS_INTERNAL_ERROR";
529
- }
530
-
531
- return "<unknown>";
532
- }
533
-
534
- static void _cudaHandleError(cudaError_t err, const char *file, int line) {
535
- if (err != cudaSuccess) {
536
- printf("NativeOp: CUDA runtime error: '%s' in %s at line %d\n", cudaGetErrorString(err), file, line);
537
- exit(EXIT_FAILURE);
538
- }
539
- }
540
-
541
- static void _cudaHandleError(cublasStatus_t status, const char *file, int line) {
542
- if (status != CUBLAS_STATUS_SUCCESS) {
543
- printf("NativeOp: cuBLAS runtime error: '%s' in %s at line %d\n", _cudaGetErrorEnum(status), file, line);
544
- exit(EXIT_FAILURE);
545
- }
599
+ static void _cudaHandleError(cudaError_t err, const char* file, int line) {
600
+ CHECK_WITH_MSG(
601
+ err == cudaSuccess,
602
+ "NativeOp: CUDA runtime error: ", cudaGetErrorString(err), " in ", file, " at line ", line);
546
603
  }
547
604
 
548
605
  #define HANDLE_ERROR(status) (_cudaHandleError( status, __FILE__, __LINE__ ))
@@ -550,49 +607,7 @@ static void _cudaHandleError(cublasStatus_t status, const char *file, int line)
550
607
 
551
608
  #else // not CUDA
552
609
 
553
-
554
- #if !TENSORFLOW
555
- // Numpy, see: https://docs.scipy.org/doc/numpy/reference/c-api.array.html
556
- // And: https://deeplearning.net/software/theano/extending/extending_theano_c.html
557
- #define Ndarray PyArrayObject
558
- #define Ndarray_DEV_DATA(x) ((float*) PyArray_DATA(x))
559
- #define Ndarray_DEV_DATA_int32(x) ((int32_t*) (Ndarray_DEV_DATA(x)))
560
- #define Ndarray_DEV_DATA_int32_scalar(x) Ndarray_DEV_DATA_int32(x)[0]
561
- #define Ndarray_HOST_DIMS PyArray_DIMS
562
- #define Ndarray_STRIDE(x, i) (PyArray_STRIDE(x, i) / sizeof(float)) // return in elements. Numpy stores in bytes
563
- #define Ndarray_DIMS Ndarray_HOST_DIMS
564
- #define Ndarray_NDIM PyArray_NDIM
565
- #define Ndarray_DIM_Type npy_intp
566
- typedef Ndarray_DIM_Type const* Ndarray_DIMS_Type;
567
- #define Ndarray_dtype_size(x) sizeof(float)
568
- #define Ndarray_SIZE PyArray_SIZE
569
- #define Ndarray_NewDims(nd, dims) (PyArray_SimpleNew(nd, dims, NPY_FLOAT32))
570
- #define Ndarray_Copy(x) (PyArray_FromArray(x, NULL, NPY_ARRAY_OUT_ARRAY | NPY_ARRAY_ENSURECOPY))
571
- /*
572
- // matrices are in column-major form
573
- int sgemm_(char *transa, char *transb,
574
- integer *m, integer *n, integer *k,
575
- real *alpha, real *a, integer *lda,
576
- real *b, integer *ldb, real *beta,
577
- real *c, integer *ldc);
578
-
579
- Cast to (float*) because we might have the C-style declaration incorrectly in the C++ scope.
580
- */
581
- #define Ndarray_sgemm(\
582
- transpose_A, transpose_B, \
583
- m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
584
- { \
585
- char transa = transpose_A, transb = transpose_B; \
586
- int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; \
587
- sgemm_(&transa, &transb, \
588
- &m_, &n_, &k_, alpha, (float*) A, &lda_, (float*) B, &ldb_, beta, C, &ldc_); \
589
- }
590
-
591
- static inline void* device_malloc(size_t size) { return malloc(size); }
592
- static inline void device_free(void* ptr) { free(ptr); }
593
- #endif
594
-
595
- #define HANDLE_LAST_ERROR() (0)
610
+ #define HANDLE_LAST_ERROR() {}
596
611
 
597
612
  #define Ndarray_memcpy(y, x, size) (memcpy(y, x, size))
598
613
  #define Ndarray_memset(s, c, size) (memset(s, c, size))
@@ -671,19 +686,9 @@ struct _KernelLoop {
671
686
  #endif
672
687
 
673
688
 
674
- Ndarray* Ndarray_uninitialized_like(Ndarray* a) {
675
- Ndarray_DIMS_Type dim = Ndarray_HOST_DIMS(a);
676
- #if TENSORFLOW
677
- Ndarray* res = (Ndarray*) Ndarray_NewDims(Ndarray_NDIM(a), dim);
678
- #else
679
- Ndarray* res = (Ndarray*) Ndarray_NewDims(Ndarray_NDIM(a), const_cast<Ndarray_DIM_Type*>(dim));
680
- #endif
681
- return res;
682
- }
683
-
684
- long Ndarray_get_n_total_elements(Ndarray* a) {
685
- long c = 1;
686
- for(long i = 0; i < Ndarray_NDIM(a); ++i)
689
+ int64_t Ndarray_get_n_total_elements(Ndarray* a) {
690
+ int64_t c = 1;
691
+ for(int i = 0; i < Ndarray_NDIM(a); ++i)
687
692
  c *= Ndarray_DIMS(a)[i];
688
693
  return c;
689
694
  }
@@ -769,17 +774,22 @@ void _free(void* ptr) {
769
774
  context->device()->GetAllocator(AllocatorAttributes());
770
775
  allocator->DeallocateRaw(ptr);
771
776
  }
772
- #define device_malloc Context(CONTEXT_ARGS)._malloc
773
- #define device_free Context(CONTEXT_ARGS)._free
777
+
778
+ #elif TORCH
774
779
 
775
780
  #if CUDA
776
- cublasHandle_t _handle() {
777
- assert("not available" && 0);
778
- return NULL;
779
- }
780
- #define handle Context(CONTEXT_ARGS)._handle()
781
- #endif
782
- #endif
781
+ void* _malloc(size_t num_bytes) { return c10::cuda::CUDACachingAllocator::raw_alloc(num_bytes); }
782
+ void _free(void* ptr) { c10::cuda::CUDACachingAllocator::raw_delete(ptr); }
783
+ #else // not CUDA
784
+ void* _malloc(size_t num_bytes) { return c10::GetCPUAllocator()->raw_allocate(num_bytes); }
785
+ void _free(void* ptr) { c10::GetCPUAllocator()->raw_deallocate(ptr); }
786
+ #endif // CUDA
787
+
788
+ #endif // TENSORFLOW or TORCH
789
+
790
+
791
+ #define device_malloc Context(CONTEXT_ARGS)._malloc
792
+ #define device_free Context(CONTEXT_ARGS)._free
783
793
 
784
794
 
785
795
  //C[x] += A[x]*B[x]
returnn/native_op.py CHANGED
@@ -5,38 +5,40 @@ Generic interface which automatically creates:
5
5
  * inplace and not inplace
6
6
  * grad variants
7
7
 
8
- See :mod:`returnn.tf.native_op` and :mod:`returnn.theano.native_op`
9
- for usage in TensorFlow and Theano.
8
+ See :mod:`returnn.tf.native_op` and :mod:`returnn.torch.utils.native_op`
9
+ for usage in TensorFlow and PyTorch.
10
10
 
11
11
  See :ref:`native_ops` for more background.
12
12
  """
13
13
 
14
+ from __future__ import annotations
15
+ from typing import Optional, Union, Any, Callable, Dict, Sequence, Tuple
14
16
  import copy
15
17
  import numpy
16
- import typing
18
+
17
19
  from returnn.util.basic import make_hashable, unicode
18
20
 
19
21
 
20
22
  class NativeOpBaseMixin:
21
23
  """
22
- The purpose of having this as a separate base class is to make this independent of any Theano specific
23
- functionality so that we can also use this base for example for TensorFlow.
24
+ The purpose of having this as a separate base class
25
+ is to make this independent of any TensorFlow or PyTorch-specific functionality.
24
26
  """
25
27
 
26
28
  def __init__(
27
29
  self,
28
- in_info,
29
- out_info,
30
- c_fw_code,
31
- c_bw_code=None,
32
- c_extra_support_code=None,
33
- code_version=None,
34
- cpu_support=True,
35
- grad_input_map=None,
36
- name=None,
30
+ in_info: Sequence[Dict[str, Any]],
31
+ out_info: Sequence[Dict[str, Any]],
32
+ c_fw_code: str,
33
+ c_bw_code: Optional[str] = None,
34
+ c_extra_support_code: Union[None, str, Dict[str, str]] = None,
35
+ code_version: Optional[Tuple[int, ...]] = None,
36
+ cpu_support: bool = True,
37
+ grad_input_map: Union[None, Tuple[int, ...], Callable] = None,
38
+ name: Optional[str] = None,
37
39
  ):
38
40
  """
39
- :param list[dict(str)] in_info: each dict describes one input var.
41
+ :param in_info: each dict describes one input var.
40
42
  attribs in the dict:
41
43
  int ndim: the ndim.
42
44
  tuple shape: tuple and can contain None for specific dimensions.
@@ -49,18 +51,18 @@ class NativeOpBaseMixin:
49
51
  str gradient: can be "disconnected". see grad().
50
52
  bool bw_input: True by default. add this param to the bw input.
51
53
  other attribs are just ignored.
52
- :param list[dict(str)] out_info: like in_info.
54
+ :param out_info: like in_info.
53
55
  slightly different behavior for:
54
56
  shape: we also allow refs to the in_info in the form (in-idx,dim). see infer_shape().
55
57
  need_contiguous/want_inplace: used for bw, in case for bw_input == True.
56
- :param str c_fw_code: C code for forward pass
57
- :param str|dict[str] c_extra_support_code: C support code (for c_support_code)
58
- :param str|None c_bw_code: C code for backward pass (for gradient)
59
- :param tuple[int] code_version: will be returned by c_code_cache_version.
60
- :param bool cpu_support:
61
- :param tuple[int]|callable grad_input_map: selection of grad inputs.
58
+ :param c_fw_code: C code for forward pass
59
+ :param c_extra_support_code: C support code (for c_support_code)
60
+ :param c_bw_code: C code for backward pass (for gradient)
61
+ :param code_version: will be returned by c_code_cache_version.
62
+ :param cpu_support:
63
+ :param grad_input_map: selection of grad inputs.
62
64
  by default, we get all inputs + all outputs + all grad outputs.
63
- :param str name: name
65
+ :param name: name
64
66
  """
65
67
  assert isinstance(in_info, (list, tuple))
66
68
  assert isinstance(out_info, (list, tuple))
@@ -251,12 +253,12 @@ class NativeOpGenBase:
251
253
  See NativeOp.__init__() for attribs.
252
254
  """
253
255
 
254
- in_info = None # type: typing.Tuple[typing.Dict[str]]
255
- out_info = None # type: typing.Tuple[typing.Dict[str]]
256
- c_fw_code = None # type: str
257
- c_bw_code = None # type: str
258
- c_extra_support_code = None # type: typing.Dict[str,str]
259
- code_version = None # type: typing.Union[typing.Tuple[int], int]
256
+ in_info: Optional[Tuple[Dict[str, Any], ...]] = None
257
+ out_info: Optional[Tuple[Dict[str, Any], ...]] = None
258
+ c_fw_code: Optional[str] = None
259
+ c_bw_code: Optional[str] = None
260
+ c_extra_support_code: Optional[Dict[str, str]] = None
261
+ code_version: Union[None, Tuple[int, ...], int] = None
260
262
  grad_input_map = None
261
263
  theano_custom_grad = None
262
264
  cpu_support = True
@@ -4699,7 +4701,7 @@ class FastViterbiOp(NativeOpGenBase):
4699
4701
  int n_states,
4700
4702
  int n_edges,
4701
4703
  int t,
4702
- int32* cur_state, // (n_batch,)
4704
+ int32_t* cur_state, // (n_batch,)
4703
4705
  const IdxAndVal* frame,
4704
4706
  const int32_t* d_am_seq_len,
4705
4707
  const int32_t* d_edge_from,
@@ -5339,7 +5341,10 @@ class EditDistanceOp(NativeOpGenBase):
5339
5341
  sub_cost = last1_dist[last1_idx];
5340
5342
  if(a[batch_idx * n_a_max_len + t_a - 1] != b[batch_idx * n_b_max_len + t_b - 1])
5341
5343
  ++sub_cost;
5342
- //printf("t_a %i, t_b %i, del %i, ins %i, sub %i\\n", t_a, t_b, del_cost, ins_cost, sub_cost);
5344
+ /*printf("t_a %i, t_b %i, a %d, b %d, del %i, ins %i, sub %i\\n",
5345
+ t_a, t_b,
5346
+ a[batch_idx * n_a_max_len + t_a - 1], b[batch_idx * n_b_max_len + t_b - 1],
5347
+ del_cost, ins_cost, sub_cost);*/
5343
5348
  int min_cost = del_cost;
5344
5349
  if(min_cost > ins_cost) min_cost = ins_cost;
5345
5350
  if(min_cost > sub_cost) min_cost = sub_cost;
returnn/sprint/cache.py CHANGED
@@ -7,10 +7,9 @@ This module is about reading (maybe later also writing) the Sprint archive forma
7
7
  """
8
8
 
9
9
  from __future__ import annotations
10
- from typing import List, Optional, Tuple
10
+ from typing import Optional, List, Tuple, Dict
11
11
  import sys
12
12
  import os
13
- import typing
14
13
  import array
15
14
  from struct import pack, unpack
16
15
  import numpy
@@ -212,7 +211,7 @@ class FileArchive:
212
211
  def __init__(self, filename, must_exists=True, encoding="ascii"):
213
212
  self.encoding = encoding
214
213
 
215
- self.ft = {} # type: typing.Dict[str,FileInfo]
214
+ self.ft: Dict[str, FileInfo] = {}
216
215
  if os.path.exists(filename):
217
216
  self.allophones = []
218
217
  self.f = open(filename, "rb")
@@ -334,8 +333,8 @@ class FileArchive:
334
333
  # print(typ)
335
334
  assert type_ == "vector-f32"
336
335
  count = self.read_U32()
337
- data = [None] * count # type: typing.List[typing.Optional[numpy.ndarray]]
338
- time_ = [None] * count # type: typing.List[typing.Optional[numpy.ndarray]]
336
+ data: List[Optional[numpy.ndarray]] = [None] * count
337
+ time_: List[Optional[numpy.ndarray]] = [None] * count
339
338
  for i in range(count):
340
339
  size = self.read_U32()
341
340
  data[i] = self.read_v("f", size) # size x f32
@@ -450,7 +449,7 @@ class FileArchive:
450
449
  a = array.array("b")
451
450
  a.fromfile(self.f, comp)
452
451
  # unpack
453
- b = zlib.decompress(a.tostring(), 15 + 32)
452
+ b = zlib.decompress(a.tobytes(), 15 + 32)
454
453
  # substitute self.f by an anonymous memmap file object
455
454
  # restore original file handle after we're done
456
455
  backup_f = self.f
@@ -575,17 +574,17 @@ class FileArchiveBundle:
575
574
  :param str encoding: encoding used in the files
576
575
  """
577
576
  # filename -> FileArchive
578
- self.archives = {} # type: typing.Dict[str,FileArchive]
577
+ self.archives: Dict[str, FileArchive] = {}
579
578
  # archive content file -> FileArchive
580
- self.files = {} # type: typing.Dict[str,FileArchive]
579
+ self.files: Dict[str, FileArchive] = {}
581
580
  self._short_seg_names = {}
582
581
  if filename is not None:
583
582
  self.add_bundle(filename=filename, encoding=encoding)
584
583
 
585
- def add_bundle(self, filename, encoding="ascii"):
584
+ def add_bundle(self, filename: str, encoding: str = "ascii"):
586
585
  """
587
- :param str filename: bundle
588
- :param str encoding:
586
+ :param filename: bundle
587
+ :param encoding:
589
588
  """
590
589
  file_dir = os.path.dirname(filename) or "."
591
590
  for line in open(filename).read().splitlines():
@@ -837,7 +836,7 @@ class MixtureSet:
837
836
  """
838
837
  a = array.array("b")
839
838
  a.fromfile(self.f, length)
840
- return a.tostring().decode(encoding)
839
+ return a.tobytes().decode(encoding)
841
840
 
842
841
  def read_f32(self):
843
842
  """
@@ -1003,7 +1002,7 @@ class WordBoundaries:
1003
1002
  """
1004
1003
  a = array.array("b")
1005
1004
  a.fromfile(self.f, length)
1006
- return a.tostring().decode(encoding)
1005
+ return a.tobytes().decode(encoding)
1007
1006
 
1008
1007
  def __init__(self, filename):
1009
1008
  """