returnn 1.20260105.192646__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- returnn/PKG-INFO +1 -1
- returnn/__old_mod_loader__.py +26 -2
- returnn/_setup_info_generated.py +2 -2
- returnn/datasets/lm.py +110 -42
- returnn/frontend/__init__.py +1 -0
- returnn/frontend/_backend.py +41 -0
- returnn/frontend/_native/__init__.py +22 -0
- returnn/frontend/_numpy_backend.py +7 -0
- returnn/frontend/_utils.py +1 -1
- returnn/frontend/array_.py +6 -5
- returnn/frontend/assert_.py +35 -0
- returnn/frontend/device.py +14 -1
- returnn/frontend/encoder/conformer.py +19 -0
- returnn/frontend/loss.py +183 -3
- returnn/frontend/math_.py +54 -14
- returnn/native_op.cpp +104 -174
- returnn/native_op.py +36 -31
- returnn/tensor/_dim_extra.py +7 -7
- returnn/tensor/_tensor_extra.py +10 -10
- returnn/tensor/utils.py +1 -1
- returnn/tf/frontend_layers/_backend.py +3 -1
- returnn/tf/layers/basic.py +13 -2
- returnn/tf/native_op.py +16 -5
- returnn/tf/util/basic.py +7 -201
- returnn/torch/engine.py +120 -3
- returnn/torch/frontend/_backend.py +166 -22
- returnn/torch/frontend/bridge.py +61 -0
- returnn/torch/frontend/compile_helper.py +106 -0
- returnn/torch/util/array_.py +30 -0
- returnn/torch/util/assert_.py +122 -0
- returnn/torch/util/native_op.py +885 -0
- returnn/torch/util/native_op_code_compiler.py +308 -0
- returnn/util/basic.py +3 -1
- returnn/util/cuda_env.py +332 -0
- returnn/util/debug.py +1 -0
- returnn/util/fsa.py +17 -13
- returnn/util/native_code_compiler.py +104 -47
- {returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/METADATA +1 -1
- {returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/RECORD +42 -36
- {returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/WHEEL +1 -1
- {returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/LICENSE +0 -0
- {returnn-1.20260105.192646.dist-info → returnn-1.20260119.15400.dist-info}/top_level.txt +0 -0
returnn/native_op.cpp
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
/*
|
|
2
|
+
This file is imported in various ways.
|
|
3
|
+
The mode is determined via the preprocessor defines:
|
|
4
|
+
|
|
5
|
+
TENSORFLOW: If defined and set to 1, TensorFlow is used as backend.
|
|
6
|
+
TORCH: If defined and set to 1, PyTorch is used as backend.
|
|
7
|
+
|
|
8
|
+
CUDA: If defined and set to 1, CUDA is used for GPU support.
|
|
9
|
+
Otherwise, it uses CPU only.
|
|
10
|
+
The kernels are all expected to also compile in CPU-only mode.
|
|
11
|
+
*/
|
|
1
12
|
|
|
2
13
|
#include <assert.h>
|
|
3
14
|
#include <iostream>
|
|
@@ -16,6 +27,10 @@
|
|
|
16
27
|
#define TENSORFLOW 0
|
|
17
28
|
#endif
|
|
18
29
|
|
|
30
|
+
#ifndef TORCH
|
|
31
|
+
#define TORCH 0
|
|
32
|
+
#endif
|
|
33
|
+
|
|
19
34
|
#ifndef _ns
|
|
20
35
|
#define _ns
|
|
21
36
|
#endif
|
|
@@ -118,7 +133,7 @@ static inline int _host_float_as_int(float x) {
|
|
|
118
133
|
#define INF_F int_as_float(0x7f800000)
|
|
119
134
|
#define NAN_F int_as_float(0x7fffffff)
|
|
120
135
|
|
|
121
|
-
#endif
|
|
136
|
+
#endif // CUDA
|
|
122
137
|
|
|
123
138
|
|
|
124
139
|
|
|
@@ -157,7 +172,7 @@ The BLAS functions expect the inputs in column-major and return in column-major.
|
|
|
157
172
|
#define Ndarray tensorflow::Tensor
|
|
158
173
|
#define Ndarray_DEV_DATA(x) ((float*) (x)->tensor_data().data())
|
|
159
174
|
#define Ndarray_DEV_DATA_int32(x) ((int32_t*) (x)->tensor_data().data())
|
|
160
|
-
#define Ndarray_DEV_DATA_int32_scalar(x) (x)->scalar<
|
|
175
|
+
#define Ndarray_DEV_DATA_int32_scalar(x) (x)->scalar<int32_t>()()
|
|
161
176
|
#define Ndarray_HOST_DIMS(x) DimsAccessor(x)
|
|
162
177
|
#define Ndarray_DIMS Ndarray_HOST_DIMS
|
|
163
178
|
#define Ndarray_NDIM(x) (x)->dims()
|
|
@@ -399,13 +414,13 @@ static void tf_cuda_sgemm_batched(
|
|
|
399
414
|
|
|
400
415
|
|
|
401
416
|
#define Ndarray_sgemm( \
|
|
402
|
-
|
|
403
|
-
|
|
417
|
+
transpose_A, transpose_B, \
|
|
418
|
+
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
|
|
404
419
|
tf_cuda_sgemm<float>(context, transpose_A, transpose_B, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
|
|
405
420
|
|
|
406
421
|
#define Ndarray_sgemm_batched( \
|
|
407
|
-
|
|
408
|
-
|
|
422
|
+
transpose_A, transpose_B, \
|
|
423
|
+
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchSize, finalize_stream) \
|
|
409
424
|
tf_cuda_sgemm_batched<float>(context, transpose_A, transpose_B, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchSize, finalize_stream);
|
|
410
425
|
|
|
411
426
|
|
|
@@ -415,21 +430,21 @@ static void tf_cuda_sgemm_batched(
|
|
|
415
430
|
|
|
416
431
|
/*
|
|
417
432
|
// matrices are in column-major form
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
433
|
+
int sgemm_(char *transa, char *transb,
|
|
434
|
+
integer *m, integer *n, integer *k,
|
|
435
|
+
real *alpha, real *a, integer *lda,
|
|
436
|
+
real *b, integer *ldb, real *beta,
|
|
437
|
+
real *c, integer *ldc);
|
|
423
438
|
*/
|
|
424
439
|
#define Ndarray_sgemm(\
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
440
|
+
transpose_A, transpose_B, \
|
|
441
|
+
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
|
|
442
|
+
{ \
|
|
443
|
+
char transa = transpose_A, transb = transpose_B; \
|
|
444
|
+
int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; \
|
|
445
|
+
sgemm_(&transa, &transb, \
|
|
446
|
+
&m_, &n_, &k_, alpha, A, &lda_, B, &ldb_, beta, C, &ldc_); \
|
|
447
|
+
}
|
|
433
448
|
|
|
434
449
|
#else // HAVE_CUSTOM_BLAS
|
|
435
450
|
|
|
@@ -494,77 +509,77 @@ static void tf_cpu_sgemm(
|
|
|
494
509
|
}
|
|
495
510
|
|
|
496
511
|
#define Ndarray_sgemm(\
|
|
497
|
-
|
|
498
|
-
|
|
512
|
+
transpose_A, transpose_B, \
|
|
513
|
+
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
|
|
499
514
|
tf_cpu_sgemm<float>(context, transpose_A, transpose_B, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
|
|
500
515
|
|
|
501
516
|
#endif // HAVE_CUSTOM_BLAS
|
|
502
517
|
#endif // CUDA
|
|
503
518
|
|
|
519
|
+
#define CHECK_WITH_MSG(condition, message) \
|
|
520
|
+
if(!(condition)) { \
|
|
521
|
+
std::cerr << "NativeOp check failed: " << message << std::endl; \
|
|
522
|
+
assert(condition); \
|
|
523
|
+
}
|
|
524
|
+
|
|
504
525
|
// See Context struct below.
|
|
505
526
|
#define CONTEXT_ARGS context
|
|
506
527
|
|
|
507
|
-
|
|
528
|
+
|
|
529
|
+
#elif TORCH
|
|
530
|
+
// https://github.com/rwth-i6/i6_native_ops/blob/main/i6_native_ops/common/returnn_definitions.h
|
|
531
|
+
// https://docs.pytorch.org/cppdocs/stable.html#tensor-class
|
|
532
|
+
|
|
533
|
+
#define Ndarray torch::Tensor
|
|
534
|
+
#define Ndarray_DEV_DATA(x) ((float*)(x)->data_ptr())
|
|
535
|
+
#define Ndarray_DEV_DATA_int32(x) ((int32_t*)(x)->data_ptr())
|
|
536
|
+
#define Ndarray_DEV_DATA_uint32(x) ((uint32_t*)(x)->data_ptr())
|
|
537
|
+
#define Ndarray_DEV_DATA_int32_scalar(x) ((x)->item().to<int32_t>())
|
|
538
|
+
#define Ndarray_HOST_DIMS(x) ((x)->sizes())
|
|
539
|
+
#define Ndarray_DIMS(x) ((x)->sizes())
|
|
540
|
+
typedef at::IntArrayRef Ndarray_DIMS_Type;
|
|
541
|
+
#define Ndarray_NDIM(x) (x)->dim()
|
|
542
|
+
#define Ndarray_dtype_size(x) torch::elementSize((x)->scalar_type())
|
|
543
|
+
typedef int64_t Ndarray_DIM_Type;
|
|
544
|
+
#define Ndarray_SIZE(x) ((x)->numel())
|
|
545
|
+
#define Ndarray_STRIDE(x, dim) ((x)->stride(dim))
|
|
546
|
+
|
|
547
|
+
#define CHECK_WITH_MSG TORCH_CHECK
|
|
508
548
|
|
|
509
549
|
// See Context struct below.
|
|
510
550
|
#define CONTEXT_ARGS
|
|
511
551
|
|
|
512
|
-
|
|
552
|
+
template<typename T>
|
|
553
|
+
static void Ndarray_sgemm(
|
|
554
|
+
char transa_, char transb_,
|
|
555
|
+
int m, int n, int k,
|
|
556
|
+
const T* alpha_ptr, const T* a_ptr, int lda,
|
|
557
|
+
const T* b_ptr, int ldb, const T* beta_ptr,
|
|
558
|
+
T* c_ptr, int ldc)
|
|
559
|
+
{
|
|
560
|
+
// TODO...
|
|
561
|
+
assert("Torch Ndarray_sgemm not implemented" && 0);
|
|
562
|
+
}
|
|
513
563
|
|
|
564
|
+
#else // TENSORFLOW or TORCH
|
|
514
565
|
|
|
566
|
+
#error "No framework defined: TENSORFLOW or TORCH"
|
|
567
|
+
|
|
568
|
+
#endif // TENSORFLOW or TORCH
|
|
515
569
|
|
|
516
|
-
#if CUDA
|
|
517
570
|
|
|
571
|
+
#if CUDA
|
|
518
572
|
|
|
519
573
|
#if TENSORFLOW
|
|
520
574
|
// Ndarray and friends already declared above, they are same for CUDA and non-CUDA
|
|
521
575
|
#define CUDA_CUR_STREAM (context->eigen_gpu_device().stream())
|
|
522
576
|
|
|
523
|
-
#
|
|
524
|
-
#define CUDA_CUR_STREAM (0) // default stream
|
|
577
|
+
#elif TORCH
|
|
525
578
|
|
|
526
|
-
|
|
527
|
-
// See also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/cuda_ndarray.cu
|
|
528
|
-
#define Ndarray CudaNdarray
|
|
529
|
-
#define Ndarray_DEV_DATA CudaNdarray_DEV_DATA
|
|
530
|
-
#define Ndarray_DEV_DATA_int32(x) ((int32_t*) (Ndarray_DEV_DATA(x)))
|
|
531
|
-
#define Ndarray_DEV_DATA_int32_scalar(x) Ndarray_DEV_DATA_int32(x)[0]
|
|
532
|
-
#define Ndarray_HOST_DIMS CudaNdarray_HOST_DIMS
|
|
533
|
-
#define Ndarray_DIMS Ndarray_HOST_DIMS
|
|
534
|
-
#define Ndarray_STRIDE(x, i) (CudaNdarray_HOST_STRIDES(x)[i]) // return in elements. CudaNdarray stores like that
|
|
535
|
-
#define Ndarray_NDIM(x) (x->nd)
|
|
536
|
-
#define Ndarray_DIM_Type int
|
|
537
|
-
typedef Ndarray_DIM_Type const* Ndarray_DIMS_Type;
|
|
538
|
-
#define Ndarray_dtype_size(x) sizeof(float)
|
|
539
|
-
#define Ndarray_SIZE CudaNdarray_SIZE
|
|
540
|
-
// PyObject *CudaNdarray_NewDims(int nd, const inttype * dims), uninitialized
|
|
541
|
-
#define Ndarray_NewDims CudaNdarray_NewDims
|
|
542
|
-
// PyObject * CudaNdarray_Copy(const CudaNdarray * self);
|
|
543
|
-
#define Ndarray_Copy CudaNdarray_Copy
|
|
544
|
-
|
|
545
|
-
/*
|
|
546
|
-
// via: https://docs.nvidia.com/cuda/cublas/
|
|
547
|
-
// matrices are in column-major form
|
|
548
|
-
cublasStatus_t cublasSgemm(cublasHandle_t handle,
|
|
549
|
-
cublasOperation_t transa, cublasOperation_t transb,
|
|
550
|
-
int m, int n, int k,
|
|
551
|
-
const float *alpha, const float *A, int lda,
|
|
552
|
-
const float *B, int ldb, const float *beta,
|
|
553
|
-
float *C, int ldc);
|
|
554
|
-
*/
|
|
555
|
-
#define _cublasTranspose(t) \
|
|
556
|
-
((t == 'T') ? CUBLAS_OP_T : \
|
|
557
|
-
(t == 'C') ? CUBLAS_OP_C : \
|
|
558
|
-
(t == 'N') ? CUBLAS_OP_N : cublasOperation_t('E'))
|
|
559
|
-
#define Ndarray_sgemm( \
|
|
560
|
-
transpose_A, transpose_B, \
|
|
561
|
-
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
|
|
562
|
-
(_cudaHandleError(cublasSgemm(handle, \
|
|
563
|
-
_cublasTranspose(transpose_A), \
|
|
564
|
-
_cublasTranspose(transpose_B), \
|
|
565
|
-
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc), \
|
|
566
|
-
__FILE__, __LINE__ ))
|
|
579
|
+
#define CUDA_CUR_STREAM (at::cuda::getCurrentCUDAStream().stream())
|
|
567
580
|
|
|
581
|
+
#else
|
|
582
|
+
#error Unknown backend
|
|
568
583
|
#endif
|
|
569
584
|
|
|
570
585
|
#define Ndarray_memcpy(y, x, size) (cudaMemcpyAsync(y, x, size, cudaMemcpyDeviceToDevice, CUDA_CUR_STREAM))
|
|
@@ -581,48 +596,10 @@ typedef Ndarray_DIM_Type const* Ndarray_DIMS_Type;
|
|
|
581
596
|
|
|
582
597
|
#define DEF_SHARED(type, name) extern __shared__ type name[];
|
|
583
598
|
|
|
584
|
-
static const char
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
case CUBLAS_STATUS_NOT_INITIALIZED:
|
|
590
|
-
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
|
591
|
-
|
|
592
|
-
case CUBLAS_STATUS_ALLOC_FAILED:
|
|
593
|
-
return "CUBLAS_STATUS_ALLOC_FAILED";
|
|
594
|
-
|
|
595
|
-
case CUBLAS_STATUS_INVALID_VALUE:
|
|
596
|
-
return "CUBLAS_STATUS_INVALID_VALUE";
|
|
597
|
-
|
|
598
|
-
case CUBLAS_STATUS_ARCH_MISMATCH:
|
|
599
|
-
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
|
600
|
-
|
|
601
|
-
case CUBLAS_STATUS_MAPPING_ERROR:
|
|
602
|
-
return "CUBLAS_STATUS_MAPPING_ERROR";
|
|
603
|
-
|
|
604
|
-
case CUBLAS_STATUS_EXECUTION_FAILED:
|
|
605
|
-
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
|
606
|
-
|
|
607
|
-
case CUBLAS_STATUS_INTERNAL_ERROR:
|
|
608
|
-
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
|
609
|
-
}
|
|
610
|
-
|
|
611
|
-
return "<unknown>";
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
static void _cudaHandleError(cudaError_t err, const char *file, int line) {
|
|
615
|
-
if (err != cudaSuccess) {
|
|
616
|
-
printf("NativeOp: CUDA runtime error: '%s' in %s at line %d\n", cudaGetErrorString(err), file, line);
|
|
617
|
-
exit(EXIT_FAILURE);
|
|
618
|
-
}
|
|
619
|
-
}
|
|
620
|
-
|
|
621
|
-
static void _cudaHandleError(cublasStatus_t status, const char *file, int line) {
|
|
622
|
-
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
623
|
-
printf("NativeOp: cuBLAS runtime error: '%s' in %s at line %d\n", _cudaGetErrorEnum(status), file, line);
|
|
624
|
-
exit(EXIT_FAILURE);
|
|
625
|
-
}
|
|
599
|
+
static void _cudaHandleError(cudaError_t err, const char* file, int line) {
|
|
600
|
+
CHECK_WITH_MSG(
|
|
601
|
+
err == cudaSuccess,
|
|
602
|
+
"NativeOp: CUDA runtime error: ", cudaGetErrorString(err), " in ", file, " at line ", line);
|
|
626
603
|
}
|
|
627
604
|
|
|
628
605
|
#define HANDLE_ERROR(status) (_cudaHandleError( status, __FILE__, __LINE__ ))
|
|
@@ -630,49 +607,7 @@ static void _cudaHandleError(cublasStatus_t status, const char *file, int line)
|
|
|
630
607
|
|
|
631
608
|
#else // not CUDA
|
|
632
609
|
|
|
633
|
-
|
|
634
|
-
#if !TENSORFLOW
|
|
635
|
-
// Numpy, see: https://docs.scipy.org/doc/numpy/reference/c-api.array.html
|
|
636
|
-
// And: https://deeplearning.net/software/theano/extending/extending_theano_c.html
|
|
637
|
-
#define Ndarray PyArrayObject
|
|
638
|
-
#define Ndarray_DEV_DATA(x) ((float*) PyArray_DATA(x))
|
|
639
|
-
#define Ndarray_DEV_DATA_int32(x) ((int32_t*) (Ndarray_DEV_DATA(x)))
|
|
640
|
-
#define Ndarray_DEV_DATA_int32_scalar(x) Ndarray_DEV_DATA_int32(x)[0]
|
|
641
|
-
#define Ndarray_HOST_DIMS PyArray_DIMS
|
|
642
|
-
#define Ndarray_STRIDE(x, i) (PyArray_STRIDE(x, i) / sizeof(float)) // return in elements. Numpy stores in bytes
|
|
643
|
-
#define Ndarray_DIMS Ndarray_HOST_DIMS
|
|
644
|
-
#define Ndarray_NDIM PyArray_NDIM
|
|
645
|
-
#define Ndarray_DIM_Type npy_intp
|
|
646
|
-
typedef Ndarray_DIM_Type const* Ndarray_DIMS_Type;
|
|
647
|
-
#define Ndarray_dtype_size(x) sizeof(float)
|
|
648
|
-
#define Ndarray_SIZE PyArray_SIZE
|
|
649
|
-
#define Ndarray_NewDims(nd, dims) (PyArray_SimpleNew(nd, dims, NPY_FLOAT32))
|
|
650
|
-
#define Ndarray_Copy(x) (PyArray_FromArray(x, NULL, NPY_ARRAY_OUT_ARRAY | NPY_ARRAY_ENSURECOPY))
|
|
651
|
-
/*
|
|
652
|
-
// matrices are in column-major form
|
|
653
|
-
int sgemm_(char *transa, char *transb,
|
|
654
|
-
integer *m, integer *n, integer *k,
|
|
655
|
-
real *alpha, real *a, integer *lda,
|
|
656
|
-
real *b, integer *ldb, real *beta,
|
|
657
|
-
real *c, integer *ldc);
|
|
658
|
-
|
|
659
|
-
Cast to (float*) because we might have the C-style declaration incorrectly in the C++ scope.
|
|
660
|
-
*/
|
|
661
|
-
#define Ndarray_sgemm(\
|
|
662
|
-
transpose_A, transpose_B, \
|
|
663
|
-
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
|
|
664
|
-
{ \
|
|
665
|
-
char transa = transpose_A, transb = transpose_B; \
|
|
666
|
-
int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; \
|
|
667
|
-
sgemm_(&transa, &transb, \
|
|
668
|
-
&m_, &n_, &k_, alpha, (float*) A, &lda_, (float*) B, &ldb_, beta, C, &ldc_); \
|
|
669
|
-
}
|
|
670
|
-
|
|
671
|
-
static inline void* device_malloc(size_t size) { return malloc(size); }
|
|
672
|
-
static inline void device_free(void* ptr) { free(ptr); }
|
|
673
|
-
#endif
|
|
674
|
-
|
|
675
|
-
#define HANDLE_LAST_ERROR() (0)
|
|
610
|
+
#define HANDLE_LAST_ERROR() {}
|
|
676
611
|
|
|
677
612
|
#define Ndarray_memcpy(y, x, size) (memcpy(y, x, size))
|
|
678
613
|
#define Ndarray_memset(s, c, size) (memset(s, c, size))
|
|
@@ -751,19 +686,9 @@ struct _KernelLoop {
|
|
|
751
686
|
#endif
|
|
752
687
|
|
|
753
688
|
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
Ndarray* res = (Ndarray*) Ndarray_NewDims(Ndarray_NDIM(a), dim);
|
|
758
|
-
#else
|
|
759
|
-
Ndarray* res = (Ndarray*) Ndarray_NewDims(Ndarray_NDIM(a), const_cast<Ndarray_DIM_Type*>(dim));
|
|
760
|
-
#endif
|
|
761
|
-
return res;
|
|
762
|
-
}
|
|
763
|
-
|
|
764
|
-
long Ndarray_get_n_total_elements(Ndarray* a) {
|
|
765
|
-
long c = 1;
|
|
766
|
-
for(long i = 0; i < Ndarray_NDIM(a); ++i)
|
|
689
|
+
int64_t Ndarray_get_n_total_elements(Ndarray* a) {
|
|
690
|
+
int64_t c = 1;
|
|
691
|
+
for(int i = 0; i < Ndarray_NDIM(a); ++i)
|
|
767
692
|
c *= Ndarray_DIMS(a)[i];
|
|
768
693
|
return c;
|
|
769
694
|
}
|
|
@@ -849,17 +774,22 @@ void _free(void* ptr) {
|
|
|
849
774
|
context->device()->GetAllocator(AllocatorAttributes());
|
|
850
775
|
allocator->DeallocateRaw(ptr);
|
|
851
776
|
}
|
|
852
|
-
|
|
853
|
-
#
|
|
777
|
+
|
|
778
|
+
#elif TORCH
|
|
854
779
|
|
|
855
780
|
#if CUDA
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
}
|
|
860
|
-
|
|
861
|
-
#endif
|
|
862
|
-
|
|
781
|
+
void* _malloc(size_t num_bytes) { return c10::cuda::CUDACachingAllocator::raw_alloc(num_bytes); }
|
|
782
|
+
void _free(void* ptr) { c10::cuda::CUDACachingAllocator::raw_delete(ptr); }
|
|
783
|
+
#else // not CUDA
|
|
784
|
+
void* _malloc(size_t num_bytes) { return c10::GetCPUAllocator()->raw_allocate(num_bytes); }
|
|
785
|
+
void _free(void* ptr) { c10::GetCPUAllocator()->raw_deallocate(ptr); }
|
|
786
|
+
#endif // CUDA
|
|
787
|
+
|
|
788
|
+
#endif // TENSORFLOW or TORCH
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
#define device_malloc Context(CONTEXT_ARGS)._malloc
|
|
792
|
+
#define device_free Context(CONTEXT_ARGS)._free
|
|
863
793
|
|
|
864
794
|
|
|
865
795
|
//C[x] += A[x]*B[x]
|
returnn/native_op.py
CHANGED
|
@@ -5,38 +5,40 @@ Generic interface which automatically creates:
|
|
|
5
5
|
* inplace and not inplace
|
|
6
6
|
* grad variants
|
|
7
7
|
|
|
8
|
-
See :mod:`returnn.tf.native_op` and :mod:`returnn.
|
|
9
|
-
for usage in TensorFlow and
|
|
8
|
+
See :mod:`returnn.tf.native_op` and :mod:`returnn.torch.utils.native_op`
|
|
9
|
+
for usage in TensorFlow and PyTorch.
|
|
10
10
|
|
|
11
11
|
See :ref:`native_ops` for more background.
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
from typing import Optional, Union, Any, Callable, Dict, Sequence, Tuple
|
|
14
16
|
import copy
|
|
15
17
|
import numpy
|
|
16
|
-
|
|
18
|
+
|
|
17
19
|
from returnn.util.basic import make_hashable, unicode
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
class NativeOpBaseMixin:
|
|
21
23
|
"""
|
|
22
|
-
The purpose of having this as a separate base class
|
|
23
|
-
|
|
24
|
+
The purpose of having this as a separate base class
|
|
25
|
+
is to make this independent of any TensorFlow or PyTorch-specific functionality.
|
|
24
26
|
"""
|
|
25
27
|
|
|
26
28
|
def __init__(
|
|
27
29
|
self,
|
|
28
|
-
in_info,
|
|
29
|
-
out_info,
|
|
30
|
-
c_fw_code,
|
|
31
|
-
c_bw_code=None,
|
|
32
|
-
c_extra_support_code=None,
|
|
33
|
-
code_version=None,
|
|
34
|
-
cpu_support=True,
|
|
35
|
-
grad_input_map=None,
|
|
36
|
-
name=None,
|
|
30
|
+
in_info: Sequence[Dict[str, Any]],
|
|
31
|
+
out_info: Sequence[Dict[str, Any]],
|
|
32
|
+
c_fw_code: str,
|
|
33
|
+
c_bw_code: Optional[str] = None,
|
|
34
|
+
c_extra_support_code: Union[None, str, Dict[str, str]] = None,
|
|
35
|
+
code_version: Optional[Tuple[int, ...]] = None,
|
|
36
|
+
cpu_support: bool = True,
|
|
37
|
+
grad_input_map: Union[None, Tuple[int, ...], Callable] = None,
|
|
38
|
+
name: Optional[str] = None,
|
|
37
39
|
):
|
|
38
40
|
"""
|
|
39
|
-
:param
|
|
41
|
+
:param in_info: each dict describes one input var.
|
|
40
42
|
attribs in the dict:
|
|
41
43
|
int ndim: the ndim.
|
|
42
44
|
tuple shape: tuple and can contain None for specific dimensions.
|
|
@@ -49,18 +51,18 @@ class NativeOpBaseMixin:
|
|
|
49
51
|
str gradient: can be "disconnected". see grad().
|
|
50
52
|
bool bw_input: True by default. add this param to the bw input.
|
|
51
53
|
other attribs are just ignored.
|
|
52
|
-
:param
|
|
54
|
+
:param out_info: like in_info.
|
|
53
55
|
slightly different behavior for:
|
|
54
56
|
shape: we also allow refs to the in_info in the form (in-idx,dim). see infer_shape().
|
|
55
57
|
need_contiguous/want_inplace: used for bw, in case for bw_input == True.
|
|
56
|
-
:param
|
|
57
|
-
:param
|
|
58
|
-
:param
|
|
59
|
-
:param
|
|
60
|
-
:param
|
|
61
|
-
:param
|
|
58
|
+
:param c_fw_code: C code for forward pass
|
|
59
|
+
:param c_extra_support_code: C support code (for c_support_code)
|
|
60
|
+
:param c_bw_code: C code for backward pass (for gradient)
|
|
61
|
+
:param code_version: will be returned by c_code_cache_version.
|
|
62
|
+
:param cpu_support:
|
|
63
|
+
:param grad_input_map: selection of grad inputs.
|
|
62
64
|
by default, we get all inputs + all outputs + all grad outputs.
|
|
63
|
-
:param
|
|
65
|
+
:param name: name
|
|
64
66
|
"""
|
|
65
67
|
assert isinstance(in_info, (list, tuple))
|
|
66
68
|
assert isinstance(out_info, (list, tuple))
|
|
@@ -251,12 +253,12 @@ class NativeOpGenBase:
|
|
|
251
253
|
See NativeOp.__init__() for attribs.
|
|
252
254
|
"""
|
|
253
255
|
|
|
254
|
-
in_info
|
|
255
|
-
out_info
|
|
256
|
-
c_fw_code = None
|
|
257
|
-
c_bw_code = None
|
|
258
|
-
c_extra_support_code
|
|
259
|
-
code_version
|
|
256
|
+
in_info: Optional[Tuple[Dict[str, Any], ...]] = None
|
|
257
|
+
out_info: Optional[Tuple[Dict[str, Any], ...]] = None
|
|
258
|
+
c_fw_code: Optional[str] = None
|
|
259
|
+
c_bw_code: Optional[str] = None
|
|
260
|
+
c_extra_support_code: Optional[Dict[str, str]] = None
|
|
261
|
+
code_version: Union[None, Tuple[int, ...], int] = None
|
|
260
262
|
grad_input_map = None
|
|
261
263
|
theano_custom_grad = None
|
|
262
264
|
cpu_support = True
|
|
@@ -4699,7 +4701,7 @@ class FastViterbiOp(NativeOpGenBase):
|
|
|
4699
4701
|
int n_states,
|
|
4700
4702
|
int n_edges,
|
|
4701
4703
|
int t,
|
|
4702
|
-
|
|
4704
|
+
int32_t* cur_state, // (n_batch,)
|
|
4703
4705
|
const IdxAndVal* frame,
|
|
4704
4706
|
const int32_t* d_am_seq_len,
|
|
4705
4707
|
const int32_t* d_edge_from,
|
|
@@ -5339,7 +5341,10 @@ class EditDistanceOp(NativeOpGenBase):
|
|
|
5339
5341
|
sub_cost = last1_dist[last1_idx];
|
|
5340
5342
|
if(a[batch_idx * n_a_max_len + t_a - 1] != b[batch_idx * n_b_max_len + t_b - 1])
|
|
5341
5343
|
++sub_cost;
|
|
5342
|
-
|
|
5344
|
+
/*printf("t_a %i, t_b %i, a %d, b %d, del %i, ins %i, sub %i\\n",
|
|
5345
|
+
t_a, t_b,
|
|
5346
|
+
a[batch_idx * n_a_max_len + t_a - 1], b[batch_idx * n_b_max_len + t_b - 1],
|
|
5347
|
+
del_cost, ins_cost, sub_cost);*/
|
|
5343
5348
|
int min_cost = del_cost;
|
|
5344
5349
|
if(min_cost > ins_cost) min_cost = ins_cost;
|
|
5345
5350
|
if(min_cost > sub_cost) min_cost = sub_cost;
|
returnn/tensor/_dim_extra.py
CHANGED
|
@@ -858,7 +858,7 @@ class _DimMixin:
|
|
|
858
858
|
self._make_extra()
|
|
859
859
|
dim_order_default = self.dyn_size_ext.dims + (self,)
|
|
860
860
|
if dim_order is not None:
|
|
861
|
-
dim_order = tuple(d for d in dim_order if d in dim_order_default) # filter
|
|
861
|
+
dim_order = tuple([d for d in dim_order if d in dim_order_default]) # filter
|
|
862
862
|
else:
|
|
863
863
|
dim_order = dim_order_default
|
|
864
864
|
cache_key = (device, dim_order)
|
|
@@ -2484,16 +2484,16 @@ _BinOpStrs = {
|
|
|
2484
2484
|
|
|
2485
2485
|
def _math_get_dim_via_bin_op(dims: Sequence[Union[Dim, int]], op_kind: str) -> Dim:
|
|
2486
2486
|
dims = [d if isinstance(d, _d.Dim) else _make_constant_static_dim(d) for d in dims]
|
|
2487
|
-
if all(d.dimension is not None for d in dims):
|
|
2487
|
+
if all([d.dimension is not None for d in dims]):
|
|
2488
2488
|
op = _BinOps[op_kind]
|
|
2489
2489
|
dim_value = dims[0].dimension
|
|
2490
2490
|
for d in dims[1:]:
|
|
2491
2491
|
dim_value = op(dim_value, d.dimension)
|
|
2492
2492
|
else:
|
|
2493
2493
|
dim_value = None
|
|
2494
|
-
if all(d.is_constant_static_dim() for d in dims):
|
|
2494
|
+
if all([d.is_constant_static_dim() for d in dims]):
|
|
2495
2495
|
return _make_constant_static_dim(dim_value, kind=_get_merged_dim_kind(dims))
|
|
2496
|
-
desc = _BinOpStrs[op_kind].join(_get_description(d) for d in dims)
|
|
2496
|
+
desc = _BinOpStrs[op_kind].join([_get_description(d) for d in dims])
|
|
2497
2497
|
if op_kind.startswith("ceildiv"):
|
|
2498
2498
|
desc = f"⌈{desc}⌉"
|
|
2499
2499
|
return _d.Dim(
|
|
@@ -2676,16 +2676,16 @@ def _get_description(dim, brackets=True):
|
|
|
2676
2676
|
|
|
2677
2677
|
|
|
2678
2678
|
def _get_merged_dim_kind(dim_tags: Sequence[Dim]) -> Entity:
|
|
2679
|
-
if any(tag.is_batch_dim() for tag in dim_tags):
|
|
2679
|
+
if any([tag.is_batch_dim() for tag in dim_tags]):
|
|
2680
2680
|
return DimTypes.Batch
|
|
2681
|
-
elif any(tag.is_feature_dim() for tag in dim_tags):
|
|
2681
|
+
elif any([tag.is_feature_dim() for tag in dim_tags]):
|
|
2682
2682
|
return DimTypes.Feature
|
|
2683
2683
|
else:
|
|
2684
2684
|
return DimTypes.Spatial
|
|
2685
2685
|
|
|
2686
2686
|
|
|
2687
2687
|
def _representative_tag(terms: Sequence[Dim]) -> Optional[Dim]:
|
|
2688
|
-
if any(not term_.auto_generated for term_ in terms):
|
|
2688
|
+
if any([not term_.auto_generated for term_ in terms]):
|
|
2689
2689
|
# Always prefer non-auto-generated.
|
|
2690
2690
|
terms = [term_ for term_ in terms if not term_.auto_generated]
|
|
2691
2691
|
# First find any dynamic.
|
returnn/tensor/_tensor_extra.py
CHANGED
|
@@ -32,8 +32,8 @@ class _TensorExtra:
|
|
|
32
32
|
tensor: Tensor,
|
|
33
33
|
time_dim_axis=NotSpecified,
|
|
34
34
|
available_for_inference=True,
|
|
35
|
-
batch=None,
|
|
36
|
-
beam=None,
|
|
35
|
+
batch: Optional[BatchInfo] = None,
|
|
36
|
+
beam: Optional[SearchBeam] = None,
|
|
37
37
|
control_flow_ctx=None,
|
|
38
38
|
):
|
|
39
39
|
"""
|
|
@@ -41,8 +41,8 @@ class _TensorExtra:
|
|
|
41
41
|
:param int|None|NotSpecified time_dim_axis: where we have the time dim axis, after we added the batch-dim.
|
|
42
42
|
this is often 1. however, can be None if there is no time-dim.
|
|
43
43
|
:param bool available_for_inference: e.g. the extern data "classes" is usually not available for inference
|
|
44
|
-
:param
|
|
45
|
-
:param
|
|
44
|
+
:param batch:
|
|
45
|
+
:param beam: the batch-dim could be extended by a beam-size,
|
|
46
46
|
such that it represents the merged dims [batch, beam_size].
|
|
47
47
|
:param ControlFlowContext|None control_flow_ctx:
|
|
48
48
|
"""
|
|
@@ -668,11 +668,11 @@ class _TensorMixin(_TensorMixinBase):
|
|
|
668
668
|
if not perm:
|
|
669
669
|
return self.copy()
|
|
670
670
|
if allow_int and isinstance(perm[0], int):
|
|
671
|
-
assert all(isinstance(a, int) for a in perm), f"{self}: invalid perm {perm!r} types"
|
|
671
|
+
assert all([isinstance(a, int) for a in perm]), f"{self}: invalid perm {perm!r} types"
|
|
672
672
|
assert set(perm) == set(range(len(perm))), f"{self}: invalid perm {perm!r}"
|
|
673
673
|
return self._copy_compatible_to_dims_with_perm([self._dims[i] for i in perm], perm)
|
|
674
674
|
else:
|
|
675
|
-
assert all(isinstance(a, Dim) for a in perm), f"{self}: invalid perm {perm!r} types"
|
|
675
|
+
assert all([isinstance(a, Dim) for a in perm]), f"{self}: invalid perm {perm!r} types"
|
|
676
676
|
return self.copy_compatible_to_dims(perm)
|
|
677
677
|
|
|
678
678
|
def copy_move_axis(self, old_axis, new_axis) -> _t.Tensor:
|
|
@@ -1155,7 +1155,7 @@ class _TensorMixin(_TensorMixinBase):
|
|
|
1155
1155
|
)
|
|
1156
1156
|
|
|
1157
1157
|
assert v.batch_ndim == data.batch_ndim
|
|
1158
|
-
assert all(mapped_axes[ax] == ax for ax in range(v.batch_ndim))
|
|
1158
|
+
assert all([mapped_axes[ax] == ax for ax in range(v.batch_ndim)])
|
|
1159
1159
|
|
|
1160
1160
|
if self.version == 1:
|
|
1161
1161
|
# Ensure time_dim_axis and feature_dim_axis is same as in data
|
|
@@ -1702,7 +1702,7 @@ class _TensorMixin(_TensorMixinBase):
|
|
|
1702
1702
|
"""
|
|
1703
1703
|
:return: shape with added batch-dim. e.g. (batch,time,feat) = (None,None,128)
|
|
1704
1704
|
"""
|
|
1705
|
-
return tuple(tag.dimension for tag in self.dim_tags)
|
|
1705
|
+
return tuple([tag.dimension for tag in self.dim_tags])
|
|
1706
1706
|
|
|
1707
1707
|
# noinspection PyShadowingNames
|
|
1708
1708
|
def get_batch_shape(self, batch_dim):
|
|
@@ -3214,7 +3214,7 @@ class _TensorMixin(_TensorMixinBase):
|
|
|
3214
3214
|
if len(sources) == 1:
|
|
3215
3215
|
return sources[0].copy_template()
|
|
3216
3216
|
max_ndim = max([s.batch_ndim for s in sources])
|
|
3217
|
-
if any(src.batch for src in sources):
|
|
3217
|
+
if any([src.batch for src in sources]):
|
|
3218
3218
|
from returnn.tf.util.data import BatchInfo
|
|
3219
3219
|
|
|
3220
3220
|
common_batch = BatchInfo.get_common_batch_info([src.batch for src in sources if src.batch])
|
|
@@ -3254,7 +3254,7 @@ class _TensorMixin(_TensorMixinBase):
|
|
|
3254
3254
|
else:
|
|
3255
3255
|
axis = common.get_default_new_axis_for_dim_tag(dim_tag)
|
|
3256
3256
|
common = common.copy_add_dim_by_tag(dim_tag, unbroadcast=True, axis=axis)
|
|
3257
|
-
if all(s.batch_ndim < common.batch_ndim for s in sources):
|
|
3257
|
+
if all([s.batch_ndim < common.batch_ndim for s in sources]):
|
|
3258
3258
|
from returnn.util.basic import validate_broadcast_all_sources
|
|
3259
3259
|
|
|
3260
3260
|
validate_broadcast_all_sources(
|
returnn/tensor/utils.py
CHANGED
|
@@ -71,7 +71,7 @@ def tensor_fill_random_numpy_(
|
|
|
71
71
|
# Make sure at least one of the dyn sizes matches the max size.
|
|
72
72
|
i = rnd.randint(0, dim.dyn_size_ext.raw_tensor.size)
|
|
73
73
|
dim.dyn_size_ext.raw_tensor.flat[i] = dyn_dim_max_sizes[dim]
|
|
74
|
-
if dim in dyn_dim_min_sizes:
|
|
74
|
+
if dim in dyn_dim_min_sizes and dim.dyn_size_ext.raw_tensor.size > 1:
|
|
75
75
|
j = rnd.randint(0, dim.dyn_size_ext.raw_tensor.size - 1)
|
|
76
76
|
if j >= i:
|
|
77
77
|
j += 1
|
|
@@ -465,6 +465,8 @@ class ReturnnLayersBackend(Backend[Layer]):
|
|
|
465
465
|
targets_spatial_dim: Dim,
|
|
466
466
|
blank_index: int,
|
|
467
467
|
max_approx: bool = False,
|
|
468
|
+
use_native_op: Optional[bool] = None,
|
|
469
|
+
label_loop: bool = True,
|
|
468
470
|
) -> Tensor:
|
|
469
471
|
"""CTC"""
|
|
470
472
|
assert targets.sparse_dim and targets.sparse_dim.dimension <= logits.feature_dim.dimension
|
|
@@ -482,6 +484,7 @@ class ReturnnLayersBackend(Backend[Layer]):
|
|
|
482
484
|
"targets": targets,
|
|
483
485
|
"blank_index": blank_index,
|
|
484
486
|
"max_approx": max_approx,
|
|
487
|
+
"label_loop": label_loop,
|
|
485
488
|
},
|
|
486
489
|
name="ctc_loss",
|
|
487
490
|
)
|
|
@@ -944,7 +947,6 @@ class ReturnnLayersBackend(Backend[Layer]):
|
|
|
944
947
|
"""
|
|
945
948
|
assert mask.dtype == "bool"
|
|
946
949
|
assert set(mask.dims) == set(dims)
|
|
947
|
-
assert set(mask.dims).issubset(set(tensor.dims))
|
|
948
950
|
if not out_dim:
|
|
949
951
|
out_dim = Dim(None, name="mask")
|
|
950
952
|
return (
|