returnn 1.20251027.232712__py3-none-any.whl → 1.20260119.15400__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- returnn/PKG-INFO +2 -2
- returnn/__old_mod_loader__.py +26 -2
- returnn/_setup_info_generated.py +2 -2
- returnn/datasets/lm.py +130 -42
- returnn/datasets/meta.py +93 -43
- returnn/datasets/postprocessing.py +597 -108
- returnn/datasets/util/vocabulary.py +90 -0
- returnn/frontend/__init__.py +1 -0
- returnn/frontend/_backend.py +41 -0
- returnn/frontend/_native/__init__.py +22 -0
- returnn/frontend/_numpy_backend.py +7 -0
- returnn/frontend/_utils.py +1 -1
- returnn/frontend/array_.py +48 -2
- returnn/frontend/assert_.py +35 -0
- returnn/frontend/attention.py +54 -20
- returnn/frontend/conv.py +273 -54
- returnn/frontend/device.py +14 -1
- returnn/frontend/encoder/conformer.py +20 -0
- returnn/frontend/encoder/transformer.py +2 -0
- returnn/frontend/loss.py +222 -3
- returnn/frontend/math_.py +54 -14
- returnn/native_op.cpp +182 -172
- returnn/native_op.py +36 -31
- returnn/sprint/cache.py +12 -13
- returnn/tensor/_dim_extra.py +7 -7
- returnn/tensor/_tensor_extra.py +10 -10
- returnn/tensor/utils.py +8 -5
- returnn/tf/frontend_layers/_backend.py +7 -3
- returnn/tf/layers/basic.py +27 -40
- returnn/tf/native_op.py +27 -63
- returnn/tf/network.py +1 -1
- returnn/tf/util/basic.py +22 -197
- returnn/torch/engine.py +157 -6
- returnn/torch/frontend/_backend.py +280 -29
- returnn/torch/frontend/bridge.py +61 -0
- returnn/torch/frontend/compile_helper.py +106 -0
- returnn/torch/util/array_.py +30 -0
- returnn/torch/util/assert_.py +122 -0
- returnn/torch/util/exception_helper.py +7 -1
- returnn/torch/util/native_op.py +885 -0
- returnn/torch/util/native_op_code_compiler.py +308 -0
- returnn/util/basic.py +6 -7
- returnn/util/better_exchook.py +4 -0
- returnn/util/cuda_env.py +332 -0
- returnn/util/debug.py +12 -2
- returnn/util/file_cache.py +15 -1
- returnn/util/fsa.py +17 -13
- returnn/util/native_code_compiler.py +104 -47
- returnn/util/task_system.py +1 -1
- {returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/METADATA +2 -2
- {returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/RECORD +54 -48
- {returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/WHEEL +1 -1
- {returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/LICENSE +0 -0
- {returnn-1.20251027.232712.dist-info → returnn-1.20260119.15400.dist-info}/top_level.txt +0 -0
returnn/native_op.cpp
CHANGED
|
@@ -1,3 +1,14 @@
|
|
|
1
|
+
/*
|
|
2
|
+
This file is imported in various ways.
|
|
3
|
+
The mode is determined via the preprocessor defines:
|
|
4
|
+
|
|
5
|
+
TENSORFLOW: If defined and set to 1, TensorFlow is used as backend.
|
|
6
|
+
TORCH: If defined and set to 1, PyTorch is used as backend.
|
|
7
|
+
|
|
8
|
+
CUDA: If defined and set to 1, CUDA is used for GPU support.
|
|
9
|
+
Otherwise, it uses CPU only.
|
|
10
|
+
The kernels are all expected to also compile in CPU-only mode.
|
|
11
|
+
*/
|
|
1
12
|
|
|
2
13
|
#include <assert.h>
|
|
3
14
|
#include <iostream>
|
|
@@ -16,6 +27,10 @@
|
|
|
16
27
|
#define TENSORFLOW 0
|
|
17
28
|
#endif
|
|
18
29
|
|
|
30
|
+
#ifndef TORCH
|
|
31
|
+
#define TORCH 0
|
|
32
|
+
#endif
|
|
33
|
+
|
|
19
34
|
#ifndef _ns
|
|
20
35
|
#define _ns
|
|
21
36
|
#endif
|
|
@@ -118,7 +133,7 @@ static inline int _host_float_as_int(float x) {
|
|
|
118
133
|
#define INF_F int_as_float(0x7f800000)
|
|
119
134
|
#define NAN_F int_as_float(0x7fffffff)
|
|
120
135
|
|
|
121
|
-
#endif
|
|
136
|
+
#endif // CUDA
|
|
122
137
|
|
|
123
138
|
|
|
124
139
|
|
|
@@ -157,7 +172,7 @@ The BLAS functions expect the inputs in column-major and return in column-major.
|
|
|
157
172
|
#define Ndarray tensorflow::Tensor
|
|
158
173
|
#define Ndarray_DEV_DATA(x) ((float*) (x)->tensor_data().data())
|
|
159
174
|
#define Ndarray_DEV_DATA_int32(x) ((int32_t*) (x)->tensor_data().data())
|
|
160
|
-
#define Ndarray_DEV_DATA_int32_scalar(x) (x)->scalar<
|
|
175
|
+
#define Ndarray_DEV_DATA_int32_scalar(x) (x)->scalar<int32_t>()()
|
|
161
176
|
#define Ndarray_HOST_DIMS(x) DimsAccessor(x)
|
|
162
177
|
#define Ndarray_DIMS Ndarray_HOST_DIMS
|
|
163
178
|
#define Ndarray_NDIM(x) (x)->dims()
|
|
@@ -206,6 +221,14 @@ Ndarray* Ndarray_Copy(const Ndarray* self) {
|
|
|
206
221
|
|
|
207
222
|
#include "tensorflow/core/public/version.h"
|
|
208
223
|
|
|
224
|
+
#ifndef TF_MAJOR_VERSION
|
|
225
|
+
#error "TF_MAJOR_VERSION is not defined!"
|
|
226
|
+
#endif
|
|
227
|
+
|
|
228
|
+
#ifndef TF_MINOR_VERSION
|
|
229
|
+
#error "TF_MINOR_VERSION is not defined!"
|
|
230
|
+
#endif
|
|
231
|
+
|
|
209
232
|
#if (TF_MAJOR_VERSION == 1 && TF_MINOR_VERSION >= 6) || (TF_MAJOR_VERSION > 1)
|
|
210
233
|
#define TF_issue_6602_workaround 0
|
|
211
234
|
#define TWOD_LSTM_SUPPORT 1
|
|
@@ -391,100 +414,172 @@ static void tf_cuda_sgemm_batched(
|
|
|
391
414
|
|
|
392
415
|
|
|
393
416
|
#define Ndarray_sgemm( \
|
|
394
|
-
|
|
395
|
-
|
|
417
|
+
transpose_A, transpose_B, \
|
|
418
|
+
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
|
|
396
419
|
tf_cuda_sgemm<float>(context, transpose_A, transpose_B, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
|
|
397
420
|
|
|
398
421
|
#define Ndarray_sgemm_batched( \
|
|
399
|
-
|
|
400
|
-
|
|
422
|
+
transpose_A, transpose_B, \
|
|
423
|
+
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchSize, finalize_stream) \
|
|
401
424
|
tf_cuda_sgemm_batched<float>(context, transpose_A, transpose_B, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, batchSize, finalize_stream);
|
|
402
425
|
|
|
403
426
|
|
|
404
427
|
#else // CUDA
|
|
428
|
+
|
|
429
|
+
#ifdef HAVE_CUSTOM_BLAS
|
|
430
|
+
|
|
405
431
|
/*
|
|
406
432
|
// matrices are in column-major form
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
433
|
+
int sgemm_(char *transa, char *transb,
|
|
434
|
+
integer *m, integer *n, integer *k,
|
|
435
|
+
real *alpha, real *a, integer *lda,
|
|
436
|
+
real *b, integer *ldb, real *beta,
|
|
437
|
+
real *c, integer *ldc);
|
|
412
438
|
*/
|
|
413
439
|
#define Ndarray_sgemm(\
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
440
|
+
transpose_A, transpose_B, \
|
|
441
|
+
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
|
|
442
|
+
{ \
|
|
443
|
+
char transa = transpose_A, transb = transpose_B; \
|
|
444
|
+
int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; \
|
|
445
|
+
sgemm_(&transa, &transb, \
|
|
446
|
+
&m_, &n_, &k_, alpha, A, &lda_, B, &ldb_, beta, C, &ldc_); \
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
#else // HAVE_CUSTOM_BLAS
|
|
450
|
+
|
|
451
|
+
template<typename T>
|
|
452
|
+
static void tf_cpu_sgemm(
|
|
453
|
+
OpKernelContext* context,
|
|
454
|
+
char transa_, char transb_,
|
|
455
|
+
int m, int n, int k,
|
|
456
|
+
const T* alpha_ptr, const T* a_ptr, int lda,
|
|
457
|
+
const T* b_ptr, int ldb, const T* beta_ptr,
|
|
458
|
+
T* c_ptr, int ldc)
|
|
459
|
+
{
|
|
460
|
+
if (m <= 0 || n <= 0 || k <= 0) return;
|
|
461
|
+
|
|
462
|
+
auto d = context->eigen_cpu_device();
|
|
463
|
+
const T alpha = *alpha_ptr;
|
|
464
|
+
const T beta = *beta_ptr;
|
|
465
|
+
|
|
466
|
+
bool transa = (transa_ == 'T' || transa_ == 't' || transa_ == 'C' || transa_ == 'c');
|
|
467
|
+
bool transb = (transb_ == 'T' || transb_ == 't' || transb_ == 'C' || transb_ == 'c');
|
|
468
|
+
|
|
469
|
+
// 1. Map as COLUMN-MAJOR
|
|
470
|
+
// Physical rows (height) for the Map is always the leading dimension (lda, ldb, ldc)
|
|
471
|
+
typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::ColMajor>, Eigen::Unaligned> ConstMap;
|
|
472
|
+
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::ColMajor>, Eigen::Unaligned> MutableMap;
|
|
473
|
+
|
|
474
|
+
// Logical height/width of slices before any transposition
|
|
475
|
+
int a_slice_rows = transa ? k : m;
|
|
476
|
+
int a_slice_cols = transa ? m : k;
|
|
477
|
+
int b_slice_rows = transb ? n : k;
|
|
478
|
+
int b_slice_cols = transb ? k : n;
|
|
479
|
+
|
|
480
|
+
// Map and Slice
|
|
481
|
+
auto a = ConstMap(a_ptr, lda, a_slice_cols).slice(
|
|
482
|
+
Eigen::array<Eigen::Index, 2>({0, 0}),
|
|
483
|
+
Eigen::array<Eigen::Index, 2>({(Eigen::Index)a_slice_rows, (Eigen::Index)a_slice_cols}));
|
|
484
|
+
|
|
485
|
+
auto b = ConstMap(b_ptr, ldb, b_slice_cols).slice(
|
|
486
|
+
Eigen::array<Eigen::Index, 2>({0, 0}),
|
|
487
|
+
Eigen::array<Eigen::Index, 2>({(Eigen::Index)b_slice_rows, (Eigen::Index)b_slice_cols}));
|
|
488
|
+
|
|
489
|
+
auto c = MutableMap(c_ptr, ldc, n).slice(
|
|
490
|
+
Eigen::array<Eigen::Index, 2>({0, 0}),
|
|
491
|
+
Eigen::array<Eigen::Index, 2>({(Eigen::Index)m, (Eigen::Index)n}));
|
|
492
|
+
|
|
493
|
+
// 2. Define Contraction Pairs based on Transposition
|
|
494
|
+
// Column-Major Matrix Mult: (M x K) * (K x N)
|
|
495
|
+
// Standard: Contract Axis 1 of A with Axis 0 of B
|
|
496
|
+
// If A is Transposed: A is (K x M), contract Axis 0 of A
|
|
497
|
+
// If B is Transposed: B is (N x K), contract Axis 1 of B
|
|
498
|
+
Eigen::array<Eigen::IndexPair<int>, 1> pairs;
|
|
499
|
+
pairs[0] = Eigen::IndexPair<int>(transa ? 0 : 1, transb ? 1 : 0);
|
|
500
|
+
|
|
501
|
+
// 3. Execution
|
|
502
|
+
if (alpha == T(1) && beta == T(0)) {
|
|
503
|
+
c.device(d) = a.contract(b, pairs);
|
|
504
|
+
} else if (alpha == T(1) && beta == T(1)) {
|
|
505
|
+
c.device(d) += a.contract(b, pairs);
|
|
506
|
+
} else {
|
|
507
|
+
c.device(d) = a.contract(b, pairs) * alpha + c * beta;
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
#define Ndarray_sgemm(\
|
|
512
|
+
transpose_A, transpose_B, \
|
|
513
|
+
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
|
|
514
|
+
tf_cpu_sgemm<float>(context, transpose_A, transpose_B, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
|
|
515
|
+
|
|
516
|
+
#endif // HAVE_CUSTOM_BLAS
|
|
422
517
|
#endif // CUDA
|
|
423
518
|
|
|
519
|
+
#define CHECK_WITH_MSG(condition, message) \
|
|
520
|
+
if(!(condition)) { \
|
|
521
|
+
std::cerr << "NativeOp check failed: " << message << std::endl; \
|
|
522
|
+
assert(condition); \
|
|
523
|
+
}
|
|
524
|
+
|
|
424
525
|
// See Context struct below.
|
|
425
526
|
#define CONTEXT_ARGS context
|
|
426
527
|
|
|
427
|
-
|
|
528
|
+
|
|
529
|
+
#elif TORCH
|
|
530
|
+
// https://github.com/rwth-i6/i6_native_ops/blob/main/i6_native_ops/common/returnn_definitions.h
|
|
531
|
+
// https://docs.pytorch.org/cppdocs/stable.html#tensor-class
|
|
532
|
+
|
|
533
|
+
#define Ndarray torch::Tensor
|
|
534
|
+
#define Ndarray_DEV_DATA(x) ((float*)(x)->data_ptr())
|
|
535
|
+
#define Ndarray_DEV_DATA_int32(x) ((int32_t*)(x)->data_ptr())
|
|
536
|
+
#define Ndarray_DEV_DATA_uint32(x) ((uint32_t*)(x)->data_ptr())
|
|
537
|
+
#define Ndarray_DEV_DATA_int32_scalar(x) ((x)->item().to<int32_t>())
|
|
538
|
+
#define Ndarray_HOST_DIMS(x) ((x)->sizes())
|
|
539
|
+
#define Ndarray_DIMS(x) ((x)->sizes())
|
|
540
|
+
typedef at::IntArrayRef Ndarray_DIMS_Type;
|
|
541
|
+
#define Ndarray_NDIM(x) (x)->dim()
|
|
542
|
+
#define Ndarray_dtype_size(x) torch::elementSize((x)->scalar_type())
|
|
543
|
+
typedef int64_t Ndarray_DIM_Type;
|
|
544
|
+
#define Ndarray_SIZE(x) ((x)->numel())
|
|
545
|
+
#define Ndarray_STRIDE(x, dim) ((x)->stride(dim))
|
|
546
|
+
|
|
547
|
+
#define CHECK_WITH_MSG TORCH_CHECK
|
|
428
548
|
|
|
429
549
|
// See Context struct below.
|
|
430
550
|
#define CONTEXT_ARGS
|
|
431
551
|
|
|
432
|
-
|
|
552
|
+
template<typename T>
|
|
553
|
+
static void Ndarray_sgemm(
|
|
554
|
+
char transa_, char transb_,
|
|
555
|
+
int m, int n, int k,
|
|
556
|
+
const T* alpha_ptr, const T* a_ptr, int lda,
|
|
557
|
+
const T* b_ptr, int ldb, const T* beta_ptr,
|
|
558
|
+
T* c_ptr, int ldc)
|
|
559
|
+
{
|
|
560
|
+
// TODO...
|
|
561
|
+
assert("Torch Ndarray_sgemm not implemented" && 0);
|
|
562
|
+
}
|
|
433
563
|
|
|
564
|
+
#else // TENSORFLOW or TORCH
|
|
434
565
|
|
|
566
|
+
#error "No framework defined: TENSORFLOW or TORCH"
|
|
435
567
|
|
|
436
|
-
#
|
|
568
|
+
#endif // TENSORFLOW or TORCH
|
|
437
569
|
|
|
438
570
|
|
|
571
|
+
#if CUDA
|
|
572
|
+
|
|
439
573
|
#if TENSORFLOW
|
|
440
574
|
// Ndarray and friends already declared above, they are same for CUDA and non-CUDA
|
|
441
575
|
#define CUDA_CUR_STREAM (context->eigen_gpu_device().stream())
|
|
442
576
|
|
|
443
|
-
#
|
|
444
|
-
#define CUDA_CUR_STREAM (0) // default stream
|
|
577
|
+
#elif TORCH
|
|
445
578
|
|
|
446
|
-
|
|
447
|
-
// See also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/cuda_ndarray.cu
|
|
448
|
-
#define Ndarray CudaNdarray
|
|
449
|
-
#define Ndarray_DEV_DATA CudaNdarray_DEV_DATA
|
|
450
|
-
#define Ndarray_DEV_DATA_int32(x) ((int32_t*) (Ndarray_DEV_DATA(x)))
|
|
451
|
-
#define Ndarray_DEV_DATA_int32_scalar(x) Ndarray_DEV_DATA_int32(x)[0]
|
|
452
|
-
#define Ndarray_HOST_DIMS CudaNdarray_HOST_DIMS
|
|
453
|
-
#define Ndarray_DIMS Ndarray_HOST_DIMS
|
|
454
|
-
#define Ndarray_STRIDE(x, i) (CudaNdarray_HOST_STRIDES(x)[i]) // return in elements. CudaNdarray stores like that
|
|
455
|
-
#define Ndarray_NDIM(x) (x->nd)
|
|
456
|
-
#define Ndarray_DIM_Type int
|
|
457
|
-
typedef Ndarray_DIM_Type const* Ndarray_DIMS_Type;
|
|
458
|
-
#define Ndarray_dtype_size(x) sizeof(float)
|
|
459
|
-
#define Ndarray_SIZE CudaNdarray_SIZE
|
|
460
|
-
// PyObject *CudaNdarray_NewDims(int nd, const inttype * dims), uninitialized
|
|
461
|
-
#define Ndarray_NewDims CudaNdarray_NewDims
|
|
462
|
-
// PyObject * CudaNdarray_Copy(const CudaNdarray * self);
|
|
463
|
-
#define Ndarray_Copy CudaNdarray_Copy
|
|
464
|
-
|
|
465
|
-
/*
|
|
466
|
-
// via: https://docs.nvidia.com/cuda/cublas/
|
|
467
|
-
// matrices are in column-major form
|
|
468
|
-
cublasStatus_t cublasSgemm(cublasHandle_t handle,
|
|
469
|
-
cublasOperation_t transa, cublasOperation_t transb,
|
|
470
|
-
int m, int n, int k,
|
|
471
|
-
const float *alpha, const float *A, int lda,
|
|
472
|
-
const float *B, int ldb, const float *beta,
|
|
473
|
-
float *C, int ldc);
|
|
474
|
-
*/
|
|
475
|
-
#define _cublasTranspose(t) \
|
|
476
|
-
((t == 'T') ? CUBLAS_OP_T : \
|
|
477
|
-
(t == 'C') ? CUBLAS_OP_C : \
|
|
478
|
-
(t == 'N') ? CUBLAS_OP_N : cublasOperation_t('E'))
|
|
479
|
-
#define Ndarray_sgemm( \
|
|
480
|
-
transpose_A, transpose_B, \
|
|
481
|
-
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
|
|
482
|
-
(_cudaHandleError(cublasSgemm(handle, \
|
|
483
|
-
_cublasTranspose(transpose_A), \
|
|
484
|
-
_cublasTranspose(transpose_B), \
|
|
485
|
-
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc), \
|
|
486
|
-
__FILE__, __LINE__ ))
|
|
579
|
+
#define CUDA_CUR_STREAM (at::cuda::getCurrentCUDAStream().stream())
|
|
487
580
|
|
|
581
|
+
#else
|
|
582
|
+
#error Unknown backend
|
|
488
583
|
#endif
|
|
489
584
|
|
|
490
585
|
#define Ndarray_memcpy(y, x, size) (cudaMemcpyAsync(y, x, size, cudaMemcpyDeviceToDevice, CUDA_CUR_STREAM))
|
|
@@ -501,48 +596,10 @@ typedef Ndarray_DIM_Type const* Ndarray_DIMS_Type;
|
|
|
501
596
|
|
|
502
597
|
#define DEF_SHARED(type, name) extern __shared__ type name[];
|
|
503
598
|
|
|
504
|
-
static const char
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
case CUBLAS_STATUS_NOT_INITIALIZED:
|
|
510
|
-
return "CUBLAS_STATUS_NOT_INITIALIZED";
|
|
511
|
-
|
|
512
|
-
case CUBLAS_STATUS_ALLOC_FAILED:
|
|
513
|
-
return "CUBLAS_STATUS_ALLOC_FAILED";
|
|
514
|
-
|
|
515
|
-
case CUBLAS_STATUS_INVALID_VALUE:
|
|
516
|
-
return "CUBLAS_STATUS_INVALID_VALUE";
|
|
517
|
-
|
|
518
|
-
case CUBLAS_STATUS_ARCH_MISMATCH:
|
|
519
|
-
return "CUBLAS_STATUS_ARCH_MISMATCH";
|
|
520
|
-
|
|
521
|
-
case CUBLAS_STATUS_MAPPING_ERROR:
|
|
522
|
-
return "CUBLAS_STATUS_MAPPING_ERROR";
|
|
523
|
-
|
|
524
|
-
case CUBLAS_STATUS_EXECUTION_FAILED:
|
|
525
|
-
return "CUBLAS_STATUS_EXECUTION_FAILED";
|
|
526
|
-
|
|
527
|
-
case CUBLAS_STATUS_INTERNAL_ERROR:
|
|
528
|
-
return "CUBLAS_STATUS_INTERNAL_ERROR";
|
|
529
|
-
}
|
|
530
|
-
|
|
531
|
-
return "<unknown>";
|
|
532
|
-
}
|
|
533
|
-
|
|
534
|
-
static void _cudaHandleError(cudaError_t err, const char *file, int line) {
|
|
535
|
-
if (err != cudaSuccess) {
|
|
536
|
-
printf("NativeOp: CUDA runtime error: '%s' in %s at line %d\n", cudaGetErrorString(err), file, line);
|
|
537
|
-
exit(EXIT_FAILURE);
|
|
538
|
-
}
|
|
539
|
-
}
|
|
540
|
-
|
|
541
|
-
static void _cudaHandleError(cublasStatus_t status, const char *file, int line) {
|
|
542
|
-
if (status != CUBLAS_STATUS_SUCCESS) {
|
|
543
|
-
printf("NativeOp: cuBLAS runtime error: '%s' in %s at line %d\n", _cudaGetErrorEnum(status), file, line);
|
|
544
|
-
exit(EXIT_FAILURE);
|
|
545
|
-
}
|
|
599
|
+
static void _cudaHandleError(cudaError_t err, const char* file, int line) {
|
|
600
|
+
CHECK_WITH_MSG(
|
|
601
|
+
err == cudaSuccess,
|
|
602
|
+
"NativeOp: CUDA runtime error: ", cudaGetErrorString(err), " in ", file, " at line ", line);
|
|
546
603
|
}
|
|
547
604
|
|
|
548
605
|
#define HANDLE_ERROR(status) (_cudaHandleError( status, __FILE__, __LINE__ ))
|
|
@@ -550,49 +607,7 @@ static void _cudaHandleError(cublasStatus_t status, const char *file, int line)
|
|
|
550
607
|
|
|
551
608
|
#else // not CUDA
|
|
552
609
|
|
|
553
|
-
|
|
554
|
-
#if !TENSORFLOW
|
|
555
|
-
// Numpy, see: https://docs.scipy.org/doc/numpy/reference/c-api.array.html
|
|
556
|
-
// And: https://deeplearning.net/software/theano/extending/extending_theano_c.html
|
|
557
|
-
#define Ndarray PyArrayObject
|
|
558
|
-
#define Ndarray_DEV_DATA(x) ((float*) PyArray_DATA(x))
|
|
559
|
-
#define Ndarray_DEV_DATA_int32(x) ((int32_t*) (Ndarray_DEV_DATA(x)))
|
|
560
|
-
#define Ndarray_DEV_DATA_int32_scalar(x) Ndarray_DEV_DATA_int32(x)[0]
|
|
561
|
-
#define Ndarray_HOST_DIMS PyArray_DIMS
|
|
562
|
-
#define Ndarray_STRIDE(x, i) (PyArray_STRIDE(x, i) / sizeof(float)) // return in elements. Numpy stores in bytes
|
|
563
|
-
#define Ndarray_DIMS Ndarray_HOST_DIMS
|
|
564
|
-
#define Ndarray_NDIM PyArray_NDIM
|
|
565
|
-
#define Ndarray_DIM_Type npy_intp
|
|
566
|
-
typedef Ndarray_DIM_Type const* Ndarray_DIMS_Type;
|
|
567
|
-
#define Ndarray_dtype_size(x) sizeof(float)
|
|
568
|
-
#define Ndarray_SIZE PyArray_SIZE
|
|
569
|
-
#define Ndarray_NewDims(nd, dims) (PyArray_SimpleNew(nd, dims, NPY_FLOAT32))
|
|
570
|
-
#define Ndarray_Copy(x) (PyArray_FromArray(x, NULL, NPY_ARRAY_OUT_ARRAY | NPY_ARRAY_ENSURECOPY))
|
|
571
|
-
/*
|
|
572
|
-
// matrices are in column-major form
|
|
573
|
-
int sgemm_(char *transa, char *transb,
|
|
574
|
-
integer *m, integer *n, integer *k,
|
|
575
|
-
real *alpha, real *a, integer *lda,
|
|
576
|
-
real *b, integer *ldb, real *beta,
|
|
577
|
-
real *c, integer *ldc);
|
|
578
|
-
|
|
579
|
-
Cast to (float*) because we might have the C-style declaration incorrectly in the C++ scope.
|
|
580
|
-
*/
|
|
581
|
-
#define Ndarray_sgemm(\
|
|
582
|
-
transpose_A, transpose_B, \
|
|
583
|
-
m, n, k, alpha, A, lda, B, ldb, beta, C, ldc) \
|
|
584
|
-
{ \
|
|
585
|
-
char transa = transpose_A, transb = transpose_B; \
|
|
586
|
-
int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; \
|
|
587
|
-
sgemm_(&transa, &transb, \
|
|
588
|
-
&m_, &n_, &k_, alpha, (float*) A, &lda_, (float*) B, &ldb_, beta, C, &ldc_); \
|
|
589
|
-
}
|
|
590
|
-
|
|
591
|
-
static inline void* device_malloc(size_t size) { return malloc(size); }
|
|
592
|
-
static inline void device_free(void* ptr) { free(ptr); }
|
|
593
|
-
#endif
|
|
594
|
-
|
|
595
|
-
#define HANDLE_LAST_ERROR() (0)
|
|
610
|
+
#define HANDLE_LAST_ERROR() {}
|
|
596
611
|
|
|
597
612
|
#define Ndarray_memcpy(y, x, size) (memcpy(y, x, size))
|
|
598
613
|
#define Ndarray_memset(s, c, size) (memset(s, c, size))
|
|
@@ -671,19 +686,9 @@ struct _KernelLoop {
|
|
|
671
686
|
#endif
|
|
672
687
|
|
|
673
688
|
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
Ndarray* res = (Ndarray*) Ndarray_NewDims(Ndarray_NDIM(a), dim);
|
|
678
|
-
#else
|
|
679
|
-
Ndarray* res = (Ndarray*) Ndarray_NewDims(Ndarray_NDIM(a), const_cast<Ndarray_DIM_Type*>(dim));
|
|
680
|
-
#endif
|
|
681
|
-
return res;
|
|
682
|
-
}
|
|
683
|
-
|
|
684
|
-
long Ndarray_get_n_total_elements(Ndarray* a) {
|
|
685
|
-
long c = 1;
|
|
686
|
-
for(long i = 0; i < Ndarray_NDIM(a); ++i)
|
|
689
|
+
int64_t Ndarray_get_n_total_elements(Ndarray* a) {
|
|
690
|
+
int64_t c = 1;
|
|
691
|
+
for(int i = 0; i < Ndarray_NDIM(a); ++i)
|
|
687
692
|
c *= Ndarray_DIMS(a)[i];
|
|
688
693
|
return c;
|
|
689
694
|
}
|
|
@@ -769,17 +774,22 @@ void _free(void* ptr) {
|
|
|
769
774
|
context->device()->GetAllocator(AllocatorAttributes());
|
|
770
775
|
allocator->DeallocateRaw(ptr);
|
|
771
776
|
}
|
|
772
|
-
|
|
773
|
-
#
|
|
777
|
+
|
|
778
|
+
#elif TORCH
|
|
774
779
|
|
|
775
780
|
#if CUDA
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
}
|
|
780
|
-
|
|
781
|
-
#endif
|
|
782
|
-
|
|
781
|
+
void* _malloc(size_t num_bytes) { return c10::cuda::CUDACachingAllocator::raw_alloc(num_bytes); }
|
|
782
|
+
void _free(void* ptr) { c10::cuda::CUDACachingAllocator::raw_delete(ptr); }
|
|
783
|
+
#else // not CUDA
|
|
784
|
+
void* _malloc(size_t num_bytes) { return c10::GetCPUAllocator()->raw_allocate(num_bytes); }
|
|
785
|
+
void _free(void* ptr) { c10::GetCPUAllocator()->raw_deallocate(ptr); }
|
|
786
|
+
#endif // CUDA
|
|
787
|
+
|
|
788
|
+
#endif // TENSORFLOW or TORCH
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
#define device_malloc Context(CONTEXT_ARGS)._malloc
|
|
792
|
+
#define device_free Context(CONTEXT_ARGS)._free
|
|
783
793
|
|
|
784
794
|
|
|
785
795
|
//C[x] += A[x]*B[x]
|
returnn/native_op.py
CHANGED
|
@@ -5,38 +5,40 @@ Generic interface which automatically creates:
|
|
|
5
5
|
* inplace and not inplace
|
|
6
6
|
* grad variants
|
|
7
7
|
|
|
8
|
-
See :mod:`returnn.tf.native_op` and :mod:`returnn.
|
|
9
|
-
for usage in TensorFlow and
|
|
8
|
+
See :mod:`returnn.tf.native_op` and :mod:`returnn.torch.utils.native_op`
|
|
9
|
+
for usage in TensorFlow and PyTorch.
|
|
10
10
|
|
|
11
11
|
See :ref:`native_ops` for more background.
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
from typing import Optional, Union, Any, Callable, Dict, Sequence, Tuple
|
|
14
16
|
import copy
|
|
15
17
|
import numpy
|
|
16
|
-
|
|
18
|
+
|
|
17
19
|
from returnn.util.basic import make_hashable, unicode
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
class NativeOpBaseMixin:
|
|
21
23
|
"""
|
|
22
|
-
The purpose of having this as a separate base class
|
|
23
|
-
|
|
24
|
+
The purpose of having this as a separate base class
|
|
25
|
+
is to make this independent of any TensorFlow or PyTorch-specific functionality.
|
|
24
26
|
"""
|
|
25
27
|
|
|
26
28
|
def __init__(
|
|
27
29
|
self,
|
|
28
|
-
in_info,
|
|
29
|
-
out_info,
|
|
30
|
-
c_fw_code,
|
|
31
|
-
c_bw_code=None,
|
|
32
|
-
c_extra_support_code=None,
|
|
33
|
-
code_version=None,
|
|
34
|
-
cpu_support=True,
|
|
35
|
-
grad_input_map=None,
|
|
36
|
-
name=None,
|
|
30
|
+
in_info: Sequence[Dict[str, Any]],
|
|
31
|
+
out_info: Sequence[Dict[str, Any]],
|
|
32
|
+
c_fw_code: str,
|
|
33
|
+
c_bw_code: Optional[str] = None,
|
|
34
|
+
c_extra_support_code: Union[None, str, Dict[str, str]] = None,
|
|
35
|
+
code_version: Optional[Tuple[int, ...]] = None,
|
|
36
|
+
cpu_support: bool = True,
|
|
37
|
+
grad_input_map: Union[None, Tuple[int, ...], Callable] = None,
|
|
38
|
+
name: Optional[str] = None,
|
|
37
39
|
):
|
|
38
40
|
"""
|
|
39
|
-
:param
|
|
41
|
+
:param in_info: each dict describes one input var.
|
|
40
42
|
attribs in the dict:
|
|
41
43
|
int ndim: the ndim.
|
|
42
44
|
tuple shape: tuple and can contain None for specific dimensions.
|
|
@@ -49,18 +51,18 @@ class NativeOpBaseMixin:
|
|
|
49
51
|
str gradient: can be "disconnected". see grad().
|
|
50
52
|
bool bw_input: True by default. add this param to the bw input.
|
|
51
53
|
other attribs are just ignored.
|
|
52
|
-
:param
|
|
54
|
+
:param out_info: like in_info.
|
|
53
55
|
slightly different behavior for:
|
|
54
56
|
shape: we also allow refs to the in_info in the form (in-idx,dim). see infer_shape().
|
|
55
57
|
need_contiguous/want_inplace: used for bw, in case for bw_input == True.
|
|
56
|
-
:param
|
|
57
|
-
:param
|
|
58
|
-
:param
|
|
59
|
-
:param
|
|
60
|
-
:param
|
|
61
|
-
:param
|
|
58
|
+
:param c_fw_code: C code for forward pass
|
|
59
|
+
:param c_extra_support_code: C support code (for c_support_code)
|
|
60
|
+
:param c_bw_code: C code for backward pass (for gradient)
|
|
61
|
+
:param code_version: will be returned by c_code_cache_version.
|
|
62
|
+
:param cpu_support:
|
|
63
|
+
:param grad_input_map: selection of grad inputs.
|
|
62
64
|
by default, we get all inputs + all outputs + all grad outputs.
|
|
63
|
-
:param
|
|
65
|
+
:param name: name
|
|
64
66
|
"""
|
|
65
67
|
assert isinstance(in_info, (list, tuple))
|
|
66
68
|
assert isinstance(out_info, (list, tuple))
|
|
@@ -251,12 +253,12 @@ class NativeOpGenBase:
|
|
|
251
253
|
See NativeOp.__init__() for attribs.
|
|
252
254
|
"""
|
|
253
255
|
|
|
254
|
-
in_info
|
|
255
|
-
out_info
|
|
256
|
-
c_fw_code = None
|
|
257
|
-
c_bw_code = None
|
|
258
|
-
c_extra_support_code
|
|
259
|
-
code_version
|
|
256
|
+
in_info: Optional[Tuple[Dict[str, Any], ...]] = None
|
|
257
|
+
out_info: Optional[Tuple[Dict[str, Any], ...]] = None
|
|
258
|
+
c_fw_code: Optional[str] = None
|
|
259
|
+
c_bw_code: Optional[str] = None
|
|
260
|
+
c_extra_support_code: Optional[Dict[str, str]] = None
|
|
261
|
+
code_version: Union[None, Tuple[int, ...], int] = None
|
|
260
262
|
grad_input_map = None
|
|
261
263
|
theano_custom_grad = None
|
|
262
264
|
cpu_support = True
|
|
@@ -4699,7 +4701,7 @@ class FastViterbiOp(NativeOpGenBase):
|
|
|
4699
4701
|
int n_states,
|
|
4700
4702
|
int n_edges,
|
|
4701
4703
|
int t,
|
|
4702
|
-
|
|
4704
|
+
int32_t* cur_state, // (n_batch,)
|
|
4703
4705
|
const IdxAndVal* frame,
|
|
4704
4706
|
const int32_t* d_am_seq_len,
|
|
4705
4707
|
const int32_t* d_edge_from,
|
|
@@ -5339,7 +5341,10 @@ class EditDistanceOp(NativeOpGenBase):
|
|
|
5339
5341
|
sub_cost = last1_dist[last1_idx];
|
|
5340
5342
|
if(a[batch_idx * n_a_max_len + t_a - 1] != b[batch_idx * n_b_max_len + t_b - 1])
|
|
5341
5343
|
++sub_cost;
|
|
5342
|
-
|
|
5344
|
+
/*printf("t_a %i, t_b %i, a %d, b %d, del %i, ins %i, sub %i\\n",
|
|
5345
|
+
t_a, t_b,
|
|
5346
|
+
a[batch_idx * n_a_max_len + t_a - 1], b[batch_idx * n_b_max_len + t_b - 1],
|
|
5347
|
+
del_cost, ins_cost, sub_cost);*/
|
|
5343
5348
|
int min_cost = del_cost;
|
|
5344
5349
|
if(min_cost > ins_cost) min_cost = ins_cost;
|
|
5345
5350
|
if(min_cost > sub_cost) min_cost = sub_cost;
|
returnn/sprint/cache.py
CHANGED
|
@@ -7,10 +7,9 @@ This module is about reading (maybe later also writing) the Sprint archive forma
|
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
9
|
from __future__ import annotations
|
|
10
|
-
from typing import List,
|
|
10
|
+
from typing import Optional, List, Tuple, Dict
|
|
11
11
|
import sys
|
|
12
12
|
import os
|
|
13
|
-
import typing
|
|
14
13
|
import array
|
|
15
14
|
from struct import pack, unpack
|
|
16
15
|
import numpy
|
|
@@ -212,7 +211,7 @@ class FileArchive:
|
|
|
212
211
|
def __init__(self, filename, must_exists=True, encoding="ascii"):
|
|
213
212
|
self.encoding = encoding
|
|
214
213
|
|
|
215
|
-
self.ft
|
|
214
|
+
self.ft: Dict[str, FileInfo] = {}
|
|
216
215
|
if os.path.exists(filename):
|
|
217
216
|
self.allophones = []
|
|
218
217
|
self.f = open(filename, "rb")
|
|
@@ -334,8 +333,8 @@ class FileArchive:
|
|
|
334
333
|
# print(typ)
|
|
335
334
|
assert type_ == "vector-f32"
|
|
336
335
|
count = self.read_U32()
|
|
337
|
-
data
|
|
338
|
-
time_
|
|
336
|
+
data: List[Optional[numpy.ndarray]] = [None] * count
|
|
337
|
+
time_: List[Optional[numpy.ndarray]] = [None] * count
|
|
339
338
|
for i in range(count):
|
|
340
339
|
size = self.read_U32()
|
|
341
340
|
data[i] = self.read_v("f", size) # size x f32
|
|
@@ -450,7 +449,7 @@ class FileArchive:
|
|
|
450
449
|
a = array.array("b")
|
|
451
450
|
a.fromfile(self.f, comp)
|
|
452
451
|
# unpack
|
|
453
|
-
b = zlib.decompress(a.
|
|
452
|
+
b = zlib.decompress(a.tobytes(), 15 + 32)
|
|
454
453
|
# substitute self.f by an anonymous memmap file object
|
|
455
454
|
# restore original file handle after we're done
|
|
456
455
|
backup_f = self.f
|
|
@@ -575,17 +574,17 @@ class FileArchiveBundle:
|
|
|
575
574
|
:param str encoding: encoding used in the files
|
|
576
575
|
"""
|
|
577
576
|
# filename -> FileArchive
|
|
578
|
-
self.archives
|
|
577
|
+
self.archives: Dict[str, FileArchive] = {}
|
|
579
578
|
# archive content file -> FileArchive
|
|
580
|
-
self.files
|
|
579
|
+
self.files: Dict[str, FileArchive] = {}
|
|
581
580
|
self._short_seg_names = {}
|
|
582
581
|
if filename is not None:
|
|
583
582
|
self.add_bundle(filename=filename, encoding=encoding)
|
|
584
583
|
|
|
585
|
-
def add_bundle(self, filename, encoding="ascii"):
|
|
584
|
+
def add_bundle(self, filename: str, encoding: str = "ascii"):
|
|
586
585
|
"""
|
|
587
|
-
:param
|
|
588
|
-
:param
|
|
586
|
+
:param filename: bundle
|
|
587
|
+
:param encoding:
|
|
589
588
|
"""
|
|
590
589
|
file_dir = os.path.dirname(filename) or "."
|
|
591
590
|
for line in open(filename).read().splitlines():
|
|
@@ -837,7 +836,7 @@ class MixtureSet:
|
|
|
837
836
|
"""
|
|
838
837
|
a = array.array("b")
|
|
839
838
|
a.fromfile(self.f, length)
|
|
840
|
-
return a.
|
|
839
|
+
return a.tobytes().decode(encoding)
|
|
841
840
|
|
|
842
841
|
def read_f32(self):
|
|
843
842
|
"""
|
|
@@ -1003,7 +1002,7 @@ class WordBoundaries:
|
|
|
1003
1002
|
"""
|
|
1004
1003
|
a = array.array("b")
|
|
1005
1004
|
a.fromfile(self.f, length)
|
|
1006
|
-
return a.
|
|
1005
|
+
return a.tobytes().decode(encoding)
|
|
1007
1006
|
|
|
1008
1007
|
def __init__(self, filename):
|
|
1009
1008
|
"""
|