normal-grain-merge 0.0.1__cp313-cp313-win_amd64.whl → 0.1.2__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- normal_grain_merge/normal_grain_merge.c +672 -206
- normal_grain_merge/normal_grain_merge.cp313-win_amd64.pyd +0 -0
- normal_grain_merge/normal_grain_merge.pyi +1 -1
- {normal_grain_merge-0.0.1.dist-info → normal_grain_merge-0.1.2.dist-info}/METADATA +26 -19
- normal_grain_merge-0.1.2.dist-info/RECORD +10 -0
- normal_grain_merge-0.0.1.dist-info/RECORD +0 -10
- {normal_grain_merge-0.0.1.dist-info → normal_grain_merge-0.1.2.dist-info}/WHEEL +0 -0
- {normal_grain_merge-0.0.1.dist-info → normal_grain_merge-0.1.2.dist-info}/licenses/LICENSE +0 -0
- {normal_grain_merge-0.0.1.dist-info → normal_grain_merge-0.1.2.dist-info}/top_level.txt +0 -0
|
@@ -2,11 +2,18 @@
|
|
|
2
2
|
#include <stdio.h>
|
|
3
3
|
#include <math.h>
|
|
4
4
|
#include <float.h>
|
|
5
|
+
#include <stdint.h>
|
|
6
|
+
#include <string.h>
|
|
5
7
|
#include <Python.h>
|
|
6
8
|
#include <numpy/arrayobject.h>
|
|
7
9
|
#include <smmintrin.h>
|
|
10
|
+
#include <tmmintrin.h>
|
|
8
11
|
#include <immintrin.h> /* AVX2 + SSE4.2 */
|
|
9
12
|
|
|
13
|
+
#if defined(__FMA__) || (defined(_MSC_VER) && defined(__AVX2__))
|
|
14
|
+
#define NGM_HAS_FMA 1
|
|
15
|
+
#endif
|
|
16
|
+
|
|
10
17
|
/* ----- Runtime CPU feature detection (GCC/Clang + MSVC) ----- */
|
|
11
18
|
#if defined(_MSC_VER)
|
|
12
19
|
#include <intrin.h>
|
|
@@ -126,8 +133,8 @@ static inline int check_shape_requirements(PyArrayObject *base,
|
|
|
126
133
|
}
|
|
127
134
|
*texture_has_alpha = (tc == 4);
|
|
128
135
|
|
|
129
|
-
if (PyArray_NDIM(skin) != 3 || PyArray_DIMS(skin)[2] !=
|
|
130
|
-
PyErr_SetString(PyExc_ValueError, "skin must have shape (H, W,
|
|
136
|
+
if (PyArray_NDIM(skin) != 3 || PyArray_DIMS(skin)[2] != 3) {
|
|
137
|
+
PyErr_SetString(PyExc_ValueError, "skin must have shape (H, W, 3)");
|
|
131
138
|
return 0;
|
|
132
139
|
}
|
|
133
140
|
if (PyArray_NDIM(im_alpha) != 2) {
|
|
@@ -184,10 +191,9 @@ static void kernel_scalar_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
184
191
|
const uint8_t t_g = texture[3*i+1];
|
|
185
192
|
const uint8_t t_b = texture[3*i+2];
|
|
186
193
|
|
|
187
|
-
const uint8_t s_r = skin[
|
|
188
|
-
const uint8_t s_g = skin[
|
|
189
|
-
const uint8_t s_b = skin[
|
|
190
|
-
const uint8_t s_a = skin[4*i+3];
|
|
194
|
+
const uint8_t s_r = skin[3*i+0];
|
|
195
|
+
const uint8_t s_g = skin[3*i+1];
|
|
196
|
+
const uint8_t s_b = skin[3*i+2];
|
|
191
197
|
|
|
192
198
|
const uint8_t a_im = im_alpha[i];
|
|
193
199
|
|
|
@@ -203,8 +209,6 @@ static void kernel_scalar_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
203
209
|
const float fs_r = s_r * (1.0f/255.0f);
|
|
204
210
|
const float fs_g = s_g * (1.0f/255.0f);
|
|
205
211
|
const float fs_b = s_b * (1.0f/255.0f);
|
|
206
|
-
const float fs_a = s_a * (1.0f/255.0f);
|
|
207
|
-
|
|
208
212
|
const float fa_im = a_im * (1.0f/255.0f);
|
|
209
213
|
|
|
210
214
|
/*
|
|
@@ -240,8 +244,8 @@ static void kernel_scalar_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
240
244
|
|
|
241
245
|
/* Normal merge
|
|
242
246
|
* n_out = gm_out * texture_alpha + base * inverse_tpa
|
|
243
|
-
*
|
|
244
|
-
* In this case, texture_alpha is
|
|
247
|
+
*
|
|
248
|
+
* In this case, texture_alpha is supplied by im_alpha since texture doesn't have an alpha channel here.
|
|
245
249
|
*/
|
|
246
250
|
fr = fr * fa_im + fb_r * fit_a;
|
|
247
251
|
fg = fg * fa_im + fb_g * fit_a;
|
|
@@ -267,10 +271,9 @@ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
267
271
|
const uint8_t t_b = texture[4*i+2];
|
|
268
272
|
const uint8_t t_a = texture[4*i+3]; /* present in RGBA branch */
|
|
269
273
|
|
|
270
|
-
const uint8_t s_r = skin[
|
|
271
|
-
const uint8_t s_g = skin[
|
|
272
|
-
const uint8_t s_b = skin[
|
|
273
|
-
const uint8_t s_a = skin[4*i+3];
|
|
274
|
+
const uint8_t s_r = skin[3*i+0];
|
|
275
|
+
const uint8_t s_g = skin[3*i+1];
|
|
276
|
+
const uint8_t s_b = skin[3*i+2];
|
|
274
277
|
|
|
275
278
|
const uint8_t a_im = im_alpha[i];
|
|
276
279
|
|
|
@@ -286,8 +289,6 @@ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
286
289
|
const float fs_r = s_r * (1.0f/255.0f);
|
|
287
290
|
const float fs_g = s_g * (1.0f/255.0f);
|
|
288
291
|
const float fs_b = s_b * (1.0f/255.0f);
|
|
289
|
-
const float fs_a = s_a * (1.0f/255.0f);
|
|
290
|
-
|
|
291
292
|
const float fa_im = a_im * (1.0f/255.0f);
|
|
292
293
|
|
|
293
294
|
/*
|
|
@@ -295,7 +296,7 @@ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
295
296
|
* normal grain merge *
|
|
296
297
|
**********************
|
|
297
298
|
*/
|
|
298
|
-
/* Merge texture
|
|
299
|
+
/* Merge texture alpha with the external mask */
|
|
299
300
|
|
|
300
301
|
/* texture_alpha = texture[..., 3] * im_alpha*/
|
|
301
302
|
ft_a = ft_a * fa_im;
|
|
@@ -341,34 +342,119 @@ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
341
342
|
}
|
|
342
343
|
|
|
343
344
|
/* ---------- AVX2 helpers ----------
|
|
344
|
-
Interleaved RGB(A) is awkward for SIMD.
|
|
345
|
-
|
|
346
|
-
You can later replace gathers with better deinterleaving if needed.
|
|
345
|
+
Interleaved RGB(A) is awkward for SIMD. We build 8-lane vectors per channel by
|
|
346
|
+
reusing the scalar u8x4 -> f32 helpers instead of relying on gathers.
|
|
347
347
|
*/
|
|
348
348
|
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
349
|
+
static inline __m128 bytes4_to_unit_f32(__m128i bytes, __m128 inv255) {
|
|
350
|
+
__m128i v32 = _mm_cvtepu8_epi32(bytes);
|
|
351
|
+
return _mm_mul_ps(_mm_cvtepi32_ps(v32), inv255);
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
/* Forward declarations for SSE4.2 kernels used in AVX2 tail handling. */
|
|
355
|
+
static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
|
|
356
|
+
const uint8_t *skin, const uint8_t *im_alpha,
|
|
357
|
+
uint8_t *out, npy_intp pixels);
|
|
358
|
+
static void kernel_sse42_rgba(const uint8_t *base, const uint8_t *texture,
|
|
359
|
+
const uint8_t *skin, const uint8_t *im_alpha,
|
|
360
|
+
uint8_t *out, npy_intp pixels);
|
|
361
|
+
|
|
362
|
+
static inline void load4_rgb_to_unit_f32(const uint8_t *p, __m128 inv255,
|
|
363
|
+
__m128 *r, __m128 *g, __m128 *b) {
|
|
364
|
+
const __m128i src = _mm_loadu_si128((const __m128i*)p);
|
|
365
|
+
const __m128i mask_r = _mm_setr_epi8(0, 3, 6, 9,
|
|
366
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
367
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
368
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80);
|
|
369
|
+
const __m128i mask_g = _mm_setr_epi8(1, 4, 7, 10,
|
|
370
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
371
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
372
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80);
|
|
373
|
+
const __m128i mask_b = _mm_setr_epi8(2, 5, 8, 11,
|
|
374
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
375
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
376
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80);
|
|
377
|
+
|
|
378
|
+
__m128i rb = _mm_shuffle_epi8(src, mask_r);
|
|
379
|
+
__m128i gb = _mm_shuffle_epi8(src, mask_g);
|
|
380
|
+
__m128i bb = _mm_shuffle_epi8(src, mask_b);
|
|
381
|
+
|
|
382
|
+
*r = bytes4_to_unit_f32(rb, inv255);
|
|
383
|
+
*g = bytes4_to_unit_f32(gb, inv255);
|
|
384
|
+
*b = bytes4_to_unit_f32(bb, inv255);
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
static inline void load4_rgba_to_unit_f32(const uint8_t *p, __m128 inv255,
|
|
388
|
+
__m128 *r, __m128 *g, __m128 *b, __m128 *a) {
|
|
389
|
+
const __m128i src = _mm_loadu_si128((const __m128i*)p);
|
|
390
|
+
const __m128i mask_r = _mm_setr_epi8(0, 4, 8, 12,
|
|
391
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
392
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
393
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80);
|
|
394
|
+
const __m128i mask_g = _mm_setr_epi8(1, 5, 9, 13,
|
|
395
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
396
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
397
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80);
|
|
398
|
+
const __m128i mask_b = _mm_setr_epi8(2, 6, 10, 14,
|
|
399
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
400
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
401
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80);
|
|
402
|
+
const __m128i mask_a = _mm_setr_epi8(3, 7, 11, 15,
|
|
403
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
404
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80,
|
|
405
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80);
|
|
406
|
+
|
|
407
|
+
__m128i rb = _mm_shuffle_epi8(src, mask_r);
|
|
408
|
+
__m128i gb = _mm_shuffle_epi8(src, mask_g);
|
|
409
|
+
__m128i bb = _mm_shuffle_epi8(src, mask_b);
|
|
410
|
+
__m128i ab = _mm_shuffle_epi8(src, mask_a);
|
|
411
|
+
|
|
412
|
+
*r = bytes4_to_unit_f32(rb, inv255);
|
|
413
|
+
*g = bytes4_to_unit_f32(gb, inv255);
|
|
414
|
+
*b = bytes4_to_unit_f32(bb, inv255);
|
|
415
|
+
*a = bytes4_to_unit_f32(ab, inv255);
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
static inline __m256 mul_add_ps256(__m256 a, __m256 b, __m256 c) {
|
|
419
|
+
#ifdef __FMA__
|
|
420
|
+
return _mm256_fmadd_ps(a, b, c);
|
|
421
|
+
#else
|
|
422
|
+
return _mm256_add_ps(_mm256_mul_ps(a, b), c);
|
|
423
|
+
#endif
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
static inline __m256 fnmadd_ps256(__m256 a, __m256 b, __m256 c) {
|
|
427
|
+
#ifdef __FMA__
|
|
428
|
+
return _mm256_fnmadd_ps(a, b, c);
|
|
429
|
+
#else
|
|
430
|
+
return _mm256_sub_ps(c, _mm256_mul_ps(a, b));
|
|
431
|
+
#endif
|
|
365
432
|
}
|
|
366
433
|
|
|
367
434
|
/* Convert 8 consecutive u8 to float32 in [0,1] (for grayscale im_alpha). */
|
|
368
|
-
static inline __m256 load8_u8_to_unit_f32_avx2(const uint8_t *p) {
|
|
435
|
+
static inline __m256 load8_u8_to_unit_f32_avx2(const uint8_t *p, __m256 inv255) {
|
|
369
436
|
__m128i v8 = _mm_loadl_epi64((const __m128i*)p); /* 8 bytes -> XMM */
|
|
370
437
|
__m256i v32 = _mm256_cvtepu8_epi32(v8); /* widen to 8 x u32 */
|
|
371
|
-
return _mm256_mul_ps(_mm256_cvtepi32_ps(v32),
|
|
438
|
+
return _mm256_mul_ps(_mm256_cvtepi32_ps(v32), inv255);
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
static inline void load16_u8_to_unit_f32_avx2(const uint8_t *p, __m256 inv255,
|
|
442
|
+
__m256 *lo, __m256 *hi) {
|
|
443
|
+
__m128i v16 = _mm_loadu_si128((const __m128i*)p); /* 16 bytes */
|
|
444
|
+
__m256i v32_lo = _mm256_cvtepu8_epi32(v16);
|
|
445
|
+
__m128i v8_hi = _mm_srli_si128(v16, 8);
|
|
446
|
+
__m256i v32_hi = _mm256_cvtepu8_epi32(v8_hi);
|
|
447
|
+
*lo = _mm256_mul_ps(_mm256_cvtepi32_ps(v32_lo), inv255);
|
|
448
|
+
*hi = _mm256_mul_ps(_mm256_cvtepi32_ps(v32_hi), inv255);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
static inline void load16_u8_to_unit_f32_avx2_from_xmm(__m128i v16, __m256 inv255,
|
|
452
|
+
__m256 *lo, __m256 *hi) {
|
|
453
|
+
__m256i v32_lo = _mm256_cvtepu8_epi32(v16);
|
|
454
|
+
__m128i v8_hi = _mm_srli_si128(v16, 8);
|
|
455
|
+
__m256i v32_hi = _mm256_cvtepu8_epi32(v8_hi);
|
|
456
|
+
*lo = _mm256_mul_ps(_mm256_cvtepi32_ps(v32_lo), inv255);
|
|
457
|
+
*hi = _mm256_mul_ps(_mm256_cvtepi32_ps(v32_hi), inv255);
|
|
372
458
|
}
|
|
373
459
|
|
|
374
460
|
static inline __m256 clamp01_ps(__m256 x) {
|
|
@@ -381,57 +467,269 @@ static inline __m256 nan_to_num_ps(__m256 x) {
|
|
|
381
467
|
return _mm256_blendv_ps(_mm256_set1_ps(0.0f), x, cmp);
|
|
382
468
|
}
|
|
383
469
|
|
|
384
|
-
/*
|
|
385
|
-
static inline
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
470
|
+
/* Convert 4 float32 RGB vectors in [0,1] to uint8_t RGBRGBRGBRGB without branches. */
|
|
471
|
+
static inline __m128i pack_unit_f32_to_u8_rgb4(__m128 fr, __m128 fg, __m128 fb) {
|
|
472
|
+
const __m128 scale = _mm_set1_ps(255.0f);
|
|
473
|
+
const __m128i zero = _mm_setzero_si128();
|
|
474
|
+
const __m128i max255 = _mm_set1_epi32(255);
|
|
475
|
+
|
|
476
|
+
__m128i ir = _mm_cvttps_epi32(_mm_mul_ps(fr, scale));
|
|
477
|
+
__m128i ig = _mm_cvttps_epi32(_mm_mul_ps(fg, scale));
|
|
478
|
+
__m128i ib = _mm_cvttps_epi32(_mm_mul_ps(fb, scale));
|
|
479
|
+
|
|
480
|
+
ir = _mm_min_epi32(_mm_max_epi32(ir, zero), max255);
|
|
481
|
+
ig = _mm_min_epi32(_mm_max_epi32(ig, zero), max255);
|
|
482
|
+
ib = _mm_min_epi32(_mm_max_epi32(ib, zero), max255);
|
|
483
|
+
|
|
484
|
+
__m128i ir16 = _mm_packus_epi32(ir, zero);
|
|
485
|
+
__m128i ig16 = _mm_packus_epi32(ig, zero);
|
|
486
|
+
__m128i ib16 = _mm_packus_epi32(ib, zero);
|
|
487
|
+
|
|
488
|
+
__m128i ir8 = _mm_packus_epi16(ir16, zero);
|
|
489
|
+
__m128i ig8 = _mm_packus_epi16(ig16, zero);
|
|
490
|
+
__m128i ib8 = _mm_packus_epi16(ib16, zero);
|
|
491
|
+
|
|
492
|
+
const __m128i mask_r = _mm_setr_epi8(
|
|
493
|
+
0, (char)0x80, (char)0x80, 1,
|
|
494
|
+
(char)0x80, (char)0x80, 2, (char)0x80,
|
|
495
|
+
(char)0x80, 3, (char)0x80, (char)0x80,
|
|
496
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80);
|
|
497
|
+
const __m128i mask_g = _mm_setr_epi8(
|
|
498
|
+
(char)0x80, 0, (char)0x80, (char)0x80,
|
|
499
|
+
1, (char)0x80, (char)0x80, 2,
|
|
500
|
+
(char)0x80, (char)0x80, 3, (char)0x80,
|
|
501
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80);
|
|
502
|
+
const __m128i mask_b = _mm_setr_epi8(
|
|
503
|
+
(char)0x80, (char)0x80, 0, (char)0x80,
|
|
504
|
+
(char)0x80, 1, (char)0x80, (char)0x80,
|
|
505
|
+
2, (char)0x80, (char)0x80, 3,
|
|
506
|
+
(char)0x80, (char)0x80, (char)0x80, (char)0x80);
|
|
507
|
+
|
|
508
|
+
__m128i packed = _mm_or_si128(
|
|
509
|
+
_mm_or_si128(_mm_shuffle_epi8(ir8, mask_r),
|
|
510
|
+
_mm_shuffle_epi8(ig8, mask_g)),
|
|
511
|
+
_mm_shuffle_epi8(ib8, mask_b));
|
|
512
|
+
|
|
513
|
+
return packed;
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
static inline void store_unit_f32_to_u8_rgb4(__m128 fr, __m128 fg, __m128 fb,
|
|
517
|
+
uint8_t *out_ptr) {
|
|
518
|
+
__m128i packed = pack_unit_f32_to_u8_rgb4(fr, fg, fb);
|
|
519
|
+
_mm_storel_epi64((__m128i*)out_ptr, packed);
|
|
520
|
+
__m128i tail_vec = _mm_srli_si128(packed, 8);
|
|
521
|
+
uint32_t tail = (uint32_t)_mm_cvtsi128_si32(tail_vec);
|
|
522
|
+
memcpy(out_ptr + 8, &tail, sizeof(tail));
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
static inline void store_unit_f32_to_u8_rgb4_u16(__m128 fr, __m128 fg, __m128 fb,
|
|
526
|
+
uint8_t *out_ptr) {
|
|
527
|
+
__m128i packed = pack_unit_f32_to_u8_rgb4(fr, fg, fb);
|
|
528
|
+
_mm_storeu_si128((__m128i*)out_ptr, packed);
|
|
403
529
|
}
|
|
404
530
|
|
|
405
531
|
/* texture is RGB: texture_alpha = im_alpha broadcast, inverse_tpa = 1 - texture_alpha */
|
|
406
532
|
static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
|
|
407
533
|
const uint8_t *skin, const uint8_t *im_alpha,
|
|
408
534
|
uint8_t *out, npy_intp pixels) {
|
|
409
|
-
const
|
|
535
|
+
const __m256 inv255 = _mm256_set1_ps(1.0f/255.0f);
|
|
536
|
+
const __m128 inv255_128 = _mm_set1_ps(1.0f/255.0f);
|
|
410
537
|
const __m256 half = _mm256_set1_ps(0.5f);
|
|
411
538
|
const __m256 one = _mm256_set1_ps(1.0f);
|
|
412
539
|
const __m256 w = _mm256_set1_ps((float)SKIN_WEIGHT);
|
|
413
540
|
const __m256 invw = _mm256_set1_ps(1.0f - (float)SKIN_WEIGHT);
|
|
414
541
|
|
|
415
542
|
npy_intp i = 0;
|
|
416
|
-
for (; i +
|
|
543
|
+
for (; i + 18 <= pixels; i += 16) {
|
|
544
|
+
if (i + 32 < pixels) {
|
|
545
|
+
_mm_prefetch((const char*)(base + 3*(i + 32)), _MM_HINT_T0);
|
|
546
|
+
_mm_prefetch((const char*)(texture + 3*(i + 32)), _MM_HINT_T0);
|
|
547
|
+
_mm_prefetch((const char*)(skin + 3*(i + 32)), _MM_HINT_T0);
|
|
548
|
+
_mm_prefetch((const char*)(im_alpha + (i + 32)), _MM_HINT_T0);
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
const uint8_t *base_blk = base + 3*i;
|
|
552
|
+
const uint8_t *tex_blk = texture + 3*i;
|
|
553
|
+
const uint8_t *skin_blk = skin + 3*i;
|
|
554
|
+
|
|
555
|
+
__m128i a16 = _mm_loadu_si128((const __m128i*)(im_alpha + i));
|
|
556
|
+
__m128i a_zero = _mm_cmpeq_epi8(a16, _mm_setzero_si128());
|
|
557
|
+
if (_mm_movemask_epi8(a_zero) == 0xFFFF) {
|
|
558
|
+
memcpy(out + 3*i, base_blk, 48);
|
|
559
|
+
continue;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
__m256 fa_im0, fa_im1;
|
|
563
|
+
load16_u8_to_unit_f32_avx2_from_xmm(a16, inv255, &fa_im0, &fa_im1);
|
|
564
|
+
__m256 fit_a0 = fnmadd_ps256(fa_im0, one, one);
|
|
565
|
+
__m256 fit_a1 = fnmadd_ps256(fa_im1, one, one);
|
|
566
|
+
|
|
567
|
+
/* base RGB in [0,1] */
|
|
568
|
+
__m128 fb_r0, fb_g0, fb_b0;
|
|
569
|
+
__m128 fb_r1, fb_g1, fb_b1;
|
|
570
|
+
load4_rgb_to_unit_f32(base_blk, inv255_128, &fb_r0, &fb_g0, &fb_b0);
|
|
571
|
+
load4_rgb_to_unit_f32(base_blk + 12, inv255_128, &fb_r1, &fb_g1, &fb_b1);
|
|
572
|
+
__m256 fb_r = _mm256_set_m128(fb_r1, fb_r0);
|
|
573
|
+
__m256 fb_g = _mm256_set_m128(fb_g1, fb_g0);
|
|
574
|
+
__m256 fb_b = _mm256_set_m128(fb_b1, fb_b0);
|
|
575
|
+
|
|
576
|
+
__m128 fb_r2, fb_g2, fb_b2;
|
|
577
|
+
__m128 fb_r3, fb_g3, fb_b3;
|
|
578
|
+
load4_rgb_to_unit_f32(base_blk + 24, inv255_128, &fb_r2, &fb_g2, &fb_b2);
|
|
579
|
+
load4_rgb_to_unit_f32(base_blk + 36, inv255_128, &fb_r3, &fb_g3, &fb_b3);
|
|
580
|
+
__m256 fb_r_2 = _mm256_set_m128(fb_r3, fb_r2);
|
|
581
|
+
__m256 fb_g_2 = _mm256_set_m128(fb_g3, fb_g2);
|
|
582
|
+
__m256 fb_b_2 = _mm256_set_m128(fb_b3, fb_b2);
|
|
583
|
+
|
|
584
|
+
/* texture RGB in [0,1] */
|
|
585
|
+
__m128 ft_r0, ft_g0, ft_b0;
|
|
586
|
+
__m128 ft_r1, ft_g1, ft_b1;
|
|
587
|
+
load4_rgb_to_unit_f32(tex_blk, inv255_128, &ft_r0, &ft_g0, &ft_b0);
|
|
588
|
+
load4_rgb_to_unit_f32(tex_blk + 12, inv255_128, &ft_r1, &ft_g1, &ft_b1);
|
|
589
|
+
__m256 ft_r = _mm256_set_m128(ft_r1, ft_r0);
|
|
590
|
+
__m256 ft_g = _mm256_set_m128(ft_g1, ft_g0);
|
|
591
|
+
__m256 ft_b = _mm256_set_m128(ft_b1, ft_b0);
|
|
592
|
+
|
|
593
|
+
__m128 ft_r2, ft_g2, ft_b2;
|
|
594
|
+
__m128 ft_r3, ft_g3, ft_b3;
|
|
595
|
+
load4_rgb_to_unit_f32(tex_blk + 24, inv255_128, &ft_r2, &ft_g2, &ft_b2);
|
|
596
|
+
load4_rgb_to_unit_f32(tex_blk + 36, inv255_128, &ft_r3, &ft_g3, &ft_b3);
|
|
597
|
+
__m256 ft_r_2 = _mm256_set_m128(ft_r3, ft_r2);
|
|
598
|
+
__m256 ft_g_2 = _mm256_set_m128(ft_g3, ft_g2);
|
|
599
|
+
__m256 ft_b_2 = _mm256_set_m128(ft_b3, ft_b2);
|
|
600
|
+
|
|
601
|
+
/* skin RGB in [0,1] */
|
|
602
|
+
__m128 fs_r0, fs_g0, fs_b0;
|
|
603
|
+
__m128 fs_r1, fs_g1, fs_b1;
|
|
604
|
+
load4_rgb_to_unit_f32(skin_blk, inv255_128, &fs_r0, &fs_g0, &fs_b0);
|
|
605
|
+
load4_rgb_to_unit_f32(skin_blk + 12, inv255_128, &fs_r1, &fs_g1, &fs_b1);
|
|
606
|
+
__m256 fs_r = _mm256_set_m128(fs_r1, fs_r0);
|
|
607
|
+
__m256 fs_g = _mm256_set_m128(fs_g1, fs_g0);
|
|
608
|
+
__m256 fs_b = _mm256_set_m128(fs_b1, fs_b0);
|
|
609
|
+
|
|
610
|
+
__m128 fs_r2, fs_g2, fs_b2;
|
|
611
|
+
__m128 fs_r3, fs_g3, fs_b3;
|
|
612
|
+
load4_rgb_to_unit_f32(skin_blk + 24, inv255_128, &fs_r2, &fs_g2, &fs_b2);
|
|
613
|
+
load4_rgb_to_unit_f32(skin_blk + 36, inv255_128, &fs_r3, &fs_g3, &fs_b3);
|
|
614
|
+
__m256 fs_r_2 = _mm256_set_m128(fs_r3, fs_r2);
|
|
615
|
+
__m256 fs_g_2 = _mm256_set_m128(fs_g3, fs_g2);
|
|
616
|
+
__m256 fs_b_2 = _mm256_set_m128(fs_b3, fs_b2);
|
|
617
|
+
|
|
618
|
+
/* gm_out = clip(texture + skin - 0.5) */
|
|
619
|
+
__m256 gm_r = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_r, fs_r), half));
|
|
620
|
+
__m256 gm_g = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_g, fs_g), half));
|
|
621
|
+
__m256 gm_b = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_b, fs_b), half));
|
|
622
|
+
|
|
623
|
+
__m256 gm_r2 = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_r_2, fs_r_2), half));
|
|
624
|
+
__m256 gm_g2 = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_g_2, fs_g_2), half));
|
|
625
|
+
__m256 gm_b2 = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_b_2, fs_b_2), half));
|
|
626
|
+
|
|
627
|
+
/* gm_out = gm_out * texture_alpha + texture * inverse_tpa */
|
|
628
|
+
gm_r = mul_add_ps256(gm_r, fa_im0, _mm256_mul_ps(ft_r, fit_a0));
|
|
629
|
+
gm_g = mul_add_ps256(gm_g, fa_im0, _mm256_mul_ps(ft_g, fit_a0));
|
|
630
|
+
gm_b = mul_add_ps256(gm_b, fa_im0, _mm256_mul_ps(ft_b, fit_a0));
|
|
631
|
+
|
|
632
|
+
gm_r2 = mul_add_ps256(gm_r2, fa_im1, _mm256_mul_ps(ft_r_2, fit_a1));
|
|
633
|
+
gm_g2 = mul_add_ps256(gm_g2, fa_im1, _mm256_mul_ps(ft_g_2, fit_a1));
|
|
634
|
+
gm_b2 = mul_add_ps256(gm_b2, fa_im1, _mm256_mul_ps(ft_b_2, fit_a1));
|
|
635
|
+
|
|
636
|
+
/* gm_out = gm_out * (1 - w) + skin * w */
|
|
637
|
+
gm_r = mul_add_ps256(gm_r, invw, _mm256_mul_ps(fs_r, w));
|
|
638
|
+
gm_g = mul_add_ps256(gm_g, invw, _mm256_mul_ps(fs_g, w));
|
|
639
|
+
gm_b = mul_add_ps256(gm_b, invw, _mm256_mul_ps(fs_b, w));
|
|
640
|
+
|
|
641
|
+
gm_r2 = mul_add_ps256(gm_r2, invw, _mm256_mul_ps(fs_r_2, w));
|
|
642
|
+
gm_g2 = mul_add_ps256(gm_g2, invw, _mm256_mul_ps(fs_g_2, w));
|
|
643
|
+
gm_b2 = mul_add_ps256(gm_b2, invw, _mm256_mul_ps(fs_b_2, w));
|
|
644
|
+
|
|
645
|
+
/* nan_to_num */
|
|
646
|
+
gm_r = nan_to_num_ps(gm_r);
|
|
647
|
+
gm_g = nan_to_num_ps(gm_g);
|
|
648
|
+
gm_b = nan_to_num_ps(gm_b);
|
|
649
|
+
|
|
650
|
+
gm_r2 = nan_to_num_ps(gm_r2);
|
|
651
|
+
gm_g2 = nan_to_num_ps(gm_g2);
|
|
652
|
+
gm_b2 = nan_to_num_ps(gm_b2);
|
|
653
|
+
|
|
654
|
+
/* n_out = gm_out * texture_alpha + base * inverse_tpa */
|
|
655
|
+
__m256 fr = mul_add_ps256(gm_r, fa_im0, _mm256_mul_ps(fb_r, fit_a0));
|
|
656
|
+
__m256 fg = mul_add_ps256(gm_g, fa_im0, _mm256_mul_ps(fb_g, fit_a0));
|
|
657
|
+
__m256 fb = mul_add_ps256(gm_b, fa_im0, _mm256_mul_ps(fb_b, fit_a0));
|
|
658
|
+
|
|
659
|
+
__m256 fr2 = mul_add_ps256(gm_r2, fa_im1, _mm256_mul_ps(fb_r_2, fit_a1));
|
|
660
|
+
__m256 fg2 = mul_add_ps256(gm_g2, fa_im1, _mm256_mul_ps(fb_g_2, fit_a1));
|
|
661
|
+
__m256 fb2 = mul_add_ps256(gm_b2, fa_im1, _mm256_mul_ps(fb_b_2, fit_a1));
|
|
662
|
+
|
|
663
|
+
__m128 fr_lo = _mm256_castps256_ps128(fr);
|
|
664
|
+
__m128 fg_lo = _mm256_castps256_ps128(fg);
|
|
665
|
+
__m128 fb_lo = _mm256_castps256_ps128(fb);
|
|
666
|
+
store_unit_f32_to_u8_rgb4_u16(fr_lo, fg_lo, fb_lo, out + 3*i);
|
|
667
|
+
|
|
668
|
+
__m128 fr_hi = _mm256_extractf128_ps(fr, 1);
|
|
669
|
+
__m128 fg_hi = _mm256_extractf128_ps(fg, 1);
|
|
670
|
+
__m128 fb_hi = _mm256_extractf128_ps(fb, 1);
|
|
671
|
+
store_unit_f32_to_u8_rgb4_u16(fr_hi, fg_hi, fb_hi, out + 3*i + 12);
|
|
672
|
+
|
|
673
|
+
__m128 fr2_lo = _mm256_castps256_ps128(fr2);
|
|
674
|
+
__m128 fg2_lo = _mm256_castps256_ps128(fg2);
|
|
675
|
+
__m128 fb2_lo = _mm256_castps256_ps128(fb2);
|
|
676
|
+
store_unit_f32_to_u8_rgb4_u16(fr2_lo, fg2_lo, fb2_lo, out + 3*i + 24);
|
|
677
|
+
|
|
678
|
+
__m128 fr2_hi = _mm256_extractf128_ps(fr2, 1);
|
|
679
|
+
__m128 fg2_hi = _mm256_extractf128_ps(fg2, 1);
|
|
680
|
+
__m128 fb2_hi = _mm256_extractf128_ps(fb2, 1);
|
|
681
|
+
store_unit_f32_to_u8_rgb4_u16(fr2_hi, fg2_hi, fb2_hi, out + 3*i + 36);
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
for (; i + 10 <= pixels; i += 8) {
|
|
685
|
+
const uint8_t *base_blk = base + 3*i;
|
|
686
|
+
const uint8_t *tex_blk = texture + 3*i;
|
|
687
|
+
const uint8_t *skin_blk = skin + 3*i;
|
|
688
|
+
|
|
417
689
|
/* base RGB in [0,1] */
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
690
|
+
__m128 fb_r0, fb_g0, fb_b0;
|
|
691
|
+
__m128 fb_r1, fb_g1, fb_b1;
|
|
692
|
+
load4_rgb_to_unit_f32(base_blk, inv255_128, &fb_r0, &fb_g0, &fb_b0);
|
|
693
|
+
load4_rgb_to_unit_f32(base_blk + 12, inv255_128, &fb_r1, &fb_g1, &fb_b1);
|
|
694
|
+
__m256 fb_r = _mm256_set_m128(fb_r1, fb_r0);
|
|
695
|
+
__m256 fb_g = _mm256_set_m128(fb_g1, fb_g0);
|
|
696
|
+
__m256 fb_b = _mm256_set_m128(fb_b1, fb_b0);
|
|
421
697
|
|
|
422
698
|
/* texture RGB in [0,1] */
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
699
|
+
__m128 ft_r0, ft_g0, ft_b0;
|
|
700
|
+
__m128 ft_r1, ft_g1, ft_b1;
|
|
701
|
+
load4_rgb_to_unit_f32(tex_blk, inv255_128, &ft_r0, &ft_g0, &ft_b0);
|
|
702
|
+
load4_rgb_to_unit_f32(tex_blk + 12, inv255_128, &ft_r1, &ft_g1, &ft_b1);
|
|
703
|
+
__m256 ft_r = _mm256_set_m128(ft_r1, ft_r0);
|
|
704
|
+
__m256 ft_g = _mm256_set_m128(ft_g1, ft_g0);
|
|
705
|
+
__m256 ft_b = _mm256_set_m128(ft_b1, ft_b0);
|
|
426
706
|
|
|
427
707
|
/* skin RGB in [0,1] */
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
708
|
+
__m128 fs_r0, fs_g0, fs_b0;
|
|
709
|
+
__m128 fs_r1, fs_g1, fs_b1;
|
|
710
|
+
load4_rgb_to_unit_f32(skin_blk, inv255_128, &fs_r0, &fs_g0, &fs_b0);
|
|
711
|
+
load4_rgb_to_unit_f32(skin_blk + 12, inv255_128, &fs_r1, &fs_g1, &fs_b1);
|
|
712
|
+
__m256 fs_r = _mm256_set_m128(fs_r1, fs_r0);
|
|
713
|
+
__m256 fs_g = _mm256_set_m128(fs_g1, fs_g0);
|
|
714
|
+
__m256 fs_b = _mm256_set_m128(fs_b1, fs_b0);
|
|
715
|
+
|
|
716
|
+
if (i + 32 < pixels) {
|
|
717
|
+
_mm_prefetch((const char*)(base + 3*(i + 32)), _MM_HINT_T0);
|
|
718
|
+
_mm_prefetch((const char*)(texture + 3*(i + 32)), _MM_HINT_T0);
|
|
719
|
+
_mm_prefetch((const char*)(skin + 3*(i + 32)), _MM_HINT_T0);
|
|
720
|
+
_mm_prefetch((const char*)(im_alpha + (i + 32)), _MM_HINT_T0);
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
__m128i a8 = _mm_loadl_epi64((const __m128i*)(im_alpha + i));
|
|
724
|
+
__m128i a_zero = _mm_cmpeq_epi8(a8, _mm_setzero_si128());
|
|
725
|
+
if (_mm_movemask_epi8(a_zero) == 0xFFFF) {
|
|
726
|
+
memcpy(out + 3*i, base_blk, 24);
|
|
727
|
+
continue;
|
|
728
|
+
}
|
|
431
729
|
|
|
432
730
|
/* texture_alpha = im_alpha */
|
|
433
|
-
__m256 fa_im = load8_u8_to_unit_f32_avx2(im_alpha + i);
|
|
434
|
-
__m256 fit_a =
|
|
731
|
+
__m256 fa_im = load8_u8_to_unit_f32_avx2(im_alpha + i, inv255);
|
|
732
|
+
__m256 fit_a = fnmadd_ps256(fa_im, one, one);
|
|
435
733
|
|
|
436
734
|
/* gm_out = clip(texture + skin - 0.5) */
|
|
437
735
|
__m256 gm_r = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_r, fs_r), half));
|
|
@@ -439,14 +737,14 @@ static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
439
737
|
__m256 gm_b = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_b, fs_b), half));
|
|
440
738
|
|
|
441
739
|
/* gm_out = gm_out * texture_alpha + texture * inverse_tpa */
|
|
442
|
-
gm_r =
|
|
443
|
-
gm_g =
|
|
444
|
-
gm_b =
|
|
740
|
+
gm_r = mul_add_ps256(gm_r, fa_im, _mm256_mul_ps(ft_r, fit_a));
|
|
741
|
+
gm_g = mul_add_ps256(gm_g, fa_im, _mm256_mul_ps(ft_g, fit_a));
|
|
742
|
+
gm_b = mul_add_ps256(gm_b, fa_im, _mm256_mul_ps(ft_b, fit_a));
|
|
445
743
|
|
|
446
744
|
/* gm_out = gm_out * (1 - w) + skin * w */
|
|
447
|
-
gm_r =
|
|
448
|
-
gm_g =
|
|
449
|
-
gm_b =
|
|
745
|
+
gm_r = mul_add_ps256(gm_r, invw, _mm256_mul_ps(fs_r, w));
|
|
746
|
+
gm_g = mul_add_ps256(gm_g, invw, _mm256_mul_ps(fs_g, w));
|
|
747
|
+
gm_b = mul_add_ps256(gm_b, invw, _mm256_mul_ps(fs_b, w));
|
|
450
748
|
|
|
451
749
|
/* nan_to_num */
|
|
452
750
|
gm_r = nan_to_num_ps(gm_r);
|
|
@@ -454,16 +752,30 @@ static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
454
752
|
gm_b = nan_to_num_ps(gm_b);
|
|
455
753
|
|
|
456
754
|
/* n_out = gm_out * texture_alpha + base * inverse_tpa */
|
|
457
|
-
__m256 fr =
|
|
458
|
-
__m256 fg =
|
|
459
|
-
__m256 fb =
|
|
460
|
-
|
|
461
|
-
|
|
755
|
+
__m256 fr = mul_add_ps256(gm_r, fa_im, _mm256_mul_ps(fb_r, fit_a));
|
|
756
|
+
__m256 fg = mul_add_ps256(gm_g, fa_im, _mm256_mul_ps(fb_g, fit_a));
|
|
757
|
+
__m256 fb = mul_add_ps256(gm_b, fa_im, _mm256_mul_ps(fb_b, fit_a));
|
|
758
|
+
|
|
759
|
+
__m128 fr_lo = _mm256_castps256_ps128(fr);
|
|
760
|
+
__m128 fg_lo = _mm256_castps256_ps128(fg);
|
|
761
|
+
__m128 fb_lo = _mm256_castps256_ps128(fb);
|
|
762
|
+
store_unit_f32_to_u8_rgb4_u16(fr_lo, fg_lo, fb_lo, out + 3*i);
|
|
763
|
+
|
|
764
|
+
__m128 fr_hi = _mm256_extractf128_ps(fr, 1);
|
|
765
|
+
__m128 fg_hi = _mm256_extractf128_ps(fg, 1);
|
|
766
|
+
__m128 fb_hi = _mm256_extractf128_ps(fb, 1);
|
|
767
|
+
store_unit_f32_to_u8_rgb4_u16(fr_hi, fg_hi, fb_hi, out + 3*i + 12);
|
|
462
768
|
}
|
|
463
769
|
|
|
464
770
|
if (i < pixels) {
|
|
465
|
-
|
|
466
|
-
|
|
771
|
+
npy_intp rem = pixels - i;
|
|
772
|
+
if (rem >= 6) {
|
|
773
|
+
kernel_sse42_rgb(base + 3*i, texture + 3*i, skin + 3*i, im_alpha + i,
|
|
774
|
+
out + 3*i, rem);
|
|
775
|
+
} else {
|
|
776
|
+
kernel_scalar_rgb(base + 3*i, texture + 3*i, skin + 3*i, im_alpha + i,
|
|
777
|
+
out + 3*i, rem);
|
|
778
|
+
}
|
|
467
779
|
}
|
|
468
780
|
}
|
|
469
781
|
|
|
@@ -471,68 +783,253 @@ static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
471
783
|
static void kernel_avx2_rgba(const uint8_t *base, const uint8_t *texture,
|
|
472
784
|
const uint8_t *skin, const uint8_t *im_alpha,
|
|
473
785
|
uint8_t *out, npy_intp pixels) {
|
|
474
|
-
const
|
|
786
|
+
const __m256 inv255 = _mm256_set1_ps(1.0f/255.0f);
|
|
787
|
+
const __m128 inv255_128 = _mm_set1_ps(1.0f/255.0f);
|
|
475
788
|
const __m256 half = _mm256_set1_ps(0.5f);
|
|
476
789
|
const __m256 one = _mm256_set1_ps(1.0f);
|
|
477
790
|
const __m256 w = _mm256_set1_ps((float)SKIN_WEIGHT);
|
|
478
791
|
const __m256 invw = _mm256_set1_ps(1.0f - (float)SKIN_WEIGHT);
|
|
479
792
|
|
|
480
793
|
npy_intp i = 0;
|
|
794
|
+
for (; i + 16 <= pixels; i += 16) {
|
|
795
|
+
if (i + 32 < pixels) {
|
|
796
|
+
_mm_prefetch((const char*)(base + 3*(i + 32)), _MM_HINT_T0);
|
|
797
|
+
_mm_prefetch((const char*)(texture + 4*(i + 32)), _MM_HINT_T0);
|
|
798
|
+
_mm_prefetch((const char*)(skin + 3*(i + 32)), _MM_HINT_T0);
|
|
799
|
+
_mm_prefetch((const char*)(im_alpha + (i + 32)), _MM_HINT_T0);
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
const uint8_t *base_blk = base + 3*i;
|
|
803
|
+
const uint8_t *tex_blk = texture + 4*i;
|
|
804
|
+
const uint8_t *skin_blk = skin + 3*i;
|
|
805
|
+
|
|
806
|
+
__m128i a16 = _mm_loadu_si128((const __m128i*)(im_alpha + i));
|
|
807
|
+
__m128i a_zero = _mm_cmpeq_epi8(a16, _mm_setzero_si128());
|
|
808
|
+
if (_mm_movemask_epi8(a_zero) == 0xFFFF) {
|
|
809
|
+
memcpy(out + 3*i, base_blk, 48);
|
|
810
|
+
continue;
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
__m128i a_ff = _mm_cmpeq_epi8(a16, _mm_set1_epi8((char)0xFF));
|
|
814
|
+
const int all_ff = (_mm_movemask_epi8(a_ff) == 0xFFFF);
|
|
815
|
+
__m256 fa_im0, fa_im1;
|
|
816
|
+
if (all_ff) {
|
|
817
|
+
fa_im0 = _mm256_set1_ps(1.0f);
|
|
818
|
+
fa_im1 = _mm256_set1_ps(1.0f);
|
|
819
|
+
} else {
|
|
820
|
+
load16_u8_to_unit_f32_avx2_from_xmm(a16, inv255, &fa_im0, &fa_im1);
|
|
821
|
+
}
|
|
822
|
+
|
|
823
|
+
__m128 fb_r0, fb_g0, fb_b0;
|
|
824
|
+
__m128 fb_r1, fb_g1, fb_b1;
|
|
825
|
+
load4_rgb_to_unit_f32(base_blk, inv255_128, &fb_r0, &fb_g0, &fb_b0);
|
|
826
|
+
load4_rgb_to_unit_f32(base_blk + 12, inv255_128, &fb_r1, &fb_g1, &fb_b1);
|
|
827
|
+
__m256 fb_r = _mm256_set_m128(fb_r1, fb_r0);
|
|
828
|
+
__m256 fb_g = _mm256_set_m128(fb_g1, fb_g0);
|
|
829
|
+
__m256 fb_b = _mm256_set_m128(fb_b1, fb_b0);
|
|
830
|
+
|
|
831
|
+
__m128 ft_r0, ft_g0, ft_b0, ft_a0;
|
|
832
|
+
__m128 ft_r1, ft_g1, ft_b1, ft_a1;
|
|
833
|
+
load4_rgba_to_unit_f32(tex_blk, inv255_128, &ft_r0, &ft_g0, &ft_b0, &ft_a0);
|
|
834
|
+
load4_rgba_to_unit_f32(tex_blk + 16, inv255_128, &ft_r1, &ft_g1, &ft_b1, &ft_a1);
|
|
835
|
+
__m256 ft_r = _mm256_set_m128(ft_r1, ft_r0);
|
|
836
|
+
__m256 ft_g = _mm256_set_m128(ft_g1, ft_g0);
|
|
837
|
+
__m256 ft_b = _mm256_set_m128(ft_b1, ft_b0);
|
|
838
|
+
__m256 ft_a = _mm256_set_m128(ft_a1, ft_a0); /* texture alpha */
|
|
839
|
+
|
|
840
|
+
__m128 fs_r0, fs_g0, fs_b0;
|
|
841
|
+
__m128 fs_r1, fs_g1, fs_b1;
|
|
842
|
+
load4_rgb_to_unit_f32(skin_blk, inv255_128, &fs_r0, &fs_g0, &fs_b0);
|
|
843
|
+
load4_rgb_to_unit_f32(skin_blk + 12, inv255_128, &fs_r1, &fs_g1, &fs_b1);
|
|
844
|
+
__m256 fs_r = _mm256_set_m128(fs_r1, fs_r0);
|
|
845
|
+
__m256 fs_g = _mm256_set_m128(fs_g1, fs_g0);
|
|
846
|
+
__m256 fs_b = _mm256_set_m128(fs_b1, fs_b0);
|
|
847
|
+
|
|
848
|
+
__m256 fta = all_ff ? ft_a : _mm256_mul_ps(ft_a, fa_im0); /* texture_alpha */
|
|
849
|
+
__m256 fit_a = fnmadd_ps256(fta, one, one); /* inverse_tpa */
|
|
850
|
+
|
|
851
|
+
__m256 gm_r = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_r, fs_r), half));
|
|
852
|
+
__m256 gm_g = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_g, fs_g), half));
|
|
853
|
+
__m256 gm_b = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_b, fs_b), half));
|
|
854
|
+
|
|
855
|
+
gm_r = mul_add_ps256(gm_r, fta, _mm256_mul_ps(ft_r, fit_a));
|
|
856
|
+
gm_g = mul_add_ps256(gm_g, fta, _mm256_mul_ps(ft_g, fit_a));
|
|
857
|
+
gm_b = mul_add_ps256(gm_b, fta, _mm256_mul_ps(ft_b, fit_a));
|
|
858
|
+
|
|
859
|
+
gm_r = mul_add_ps256(gm_r, invw, _mm256_mul_ps(fs_r, w));
|
|
860
|
+
gm_g = mul_add_ps256(gm_g, invw, _mm256_mul_ps(fs_g, w));
|
|
861
|
+
gm_b = mul_add_ps256(gm_b, invw, _mm256_mul_ps(fs_b, w));
|
|
862
|
+
|
|
863
|
+
gm_r = nan_to_num_ps(gm_r);
|
|
864
|
+
gm_g = nan_to_num_ps(gm_g);
|
|
865
|
+
gm_b = nan_to_num_ps(gm_b);
|
|
866
|
+
|
|
867
|
+
__m256 fr = mul_add_ps256(gm_r, fta, _mm256_mul_ps(fb_r, fit_a));
|
|
868
|
+
__m256 fg = mul_add_ps256(gm_g, fta, _mm256_mul_ps(fb_g, fit_a));
|
|
869
|
+
__m256 fb = mul_add_ps256(gm_b, fta, _mm256_mul_ps(fb_b, fit_a));
|
|
870
|
+
|
|
871
|
+
__m128 fr_lo = _mm256_castps256_ps128(fr);
|
|
872
|
+
__m128 fg_lo = _mm256_castps256_ps128(fg);
|
|
873
|
+
__m128 fb_lo = _mm256_castps256_ps128(fb);
|
|
874
|
+
store_unit_f32_to_u8_rgb4_u16(fr_lo, fg_lo, fb_lo, out + 3*i);
|
|
875
|
+
|
|
876
|
+
__m128 fr_hi = _mm256_extractf128_ps(fr, 1);
|
|
877
|
+
__m128 fg_hi = _mm256_extractf128_ps(fg, 1);
|
|
878
|
+
__m128 fb_hi = _mm256_extractf128_ps(fb, 1);
|
|
879
|
+
store_unit_f32_to_u8_rgb4(fr_hi, fg_hi, fb_hi, out + 3*i + 12);
|
|
880
|
+
|
|
881
|
+
__m128 fb_r2, fb_g2, fb_b2;
|
|
882
|
+
__m128 fb_r3, fb_g3, fb_b3;
|
|
883
|
+
load4_rgb_to_unit_f32(base_blk + 24, inv255_128, &fb_r2, &fb_g2, &fb_b2);
|
|
884
|
+
load4_rgb_to_unit_f32(base_blk + 36, inv255_128, &fb_r3, &fb_g3, &fb_b3);
|
|
885
|
+
__m256 fb_r_2 = _mm256_set_m128(fb_r3, fb_r2);
|
|
886
|
+
__m256 fb_g_2 = _mm256_set_m128(fb_g3, fb_g2);
|
|
887
|
+
__m256 fb_b_2 = _mm256_set_m128(fb_b3, fb_b2);
|
|
888
|
+
|
|
889
|
+
__m128 ft_r2, ft_g2, ft_b2, ft_a2;
|
|
890
|
+
__m128 ft_r3, ft_g3, ft_b3, ft_a3;
|
|
891
|
+
load4_rgba_to_unit_f32(tex_blk + 32, inv255_128, &ft_r2, &ft_g2, &ft_b2, &ft_a2);
|
|
892
|
+
load4_rgba_to_unit_f32(tex_blk + 48, inv255_128, &ft_r3, &ft_g3, &ft_b3, &ft_a3);
|
|
893
|
+
__m256 ft_r_2 = _mm256_set_m128(ft_r3, ft_r2);
|
|
894
|
+
__m256 ft_g_2 = _mm256_set_m128(ft_g3, ft_g2);
|
|
895
|
+
__m256 ft_b_2 = _mm256_set_m128(ft_b3, ft_b2);
|
|
896
|
+
__m256 ft_a_2 = _mm256_set_m128(ft_a3, ft_a2);
|
|
897
|
+
|
|
898
|
+
__m128 fs_r2, fs_g2, fs_b2;
|
|
899
|
+
__m128 fs_r3, fs_g3, fs_b3;
|
|
900
|
+
load4_rgb_to_unit_f32(skin_blk + 24, inv255_128, &fs_r2, &fs_g2, &fs_b2);
|
|
901
|
+
load4_rgb_to_unit_f32(skin_blk + 36, inv255_128, &fs_r3, &fs_g3, &fs_b3);
|
|
902
|
+
__m256 fs_r_2 = _mm256_set_m128(fs_r3, fs_r2);
|
|
903
|
+
__m256 fs_g_2 = _mm256_set_m128(fs_g3, fs_g2);
|
|
904
|
+
__m256 fs_b_2 = _mm256_set_m128(fs_b3, fs_b2);
|
|
905
|
+
|
|
906
|
+
__m256 fta2 = all_ff ? ft_a_2 : _mm256_mul_ps(ft_a_2, fa_im1);
|
|
907
|
+
__m256 fit_a2 = fnmadd_ps256(fta2, one, one);
|
|
908
|
+
|
|
909
|
+
__m256 gm_r2 = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_r_2, fs_r_2), half));
|
|
910
|
+
__m256 gm_g2 = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_g_2, fs_g_2), half));
|
|
911
|
+
__m256 gm_b2 = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_b_2, fs_b_2), half));
|
|
912
|
+
|
|
913
|
+
gm_r2 = mul_add_ps256(gm_r2, fta2, _mm256_mul_ps(ft_r_2, fit_a2));
|
|
914
|
+
gm_g2 = mul_add_ps256(gm_g2, fta2, _mm256_mul_ps(ft_g_2, fit_a2));
|
|
915
|
+
gm_b2 = mul_add_ps256(gm_b2, fta2, _mm256_mul_ps(ft_b_2, fit_a2));
|
|
916
|
+
|
|
917
|
+
gm_r2 = mul_add_ps256(gm_r2, invw, _mm256_mul_ps(fs_r_2, w));
|
|
918
|
+
gm_g2 = mul_add_ps256(gm_g2, invw, _mm256_mul_ps(fs_g_2, w));
|
|
919
|
+
gm_b2 = mul_add_ps256(gm_b2, invw, _mm256_mul_ps(fs_b_2, w));
|
|
920
|
+
|
|
921
|
+
gm_r2 = nan_to_num_ps(gm_r2);
|
|
922
|
+
gm_g2 = nan_to_num_ps(gm_g2);
|
|
923
|
+
gm_b2 = nan_to_num_ps(gm_b2);
|
|
924
|
+
|
|
925
|
+
__m256 fr2 = mul_add_ps256(gm_r2, fta2, _mm256_mul_ps(fb_r_2, fit_a2));
|
|
926
|
+
__m256 fg2 = mul_add_ps256(gm_g2, fta2, _mm256_mul_ps(fb_g_2, fit_a2));
|
|
927
|
+
__m256 fb2 = mul_add_ps256(gm_b2, fta2, _mm256_mul_ps(fb_b_2, fit_a2));
|
|
928
|
+
|
|
929
|
+
__m128 fr2_lo = _mm256_castps256_ps128(fr2);
|
|
930
|
+
__m128 fg2_lo = _mm256_castps256_ps128(fg2);
|
|
931
|
+
__m128 fb2_lo = _mm256_castps256_ps128(fb2);
|
|
932
|
+
store_unit_f32_to_u8_rgb4_u16(fr2_lo, fg2_lo, fb2_lo, out + 3*i + 24);
|
|
933
|
+
|
|
934
|
+
__m128 fr2_hi = _mm256_extractf128_ps(fr2, 1);
|
|
935
|
+
__m128 fg2_hi = _mm256_extractf128_ps(fg2, 1);
|
|
936
|
+
__m128 fb2_hi = _mm256_extractf128_ps(fb2, 1);
|
|
937
|
+
store_unit_f32_to_u8_rgb4(fr2_hi, fg2_hi, fb2_hi, out + 3*i + 36);
|
|
938
|
+
}
|
|
939
|
+
|
|
481
940
|
for (; i + 8 <= pixels; i += 8) {
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
941
|
+
if (i + 32 < pixels) {
|
|
942
|
+
_mm_prefetch((const char*)(base + 3*(i + 32)), _MM_HINT_T0);
|
|
943
|
+
_mm_prefetch((const char*)(texture + 4*(i + 32)), _MM_HINT_T0);
|
|
944
|
+
_mm_prefetch((const char*)(skin + 3*(i + 32)), _MM_HINT_T0);
|
|
945
|
+
_mm_prefetch((const char*)(im_alpha + (i + 32)), _MM_HINT_T0);
|
|
946
|
+
}
|
|
485
947
|
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
__m256 ft_a = gather_u8_to_unit_f32_avx2(texture+3, stride4, i); /* texture alpha */
|
|
948
|
+
const uint8_t *base_blk = base + 3*i;
|
|
949
|
+
const uint8_t *tex_blk = texture + 4*i;
|
|
950
|
+
const uint8_t *skin_blk = skin + 3*i;
|
|
490
951
|
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
952
|
+
__m128i a8 = _mm_loadl_epi64((const __m128i*)(im_alpha + i));
|
|
953
|
+
__m128i a_zero = _mm_cmpeq_epi8(a8, _mm_setzero_si128());
|
|
954
|
+
if (_mm_movemask_epi8(a_zero) == 0xFFFF) {
|
|
955
|
+
memcpy(out + 3*i, base_blk, 24);
|
|
956
|
+
continue;
|
|
957
|
+
}
|
|
494
958
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
959
|
+
__m128 fb_r0, fb_g0, fb_b0;
|
|
960
|
+
__m128 fb_r1, fb_g1, fb_b1;
|
|
961
|
+
load4_rgb_to_unit_f32(base_blk, inv255_128, &fb_r0, &fb_g0, &fb_b0);
|
|
962
|
+
load4_rgb_to_unit_f32(base_blk + 12, inv255_128, &fb_r1, &fb_g1, &fb_b1);
|
|
963
|
+
__m256 fb_r = _mm256_set_m128(fb_r1, fb_r0);
|
|
964
|
+
__m256 fb_g = _mm256_set_m128(fb_g1, fb_g0);
|
|
965
|
+
__m256 fb_b = _mm256_set_m128(fb_b1, fb_b0);
|
|
966
|
+
|
|
967
|
+
__m128 ft_r0, ft_g0, ft_b0, ft_a0;
|
|
968
|
+
__m128 ft_r1, ft_g1, ft_b1, ft_a1;
|
|
969
|
+
load4_rgba_to_unit_f32(tex_blk, inv255_128, &ft_r0, &ft_g0, &ft_b0, &ft_a0);
|
|
970
|
+
load4_rgba_to_unit_f32(tex_blk + 16, inv255_128, &ft_r1, &ft_g1, &ft_b1, &ft_a1);
|
|
971
|
+
__m256 ft_r = _mm256_set_m128(ft_r1, ft_r0);
|
|
972
|
+
__m256 ft_g = _mm256_set_m128(ft_g1, ft_g0);
|
|
973
|
+
__m256 ft_b = _mm256_set_m128(ft_b1, ft_b0);
|
|
974
|
+
__m256 ft_a = _mm256_set_m128(ft_a1, ft_a0);
|
|
975
|
+
|
|
976
|
+
__m128 fs_r0, fs_g0, fs_b0;
|
|
977
|
+
__m128 fs_r1, fs_g1, fs_b1;
|
|
978
|
+
load4_rgb_to_unit_f32(skin_blk, inv255_128, &fs_r0, &fs_g0, &fs_b0);
|
|
979
|
+
load4_rgb_to_unit_f32(skin_blk + 12, inv255_128, &fs_r1, &fs_g1, &fs_b1);
|
|
980
|
+
__m256 fs_r = _mm256_set_m128(fs_r1, fs_r0);
|
|
981
|
+
__m256 fs_g = _mm256_set_m128(fs_g1, fs_g0);
|
|
982
|
+
__m256 fs_b = _mm256_set_m128(fs_b1, fs_b0);
|
|
983
|
+
|
|
984
|
+
__m256 fa_im = load8_u8_to_unit_f32_avx2(im_alpha + i, inv255);
|
|
985
|
+
__m256 fta = _mm256_mul_ps(ft_a, fa_im);
|
|
986
|
+
__m256 fit_a = fnmadd_ps256(fta, one, one);
|
|
498
987
|
|
|
499
988
|
__m256 gm_r = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_r, fs_r), half));
|
|
500
989
|
__m256 gm_g = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_g, fs_g), half));
|
|
501
990
|
__m256 gm_b = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_b, fs_b), half));
|
|
502
991
|
|
|
503
|
-
gm_r =
|
|
504
|
-
gm_g =
|
|
505
|
-
gm_b =
|
|
992
|
+
gm_r = mul_add_ps256(gm_r, fta, _mm256_mul_ps(ft_r, fit_a));
|
|
993
|
+
gm_g = mul_add_ps256(gm_g, fta, _mm256_mul_ps(ft_g, fit_a));
|
|
994
|
+
gm_b = mul_add_ps256(gm_b, fta, _mm256_mul_ps(ft_b, fit_a));
|
|
506
995
|
|
|
507
|
-
gm_r =
|
|
508
|
-
gm_g =
|
|
509
|
-
gm_b =
|
|
996
|
+
gm_r = mul_add_ps256(gm_r, invw, _mm256_mul_ps(fs_r, w));
|
|
997
|
+
gm_g = mul_add_ps256(gm_g, invw, _mm256_mul_ps(fs_g, w));
|
|
998
|
+
gm_b = mul_add_ps256(gm_b, invw, _mm256_mul_ps(fs_b, w));
|
|
510
999
|
|
|
511
1000
|
gm_r = nan_to_num_ps(gm_r);
|
|
512
1001
|
gm_g = nan_to_num_ps(gm_g);
|
|
513
1002
|
gm_b = nan_to_num_ps(gm_b);
|
|
514
1003
|
|
|
515
|
-
__m256 fr =
|
|
516
|
-
__m256 fg =
|
|
517
|
-
__m256 fb =
|
|
1004
|
+
__m256 fr = mul_add_ps256(gm_r, fta, _mm256_mul_ps(fb_r, fit_a));
|
|
1005
|
+
__m256 fg = mul_add_ps256(gm_g, fta, _mm256_mul_ps(fb_g, fit_a));
|
|
1006
|
+
__m256 fb = mul_add_ps256(gm_b, fta, _mm256_mul_ps(fb_b, fit_a));
|
|
1007
|
+
|
|
1008
|
+
__m128 fr_lo = _mm256_castps256_ps128(fr);
|
|
1009
|
+
__m128 fg_lo = _mm256_castps256_ps128(fg);
|
|
1010
|
+
__m128 fb_lo = _mm256_castps256_ps128(fb);
|
|
1011
|
+
store_unit_f32_to_u8_rgb4_u16(fr_lo, fg_lo, fb_lo, out + 3*i);
|
|
518
1012
|
|
|
519
|
-
|
|
1013
|
+
__m128 fr_hi = _mm256_extractf128_ps(fr, 1);
|
|
1014
|
+
__m128 fg_hi = _mm256_extractf128_ps(fg, 1);
|
|
1015
|
+
__m128 fb_hi = _mm256_extractf128_ps(fb, 1);
|
|
1016
|
+
store_unit_f32_to_u8_rgb4(fr_hi, fg_hi, fb_hi, out + 3*i + 12);
|
|
520
1017
|
}
|
|
521
1018
|
|
|
522
1019
|
if (i < pixels) {
|
|
523
|
-
|
|
524
|
-
|
|
1020
|
+
npy_intp rem = pixels - i;
|
|
1021
|
+
if (rem >= 4) {
|
|
1022
|
+
kernel_sse42_rgba(base + 3*i, texture + 4*i, skin + 3*i, im_alpha + i,
|
|
1023
|
+
out + 3*i, rem);
|
|
1024
|
+
} else {
|
|
1025
|
+
kernel_scalar_rgba(base + 3*i, texture + 4*i, skin + 3*i, im_alpha + i,
|
|
1026
|
+
out + 3*i, rem);
|
|
1027
|
+
}
|
|
525
1028
|
}
|
|
526
1029
|
}
|
|
527
1030
|
|
|
528
1031
|
/* ---------- SSE4.2 skeleton (process 4 pixels via manual loads) ---------- */
|
|
529
1032
|
|
|
530
|
-
/* 4-lane u8->f32 [0,1] from scalar bytes (works with interleaved strides) */
|
|
531
|
-
static inline __m128 u8x4_to_unit_f32(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
|
|
532
|
-
__m128i vi = _mm_setr_epi32((int)a, (int)b, (int)c, (int)d);
|
|
533
|
-
return _mm_mul_ps(_mm_cvtepi32_ps(vi), _mm_set1_ps(1.0f/255.0f));
|
|
534
|
-
}
|
|
535
|
-
|
|
536
1033
|
static inline __m128 load4_u8_to_unit_f32(const uint8_t *p) {
|
|
537
1034
|
/* p[0..3] are consecutive bytes (for im_alpha) */
|
|
538
1035
|
__m128i v8 = _mm_cvtsi32_si128(*(const int*)p); /* 4 bytes into xmm */
|
|
@@ -550,6 +1047,13 @@ static inline __m128 nan_to_num_ps128(__m128 x) {
|
|
|
550
1047
|
return _mm_blendv_ps(_mm_set1_ps(0.0f), x, cmp);
|
|
551
1048
|
}
|
|
552
1049
|
|
|
1050
|
+
static inline __m128 mul_add_ps128(__m128 a, __m128 b, __m128 c) {
|
|
1051
|
+
#ifdef __FMA__
|
|
1052
|
+
return _mm_fmadd_ps(a, b, c);
|
|
1053
|
+
#else
|
|
1054
|
+
return _mm_add_ps(_mm_mul_ps(a, b), c);
|
|
1055
|
+
#endif
|
|
1056
|
+
}
|
|
553
1057
|
|
|
554
1058
|
static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
|
|
555
1059
|
const uint8_t *skin, const uint8_t *im_alpha,
|
|
@@ -558,29 +1062,23 @@ static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
558
1062
|
const __m128 one = _mm_set1_ps(1.0f);
|
|
559
1063
|
const __m128 w = _mm_set1_ps((float)SKIN_WEIGHT);
|
|
560
1064
|
const __m128 invw = _mm_set1_ps(1.0f - (float)SKIN_WEIGHT);
|
|
1065
|
+
const __m128 inv255 = _mm_set1_ps(1.0f/255.0f);
|
|
561
1066
|
|
|
562
1067
|
npy_intp i = 0;
|
|
563
|
-
for (; i +
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
__m128
|
|
572
|
-
|
|
573
|
-
__m128
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
__m128 fs_r = u8x4_to_unit_f32(skin[4*(i+0)+0], skin[4*(i+1)+0],
|
|
579
|
-
skin[4*(i+2)+0], skin[4*(i+3)+0]);
|
|
580
|
-
__m128 fs_g = u8x4_to_unit_f32(skin[4*(i+0)+1], skin[4*(i+1)+1],
|
|
581
|
-
skin[4*(i+2)+1], skin[4*(i+3)+1]);
|
|
582
|
-
__m128 fs_b = u8x4_to_unit_f32(skin[4*(i+0)+2], skin[4*(i+1)+2],
|
|
583
|
-
skin[4*(i+2)+2], skin[4*(i+3)+2]);
|
|
1068
|
+
for (; i + 6 <= pixels; i += 4) {
|
|
1069
|
+
__m128i a4 = _mm_cvtsi32_si128(*(const int*)(im_alpha + i));
|
|
1070
|
+
__m128i a_zero = _mm_cmpeq_epi8(a4, _mm_setzero_si128());
|
|
1071
|
+
if (_mm_movemask_epi8(a_zero) == 0xFFFF) {
|
|
1072
|
+
memcpy(out + 3*i, base + 3*i, 12);
|
|
1073
|
+
continue;
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
__m128 fb_r, fb_g, fb_b;
|
|
1077
|
+
__m128 ft_r, ft_g, ft_b;
|
|
1078
|
+
__m128 fs_r, fs_g, fs_b;
|
|
1079
|
+
load4_rgb_to_unit_f32(base + 3*i, inv255, &fb_r, &fb_g, &fb_b);
|
|
1080
|
+
load4_rgb_to_unit_f32(texture + 3*i, inv255, &ft_r, &ft_g, &ft_b);
|
|
1081
|
+
load4_rgb_to_unit_f32(skin + 3*i, inv255, &fs_r, &fs_g, &fs_b);
|
|
584
1082
|
|
|
585
1083
|
__m128 fa_im = load4_u8_to_unit_f32(im_alpha + i);
|
|
586
1084
|
__m128 fit_a = _mm_sub_ps(one, fa_im);
|
|
@@ -589,39 +1087,27 @@ static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
589
1087
|
__m128 gm_g = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_g, fs_g), half));
|
|
590
1088
|
__m128 gm_b = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_b, fs_b), half));
|
|
591
1089
|
|
|
592
|
-
gm_r =
|
|
593
|
-
gm_g =
|
|
594
|
-
gm_b =
|
|
1090
|
+
gm_r = mul_add_ps128(gm_r, fa_im, _mm_mul_ps(ft_r, fit_a));
|
|
1091
|
+
gm_g = mul_add_ps128(gm_g, fa_im, _mm_mul_ps(ft_g, fit_a));
|
|
1092
|
+
gm_b = mul_add_ps128(gm_b, fa_im, _mm_mul_ps(ft_b, fit_a));
|
|
595
1093
|
|
|
596
|
-
gm_r =
|
|
597
|
-
gm_g =
|
|
598
|
-
gm_b =
|
|
1094
|
+
gm_r = mul_add_ps128(gm_r, invw, _mm_mul_ps(fs_r, w));
|
|
1095
|
+
gm_g = mul_add_ps128(gm_g, invw, _mm_mul_ps(fs_g, w));
|
|
1096
|
+
gm_b = mul_add_ps128(gm_b, invw, _mm_mul_ps(fs_b, w));
|
|
599
1097
|
|
|
600
1098
|
gm_r = nan_to_num_ps128(gm_r);
|
|
601
1099
|
gm_g = nan_to_num_ps128(gm_g);
|
|
602
1100
|
gm_b = nan_to_num_ps128(gm_b);
|
|
603
1101
|
|
|
604
|
-
__m128 fr =
|
|
605
|
-
__m128 fg =
|
|
606
|
-
__m128 fb =
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
_mm_storeu_ps(rr, fr);
|
|
610
|
-
_mm_storeu_ps(gg, fg);
|
|
611
|
-
_mm_storeu_ps(bb, fb);
|
|
612
|
-
|
|
613
|
-
for (int k = 0; k < 4; ++k) {
|
|
614
|
-
int r = (int)(rr[k] * 255.0f);
|
|
615
|
-
int g = (int)(gg[k] * 255.0f);
|
|
616
|
-
int b = (int)(bb[k] * 255.0f);
|
|
617
|
-
out[3*(i+k)+0] = (uint8_t)(r < 0 ? 0 : r > 255 ? 255 : r);
|
|
618
|
-
out[3*(i+k)+1] = (uint8_t)(g < 0 ? 0 : g > 255 ? 255 : g);
|
|
619
|
-
out[3*(i+k)+2] = (uint8_t)(b < 0 ? 0 : b > 255 ? 255 : b);
|
|
620
|
-
}
|
|
1102
|
+
__m128 fr = mul_add_ps128(gm_r, fa_im, _mm_mul_ps(fb_r, fit_a));
|
|
1103
|
+
__m128 fg = mul_add_ps128(gm_g, fa_im, _mm_mul_ps(fb_g, fit_a));
|
|
1104
|
+
__m128 fb = mul_add_ps128(gm_b, fa_im, _mm_mul_ps(fb_b, fit_a));
|
|
1105
|
+
|
|
1106
|
+
store_unit_f32_to_u8_rgb4_u16(fr, fg, fb, out + 3*i);
|
|
621
1107
|
}
|
|
622
1108
|
|
|
623
1109
|
if (i < pixels) {
|
|
624
|
-
kernel_scalar_rgb(base + 3*i, texture + 3*i, skin +
|
|
1110
|
+
kernel_scalar_rgb(base + 3*i, texture + 3*i, skin + 3*i, im_alpha + i,
|
|
625
1111
|
out + 3*i, pixels - i);
|
|
626
1112
|
}
|
|
627
1113
|
}
|
|
@@ -633,31 +1119,23 @@ static void kernel_sse42_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
633
1119
|
const __m128 one = _mm_set1_ps(1.0f);
|
|
634
1120
|
const __m128 w = _mm_set1_ps((float)SKIN_WEIGHT);
|
|
635
1121
|
const __m128 invw = _mm_set1_ps(1.0f - (float)SKIN_WEIGHT);
|
|
1122
|
+
const __m128 inv255 = _mm_set1_ps(1.0f/255.0f);
|
|
636
1123
|
|
|
637
1124
|
npy_intp i = 0;
|
|
638
1125
|
for (; i + 4 <= pixels; i += 4) {
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
__m128
|
|
647
|
-
|
|
648
|
-
__m128
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
__m128 ft_a = u8x4_to_unit_f32(texture[4*(i+0)+3], texture[4*(i+1)+3],
|
|
653
|
-
texture[4*(i+2)+3], texture[4*(i+3)+3]);
|
|
654
|
-
|
|
655
|
-
__m128 fs_r = u8x4_to_unit_f32(skin[4*(i+0)+0], skin[4*(i+1)+0],
|
|
656
|
-
skin[4*(i+2)+0], skin[4*(i+3)+0]);
|
|
657
|
-
__m128 fs_g = u8x4_to_unit_f32(skin[4*(i+0)+1], skin[4*(i+1)+1],
|
|
658
|
-
skin[4*(i+2)+1], skin[4*(i+3)+1]);
|
|
659
|
-
__m128 fs_b = u8x4_to_unit_f32(skin[4*(i+0)+2], skin[4*(i+1)+2],
|
|
660
|
-
skin[4*(i+2)+2], skin[4*(i+3)+2]);
|
|
1126
|
+
__m128i a4 = _mm_cvtsi32_si128(*(const int*)(im_alpha + i));
|
|
1127
|
+
__m128i a_zero = _mm_cmpeq_epi8(a4, _mm_setzero_si128());
|
|
1128
|
+
if (_mm_movemask_epi8(a_zero) == 0xFFFF) {
|
|
1129
|
+
memcpy(out + 3*i, base + 3*i, 12);
|
|
1130
|
+
continue;
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
__m128 fb_r, fb_g, fb_b;
|
|
1134
|
+
__m128 ft_r, ft_g, ft_b, ft_a;
|
|
1135
|
+
__m128 fs_r, fs_g, fs_b;
|
|
1136
|
+
load4_rgb_to_unit_f32(base + 3*i, inv255, &fb_r, &fb_g, &fb_b);
|
|
1137
|
+
load4_rgba_to_unit_f32(texture + 4*i, inv255, &ft_r, &ft_g, &ft_b, &ft_a);
|
|
1138
|
+
load4_rgb_to_unit_f32(skin + 3*i, inv255, &fs_r, &fs_g, &fs_b);
|
|
661
1139
|
|
|
662
1140
|
__m128 fa_im = load4_u8_to_unit_f32(im_alpha + i);
|
|
663
1141
|
__m128 fta = _mm_mul_ps(ft_a, fa_im); /* texture_alpha */
|
|
@@ -667,39 +1145,27 @@ static void kernel_sse42_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
667
1145
|
__m128 gm_g = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_g, fs_g), half));
|
|
668
1146
|
__m128 gm_b = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_b, fs_b), half));
|
|
669
1147
|
|
|
670
|
-
gm_r =
|
|
671
|
-
gm_g =
|
|
672
|
-
gm_b =
|
|
1148
|
+
gm_r = mul_add_ps128(gm_r, fta, _mm_mul_ps(ft_r, fit_a));
|
|
1149
|
+
gm_g = mul_add_ps128(gm_g, fta, _mm_mul_ps(ft_g, fit_a));
|
|
1150
|
+
gm_b = mul_add_ps128(gm_b, fta, _mm_mul_ps(ft_b, fit_a));
|
|
673
1151
|
|
|
674
|
-
gm_r =
|
|
675
|
-
gm_g =
|
|
676
|
-
gm_b =
|
|
1152
|
+
gm_r = mul_add_ps128(gm_r, invw, _mm_mul_ps(fs_r, w));
|
|
1153
|
+
gm_g = mul_add_ps128(gm_g, invw, _mm_mul_ps(fs_g, w));
|
|
1154
|
+
gm_b = mul_add_ps128(gm_b, invw, _mm_mul_ps(fs_b, w));
|
|
677
1155
|
|
|
678
1156
|
gm_r = nan_to_num_ps128(gm_r);
|
|
679
1157
|
gm_g = nan_to_num_ps128(gm_g);
|
|
680
1158
|
gm_b = nan_to_num_ps128(gm_b);
|
|
681
1159
|
|
|
682
|
-
__m128 fr =
|
|
683
|
-
__m128 fg =
|
|
684
|
-
__m128 fb =
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
_mm_storeu_ps(rr, fr);
|
|
688
|
-
_mm_storeu_ps(gg, fg);
|
|
689
|
-
_mm_storeu_ps(bb, fb);
|
|
690
|
-
|
|
691
|
-
for (int k = 0; k < 4; ++k) {
|
|
692
|
-
int r = (int)(rr[k] * 255.0f);
|
|
693
|
-
int g = (int)(gg[k] * 255.0f);
|
|
694
|
-
int b = (int)(bb[k] * 255.0f);
|
|
695
|
-
out[3*(i+k)+0] = (uint8_t)(r < 0 ? 0 : r > 255 ? 255 : r);
|
|
696
|
-
out[3*(i+k)+1] = (uint8_t)(g < 0 ? 0 : g > 255 ? 255 : g);
|
|
697
|
-
out[3*(i+k)+2] = (uint8_t)(b < 0 ? 0 : b > 255 ? 255 : b);
|
|
698
|
-
}
|
|
1160
|
+
__m128 fr = mul_add_ps128(gm_r, fta, _mm_mul_ps(fb_r, fit_a));
|
|
1161
|
+
__m128 fg = mul_add_ps128(gm_g, fta, _mm_mul_ps(fb_g, fit_a));
|
|
1162
|
+
__m128 fb = mul_add_ps128(gm_b, fta, _mm_mul_ps(fb_b, fit_a));
|
|
1163
|
+
|
|
1164
|
+
store_unit_f32_to_u8_rgb4(fr, fg, fb, out + 3*i);
|
|
699
1165
|
}
|
|
700
1166
|
|
|
701
1167
|
if (i < pixels) {
|
|
702
|
-
kernel_scalar_rgba(base + 3*i, texture + 4*i, skin +
|
|
1168
|
+
kernel_scalar_rgba(base + 3*i, texture + 4*i, skin + 3*i, im_alpha + i,
|
|
703
1169
|
out + 3*i, pixels - i);
|
|
704
1170
|
}
|
|
705
1171
|
}
|
|
Binary file
|
|
@@ -16,7 +16,7 @@ def normal_grain_merge(
|
|
|
16
16
|
Channel ordering doesn't matter as long as it is consistent.
|
|
17
17
|
:param base: The base RGB image.
|
|
18
18
|
:param texture: The texture, either RGB or RGBA.
|
|
19
|
-
:param skin: The
|
|
19
|
+
:param skin: The RGB skin cutout.
|
|
20
20
|
:param im_alpha: The alpha from the cutout.
|
|
21
21
|
:param kernel: Which kernel to use.
|
|
22
22
|
The `auto` kernel chooses between avx2 and sse4.2 when compiled with gcc and uses `scaler` on Windows.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: normal_grain_merge
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Fused normal and grain merge C extension
|
|
5
5
|
Author: Samuel Howard
|
|
6
6
|
License: MIT
|
|
@@ -26,6 +26,12 @@ $$
|
|
|
26
26
|
(((\mathrm{t_{rgb}} + \mathrm{s_{rgb}} - 0.5) * \mathrm{t_\alpha} + \mathrm{t_{rgb}} * (1 - \mathrm{t_\alpha})) * (1 - 0.3) + \mathrm{s_{rgb}} * 0.3) * \mathrm{t_\alpha} + \mathrm{b_{rgb}} * (1 - \mathrm{t_\alpha})
|
|
27
27
|
$$
|
|
28
28
|
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```shell
|
|
32
|
+
pip install normal-grain-merge
|
|
33
|
+
```
|
|
34
|
+
|
|
29
35
|
## Usage
|
|
30
36
|
```py
|
|
31
37
|
import numpy as np
|
|
@@ -82,21 +88,22 @@ One of `KernelKind`.
|
|
|
82
88
|
The entire reason for me writing this was NumPy being slow when this operation is in the hot path.
|
|
83
89
|
So, I decided to write a SIMD version that does the type casting outside NumPy with only the intermediate values being in FP32.
|
|
84
90
|
|
|
85
|
-
How much of a speedup is this? All numbers are from a Ryzen 7 4800H running
|
|
86
|
-
|
|
87
|
-
| Method/Kernel | Average Iteration Time |
|
|
88
|
-
|
|
89
|
-
| C scalar kernel | 0.
|
|
90
|
-
| C SSE4.2 kernel | 0.
|
|
91
|
-
| C AVX2 kernel | 0.
|
|
92
|
-
| NumPy version | 0.
|
|
93
|
-
| Old NumPy version | 0.
|
|
94
|
-
|
|
95
|
-
| Method Comparison | Speedup
|
|
96
|
-
|
|
97
|
-
| NumPy -> scalar |
|
|
98
|
-
| NumPy -> SSE4.2 |
|
|
99
|
-
| NumPy -> AVX2 |
|
|
100
|
-
| Old np -> SSE4.2 |
|
|
101
|
-
|
|
|
102
|
-
| C scalar ->
|
|
91
|
+
How much of a speedup is this? All numbers are from a Ryzen 7 4800H running Ubuntu 24.04 and Python 3.12.3.
|
|
92
|
+
|
|
93
|
+
| Method/Kernel | Average Iteration Time (RGB) | Average Iteration Time (RGBA) |
|
|
94
|
+
|-------------------|------------------------------|-------------------------------|
|
|
95
|
+
| C scalar kernel | 0.016109s | 0.016679s |
|
|
96
|
+
| C SSE4.2 kernel | 0.002446s | 0.002478s |
|
|
97
|
+
| C AVX2 kernel | 0.002336s | 0.002520s |
|
|
98
|
+
| NumPy version | 0.160623s | 0.258044s |
|
|
99
|
+
| Old NumPy version | 0.248160s | 0.232046s |
|
|
100
|
+
|
|
101
|
+
| Method Comparison | Speedup (RGB) | Speedup (RGBA) |
|
|
102
|
+
|--------------------|---------------|----------------|
|
|
103
|
+
| NumPy -> scalar | 89.9709% | 93.5363% |
|
|
104
|
+
| NumPy -> SSE4.2 | 98.4769% | 99.0397% |
|
|
105
|
+
| NumPy -> AVX2 | 98.5454% | 99.0235% |
|
|
106
|
+
| Old np -> SSE4.2 | 99.0142% | 98.9321% |
|
|
107
|
+
| Old np -> AVX2 | 99.0585% | 98.9141% |
|
|
108
|
+
| C scalar -> SSE4.2 | 84.8135% | 85.1437% |
|
|
109
|
+
| C scalar -> AVX2 | 85.4964% | 84.8923% |
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
normal_grain_merge/__init__.py,sha256=Roc1wQ7_13LG_Z3Bd82zhk8wn7R1BrcO63fCdsvnnJU,89
|
|
2
|
+
normal_grain_merge/kernel_kind.py,sha256=3cP4WRQSG9ZZeHsrXpXJ5Kcc8wABsmRSgex0rwRT8K4,162
|
|
3
|
+
normal_grain_merge/normal_grain_merge.c,sha256=FtrCLe8ajPrd9mSFLloFwKaxfYgjTKKOo5nvSpu2lcQ,56968
|
|
4
|
+
normal_grain_merge/normal_grain_merge.cp313-win_amd64.pyd,sha256=nAmw48P3W6fcANdcCp0f7Cy5qQBP4qF_SHq2SUw2tBY,29184
|
|
5
|
+
normal_grain_merge/normal_grain_merge.pyi,sha256=HXa55A0wdcmzPpJzi7qgJws5y2q_uGjdJZQXzTkw9vc,1089
|
|
6
|
+
normal_grain_merge-0.1.2.dist-info/licenses/LICENSE,sha256=qbUDFP46iOpV1ouBhpqjX-kS_cCVMHgrLBNcdTlq7Qc,1089
|
|
7
|
+
normal_grain_merge-0.1.2.dist-info/METADATA,sha256=fE6tnhl1A2BGfOF8RWp3MzdtoJgoLg7hvlQsF-h5Ta0,4061
|
|
8
|
+
normal_grain_merge-0.1.2.dist-info/WHEEL,sha256=qV0EIPljj1XC_vuSatRWjn02nZIz3N1t8jsZz7HBr2U,101
|
|
9
|
+
normal_grain_merge-0.1.2.dist-info/top_level.txt,sha256=jfUAUKWrxBshHvZ0xTu3uF5VJsUpbWp5NkxUj8OXqu8,19
|
|
10
|
+
normal_grain_merge-0.1.2.dist-info/RECORD,,
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
normal_grain_merge/__init__.py,sha256=Roc1wQ7_13LG_Z3Bd82zhk8wn7R1BrcO63fCdsvnnJU,89
|
|
2
|
-
normal_grain_merge/kernel_kind.py,sha256=3cP4WRQSG9ZZeHsrXpXJ5Kcc8wABsmRSgex0rwRT8K4,162
|
|
3
|
-
normal_grain_merge/normal_grain_merge.c,sha256=n2dJ-E_DlpKtDoLW2oQ6XoLDrDPKWx_DknA_lbzuB-g,36136
|
|
4
|
-
normal_grain_merge/normal_grain_merge.cp313-win_amd64.pyd,sha256=bQxWdhmNZJJ9PEuFVSexBf__7SBNTOmfnKluV_fXKaw,25088
|
|
5
|
-
normal_grain_merge/normal_grain_merge.pyi,sha256=Tz5RVlNbBqn_MsQ46WikaohEPctHdWsFxK3bloRZl1M,1090
|
|
6
|
-
normal_grain_merge-0.0.1.dist-info/licenses/LICENSE,sha256=qbUDFP46iOpV1ouBhpqjX-kS_cCVMHgrLBNcdTlq7Qc,1089
|
|
7
|
-
normal_grain_merge-0.0.1.dist-info/METADATA,sha256=t4r6jBfoXUoqtbrnPt7YEzOYzcAOqNEtrKTtgEg-dQs,3492
|
|
8
|
-
normal_grain_merge-0.0.1.dist-info/WHEEL,sha256=qV0EIPljj1XC_vuSatRWjn02nZIz3N1t8jsZz7HBr2U,101
|
|
9
|
-
normal_grain_merge-0.0.1.dist-info/top_level.txt,sha256=jfUAUKWrxBshHvZ0xTu3uF5VJsUpbWp5NkxUj8OXqu8,19
|
|
10
|
-
normal_grain_merge-0.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|