normal-grain-merge 0.0.2__tar.gz → 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of normal-grain-merge might be problematic. Click here for more details.

Files changed (17) hide show
  1. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/LICENSE +0 -0
  2. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/PKG-INFO +14 -13
  3. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/README.md +13 -12
  4. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge/__init__.py +0 -0
  5. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge/kernel_kind.py +0 -0
  6. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge/normal_grain_merge.c +138 -112
  7. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge/normal_grain_merge.pyi +1 -1
  8. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/PKG-INFO +14 -13
  9. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/pyproject.toml +1 -1
  10. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/setup.py +2 -2
  11. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/tests/test_ngm.py +9 -13
  12. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/tests/test_speed.py +4 -5
  13. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/SOURCES.txt +0 -0
  14. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/dependency_links.txt +0 -0
  15. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/requires.txt +0 -0
  16. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/top_level.txt +0 -0
  17. {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: normal_grain_merge
3
- Version: 0.0.2
3
+ Version: 0.1.0
4
4
  Summary: Fused normal and grain merge C extension
5
5
  Author: Samuel Howard
6
6
  License: MIT
@@ -88,21 +88,22 @@ One of `KernelKind`.
88
88
  The entire reason for me writing this was NumPy being slow when this operation is in the hot path.
89
89
  So, I decided to write a SIMD version that does the type casting outside NumPy with only the intermediate values being in FP32.
90
90
 
91
- How much of a speedup is this? All numbers are from a Ryzen 7 4800H running Windows 11 and Python 3.12.4.
91
+ How much of a speedup is this? All numbers are from a Ryzen 7 4800H running Ubuntu 24.04 and Python 3.12.3.
92
92
 
93
93
  | Method/Kernel | Average Iteration Time |
94
94
  |-------------------|------------------------|
95
- | C scalar kernel | 0.019565s |
96
- | C SSE4.2 kernel | 0.013705s |
97
- | C AVX2 kernel | 0.016842s |
98
- | NumPy version | 0.228098s |
99
- | Old NumPy version | 0.350554s |
95
+ | C scalar kernel | 0.016007s |
96
+ | C SSE4.2 kernel | 0.011155s |
97
+ | C AVX2 kernel | 0.014575s |
98
+ | NumPy version | 0.190392s |
99
+ | Old NumPy version | 0.274065s |
100
100
 
101
101
  | Method Comparison | Speedup |
102
102
  |--------------------|----------|
103
- | NumPy -> scalar | 91.4227% |
104
- | NumPy -> SSE4.2 | 93.9915% |
105
- | NumPy -> AVX2 | 92.6165% |
106
- | Old np -> SSE4.2 | 96.0904% |
107
- | C scalar -> SSE4.2 | 29.9487% |
108
- | C scalar -> AVX2 | 13.9183% |
103
+ | NumPy -> scalar | 91.5927% |
104
+ | NumPy -> SSE4.2 | 94.1409% |
105
+ | NumPy -> AVX2 | 92.3448% |
106
+ | Old np -> SSE4.2 | 95.9297% |
107
+ | Old np -> AVX2 | 94.6819% |
108
+ | C scalar -> SSE4.2 | 30.3086% |
109
+ | C scalar -> AVX2 | 8.9448% |
@@ -70,21 +70,22 @@ One of `KernelKind`.
70
70
  The entire reason for me writing this was NumPy being slow when this operation is in the hot path.
71
71
  So, I decided to write a SIMD version that does the type casting outside NumPy with only the intermediate values being in FP32.
72
72
 
73
- How much of a speedup is this? All numbers are from a Ryzen 7 4800H running Windows 11 and Python 3.12.4.
73
+ How much of a speedup is this? All numbers are from a Ryzen 7 4800H running Ubuntu 24.04 and Python 3.12.3.
74
74
 
75
75
  | Method/Kernel | Average Iteration Time |
76
76
  |-------------------|------------------------|
77
- | C scalar kernel | 0.019565s |
78
- | C SSE4.2 kernel | 0.013705s |
79
- | C AVX2 kernel | 0.016842s |
80
- | NumPy version | 0.228098s |
81
- | Old NumPy version | 0.350554s |
77
+ | C scalar kernel | 0.016007s |
78
+ | C SSE4.2 kernel | 0.011155s |
79
+ | C AVX2 kernel | 0.014575s |
80
+ | NumPy version | 0.190392s |
81
+ | Old NumPy version | 0.274065s |
82
82
 
83
83
  | Method Comparison | Speedup |
84
84
  |--------------------|----------|
85
- | NumPy -> scalar | 91.4227% |
86
- | NumPy -> SSE4.2 | 93.9915% |
87
- | NumPy -> AVX2 | 92.6165% |
88
- | Old np -> SSE4.2 | 96.0904% |
89
- | C scalar -> SSE4.2 | 29.9487% |
90
- | C scalar -> AVX2 | 13.9183% |
85
+ | NumPy -> scalar | 91.5927% |
86
+ | NumPy -> SSE4.2 | 94.1409% |
87
+ | NumPy -> AVX2 | 92.3448% |
88
+ | Old np -> SSE4.2 | 95.9297% |
89
+ | Old np -> AVX2 | 94.6819% |
90
+ | C scalar -> SSE4.2 | 30.3086% |
91
+ | C scalar -> AVX2 | 8.9448% |
@@ -7,6 +7,10 @@
7
7
  #include <smmintrin.h>
8
8
  #include <immintrin.h> /* AVX2 + SSE4.2 */
9
9
 
10
+ #if defined(__FMA__) || (defined(_MSC_VER) && defined(__AVX2__))
11
+ #define NGM_HAS_FMA 1
12
+ #endif
13
+
10
14
  /* ----- Runtime CPU feature detection (GCC/Clang + MSVC) ----- */
11
15
  #if defined(_MSC_VER)
12
16
  #include <intrin.h>
@@ -126,8 +130,8 @@ static inline int check_shape_requirements(PyArrayObject *base,
126
130
  }
127
131
  *texture_has_alpha = (tc == 4);
128
132
 
129
- if (PyArray_NDIM(skin) != 3 || PyArray_DIMS(skin)[2] != 4) {
130
- PyErr_SetString(PyExc_ValueError, "skin must have shape (H, W, 4)");
133
+ if (PyArray_NDIM(skin) != 3 || PyArray_DIMS(skin)[2] != 3) {
134
+ PyErr_SetString(PyExc_ValueError, "skin must have shape (H, W, 3)");
131
135
  return 0;
132
136
  }
133
137
  if (PyArray_NDIM(im_alpha) != 2) {
@@ -184,10 +188,9 @@ static void kernel_scalar_rgb(const uint8_t *base, const uint8_t *texture,
184
188
  const uint8_t t_g = texture[3*i+1];
185
189
  const uint8_t t_b = texture[3*i+2];
186
190
 
187
- const uint8_t s_r = skin[4*i+0];
188
- const uint8_t s_g = skin[4*i+1];
189
- const uint8_t s_b = skin[4*i+2];
190
- const uint8_t s_a = skin[4*i+3];
191
+ const uint8_t s_r = skin[3*i+0];
192
+ const uint8_t s_g = skin[3*i+1];
193
+ const uint8_t s_b = skin[3*i+2];
191
194
 
192
195
  const uint8_t a_im = im_alpha[i];
193
196
 
@@ -203,8 +206,6 @@ static void kernel_scalar_rgb(const uint8_t *base, const uint8_t *texture,
203
206
  const float fs_r = s_r * (1.0f/255.0f);
204
207
  const float fs_g = s_g * (1.0f/255.0f);
205
208
  const float fs_b = s_b * (1.0f/255.0f);
206
- const float fs_a = s_a * (1.0f/255.0f);
207
-
208
209
  const float fa_im = a_im * (1.0f/255.0f);
209
210
 
210
211
  /*
@@ -240,8 +241,8 @@ static void kernel_scalar_rgb(const uint8_t *base, const uint8_t *texture,
240
241
 
241
242
  /* Normal merge
242
243
  * n_out = gm_out * texture_alpha + base * inverse_tpa
243
- *
244
- * In this case, texture_alpha is the skin alpha since texture doesn't have an alpha channel here.
244
+ *
245
+ * In this case, texture_alpha is supplied by im_alpha since texture doesn't have an alpha channel here.
245
246
  */
246
247
  fr = fr * fa_im + fb_r * fit_a;
247
248
  fg = fg * fa_im + fb_g * fit_a;
@@ -267,10 +268,9 @@ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
267
268
  const uint8_t t_b = texture[4*i+2];
268
269
  const uint8_t t_a = texture[4*i+3]; /* present in RGBA branch */
269
270
 
270
- const uint8_t s_r = skin[4*i+0];
271
- const uint8_t s_g = skin[4*i+1];
272
- const uint8_t s_b = skin[4*i+2];
273
- const uint8_t s_a = skin[4*i+3];
271
+ const uint8_t s_r = skin[3*i+0];
272
+ const uint8_t s_g = skin[3*i+1];
273
+ const uint8_t s_b = skin[3*i+2];
274
274
 
275
275
  const uint8_t a_im = im_alpha[i];
276
276
 
@@ -286,8 +286,6 @@ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
286
286
  const float fs_r = s_r * (1.0f/255.0f);
287
287
  const float fs_g = s_g * (1.0f/255.0f);
288
288
  const float fs_b = s_b * (1.0f/255.0f);
289
- const float fs_a = s_a * (1.0f/255.0f);
290
-
291
289
  const float fa_im = a_im * (1.0f/255.0f);
292
290
 
293
291
  /*
@@ -295,7 +293,7 @@ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
295
293
  * normal grain merge *
296
294
  **********************
297
295
  */
298
- /* Merge texture and skin alphas */
296
+ /* Merge texture alpha with the external mask */
299
297
 
300
298
  /* texture_alpha = texture[..., 3] * im_alpha*/
301
299
  ft_a = ft_a * fa_im;
@@ -346,29 +344,37 @@ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
346
344
  You can later replace gathers with better deinterleaving if needed.
347
345
  */
348
346
 
349
- /* Convert 8 u8 interleaved channel samples (stride 3 or 4) to float32 in [0,1] via gather. */
350
- static inline __m256 gather_u8_to_unit_f32_avx2(const uint8_t *base_ptr, int stride,
351
- npy_intp start_idx) {
352
- const int i0 = (int)((start_idx + 0) * stride);
353
- const int i1 = (int)((start_idx + 1) * stride);
354
- const int i2 = (int)((start_idx + 2) * stride);
355
- const int i3 = (int)((start_idx + 3) * stride);
356
- const int i4 = (int)((start_idx + 4) * stride);
357
- const int i5 = (int)((start_idx + 5) * stride);
358
- const int i6 = (int)((start_idx + 6) * stride);
359
- const int i7 = (int)((start_idx + 7) * stride);
360
-
361
- __m256i offs = _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7);
362
- __m256i v32 = _mm256_i32gather_epi32((const int*)base_ptr, offs, 1); /* read 8 x u8 as u32 */
363
- v32 = _mm256_and_si256(v32, _mm256_set1_epi32(0xFF));
364
- return _mm256_mul_ps(_mm256_cvtepi32_ps(v32), _mm256_set1_ps(1.0f/255.0f));
347
+ /* Convert 8 u8 interleaved samples addressed by idx to float32 in [0,1]. */
348
+ static inline __m256 gather_u8_block_to_unit_f32_avx2(const uint8_t *block_ptr,
349
+ __m256i idx,
350
+ __m256i mask_ff,
351
+ __m256 inv255) {
352
+ __m256i v32 = _mm256_i32gather_epi32((const int*)block_ptr, idx, 1);
353
+ v32 = _mm256_and_si256(v32, mask_ff);
354
+ return _mm256_mul_ps(_mm256_cvtepi32_ps(v32), inv255);
355
+ }
356
+
357
+ static inline __m256 mul_add_ps256(__m256 a, __m256 b, __m256 c) {
358
+ #ifdef __FMA__
359
+ return _mm256_fmadd_ps(a, b, c);
360
+ #else
361
+ return _mm256_add_ps(_mm256_mul_ps(a, b), c);
362
+ #endif
363
+ }
364
+
365
+ static inline __m256 fnmadd_ps256(__m256 a, __m256 b, __m256 c) {
366
+ #ifdef __FMA__
367
+ return _mm256_fnmadd_ps(a, b, c);
368
+ #else
369
+ return _mm256_sub_ps(c, _mm256_mul_ps(a, b));
370
+ #endif
365
371
  }
366
372
 
367
373
  /* Convert 8 consecutive u8 to float32 in [0,1] (for grayscale im_alpha). */
368
- static inline __m256 load8_u8_to_unit_f32_avx2(const uint8_t *p) {
374
+ static inline __m256 load8_u8_to_unit_f32_avx2(const uint8_t *p, __m256 inv255) {
369
375
  __m128i v8 = _mm_loadl_epi64((const __m128i*)p); /* 8 bytes -> XMM */
370
376
  __m256i v32 = _mm256_cvtepu8_epi32(v8); /* widen to 8 x u32 */
371
- return _mm256_mul_ps(_mm256_cvtepi32_ps(v32), _mm256_set1_ps(1.0f/255.0f));
377
+ return _mm256_mul_ps(_mm256_cvtepi32_ps(v32), inv255);
372
378
  }
373
379
 
374
380
  static inline __m256 clamp01_ps(__m256 x) {
@@ -406,7 +412,9 @@ static inline void store_unit_f32_to_u8_rgb8_avx2(__m256 fr, __m256 fg, __m256 f
406
412
  static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
407
413
  const uint8_t *skin, const uint8_t *im_alpha,
408
414
  uint8_t *out, npy_intp pixels) {
409
- const int stride3 = 3, stride4 = 4;
415
+ const __m256 inv255 = _mm256_set1_ps(1.0f/255.0f);
416
+ const __m256i mask_ff = _mm256_set1_epi32(0xFF);
417
+ const __m256i idx_rgb = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21);
410
418
  const __m256 half = _mm256_set1_ps(0.5f);
411
419
  const __m256 one = _mm256_set1_ps(1.0f);
412
420
  const __m256 w = _mm256_set1_ps((float)SKIN_WEIGHT);
@@ -414,24 +422,28 @@ static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
414
422
 
415
423
  npy_intp i = 0;
416
424
  for (; i + 8 <= pixels; i += 8) {
425
+ const uint8_t *base_blk = base + 3*i;
426
+ const uint8_t *tex_blk = texture + 3*i;
427
+ const uint8_t *skin_blk = skin + 3*i;
428
+
417
429
  /* base RGB in [0,1] */
418
- __m256 fb_r = gather_u8_to_unit_f32_avx2(base+0, stride3, i);
419
- __m256 fb_g = gather_u8_to_unit_f32_avx2(base+1, stride3, i);
420
- __m256 fb_b = gather_u8_to_unit_f32_avx2(base+2, stride3, i);
430
+ __m256 fb_r = gather_u8_block_to_unit_f32_avx2(base_blk + 0, idx_rgb, mask_ff, inv255);
431
+ __m256 fb_g = gather_u8_block_to_unit_f32_avx2(base_blk + 1, idx_rgb, mask_ff, inv255);
432
+ __m256 fb_b = gather_u8_block_to_unit_f32_avx2(base_blk + 2, idx_rgb, mask_ff, inv255);
421
433
 
422
434
  /* texture RGB in [0,1] */
423
- __m256 ft_r = gather_u8_to_unit_f32_avx2(texture+0, stride3, i);
424
- __m256 ft_g = gather_u8_to_unit_f32_avx2(texture+1, stride3, i);
425
- __m256 ft_b = gather_u8_to_unit_f32_avx2(texture+2, stride3, i);
435
+ __m256 ft_r = gather_u8_block_to_unit_f32_avx2(tex_blk + 0, idx_rgb, mask_ff, inv255);
436
+ __m256 ft_g = gather_u8_block_to_unit_f32_avx2(tex_blk + 1, idx_rgb, mask_ff, inv255);
437
+ __m256 ft_b = gather_u8_block_to_unit_f32_avx2(tex_blk + 2, idx_rgb, mask_ff, inv255);
426
438
 
427
439
  /* skin RGB in [0,1] */
428
- __m256 fs_r = gather_u8_to_unit_f32_avx2(skin+0, stride4, i);
429
- __m256 fs_g = gather_u8_to_unit_f32_avx2(skin+1, stride4, i);
430
- __m256 fs_b = gather_u8_to_unit_f32_avx2(skin+2, stride4, i);
440
+ __m256 fs_r = gather_u8_block_to_unit_f32_avx2(skin_blk + 0, idx_rgb, mask_ff, inv255);
441
+ __m256 fs_g = gather_u8_block_to_unit_f32_avx2(skin_blk + 1, idx_rgb, mask_ff, inv255);
442
+ __m256 fs_b = gather_u8_block_to_unit_f32_avx2(skin_blk + 2, idx_rgb, mask_ff, inv255);
431
443
 
432
444
  /* texture_alpha = im_alpha */
433
- __m256 fa_im = load8_u8_to_unit_f32_avx2(im_alpha + i);
434
- __m256 fit_a = _mm256_sub_ps(one, fa_im);
445
+ __m256 fa_im = load8_u8_to_unit_f32_avx2(im_alpha + i, inv255);
446
+ __m256 fit_a = fnmadd_ps256(fa_im, one, one);
435
447
 
436
448
  /* gm_out = clip(texture + skin - 0.5) */
437
449
  __m256 gm_r = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_r, fs_r), half));
@@ -439,14 +451,14 @@ static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
439
451
  __m256 gm_b = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_b, fs_b), half));
440
452
 
441
453
  /* gm_out = gm_out * texture_alpha + texture * inverse_tpa */
442
- gm_r = _mm256_add_ps(_mm256_mul_ps(gm_r, fa_im), _mm256_mul_ps(ft_r, fit_a));
443
- gm_g = _mm256_add_ps(_mm256_mul_ps(gm_g, fa_im), _mm256_mul_ps(ft_g, fit_a));
444
- gm_b = _mm256_add_ps(_mm256_mul_ps(gm_b, fa_im), _mm256_mul_ps(ft_b, fit_a));
454
+ gm_r = mul_add_ps256(gm_r, fa_im, _mm256_mul_ps(ft_r, fit_a));
455
+ gm_g = mul_add_ps256(gm_g, fa_im, _mm256_mul_ps(ft_g, fit_a));
456
+ gm_b = mul_add_ps256(gm_b, fa_im, _mm256_mul_ps(ft_b, fit_a));
445
457
 
446
458
  /* gm_out = gm_out * (1 - w) + skin * w */
447
- gm_r = _mm256_add_ps(_mm256_mul_ps(gm_r, invw), _mm256_mul_ps(fs_r, w));
448
- gm_g = _mm256_add_ps(_mm256_mul_ps(gm_g, invw), _mm256_mul_ps(fs_g, w));
449
- gm_b = _mm256_add_ps(_mm256_mul_ps(gm_b, invw), _mm256_mul_ps(fs_b, w));
459
+ gm_r = mul_add_ps256(gm_r, invw, _mm256_mul_ps(fs_r, w));
460
+ gm_g = mul_add_ps256(gm_g, invw, _mm256_mul_ps(fs_g, w));
461
+ gm_b = mul_add_ps256(gm_b, invw, _mm256_mul_ps(fs_b, w));
450
462
 
451
463
  /* nan_to_num */
452
464
  gm_r = nan_to_num_ps(gm_r);
@@ -454,15 +466,15 @@ static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
454
466
  gm_b = nan_to_num_ps(gm_b);
455
467
 
456
468
  /* n_out = gm_out * texture_alpha + base * inverse_tpa */
457
- __m256 fr = _mm256_add_ps(_mm256_mul_ps(gm_r, fa_im), _mm256_mul_ps(fb_r, fit_a));
458
- __m256 fg = _mm256_add_ps(_mm256_mul_ps(gm_g, fa_im), _mm256_mul_ps(fb_g, fit_a));
459
- __m256 fb = _mm256_add_ps(_mm256_mul_ps(gm_b, fa_im), _mm256_mul_ps(fb_b, fit_a));
469
+ __m256 fr = mul_add_ps256(gm_r, fa_im, _mm256_mul_ps(fb_r, fit_a));
470
+ __m256 fg = mul_add_ps256(gm_g, fa_im, _mm256_mul_ps(fb_g, fit_a));
471
+ __m256 fb = mul_add_ps256(gm_b, fa_im, _mm256_mul_ps(fb_b, fit_a));
460
472
 
461
473
  store_unit_f32_to_u8_rgb8_avx2(fr, fg, fb, out, i);
462
474
  }
463
475
 
464
476
  if (i < pixels) {
465
- kernel_scalar_rgb(base + 3*i, texture + 3*i, skin + 4*i, im_alpha + i,
477
+ kernel_scalar_rgb(base + 3*i, texture + 3*i, skin + 3*i, im_alpha + i,
466
478
  out + 3*i, pixels - i);
467
479
  }
468
480
  }
@@ -471,7 +483,10 @@ static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
471
483
  static void kernel_avx2_rgba(const uint8_t *base, const uint8_t *texture,
472
484
  const uint8_t *skin, const uint8_t *im_alpha,
473
485
  uint8_t *out, npy_intp pixels) {
474
- const int stride3 = 3, stride4 = 4;
486
+ const __m256 inv255 = _mm256_set1_ps(1.0f/255.0f);
487
+ const __m256i mask_ff = _mm256_set1_epi32(0xFF);
488
+ const __m256i idx_rgb = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21);
489
+ const __m256i idx_rgba = _mm256_setr_epi32(0, 4, 8, 12, 16, 20, 24, 28);
475
490
  const __m256 half = _mm256_set1_ps(0.5f);
476
491
  const __m256 one = _mm256_set1_ps(1.0f);
477
492
  const __m256 w = _mm256_set1_ps((float)SKIN_WEIGHT);
@@ -479,48 +494,52 @@ static void kernel_avx2_rgba(const uint8_t *base, const uint8_t *texture,
479
494
 
480
495
  npy_intp i = 0;
481
496
  for (; i + 8 <= pixels; i += 8) {
482
- __m256 fb_r = gather_u8_to_unit_f32_avx2(base+0, stride3, i);
483
- __m256 fb_g = gather_u8_to_unit_f32_avx2(base+1, stride3, i);
484
- __m256 fb_b = gather_u8_to_unit_f32_avx2(base+2, stride3, i);
497
+ const uint8_t *base_blk = base + 3*i;
498
+ const uint8_t *tex_blk = texture + 4*i;
499
+ const uint8_t *skin_blk = skin + 3*i;
485
500
 
486
- __m256 ft_r = gather_u8_to_unit_f32_avx2(texture+0, stride4, i);
487
- __m256 ft_g = gather_u8_to_unit_f32_avx2(texture+1, stride4, i);
488
- __m256 ft_b = gather_u8_to_unit_f32_avx2(texture+2, stride4, i);
489
- __m256 ft_a = gather_u8_to_unit_f32_avx2(texture+3, stride4, i); /* texture alpha */
501
+ __m256 fb_r = gather_u8_block_to_unit_f32_avx2(base_blk + 0, idx_rgb, mask_ff, inv255);
502
+ __m256 fb_g = gather_u8_block_to_unit_f32_avx2(base_blk + 1, idx_rgb, mask_ff, inv255);
503
+ __m256 fb_b = gather_u8_block_to_unit_f32_avx2(base_blk + 2, idx_rgb, mask_ff, inv255);
490
504
 
491
- __m256 fs_r = gather_u8_to_unit_f32_avx2(skin+0, stride4, i);
492
- __m256 fs_g = gather_u8_to_unit_f32_avx2(skin+1, stride4, i);
493
- __m256 fs_b = gather_u8_to_unit_f32_avx2(skin+2, stride4, i);
505
+ __m256 ft_r = gather_u8_block_to_unit_f32_avx2(tex_blk + 0, idx_rgba, mask_ff, inv255);
506
+ __m256 ft_g = gather_u8_block_to_unit_f32_avx2(tex_blk + 1, idx_rgba, mask_ff, inv255);
507
+ __m256 ft_b = gather_u8_block_to_unit_f32_avx2(tex_blk + 2, idx_rgba, mask_ff, inv255);
508
+ __m256 ft_a = gather_u8_block_to_unit_f32_avx2(tex_blk + 3, idx_rgba, mask_ff, inv255); /* texture alpha */
494
509
 
495
- __m256 fa_im = load8_u8_to_unit_f32_avx2(im_alpha + i);
510
+ __m256 fs_r = gather_u8_block_to_unit_f32_avx2(skin_blk + 0, idx_rgb, mask_ff, inv255);
511
+ __m256 fs_g = gather_u8_block_to_unit_f32_avx2(skin_blk + 1, idx_rgb, mask_ff, inv255);
512
+ __m256 fs_b = gather_u8_block_to_unit_f32_avx2(skin_blk + 2, idx_rgb, mask_ff, inv255);
513
+
514
+ __m256 fa_im = load8_u8_to_unit_f32_avx2(im_alpha + i, inv255);
496
515
  __m256 fta = _mm256_mul_ps(ft_a, fa_im); /* texture_alpha */
497
- __m256 fit_a = _mm256_sub_ps(one, fta); /* inverse_tpa */
516
+ __m256 fit_a = fnmadd_ps256(fta, one, one); /* inverse_tpa */
498
517
 
499
518
  __m256 gm_r = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_r, fs_r), half));
500
519
  __m256 gm_g = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_g, fs_g), half));
501
520
  __m256 gm_b = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_b, fs_b), half));
502
521
 
503
- gm_r = _mm256_add_ps(_mm256_mul_ps(gm_r, fta), _mm256_mul_ps(ft_r, fit_a));
504
- gm_g = _mm256_add_ps(_mm256_mul_ps(gm_g, fta), _mm256_mul_ps(ft_g, fit_a));
505
- gm_b = _mm256_add_ps(_mm256_mul_ps(gm_b, fta), _mm256_mul_ps(ft_b, fit_a));
522
+ gm_r = mul_add_ps256(gm_r, fta, _mm256_mul_ps(ft_r, fit_a));
523
+ gm_g = mul_add_ps256(gm_g, fta, _mm256_mul_ps(ft_g, fit_a));
524
+ gm_b = mul_add_ps256(gm_b, fta, _mm256_mul_ps(ft_b, fit_a));
506
525
 
507
- gm_r = _mm256_add_ps(_mm256_mul_ps(gm_r, invw), _mm256_mul_ps(fs_r, w));
508
- gm_g = _mm256_add_ps(_mm256_mul_ps(gm_g, invw), _mm256_mul_ps(fs_g, w));
509
- gm_b = _mm256_add_ps(_mm256_mul_ps(gm_b, invw), _mm256_mul_ps(fs_b, w));
526
+ gm_r = mul_add_ps256(gm_r, invw, _mm256_mul_ps(fs_r, w));
527
+ gm_g = mul_add_ps256(gm_g, invw, _mm256_mul_ps(fs_g, w));
528
+ gm_b = mul_add_ps256(gm_b, invw, _mm256_mul_ps(fs_b, w));
510
529
 
511
530
  gm_r = nan_to_num_ps(gm_r);
512
531
  gm_g = nan_to_num_ps(gm_g);
513
532
  gm_b = nan_to_num_ps(gm_b);
514
533
 
515
- __m256 fr = _mm256_add_ps(_mm256_mul_ps(gm_r, fta), _mm256_mul_ps(fb_r, fit_a));
516
- __m256 fg = _mm256_add_ps(_mm256_mul_ps(gm_g, fta), _mm256_mul_ps(fb_g, fit_a));
517
- __m256 fb = _mm256_add_ps(_mm256_mul_ps(gm_b, fta), _mm256_mul_ps(fb_b, fit_a));
534
+ __m256 fr = mul_add_ps256(gm_r, fta, _mm256_mul_ps(fb_r, fit_a));
535
+ __m256 fg = mul_add_ps256(gm_g, fta, _mm256_mul_ps(fb_g, fit_a));
536
+ __m256 fb = mul_add_ps256(gm_b, fta, _mm256_mul_ps(fb_b, fit_a));
518
537
 
519
538
  store_unit_f32_to_u8_rgb8_avx2(fr, fg, fb, out, i);
520
539
  }
521
540
 
522
541
  if (i < pixels) {
523
- kernel_scalar_rgba(base + 3*i, texture + 4*i, skin + 4*i, im_alpha + i,
542
+ kernel_scalar_rgba(base + 3*i, texture + 4*i, skin + 3*i, im_alpha + i,
524
543
  out + 3*i, pixels - i);
525
544
  }
526
545
  }
@@ -550,6 +569,13 @@ static inline __m128 nan_to_num_ps128(__m128 x) {
550
569
  return _mm_blendv_ps(_mm_set1_ps(0.0f), x, cmp);
551
570
  }
552
571
 
572
+ static inline __m128 mul_add_ps128(__m128 a, __m128 b, __m128 c) {
573
+ #ifdef __FMA__
574
+ return _mm_fmadd_ps(a, b, c);
575
+ #else
576
+ return _mm_add_ps(_mm_mul_ps(a, b), c);
577
+ #endif
578
+ }
553
579
 
554
580
  static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
555
581
  const uint8_t *skin, const uint8_t *im_alpha,
@@ -575,12 +601,12 @@ static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
575
601
  __m128 ft_b = u8x4_to_unit_f32(texture[3*(i+0)+2], texture[3*(i+1)+2],
576
602
  texture[3*(i+2)+2], texture[3*(i+3)+2]);
577
603
 
578
- __m128 fs_r = u8x4_to_unit_f32(skin[4*(i+0)+0], skin[4*(i+1)+0],
579
- skin[4*(i+2)+0], skin[4*(i+3)+0]);
580
- __m128 fs_g = u8x4_to_unit_f32(skin[4*(i+0)+1], skin[4*(i+1)+1],
581
- skin[4*(i+2)+1], skin[4*(i+3)+1]);
582
- __m128 fs_b = u8x4_to_unit_f32(skin[4*(i+0)+2], skin[4*(i+1)+2],
583
- skin[4*(i+2)+2], skin[4*(i+3)+2]);
604
+ __m128 fs_r = u8x4_to_unit_f32(skin[3*(i+0)+0], skin[3*(i+1)+0],
605
+ skin[3*(i+2)+0], skin[3*(i+3)+0]);
606
+ __m128 fs_g = u8x4_to_unit_f32(skin[3*(i+0)+1], skin[3*(i+1)+1],
607
+ skin[3*(i+2)+1], skin[3*(i+3)+1]);
608
+ __m128 fs_b = u8x4_to_unit_f32(skin[3*(i+0)+2], skin[3*(i+1)+2],
609
+ skin[3*(i+2)+2], skin[3*(i+3)+2]);
584
610
 
585
611
  __m128 fa_im = load4_u8_to_unit_f32(im_alpha + i);
586
612
  __m128 fit_a = _mm_sub_ps(one, fa_im);
@@ -589,21 +615,21 @@ static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
589
615
  __m128 gm_g = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_g, fs_g), half));
590
616
  __m128 gm_b = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_b, fs_b), half));
591
617
 
592
- gm_r = _mm_add_ps(_mm_mul_ps(gm_r, fa_im), _mm_mul_ps(ft_r, fit_a));
593
- gm_g = _mm_add_ps(_mm_mul_ps(gm_g, fa_im), _mm_mul_ps(ft_g, fit_a));
594
- gm_b = _mm_add_ps(_mm_mul_ps(gm_b, fa_im), _mm_mul_ps(ft_b, fit_a));
618
+ gm_r = mul_add_ps128(gm_r, fa_im, _mm_mul_ps(ft_r, fit_a));
619
+ gm_g = mul_add_ps128(gm_g, fa_im, _mm_mul_ps(ft_g, fit_a));
620
+ gm_b = mul_add_ps128(gm_b, fa_im, _mm_mul_ps(ft_b, fit_a));
595
621
 
596
- gm_r = _mm_add_ps(_mm_mul_ps(gm_r, invw), _mm_mul_ps(fs_r, w));
597
- gm_g = _mm_add_ps(_mm_mul_ps(gm_g, invw), _mm_mul_ps(fs_g, w));
598
- gm_b = _mm_add_ps(_mm_mul_ps(gm_b, invw), _mm_mul_ps(fs_b, w));
622
+ gm_r = mul_add_ps128(gm_r, invw, _mm_mul_ps(fs_r, w));
623
+ gm_g = mul_add_ps128(gm_g, invw, _mm_mul_ps(fs_g, w));
624
+ gm_b = mul_add_ps128(gm_b, invw, _mm_mul_ps(fs_b, w));
599
625
 
600
626
  gm_r = nan_to_num_ps128(gm_r);
601
627
  gm_g = nan_to_num_ps128(gm_g);
602
628
  gm_b = nan_to_num_ps128(gm_b);
603
629
 
604
- __m128 fr = _mm_add_ps(_mm_mul_ps(gm_r, fa_im), _mm_mul_ps(fb_r, fit_a));
605
- __m128 fg = _mm_add_ps(_mm_mul_ps(gm_g, fa_im), _mm_mul_ps(fb_g, fit_a));
606
- __m128 fb = _mm_add_ps(_mm_mul_ps(gm_b, fa_im), _mm_mul_ps(fb_b, fit_a));
630
+ __m128 fr = mul_add_ps128(gm_r, fa_im, _mm_mul_ps(fb_r, fit_a));
631
+ __m128 fg = mul_add_ps128(gm_g, fa_im, _mm_mul_ps(fb_g, fit_a));
632
+ __m128 fb = mul_add_ps128(gm_b, fa_im, _mm_mul_ps(fb_b, fit_a));
607
633
 
608
634
  float rr[4], gg[4], bb[4];
609
635
  _mm_storeu_ps(rr, fr);
@@ -621,7 +647,7 @@ static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
621
647
  }
622
648
 
623
649
  if (i < pixels) {
624
- kernel_scalar_rgb(base + 3*i, texture + 3*i, skin + 4*i, im_alpha + i,
650
+ kernel_scalar_rgb(base + 3*i, texture + 3*i, skin + 3*i, im_alpha + i,
625
651
  out + 3*i, pixels - i);
626
652
  }
627
653
  }
@@ -652,12 +678,12 @@ static void kernel_sse42_rgba(const uint8_t *base, const uint8_t *texture,
652
678
  __m128 ft_a = u8x4_to_unit_f32(texture[4*(i+0)+3], texture[4*(i+1)+3],
653
679
  texture[4*(i+2)+3], texture[4*(i+3)+3]);
654
680
 
655
- __m128 fs_r = u8x4_to_unit_f32(skin[4*(i+0)+0], skin[4*(i+1)+0],
656
- skin[4*(i+2)+0], skin[4*(i+3)+0]);
657
- __m128 fs_g = u8x4_to_unit_f32(skin[4*(i+0)+1], skin[4*(i+1)+1],
658
- skin[4*(i+2)+1], skin[4*(i+3)+1]);
659
- __m128 fs_b = u8x4_to_unit_f32(skin[4*(i+0)+2], skin[4*(i+1)+2],
660
- skin[4*(i+2)+2], skin[4*(i+3)+2]);
681
+ __m128 fs_r = u8x4_to_unit_f32(skin[3*(i+0)+0], skin[3*(i+1)+0],
682
+ skin[3*(i+2)+0], skin[3*(i+3)+0]);
683
+ __m128 fs_g = u8x4_to_unit_f32(skin[3*(i+0)+1], skin[3*(i+1)+1],
684
+ skin[3*(i+2)+1], skin[3*(i+3)+1]);
685
+ __m128 fs_b = u8x4_to_unit_f32(skin[3*(i+0)+2], skin[3*(i+1)+2],
686
+ skin[3*(i+2)+2], skin[3*(i+3)+2]);
661
687
 
662
688
  __m128 fa_im = load4_u8_to_unit_f32(im_alpha + i);
663
689
  __m128 fta = _mm_mul_ps(ft_a, fa_im); /* texture_alpha */
@@ -667,21 +693,21 @@ static void kernel_sse42_rgba(const uint8_t *base, const uint8_t *texture,
667
693
  __m128 gm_g = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_g, fs_g), half));
668
694
  __m128 gm_b = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_b, fs_b), half));
669
695
 
670
- gm_r = _mm_add_ps(_mm_mul_ps(gm_r, fta), _mm_mul_ps(ft_r, fit_a));
671
- gm_g = _mm_add_ps(_mm_mul_ps(gm_g, fta), _mm_mul_ps(ft_g, fit_a));
672
- gm_b = _mm_add_ps(_mm_mul_ps(gm_b, fta), _mm_mul_ps(ft_b, fit_a));
696
+ gm_r = mul_add_ps128(gm_r, fta, _mm_mul_ps(ft_r, fit_a));
697
+ gm_g = mul_add_ps128(gm_g, fta, _mm_mul_ps(ft_g, fit_a));
698
+ gm_b = mul_add_ps128(gm_b, fta, _mm_mul_ps(ft_b, fit_a));
673
699
 
674
- gm_r = _mm_add_ps(_mm_mul_ps(gm_r, invw), _mm_mul_ps(fs_r, w));
675
- gm_g = _mm_add_ps(_mm_mul_ps(gm_g, invw), _mm_mul_ps(fs_g, w));
676
- gm_b = _mm_add_ps(_mm_mul_ps(gm_b, invw), _mm_mul_ps(fs_b, w));
700
+ gm_r = mul_add_ps128(gm_r, invw, _mm_mul_ps(fs_r, w));
701
+ gm_g = mul_add_ps128(gm_g, invw, _mm_mul_ps(fs_g, w));
702
+ gm_b = mul_add_ps128(gm_b, invw, _mm_mul_ps(fs_b, w));
677
703
 
678
704
  gm_r = nan_to_num_ps128(gm_r);
679
705
  gm_g = nan_to_num_ps128(gm_g);
680
706
  gm_b = nan_to_num_ps128(gm_b);
681
707
 
682
- __m128 fr = _mm_add_ps(_mm_mul_ps(gm_r, fta), _mm_mul_ps(fb_r, fit_a));
683
- __m128 fg = _mm_add_ps(_mm_mul_ps(gm_g, fta), _mm_mul_ps(fb_g, fit_a));
684
- __m128 fb = _mm_add_ps(_mm_mul_ps(gm_b, fta), _mm_mul_ps(fb_b, fit_a));
708
+ __m128 fr = mul_add_ps128(gm_r, fta, _mm_mul_ps(fb_r, fit_a));
709
+ __m128 fg = mul_add_ps128(gm_g, fta, _mm_mul_ps(fb_g, fit_a));
710
+ __m128 fb = mul_add_ps128(gm_b, fta, _mm_mul_ps(fb_b, fit_a));
685
711
 
686
712
  float rr[4], gg[4], bb[4];
687
713
  _mm_storeu_ps(rr, fr);
@@ -699,7 +725,7 @@ static void kernel_sse42_rgba(const uint8_t *base, const uint8_t *texture,
699
725
  }
700
726
 
701
727
  if (i < pixels) {
702
- kernel_scalar_rgba(base + 3*i, texture + 4*i, skin + 4*i, im_alpha + i,
728
+ kernel_scalar_rgba(base + 3*i, texture + 4*i, skin + 3*i, im_alpha + i,
703
729
  out + 3*i, pixels - i);
704
730
  }
705
731
  }
@@ -16,7 +16,7 @@ def normal_grain_merge(
16
16
  Channel ordering doesn't matter as long as it is consistent.
17
17
  :param base: The base RGB image.
18
18
  :param texture: The texture, either RGB or RGBA.
19
- :param skin: The RGBA skin cutout.
19
+ :param skin: The RGB skin cutout.
20
20
  :param im_alpha: The alpha from the cutout.
21
21
  :param kernel: Which kernel to use.
22
22
  The `auto` kernel chooses between avx2 and sse4.2 when compiled with gcc and uses `scaler` on Windows.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: normal_grain_merge
3
- Version: 0.0.2
3
+ Version: 0.1.0
4
4
  Summary: Fused normal and grain merge C extension
5
5
  Author: Samuel Howard
6
6
  License: MIT
@@ -88,21 +88,22 @@ One of `KernelKind`.
88
88
  The entire reason for me writing this was NumPy being slow when this operation is in the hot path.
89
89
  So, I decided to write a SIMD version that does the type casting outside NumPy with only the intermediate values being in FP32.
90
90
 
91
- How much of a speedup is this? All numbers are from a Ryzen 7 4800H running Windows 11 and Python 3.12.4.
91
+ How much of a speedup is this? All numbers are from a Ryzen 7 4800H running Ubuntu 24.04 and Python 3.12.3.
92
92
 
93
93
  | Method/Kernel | Average Iteration Time |
94
94
  |-------------------|------------------------|
95
- | C scalar kernel | 0.019565s |
96
- | C SSE4.2 kernel | 0.013705s |
97
- | C AVX2 kernel | 0.016842s |
98
- | NumPy version | 0.228098s |
99
- | Old NumPy version | 0.350554s |
95
+ | C scalar kernel | 0.016007s |
96
+ | C SSE4.2 kernel | 0.011155s |
97
+ | C AVX2 kernel | 0.014575s |
98
+ | NumPy version | 0.190392s |
99
+ | Old NumPy version | 0.274065s |
100
100
 
101
101
  | Method Comparison | Speedup |
102
102
  |--------------------|----------|
103
- | NumPy -> scalar | 91.4227% |
104
- | NumPy -> SSE4.2 | 93.9915% |
105
- | NumPy -> AVX2 | 92.6165% |
106
- | Old np -> SSE4.2 | 96.0904% |
107
- | C scalar -> SSE4.2 | 29.9487% |
108
- | C scalar -> AVX2 | 13.9183% |
103
+ | NumPy -> scalar | 91.5927% |
104
+ | NumPy -> SSE4.2 | 94.1409% |
105
+ | NumPy -> AVX2 | 92.3448% |
106
+ | Old np -> SSE4.2 | 95.9297% |
107
+ | Old np -> AVX2 | 94.6819% |
108
+ | C scalar -> SSE4.2 | 30.3086% |
109
+ | C scalar -> AVX2 | 8.9448% |
@@ -8,7 +8,7 @@ build-backend = "setuptools.build_meta"
8
8
 
9
9
  [project]
10
10
  name = "normal_grain_merge"
11
- version = "0.0.2"
11
+ version = "0.1.0"
12
12
  description = "Fused normal and grain merge C extension"
13
13
  readme = "README.md"
14
14
  authors = [
@@ -12,7 +12,7 @@ if sys.platform == "win32":
12
12
  elif "arm" in arch or "aarch64" in arch:
13
13
  extra_compile_args += ["-O3"]
14
14
  else:
15
- extra_compile_args += ["-O3", "-march=x86-64", "-mavx2", "-msse4.2"]
15
+ extra_compile_args += ["-O3", "-march=x86-64", "-mavx2", "-msse4.2", "-flto", "-mfma",]
16
16
 
17
17
  module = Extension(
18
18
  "normal_grain_merge.normal_grain_merge",
@@ -23,7 +23,7 @@ module = Extension(
23
23
 
24
24
  setup(
25
25
  name="normal_grain_merge",
26
- version="0.0.2",
26
+ version="0.1.0",
27
27
  description="Normal grain merge C extension",
28
28
  ext_modules=[module],
29
29
  packages=["normal_grain_merge"],
@@ -33,8 +33,8 @@ class TestNGM(unittest.TestCase):
33
33
  """
34
34
  self.base = cv2.imread("base.png")
35
35
  self.texture = cv2.imread("texture.png")
36
- self.skin = cv2.imread("skin.png", cv2.IMREAD_UNCHANGED)
37
- self.im_alpha = self.skin[..., 3]
36
+ self.skin = self.base.copy()
37
+ self.im_alpha = cv2.imread("skin.png", cv2.IMREAD_UNCHANGED)[..., 3]
38
38
 
39
39
  def test_dummy_arrays(self):
40
40
  """
@@ -42,7 +42,7 @@ class TestNGM(unittest.TestCase):
42
42
  """
43
43
  base = np.zeros((100, 100, 3), dtype=np.uint8)
44
44
  texture = np.zeros((100, 100, 3), dtype=np.uint8)
45
- skin = np.zeros((100, 100, 4), dtype=np.uint8)
45
+ skin = np.zeros((100, 100, 3), dtype=np.uint8)
46
46
  im_alpha = np.zeros((100, 100), dtype=np.uint8)
47
47
 
48
48
  result_scalar = normal_grain_merge(base, texture, skin, im_alpha, KernelKind.KERNEL_SCALAR.value)
@@ -53,15 +53,14 @@ class TestNGM(unittest.TestCase):
53
53
  """
54
54
  Test the common case; RGB versions of each kernel.
55
55
  """
56
- result_py = apply_texture(self.base, self.skin, self.texture, self.im_alpha)
56
+ result_py = apply_texture(self.base, np.dstack([self.skin, self.im_alpha]), self.texture, self.im_alpha)
57
57
  self.skin = cv2.cvtColor(
58
58
  cv2.cvtColor(
59
- self.skin[..., :3],
59
+ self.skin,
60
60
  cv2.COLOR_BGR2GRAY),
61
61
  cv2.COLOR_GRAY2BGR
62
62
  )
63
63
  # Skin is BGR at this point
64
- self.skin = np.dstack([self.skin, self.im_alpha])
65
64
  result_scalar = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_SCALAR.value)
66
65
  result_sse = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_SSE42.value)
67
66
  result_avx = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_AVX2.value)
@@ -81,15 +80,14 @@ class TestNGM(unittest.TestCase):
81
80
  """
82
81
  self.skin = cv2.cvtColor(
83
82
  cv2.cvtColor(
84
- self.skin[..., :3],
83
+ self.skin,
85
84
  cv2.COLOR_BGR2GRAY),
86
85
  cv2.COLOR_GRAY2BGR
87
86
  )
88
87
  mask = vertical_fill(self.base.shape[0], self.base.shape[1], self.base.shape[1] // 2)
89
88
  new_alpha = np.bitwise_and(self.im_alpha, mask)
90
- self.skin = np.dstack((self.skin[..., :3], new_alpha))
91
89
 
92
- result_py = apply_texture(self.base, self.skin, self.texture, new_alpha)
90
+ result_py = apply_texture(self.base, np.dstack((self.skin[..., :3], new_alpha)), self.texture, new_alpha)
93
91
  result_scalar = normal_grain_merge(self.base, self.texture, self.skin, new_alpha, KernelKind.KERNEL_SCALAR.value)
94
92
  result_sse = normal_grain_merge(self.base, self.texture, self.skin, new_alpha, KernelKind.KERNEL_SSE42.value)
95
93
  result_avx = normal_grain_merge(self.base, self.texture, self.skin, new_alpha, KernelKind.KERNEL_AVX2.value)
@@ -114,13 +112,12 @@ class TestNGM(unittest.TestCase):
114
112
 
115
113
  self.skin = cv2.cvtColor(
116
114
  cv2.cvtColor(
117
- self.skin[..., :3],
115
+ self.skin,
118
116
  cv2.COLOR_BGR2GRAY),
119
117
  cv2.COLOR_GRAY2BGR
120
118
  )
121
119
  result_py = apply_texture(self.base, self.skin, self.texture, self.im_alpha)
122
120
  # Skin is BGR at this point
123
- self.skin = np.dstack([self.skin, self.im_alpha])
124
121
  result_scalar = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_SCALAR.value)
125
122
  result_sse = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_SSE42.value)
126
123
  result_avx = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_AVX2.value)
@@ -143,12 +140,11 @@ class TestNGM(unittest.TestCase):
143
140
  result_py = apply_texture(self.base, self.skin, self.texture, self.im_alpha)
144
141
  self.skin = cv2.cvtColor(
145
142
  cv2.cvtColor(
146
- self.skin[..., :3],
143
+ self.skin,
147
144
  cv2.COLOR_BGR2GRAY),
148
145
  cv2.COLOR_GRAY2BGR
149
146
  )
150
147
  # Skin is BGR at this point
151
- self.skin = np.dstack([self.skin, self.im_alpha])
152
148
  result_scalar = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_SCALAR.value)
153
149
  result_sse = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_SSE42.value)
154
150
  result_avx = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_AVX2.value)
@@ -30,16 +30,15 @@ class TestNGM(unittest.TestCase):
30
30
  global_start = time.perf_counter()
31
31
  base = cv2.imread("base.png")
32
32
  texture = cv2.imread("texture.png")
33
- skin = cv2.imread("skin.png", cv2.IMREAD_UNCHANGED)
34
- im_alpha = skin[..., 3]
33
+ skin = base.copy()
34
+ im_alpha = cv2.imread("skin.png", cv2.IMREAD_UNCHANGED)[..., 3]
35
35
  skin = cv2.cvtColor(
36
36
  cv2.cvtColor(
37
- skin[..., :3],
37
+ skin,
38
38
  cv2.COLOR_BGR2GRAY),
39
39
  cv2.COLOR_GRAY2BGR
40
40
  )
41
41
  # Skin is BGR at this point
42
- skin = np.dstack([skin, im_alpha])
43
42
 
44
43
  # Scaler kernel
45
44
  start_c_scalar = time.perf_counter()
@@ -60,7 +59,6 @@ class TestNGM(unittest.TestCase):
60
59
  end_c_avx = time.perf_counter()
61
60
 
62
61
  # NumPy "just do less" version.
63
- skin = skin[..., :3]
64
62
  start_py = time.perf_counter()
65
63
  for _ in range(ITERATIONS):
66
64
  result = normal_grain_merge_py(base, texture, skin, im_alpha)
@@ -90,6 +88,7 @@ class TestNGM(unittest.TestCase):
90
88
  f"NumPy -> SSE4.2: {percent_change(c_avg_sse, np_avg):.4f}%\n"
91
89
  f"NumPy -> AVX2: {percent_change(c_avg_avx, np_avg):.4f}%\n"
92
90
  f"Old np -> SSE: {percent_change(c_avg_sse, np_old_avg):.4f}%\n"
91
+ f"Old np -> AVX2: {percent_change(c_avg_avx, np_old_avg):.4f}%\n"
93
92
  f"C scalar -> SSE: {percent_change(c_avg_sse, c_avg_scalar):.4f}%\n"
94
93
  f"C scalar -> AVX: {percent_change(c_avg_avx, c_avg_scalar):.4f}%\n")
95
94
  print(f"Test time: {end - global_start:.4f}s")