normal-grain-merge 0.0.2__tar.gz → 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of normal-grain-merge might be problematic. Click here for more details.
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/LICENSE +0 -0
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/PKG-INFO +14 -13
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/README.md +13 -12
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge/__init__.py +0 -0
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge/kernel_kind.py +0 -0
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge/normal_grain_merge.c +138 -112
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge/normal_grain_merge.pyi +1 -1
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/PKG-INFO +14 -13
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/pyproject.toml +1 -1
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/setup.py +2 -2
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/tests/test_ngm.py +9 -13
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/tests/test_speed.py +4 -5
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/SOURCES.txt +0 -0
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/dependency_links.txt +0 -0
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/requires.txt +0 -0
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/top_level.txt +0 -0
- {normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/setup.cfg +0 -0
|
File without changes
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: normal_grain_merge
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.1.0
|
|
4
4
|
Summary: Fused normal and grain merge C extension
|
|
5
5
|
Author: Samuel Howard
|
|
6
6
|
License: MIT
|
|
@@ -88,21 +88,22 @@ One of `KernelKind`.
|
|
|
88
88
|
The entire reason for me writing this was NumPy being slow when this operation is in the hot path.
|
|
89
89
|
So, I decided to write a SIMD version that does the type casting outside NumPy with only the intermediate values being in FP32.
|
|
90
90
|
|
|
91
|
-
How much of a speedup is this? All numbers are from a Ryzen 7 4800H running
|
|
91
|
+
How much of a speedup is this? All numbers are from a Ryzen 7 4800H running Ubuntu 24.04 and Python 3.12.3.
|
|
92
92
|
|
|
93
93
|
| Method/Kernel | Average Iteration Time |
|
|
94
94
|
|-------------------|------------------------|
|
|
95
|
-
| C scalar kernel | 0.
|
|
96
|
-
| C SSE4.2 kernel | 0.
|
|
97
|
-
| C AVX2 kernel | 0.
|
|
98
|
-
| NumPy version | 0.
|
|
99
|
-
| Old NumPy version | 0.
|
|
95
|
+
| C scalar kernel | 0.016007s |
|
|
96
|
+
| C SSE4.2 kernel | 0.011155s |
|
|
97
|
+
| C AVX2 kernel | 0.014575s |
|
|
98
|
+
| NumPy version | 0.190392s |
|
|
99
|
+
| Old NumPy version | 0.274065s |
|
|
100
100
|
|
|
101
101
|
| Method Comparison | Speedup |
|
|
102
102
|
|--------------------|----------|
|
|
103
|
-
| NumPy -> scalar | 91.
|
|
104
|
-
| NumPy -> SSE4.2 |
|
|
105
|
-
| NumPy -> AVX2 | 92.
|
|
106
|
-
| Old np -> SSE4.2 |
|
|
107
|
-
|
|
|
108
|
-
| C scalar ->
|
|
103
|
+
| NumPy -> scalar | 91.5927% |
|
|
104
|
+
| NumPy -> SSE4.2 | 94.1409% |
|
|
105
|
+
| NumPy -> AVX2 | 92.3448% |
|
|
106
|
+
| Old np -> SSE4.2 | 95.9297% |
|
|
107
|
+
| Old np -> AVX2 | 94.6819% |
|
|
108
|
+
| C scalar -> SSE4.2 | 30.3086% |
|
|
109
|
+
| C scalar -> AVX2 | 8.9448% |
|
|
@@ -70,21 +70,22 @@ One of `KernelKind`.
|
|
|
70
70
|
The entire reason for me writing this was NumPy being slow when this operation is in the hot path.
|
|
71
71
|
So, I decided to write a SIMD version that does the type casting outside NumPy with only the intermediate values being in FP32.
|
|
72
72
|
|
|
73
|
-
How much of a speedup is this? All numbers are from a Ryzen 7 4800H running
|
|
73
|
+
How much of a speedup is this? All numbers are from a Ryzen 7 4800H running Ubuntu 24.04 and Python 3.12.3.
|
|
74
74
|
|
|
75
75
|
| Method/Kernel | Average Iteration Time |
|
|
76
76
|
|-------------------|------------------------|
|
|
77
|
-
| C scalar kernel | 0.
|
|
78
|
-
| C SSE4.2 kernel | 0.
|
|
79
|
-
| C AVX2 kernel | 0.
|
|
80
|
-
| NumPy version | 0.
|
|
81
|
-
| Old NumPy version | 0.
|
|
77
|
+
| C scalar kernel | 0.016007s |
|
|
78
|
+
| C SSE4.2 kernel | 0.011155s |
|
|
79
|
+
| C AVX2 kernel | 0.014575s |
|
|
80
|
+
| NumPy version | 0.190392s |
|
|
81
|
+
| Old NumPy version | 0.274065s |
|
|
82
82
|
|
|
83
83
|
| Method Comparison | Speedup |
|
|
84
84
|
|--------------------|----------|
|
|
85
|
-
| NumPy -> scalar | 91.
|
|
86
|
-
| NumPy -> SSE4.2 |
|
|
87
|
-
| NumPy -> AVX2 | 92.
|
|
88
|
-
| Old np -> SSE4.2 |
|
|
89
|
-
|
|
|
90
|
-
| C scalar ->
|
|
85
|
+
| NumPy -> scalar | 91.5927% |
|
|
86
|
+
| NumPy -> SSE4.2 | 94.1409% |
|
|
87
|
+
| NumPy -> AVX2 | 92.3448% |
|
|
88
|
+
| Old np -> SSE4.2 | 95.9297% |
|
|
89
|
+
| Old np -> AVX2 | 94.6819% |
|
|
90
|
+
| C scalar -> SSE4.2 | 30.3086% |
|
|
91
|
+
| C scalar -> AVX2 | 8.9448% |
|
|
File without changes
|
|
File without changes
|
{normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge/normal_grain_merge.c
RENAMED
|
@@ -7,6 +7,10 @@
|
|
|
7
7
|
#include <smmintrin.h>
|
|
8
8
|
#include <immintrin.h> /* AVX2 + SSE4.2 */
|
|
9
9
|
|
|
10
|
+
#if defined(__FMA__) || (defined(_MSC_VER) && defined(__AVX2__))
|
|
11
|
+
#define NGM_HAS_FMA 1
|
|
12
|
+
#endif
|
|
13
|
+
|
|
10
14
|
/* ----- Runtime CPU feature detection (GCC/Clang + MSVC) ----- */
|
|
11
15
|
#if defined(_MSC_VER)
|
|
12
16
|
#include <intrin.h>
|
|
@@ -126,8 +130,8 @@ static inline int check_shape_requirements(PyArrayObject *base,
|
|
|
126
130
|
}
|
|
127
131
|
*texture_has_alpha = (tc == 4);
|
|
128
132
|
|
|
129
|
-
if (PyArray_NDIM(skin) != 3 || PyArray_DIMS(skin)[2] !=
|
|
130
|
-
PyErr_SetString(PyExc_ValueError, "skin must have shape (H, W,
|
|
133
|
+
if (PyArray_NDIM(skin) != 3 || PyArray_DIMS(skin)[2] != 3) {
|
|
134
|
+
PyErr_SetString(PyExc_ValueError, "skin must have shape (H, W, 3)");
|
|
131
135
|
return 0;
|
|
132
136
|
}
|
|
133
137
|
if (PyArray_NDIM(im_alpha) != 2) {
|
|
@@ -184,10 +188,9 @@ static void kernel_scalar_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
184
188
|
const uint8_t t_g = texture[3*i+1];
|
|
185
189
|
const uint8_t t_b = texture[3*i+2];
|
|
186
190
|
|
|
187
|
-
const uint8_t s_r = skin[
|
|
188
|
-
const uint8_t s_g = skin[
|
|
189
|
-
const uint8_t s_b = skin[
|
|
190
|
-
const uint8_t s_a = skin[4*i+3];
|
|
191
|
+
const uint8_t s_r = skin[3*i+0];
|
|
192
|
+
const uint8_t s_g = skin[3*i+1];
|
|
193
|
+
const uint8_t s_b = skin[3*i+2];
|
|
191
194
|
|
|
192
195
|
const uint8_t a_im = im_alpha[i];
|
|
193
196
|
|
|
@@ -203,8 +206,6 @@ static void kernel_scalar_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
203
206
|
const float fs_r = s_r * (1.0f/255.0f);
|
|
204
207
|
const float fs_g = s_g * (1.0f/255.0f);
|
|
205
208
|
const float fs_b = s_b * (1.0f/255.0f);
|
|
206
|
-
const float fs_a = s_a * (1.0f/255.0f);
|
|
207
|
-
|
|
208
209
|
const float fa_im = a_im * (1.0f/255.0f);
|
|
209
210
|
|
|
210
211
|
/*
|
|
@@ -240,8 +241,8 @@ static void kernel_scalar_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
240
241
|
|
|
241
242
|
/* Normal merge
|
|
242
243
|
* n_out = gm_out * texture_alpha + base * inverse_tpa
|
|
243
|
-
*
|
|
244
|
-
* In this case, texture_alpha is
|
|
244
|
+
*
|
|
245
|
+
* In this case, texture_alpha is supplied by im_alpha since texture doesn't have an alpha channel here.
|
|
245
246
|
*/
|
|
246
247
|
fr = fr * fa_im + fb_r * fit_a;
|
|
247
248
|
fg = fg * fa_im + fb_g * fit_a;
|
|
@@ -267,10 +268,9 @@ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
267
268
|
const uint8_t t_b = texture[4*i+2];
|
|
268
269
|
const uint8_t t_a = texture[4*i+3]; /* present in RGBA branch */
|
|
269
270
|
|
|
270
|
-
const uint8_t s_r = skin[
|
|
271
|
-
const uint8_t s_g = skin[
|
|
272
|
-
const uint8_t s_b = skin[
|
|
273
|
-
const uint8_t s_a = skin[4*i+3];
|
|
271
|
+
const uint8_t s_r = skin[3*i+0];
|
|
272
|
+
const uint8_t s_g = skin[3*i+1];
|
|
273
|
+
const uint8_t s_b = skin[3*i+2];
|
|
274
274
|
|
|
275
275
|
const uint8_t a_im = im_alpha[i];
|
|
276
276
|
|
|
@@ -286,8 +286,6 @@ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
286
286
|
const float fs_r = s_r * (1.0f/255.0f);
|
|
287
287
|
const float fs_g = s_g * (1.0f/255.0f);
|
|
288
288
|
const float fs_b = s_b * (1.0f/255.0f);
|
|
289
|
-
const float fs_a = s_a * (1.0f/255.0f);
|
|
290
|
-
|
|
291
289
|
const float fa_im = a_im * (1.0f/255.0f);
|
|
292
290
|
|
|
293
291
|
/*
|
|
@@ -295,7 +293,7 @@ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
295
293
|
* normal grain merge *
|
|
296
294
|
**********************
|
|
297
295
|
*/
|
|
298
|
-
/* Merge texture
|
|
296
|
+
/* Merge texture alpha with the external mask */
|
|
299
297
|
|
|
300
298
|
/* texture_alpha = texture[..., 3] * im_alpha*/
|
|
301
299
|
ft_a = ft_a * fa_im;
|
|
@@ -346,29 +344,37 @@ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
346
344
|
You can later replace gathers with better deinterleaving if needed.
|
|
347
345
|
*/
|
|
348
346
|
|
|
349
|
-
/* Convert 8 u8 interleaved
|
|
350
|
-
static inline __m256
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
347
|
+
/* Convert 8 u8 interleaved samples addressed by idx to float32 in [0,1]. */
|
|
348
|
+
static inline __m256 gather_u8_block_to_unit_f32_avx2(const uint8_t *block_ptr,
|
|
349
|
+
__m256i idx,
|
|
350
|
+
__m256i mask_ff,
|
|
351
|
+
__m256 inv255) {
|
|
352
|
+
__m256i v32 = _mm256_i32gather_epi32((const int*)block_ptr, idx, 1);
|
|
353
|
+
v32 = _mm256_and_si256(v32, mask_ff);
|
|
354
|
+
return _mm256_mul_ps(_mm256_cvtepi32_ps(v32), inv255);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
static inline __m256 mul_add_ps256(__m256 a, __m256 b, __m256 c) {
|
|
358
|
+
#ifdef __FMA__
|
|
359
|
+
return _mm256_fmadd_ps(a, b, c);
|
|
360
|
+
#else
|
|
361
|
+
return _mm256_add_ps(_mm256_mul_ps(a, b), c);
|
|
362
|
+
#endif
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
static inline __m256 fnmadd_ps256(__m256 a, __m256 b, __m256 c) {
|
|
366
|
+
#ifdef __FMA__
|
|
367
|
+
return _mm256_fnmadd_ps(a, b, c);
|
|
368
|
+
#else
|
|
369
|
+
return _mm256_sub_ps(c, _mm256_mul_ps(a, b));
|
|
370
|
+
#endif
|
|
365
371
|
}
|
|
366
372
|
|
|
367
373
|
/* Convert 8 consecutive u8 to float32 in [0,1] (for grayscale im_alpha). */
|
|
368
|
-
static inline __m256 load8_u8_to_unit_f32_avx2(const uint8_t *p) {
|
|
374
|
+
static inline __m256 load8_u8_to_unit_f32_avx2(const uint8_t *p, __m256 inv255) {
|
|
369
375
|
__m128i v8 = _mm_loadl_epi64((const __m128i*)p); /* 8 bytes -> XMM */
|
|
370
376
|
__m256i v32 = _mm256_cvtepu8_epi32(v8); /* widen to 8 x u32 */
|
|
371
|
-
return _mm256_mul_ps(_mm256_cvtepi32_ps(v32),
|
|
377
|
+
return _mm256_mul_ps(_mm256_cvtepi32_ps(v32), inv255);
|
|
372
378
|
}
|
|
373
379
|
|
|
374
380
|
static inline __m256 clamp01_ps(__m256 x) {
|
|
@@ -406,7 +412,9 @@ static inline void store_unit_f32_to_u8_rgb8_avx2(__m256 fr, __m256 fg, __m256 f
|
|
|
406
412
|
static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
|
|
407
413
|
const uint8_t *skin, const uint8_t *im_alpha,
|
|
408
414
|
uint8_t *out, npy_intp pixels) {
|
|
409
|
-
const
|
|
415
|
+
const __m256 inv255 = _mm256_set1_ps(1.0f/255.0f);
|
|
416
|
+
const __m256i mask_ff = _mm256_set1_epi32(0xFF);
|
|
417
|
+
const __m256i idx_rgb = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21);
|
|
410
418
|
const __m256 half = _mm256_set1_ps(0.5f);
|
|
411
419
|
const __m256 one = _mm256_set1_ps(1.0f);
|
|
412
420
|
const __m256 w = _mm256_set1_ps((float)SKIN_WEIGHT);
|
|
@@ -414,24 +422,28 @@ static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
414
422
|
|
|
415
423
|
npy_intp i = 0;
|
|
416
424
|
for (; i + 8 <= pixels; i += 8) {
|
|
425
|
+
const uint8_t *base_blk = base + 3*i;
|
|
426
|
+
const uint8_t *tex_blk = texture + 3*i;
|
|
427
|
+
const uint8_t *skin_blk = skin + 3*i;
|
|
428
|
+
|
|
417
429
|
/* base RGB in [0,1] */
|
|
418
|
-
__m256 fb_r =
|
|
419
|
-
__m256 fb_g =
|
|
420
|
-
__m256 fb_b =
|
|
430
|
+
__m256 fb_r = gather_u8_block_to_unit_f32_avx2(base_blk + 0, idx_rgb, mask_ff, inv255);
|
|
431
|
+
__m256 fb_g = gather_u8_block_to_unit_f32_avx2(base_blk + 1, idx_rgb, mask_ff, inv255);
|
|
432
|
+
__m256 fb_b = gather_u8_block_to_unit_f32_avx2(base_blk + 2, idx_rgb, mask_ff, inv255);
|
|
421
433
|
|
|
422
434
|
/* texture RGB in [0,1] */
|
|
423
|
-
__m256 ft_r =
|
|
424
|
-
__m256 ft_g =
|
|
425
|
-
__m256 ft_b =
|
|
435
|
+
__m256 ft_r = gather_u8_block_to_unit_f32_avx2(tex_blk + 0, idx_rgb, mask_ff, inv255);
|
|
436
|
+
__m256 ft_g = gather_u8_block_to_unit_f32_avx2(tex_blk + 1, idx_rgb, mask_ff, inv255);
|
|
437
|
+
__m256 ft_b = gather_u8_block_to_unit_f32_avx2(tex_blk + 2, idx_rgb, mask_ff, inv255);
|
|
426
438
|
|
|
427
439
|
/* skin RGB in [0,1] */
|
|
428
|
-
__m256 fs_r =
|
|
429
|
-
__m256 fs_g =
|
|
430
|
-
__m256 fs_b =
|
|
440
|
+
__m256 fs_r = gather_u8_block_to_unit_f32_avx2(skin_blk + 0, idx_rgb, mask_ff, inv255);
|
|
441
|
+
__m256 fs_g = gather_u8_block_to_unit_f32_avx2(skin_blk + 1, idx_rgb, mask_ff, inv255);
|
|
442
|
+
__m256 fs_b = gather_u8_block_to_unit_f32_avx2(skin_blk + 2, idx_rgb, mask_ff, inv255);
|
|
431
443
|
|
|
432
444
|
/* texture_alpha = im_alpha */
|
|
433
|
-
__m256 fa_im = load8_u8_to_unit_f32_avx2(im_alpha + i);
|
|
434
|
-
__m256 fit_a =
|
|
445
|
+
__m256 fa_im = load8_u8_to_unit_f32_avx2(im_alpha + i, inv255);
|
|
446
|
+
__m256 fit_a = fnmadd_ps256(fa_im, one, one);
|
|
435
447
|
|
|
436
448
|
/* gm_out = clip(texture + skin - 0.5) */
|
|
437
449
|
__m256 gm_r = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_r, fs_r), half));
|
|
@@ -439,14 +451,14 @@ static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
439
451
|
__m256 gm_b = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_b, fs_b), half));
|
|
440
452
|
|
|
441
453
|
/* gm_out = gm_out * texture_alpha + texture * inverse_tpa */
|
|
442
|
-
gm_r =
|
|
443
|
-
gm_g =
|
|
444
|
-
gm_b =
|
|
454
|
+
gm_r = mul_add_ps256(gm_r, fa_im, _mm256_mul_ps(ft_r, fit_a));
|
|
455
|
+
gm_g = mul_add_ps256(gm_g, fa_im, _mm256_mul_ps(ft_g, fit_a));
|
|
456
|
+
gm_b = mul_add_ps256(gm_b, fa_im, _mm256_mul_ps(ft_b, fit_a));
|
|
445
457
|
|
|
446
458
|
/* gm_out = gm_out * (1 - w) + skin * w */
|
|
447
|
-
gm_r =
|
|
448
|
-
gm_g =
|
|
449
|
-
gm_b =
|
|
459
|
+
gm_r = mul_add_ps256(gm_r, invw, _mm256_mul_ps(fs_r, w));
|
|
460
|
+
gm_g = mul_add_ps256(gm_g, invw, _mm256_mul_ps(fs_g, w));
|
|
461
|
+
gm_b = mul_add_ps256(gm_b, invw, _mm256_mul_ps(fs_b, w));
|
|
450
462
|
|
|
451
463
|
/* nan_to_num */
|
|
452
464
|
gm_r = nan_to_num_ps(gm_r);
|
|
@@ -454,15 +466,15 @@ static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
454
466
|
gm_b = nan_to_num_ps(gm_b);
|
|
455
467
|
|
|
456
468
|
/* n_out = gm_out * texture_alpha + base * inverse_tpa */
|
|
457
|
-
__m256 fr =
|
|
458
|
-
__m256 fg =
|
|
459
|
-
__m256 fb =
|
|
469
|
+
__m256 fr = mul_add_ps256(gm_r, fa_im, _mm256_mul_ps(fb_r, fit_a));
|
|
470
|
+
__m256 fg = mul_add_ps256(gm_g, fa_im, _mm256_mul_ps(fb_g, fit_a));
|
|
471
|
+
__m256 fb = mul_add_ps256(gm_b, fa_im, _mm256_mul_ps(fb_b, fit_a));
|
|
460
472
|
|
|
461
473
|
store_unit_f32_to_u8_rgb8_avx2(fr, fg, fb, out, i);
|
|
462
474
|
}
|
|
463
475
|
|
|
464
476
|
if (i < pixels) {
|
|
465
|
-
kernel_scalar_rgb(base + 3*i, texture + 3*i, skin +
|
|
477
|
+
kernel_scalar_rgb(base + 3*i, texture + 3*i, skin + 3*i, im_alpha + i,
|
|
466
478
|
out + 3*i, pixels - i);
|
|
467
479
|
}
|
|
468
480
|
}
|
|
@@ -471,7 +483,10 @@ static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
471
483
|
static void kernel_avx2_rgba(const uint8_t *base, const uint8_t *texture,
|
|
472
484
|
const uint8_t *skin, const uint8_t *im_alpha,
|
|
473
485
|
uint8_t *out, npy_intp pixels) {
|
|
474
|
-
const
|
|
486
|
+
const __m256 inv255 = _mm256_set1_ps(1.0f/255.0f);
|
|
487
|
+
const __m256i mask_ff = _mm256_set1_epi32(0xFF);
|
|
488
|
+
const __m256i idx_rgb = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21);
|
|
489
|
+
const __m256i idx_rgba = _mm256_setr_epi32(0, 4, 8, 12, 16, 20, 24, 28);
|
|
475
490
|
const __m256 half = _mm256_set1_ps(0.5f);
|
|
476
491
|
const __m256 one = _mm256_set1_ps(1.0f);
|
|
477
492
|
const __m256 w = _mm256_set1_ps((float)SKIN_WEIGHT);
|
|
@@ -479,48 +494,52 @@ static void kernel_avx2_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
479
494
|
|
|
480
495
|
npy_intp i = 0;
|
|
481
496
|
for (; i + 8 <= pixels; i += 8) {
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
497
|
+
const uint8_t *base_blk = base + 3*i;
|
|
498
|
+
const uint8_t *tex_blk = texture + 4*i;
|
|
499
|
+
const uint8_t *skin_blk = skin + 3*i;
|
|
485
500
|
|
|
486
|
-
__m256
|
|
487
|
-
__m256
|
|
488
|
-
__m256
|
|
489
|
-
__m256 ft_a = gather_u8_to_unit_f32_avx2(texture+3, stride4, i); /* texture alpha */
|
|
501
|
+
__m256 fb_r = gather_u8_block_to_unit_f32_avx2(base_blk + 0, idx_rgb, mask_ff, inv255);
|
|
502
|
+
__m256 fb_g = gather_u8_block_to_unit_f32_avx2(base_blk + 1, idx_rgb, mask_ff, inv255);
|
|
503
|
+
__m256 fb_b = gather_u8_block_to_unit_f32_avx2(base_blk + 2, idx_rgb, mask_ff, inv255);
|
|
490
504
|
|
|
491
|
-
__m256
|
|
492
|
-
__m256
|
|
493
|
-
__m256
|
|
505
|
+
__m256 ft_r = gather_u8_block_to_unit_f32_avx2(tex_blk + 0, idx_rgba, mask_ff, inv255);
|
|
506
|
+
__m256 ft_g = gather_u8_block_to_unit_f32_avx2(tex_blk + 1, idx_rgba, mask_ff, inv255);
|
|
507
|
+
__m256 ft_b = gather_u8_block_to_unit_f32_avx2(tex_blk + 2, idx_rgba, mask_ff, inv255);
|
|
508
|
+
__m256 ft_a = gather_u8_block_to_unit_f32_avx2(tex_blk + 3, idx_rgba, mask_ff, inv255); /* texture alpha */
|
|
494
509
|
|
|
495
|
-
__m256
|
|
510
|
+
__m256 fs_r = gather_u8_block_to_unit_f32_avx2(skin_blk + 0, idx_rgb, mask_ff, inv255);
|
|
511
|
+
__m256 fs_g = gather_u8_block_to_unit_f32_avx2(skin_blk + 1, idx_rgb, mask_ff, inv255);
|
|
512
|
+
__m256 fs_b = gather_u8_block_to_unit_f32_avx2(skin_blk + 2, idx_rgb, mask_ff, inv255);
|
|
513
|
+
|
|
514
|
+
__m256 fa_im = load8_u8_to_unit_f32_avx2(im_alpha + i, inv255);
|
|
496
515
|
__m256 fta = _mm256_mul_ps(ft_a, fa_im); /* texture_alpha */
|
|
497
|
-
__m256 fit_a =
|
|
516
|
+
__m256 fit_a = fnmadd_ps256(fta, one, one); /* inverse_tpa */
|
|
498
517
|
|
|
499
518
|
__m256 gm_r = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_r, fs_r), half));
|
|
500
519
|
__m256 gm_g = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_g, fs_g), half));
|
|
501
520
|
__m256 gm_b = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_b, fs_b), half));
|
|
502
521
|
|
|
503
|
-
gm_r =
|
|
504
|
-
gm_g =
|
|
505
|
-
gm_b =
|
|
522
|
+
gm_r = mul_add_ps256(gm_r, fta, _mm256_mul_ps(ft_r, fit_a));
|
|
523
|
+
gm_g = mul_add_ps256(gm_g, fta, _mm256_mul_ps(ft_g, fit_a));
|
|
524
|
+
gm_b = mul_add_ps256(gm_b, fta, _mm256_mul_ps(ft_b, fit_a));
|
|
506
525
|
|
|
507
|
-
gm_r =
|
|
508
|
-
gm_g =
|
|
509
|
-
gm_b =
|
|
526
|
+
gm_r = mul_add_ps256(gm_r, invw, _mm256_mul_ps(fs_r, w));
|
|
527
|
+
gm_g = mul_add_ps256(gm_g, invw, _mm256_mul_ps(fs_g, w));
|
|
528
|
+
gm_b = mul_add_ps256(gm_b, invw, _mm256_mul_ps(fs_b, w));
|
|
510
529
|
|
|
511
530
|
gm_r = nan_to_num_ps(gm_r);
|
|
512
531
|
gm_g = nan_to_num_ps(gm_g);
|
|
513
532
|
gm_b = nan_to_num_ps(gm_b);
|
|
514
533
|
|
|
515
|
-
__m256 fr =
|
|
516
|
-
__m256 fg =
|
|
517
|
-
__m256 fb =
|
|
534
|
+
__m256 fr = mul_add_ps256(gm_r, fta, _mm256_mul_ps(fb_r, fit_a));
|
|
535
|
+
__m256 fg = mul_add_ps256(gm_g, fta, _mm256_mul_ps(fb_g, fit_a));
|
|
536
|
+
__m256 fb = mul_add_ps256(gm_b, fta, _mm256_mul_ps(fb_b, fit_a));
|
|
518
537
|
|
|
519
538
|
store_unit_f32_to_u8_rgb8_avx2(fr, fg, fb, out, i);
|
|
520
539
|
}
|
|
521
540
|
|
|
522
541
|
if (i < pixels) {
|
|
523
|
-
kernel_scalar_rgba(base + 3*i, texture + 4*i, skin +
|
|
542
|
+
kernel_scalar_rgba(base + 3*i, texture + 4*i, skin + 3*i, im_alpha + i,
|
|
524
543
|
out + 3*i, pixels - i);
|
|
525
544
|
}
|
|
526
545
|
}
|
|
@@ -550,6 +569,13 @@ static inline __m128 nan_to_num_ps128(__m128 x) {
|
|
|
550
569
|
return _mm_blendv_ps(_mm_set1_ps(0.0f), x, cmp);
|
|
551
570
|
}
|
|
552
571
|
|
|
572
|
+
static inline __m128 mul_add_ps128(__m128 a, __m128 b, __m128 c) {
|
|
573
|
+
#ifdef __FMA__
|
|
574
|
+
return _mm_fmadd_ps(a, b, c);
|
|
575
|
+
#else
|
|
576
|
+
return _mm_add_ps(_mm_mul_ps(a, b), c);
|
|
577
|
+
#endif
|
|
578
|
+
}
|
|
553
579
|
|
|
554
580
|
static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
|
|
555
581
|
const uint8_t *skin, const uint8_t *im_alpha,
|
|
@@ -575,12 +601,12 @@ static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
575
601
|
__m128 ft_b = u8x4_to_unit_f32(texture[3*(i+0)+2], texture[3*(i+1)+2],
|
|
576
602
|
texture[3*(i+2)+2], texture[3*(i+3)+2]);
|
|
577
603
|
|
|
578
|
-
__m128 fs_r = u8x4_to_unit_f32(skin[
|
|
579
|
-
skin[
|
|
580
|
-
__m128 fs_g = u8x4_to_unit_f32(skin[
|
|
581
|
-
skin[
|
|
582
|
-
__m128 fs_b = u8x4_to_unit_f32(skin[
|
|
583
|
-
skin[
|
|
604
|
+
__m128 fs_r = u8x4_to_unit_f32(skin[3*(i+0)+0], skin[3*(i+1)+0],
|
|
605
|
+
skin[3*(i+2)+0], skin[3*(i+3)+0]);
|
|
606
|
+
__m128 fs_g = u8x4_to_unit_f32(skin[3*(i+0)+1], skin[3*(i+1)+1],
|
|
607
|
+
skin[3*(i+2)+1], skin[3*(i+3)+1]);
|
|
608
|
+
__m128 fs_b = u8x4_to_unit_f32(skin[3*(i+0)+2], skin[3*(i+1)+2],
|
|
609
|
+
skin[3*(i+2)+2], skin[3*(i+3)+2]);
|
|
584
610
|
|
|
585
611
|
__m128 fa_im = load4_u8_to_unit_f32(im_alpha + i);
|
|
586
612
|
__m128 fit_a = _mm_sub_ps(one, fa_im);
|
|
@@ -589,21 +615,21 @@ static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
589
615
|
__m128 gm_g = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_g, fs_g), half));
|
|
590
616
|
__m128 gm_b = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_b, fs_b), half));
|
|
591
617
|
|
|
592
|
-
gm_r =
|
|
593
|
-
gm_g =
|
|
594
|
-
gm_b =
|
|
618
|
+
gm_r = mul_add_ps128(gm_r, fa_im, _mm_mul_ps(ft_r, fit_a));
|
|
619
|
+
gm_g = mul_add_ps128(gm_g, fa_im, _mm_mul_ps(ft_g, fit_a));
|
|
620
|
+
gm_b = mul_add_ps128(gm_b, fa_im, _mm_mul_ps(ft_b, fit_a));
|
|
595
621
|
|
|
596
|
-
gm_r =
|
|
597
|
-
gm_g =
|
|
598
|
-
gm_b =
|
|
622
|
+
gm_r = mul_add_ps128(gm_r, invw, _mm_mul_ps(fs_r, w));
|
|
623
|
+
gm_g = mul_add_ps128(gm_g, invw, _mm_mul_ps(fs_g, w));
|
|
624
|
+
gm_b = mul_add_ps128(gm_b, invw, _mm_mul_ps(fs_b, w));
|
|
599
625
|
|
|
600
626
|
gm_r = nan_to_num_ps128(gm_r);
|
|
601
627
|
gm_g = nan_to_num_ps128(gm_g);
|
|
602
628
|
gm_b = nan_to_num_ps128(gm_b);
|
|
603
629
|
|
|
604
|
-
__m128 fr =
|
|
605
|
-
__m128 fg =
|
|
606
|
-
__m128 fb =
|
|
630
|
+
__m128 fr = mul_add_ps128(gm_r, fa_im, _mm_mul_ps(fb_r, fit_a));
|
|
631
|
+
__m128 fg = mul_add_ps128(gm_g, fa_im, _mm_mul_ps(fb_g, fit_a));
|
|
632
|
+
__m128 fb = mul_add_ps128(gm_b, fa_im, _mm_mul_ps(fb_b, fit_a));
|
|
607
633
|
|
|
608
634
|
float rr[4], gg[4], bb[4];
|
|
609
635
|
_mm_storeu_ps(rr, fr);
|
|
@@ -621,7 +647,7 @@ static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
|
|
|
621
647
|
}
|
|
622
648
|
|
|
623
649
|
if (i < pixels) {
|
|
624
|
-
kernel_scalar_rgb(base + 3*i, texture + 3*i, skin +
|
|
650
|
+
kernel_scalar_rgb(base + 3*i, texture + 3*i, skin + 3*i, im_alpha + i,
|
|
625
651
|
out + 3*i, pixels - i);
|
|
626
652
|
}
|
|
627
653
|
}
|
|
@@ -652,12 +678,12 @@ static void kernel_sse42_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
652
678
|
__m128 ft_a = u8x4_to_unit_f32(texture[4*(i+0)+3], texture[4*(i+1)+3],
|
|
653
679
|
texture[4*(i+2)+3], texture[4*(i+3)+3]);
|
|
654
680
|
|
|
655
|
-
__m128 fs_r = u8x4_to_unit_f32(skin[
|
|
656
|
-
skin[
|
|
657
|
-
__m128 fs_g = u8x4_to_unit_f32(skin[
|
|
658
|
-
skin[
|
|
659
|
-
__m128 fs_b = u8x4_to_unit_f32(skin[
|
|
660
|
-
skin[
|
|
681
|
+
__m128 fs_r = u8x4_to_unit_f32(skin[3*(i+0)+0], skin[3*(i+1)+0],
|
|
682
|
+
skin[3*(i+2)+0], skin[3*(i+3)+0]);
|
|
683
|
+
__m128 fs_g = u8x4_to_unit_f32(skin[3*(i+0)+1], skin[3*(i+1)+1],
|
|
684
|
+
skin[3*(i+2)+1], skin[3*(i+3)+1]);
|
|
685
|
+
__m128 fs_b = u8x4_to_unit_f32(skin[3*(i+0)+2], skin[3*(i+1)+2],
|
|
686
|
+
skin[3*(i+2)+2], skin[3*(i+3)+2]);
|
|
661
687
|
|
|
662
688
|
__m128 fa_im = load4_u8_to_unit_f32(im_alpha + i);
|
|
663
689
|
__m128 fta = _mm_mul_ps(ft_a, fa_im); /* texture_alpha */
|
|
@@ -667,21 +693,21 @@ static void kernel_sse42_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
667
693
|
__m128 gm_g = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_g, fs_g), half));
|
|
668
694
|
__m128 gm_b = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_b, fs_b), half));
|
|
669
695
|
|
|
670
|
-
gm_r =
|
|
671
|
-
gm_g =
|
|
672
|
-
gm_b =
|
|
696
|
+
gm_r = mul_add_ps128(gm_r, fta, _mm_mul_ps(ft_r, fit_a));
|
|
697
|
+
gm_g = mul_add_ps128(gm_g, fta, _mm_mul_ps(ft_g, fit_a));
|
|
698
|
+
gm_b = mul_add_ps128(gm_b, fta, _mm_mul_ps(ft_b, fit_a));
|
|
673
699
|
|
|
674
|
-
gm_r =
|
|
675
|
-
gm_g =
|
|
676
|
-
gm_b =
|
|
700
|
+
gm_r = mul_add_ps128(gm_r, invw, _mm_mul_ps(fs_r, w));
|
|
701
|
+
gm_g = mul_add_ps128(gm_g, invw, _mm_mul_ps(fs_g, w));
|
|
702
|
+
gm_b = mul_add_ps128(gm_b, invw, _mm_mul_ps(fs_b, w));
|
|
677
703
|
|
|
678
704
|
gm_r = nan_to_num_ps128(gm_r);
|
|
679
705
|
gm_g = nan_to_num_ps128(gm_g);
|
|
680
706
|
gm_b = nan_to_num_ps128(gm_b);
|
|
681
707
|
|
|
682
|
-
__m128 fr =
|
|
683
|
-
__m128 fg =
|
|
684
|
-
__m128 fb =
|
|
708
|
+
__m128 fr = mul_add_ps128(gm_r, fta, _mm_mul_ps(fb_r, fit_a));
|
|
709
|
+
__m128 fg = mul_add_ps128(gm_g, fta, _mm_mul_ps(fb_g, fit_a));
|
|
710
|
+
__m128 fb = mul_add_ps128(gm_b, fta, _mm_mul_ps(fb_b, fit_a));
|
|
685
711
|
|
|
686
712
|
float rr[4], gg[4], bb[4];
|
|
687
713
|
_mm_storeu_ps(rr, fr);
|
|
@@ -699,7 +725,7 @@ static void kernel_sse42_rgba(const uint8_t *base, const uint8_t *texture,
|
|
|
699
725
|
}
|
|
700
726
|
|
|
701
727
|
if (i < pixels) {
|
|
702
|
-
kernel_scalar_rgba(base + 3*i, texture + 4*i, skin +
|
|
728
|
+
kernel_scalar_rgba(base + 3*i, texture + 4*i, skin + 3*i, im_alpha + i,
|
|
703
729
|
out + 3*i, pixels - i);
|
|
704
730
|
}
|
|
705
731
|
}
|
{normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge/normal_grain_merge.pyi
RENAMED
|
@@ -16,7 +16,7 @@ def normal_grain_merge(
|
|
|
16
16
|
Channel ordering doesn't matter as long as it is consistent.
|
|
17
17
|
:param base: The base RGB image.
|
|
18
18
|
:param texture: The texture, either RGB or RGBA.
|
|
19
|
-
:param skin: The
|
|
19
|
+
:param skin: The RGB skin cutout.
|
|
20
20
|
:param im_alpha: The alpha from the cutout.
|
|
21
21
|
:param kernel: Which kernel to use.
|
|
22
22
|
The `auto` kernel chooses between avx2 and sse4.2 when compiled with gcc and uses `scaler` on Windows.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: normal_grain_merge
|
|
3
|
-
Version: 0.0
|
|
3
|
+
Version: 0.1.0
|
|
4
4
|
Summary: Fused normal and grain merge C extension
|
|
5
5
|
Author: Samuel Howard
|
|
6
6
|
License: MIT
|
|
@@ -88,21 +88,22 @@ One of `KernelKind`.
|
|
|
88
88
|
The entire reason for me writing this was NumPy being slow when this operation is in the hot path.
|
|
89
89
|
So, I decided to write a SIMD version that does the type casting outside NumPy with only the intermediate values being in FP32.
|
|
90
90
|
|
|
91
|
-
How much of a speedup is this? All numbers are from a Ryzen 7 4800H running
|
|
91
|
+
How much of a speedup is this? All numbers are from a Ryzen 7 4800H running Ubuntu 24.04 and Python 3.12.3.
|
|
92
92
|
|
|
93
93
|
| Method/Kernel | Average Iteration Time |
|
|
94
94
|
|-------------------|------------------------|
|
|
95
|
-
| C scalar kernel | 0.
|
|
96
|
-
| C SSE4.2 kernel | 0.
|
|
97
|
-
| C AVX2 kernel | 0.
|
|
98
|
-
| NumPy version | 0.
|
|
99
|
-
| Old NumPy version | 0.
|
|
95
|
+
| C scalar kernel | 0.016007s |
|
|
96
|
+
| C SSE4.2 kernel | 0.011155s |
|
|
97
|
+
| C AVX2 kernel | 0.014575s |
|
|
98
|
+
| NumPy version | 0.190392s |
|
|
99
|
+
| Old NumPy version | 0.274065s |
|
|
100
100
|
|
|
101
101
|
| Method Comparison | Speedup |
|
|
102
102
|
|--------------------|----------|
|
|
103
|
-
| NumPy -> scalar | 91.
|
|
104
|
-
| NumPy -> SSE4.2 |
|
|
105
|
-
| NumPy -> AVX2 | 92.
|
|
106
|
-
| Old np -> SSE4.2 |
|
|
107
|
-
|
|
|
108
|
-
| C scalar ->
|
|
103
|
+
| NumPy -> scalar | 91.5927% |
|
|
104
|
+
| NumPy -> SSE4.2 | 94.1409% |
|
|
105
|
+
| NumPy -> AVX2 | 92.3448% |
|
|
106
|
+
| Old np -> SSE4.2 | 95.9297% |
|
|
107
|
+
| Old np -> AVX2 | 94.6819% |
|
|
108
|
+
| C scalar -> SSE4.2 | 30.3086% |
|
|
109
|
+
| C scalar -> AVX2 | 8.9448% |
|
|
@@ -12,7 +12,7 @@ if sys.platform == "win32":
|
|
|
12
12
|
elif "arm" in arch or "aarch64" in arch:
|
|
13
13
|
extra_compile_args += ["-O3"]
|
|
14
14
|
else:
|
|
15
|
-
extra_compile_args += ["-O3", "-march=x86-64", "-mavx2", "-msse4.2"]
|
|
15
|
+
extra_compile_args += ["-O3", "-march=x86-64", "-mavx2", "-msse4.2", "-flto", "-mfma",]
|
|
16
16
|
|
|
17
17
|
module = Extension(
|
|
18
18
|
"normal_grain_merge.normal_grain_merge",
|
|
@@ -23,7 +23,7 @@ module = Extension(
|
|
|
23
23
|
|
|
24
24
|
setup(
|
|
25
25
|
name="normal_grain_merge",
|
|
26
|
-
version="0.0
|
|
26
|
+
version="0.1.0",
|
|
27
27
|
description="Normal grain merge C extension",
|
|
28
28
|
ext_modules=[module],
|
|
29
29
|
packages=["normal_grain_merge"],
|
|
@@ -33,8 +33,8 @@ class TestNGM(unittest.TestCase):
|
|
|
33
33
|
"""
|
|
34
34
|
self.base = cv2.imread("base.png")
|
|
35
35
|
self.texture = cv2.imread("texture.png")
|
|
36
|
-
self.skin =
|
|
37
|
-
self.im_alpha =
|
|
36
|
+
self.skin = self.base.copy()
|
|
37
|
+
self.im_alpha = cv2.imread("skin.png", cv2.IMREAD_UNCHANGED)[..., 3]
|
|
38
38
|
|
|
39
39
|
def test_dummy_arrays(self):
|
|
40
40
|
"""
|
|
@@ -42,7 +42,7 @@ class TestNGM(unittest.TestCase):
|
|
|
42
42
|
"""
|
|
43
43
|
base = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
44
44
|
texture = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
45
|
-
skin = np.zeros((100, 100,
|
|
45
|
+
skin = np.zeros((100, 100, 3), dtype=np.uint8)
|
|
46
46
|
im_alpha = np.zeros((100, 100), dtype=np.uint8)
|
|
47
47
|
|
|
48
48
|
result_scalar = normal_grain_merge(base, texture, skin, im_alpha, KernelKind.KERNEL_SCALAR.value)
|
|
@@ -53,15 +53,14 @@ class TestNGM(unittest.TestCase):
|
|
|
53
53
|
"""
|
|
54
54
|
Test the common case; RGB versions of each kernel.
|
|
55
55
|
"""
|
|
56
|
-
result_py = apply_texture(self.base, self.skin, self.texture, self.im_alpha)
|
|
56
|
+
result_py = apply_texture(self.base, np.dstack([self.skin, self.im_alpha]), self.texture, self.im_alpha)
|
|
57
57
|
self.skin = cv2.cvtColor(
|
|
58
58
|
cv2.cvtColor(
|
|
59
|
-
self.skin
|
|
59
|
+
self.skin,
|
|
60
60
|
cv2.COLOR_BGR2GRAY),
|
|
61
61
|
cv2.COLOR_GRAY2BGR
|
|
62
62
|
)
|
|
63
63
|
# Skin is BGR at this point
|
|
64
|
-
self.skin = np.dstack([self.skin, self.im_alpha])
|
|
65
64
|
result_scalar = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_SCALAR.value)
|
|
66
65
|
result_sse = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_SSE42.value)
|
|
67
66
|
result_avx = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_AVX2.value)
|
|
@@ -81,15 +80,14 @@ class TestNGM(unittest.TestCase):
|
|
|
81
80
|
"""
|
|
82
81
|
self.skin = cv2.cvtColor(
|
|
83
82
|
cv2.cvtColor(
|
|
84
|
-
self.skin
|
|
83
|
+
self.skin,
|
|
85
84
|
cv2.COLOR_BGR2GRAY),
|
|
86
85
|
cv2.COLOR_GRAY2BGR
|
|
87
86
|
)
|
|
88
87
|
mask = vertical_fill(self.base.shape[0], self.base.shape[1], self.base.shape[1] // 2)
|
|
89
88
|
new_alpha = np.bitwise_and(self.im_alpha, mask)
|
|
90
|
-
self.skin = np.dstack((self.skin[..., :3], new_alpha))
|
|
91
89
|
|
|
92
|
-
result_py = apply_texture(self.base, self.skin, self.texture, new_alpha)
|
|
90
|
+
result_py = apply_texture(self.base, np.dstack((self.skin[..., :3], new_alpha)), self.texture, new_alpha)
|
|
93
91
|
result_scalar = normal_grain_merge(self.base, self.texture, self.skin, new_alpha, KernelKind.KERNEL_SCALAR.value)
|
|
94
92
|
result_sse = normal_grain_merge(self.base, self.texture, self.skin, new_alpha, KernelKind.KERNEL_SSE42.value)
|
|
95
93
|
result_avx = normal_grain_merge(self.base, self.texture, self.skin, new_alpha, KernelKind.KERNEL_AVX2.value)
|
|
@@ -114,13 +112,12 @@ class TestNGM(unittest.TestCase):
|
|
|
114
112
|
|
|
115
113
|
self.skin = cv2.cvtColor(
|
|
116
114
|
cv2.cvtColor(
|
|
117
|
-
self.skin
|
|
115
|
+
self.skin,
|
|
118
116
|
cv2.COLOR_BGR2GRAY),
|
|
119
117
|
cv2.COLOR_GRAY2BGR
|
|
120
118
|
)
|
|
121
119
|
result_py = apply_texture(self.base, self.skin, self.texture, self.im_alpha)
|
|
122
120
|
# Skin is BGR at this point
|
|
123
|
-
self.skin = np.dstack([self.skin, self.im_alpha])
|
|
124
121
|
result_scalar = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_SCALAR.value)
|
|
125
122
|
result_sse = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_SSE42.value)
|
|
126
123
|
result_avx = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_AVX2.value)
|
|
@@ -143,12 +140,11 @@ class TestNGM(unittest.TestCase):
|
|
|
143
140
|
result_py = apply_texture(self.base, self.skin, self.texture, self.im_alpha)
|
|
144
141
|
self.skin = cv2.cvtColor(
|
|
145
142
|
cv2.cvtColor(
|
|
146
|
-
self.skin
|
|
143
|
+
self.skin,
|
|
147
144
|
cv2.COLOR_BGR2GRAY),
|
|
148
145
|
cv2.COLOR_GRAY2BGR
|
|
149
146
|
)
|
|
150
147
|
# Skin is BGR at this point
|
|
151
|
-
self.skin = np.dstack([self.skin, self.im_alpha])
|
|
152
148
|
result_scalar = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_SCALAR.value)
|
|
153
149
|
result_sse = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_SSE42.value)
|
|
154
150
|
result_avx = normal_grain_merge(self.base, self.texture, self.skin, self.im_alpha, KernelKind.KERNEL_AVX2.value)
|
|
@@ -30,16 +30,15 @@ class TestNGM(unittest.TestCase):
|
|
|
30
30
|
global_start = time.perf_counter()
|
|
31
31
|
base = cv2.imread("base.png")
|
|
32
32
|
texture = cv2.imread("texture.png")
|
|
33
|
-
skin =
|
|
34
|
-
im_alpha = skin[..., 3]
|
|
33
|
+
skin = base.copy()
|
|
34
|
+
im_alpha = cv2.imread("skin.png", cv2.IMREAD_UNCHANGED)[..., 3]
|
|
35
35
|
skin = cv2.cvtColor(
|
|
36
36
|
cv2.cvtColor(
|
|
37
|
-
skin
|
|
37
|
+
skin,
|
|
38
38
|
cv2.COLOR_BGR2GRAY),
|
|
39
39
|
cv2.COLOR_GRAY2BGR
|
|
40
40
|
)
|
|
41
41
|
# Skin is BGR at this point
|
|
42
|
-
skin = np.dstack([skin, im_alpha])
|
|
43
42
|
|
|
44
43
|
# Scaler kernel
|
|
45
44
|
start_c_scalar = time.perf_counter()
|
|
@@ -60,7 +59,6 @@ class TestNGM(unittest.TestCase):
|
|
|
60
59
|
end_c_avx = time.perf_counter()
|
|
61
60
|
|
|
62
61
|
# NumPy "just do less" version.
|
|
63
|
-
skin = skin[..., :3]
|
|
64
62
|
start_py = time.perf_counter()
|
|
65
63
|
for _ in range(ITERATIONS):
|
|
66
64
|
result = normal_grain_merge_py(base, texture, skin, im_alpha)
|
|
@@ -90,6 +88,7 @@ class TestNGM(unittest.TestCase):
|
|
|
90
88
|
f"NumPy -> SSE4.2: {percent_change(c_avg_sse, np_avg):.4f}%\n"
|
|
91
89
|
f"NumPy -> AVX2: {percent_change(c_avg_avx, np_avg):.4f}%\n"
|
|
92
90
|
f"Old np -> SSE: {percent_change(c_avg_sse, np_old_avg):.4f}%\n"
|
|
91
|
+
f"Old np -> AVX2: {percent_change(c_avg_avx, np_old_avg):.4f}%\n"
|
|
93
92
|
f"C scalar -> SSE: {percent_change(c_avg_sse, c_avg_scalar):.4f}%\n"
|
|
94
93
|
f"C scalar -> AVX: {percent_change(c_avg_avx, c_avg_scalar):.4f}%\n")
|
|
95
94
|
print(f"Test time: {end - global_start:.4f}s")
|
{normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/SOURCES.txt
RENAMED
|
File without changes
|
|
File without changes
|
{normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/requires.txt
RENAMED
|
File without changes
|
{normal_grain_merge-0.0.2 → normal_grain_merge-0.1.0}/normal_grain_merge.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|