normal-grain-merge 0.0.0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of normal-grain-merge might be problematic. Click here for more details.

@@ -0,0 +1,2 @@
1
+ from .normal_grain_merge import normal_grain_merge
2
+ from .kernel_kind import KernelKind
@@ -0,0 +1,8 @@
1
+ from enum import Enum
2
+
3
+
4
+ class KernelKind(Enum):
5
+ KERNEL_AUTO = "auto"
6
+ KERNEL_SCALAR = "scalar"
7
+ KERNEL_SSE42 = "sse42"
8
+ KERNEL_AVX2 = "avx2"
@@ -0,0 +1,865 @@
1
+ #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
2
+ #include <stdio.h>
3
+ #include <math.h>
4
+ #include <float.h>
5
+ #include <Python.h>
6
+ #include <numpy/arrayobject.h>
7
+ #include <smmintrin.h>
8
+ #include <immintrin.h> /* AVX2 + SSE4.2 */
9
+
10
+ /* ----- Runtime CPU feature detection (GCC/Clang + MSVC) ----- */
11
+ #if defined(_MSC_VER)
12
+ #include <intrin.h>
13
+ static int os_supports_avx(void) {
14
+ /* Check OSXSAVE + XCR0[2:1] == 11b so OS saves YMM state */
15
+ int cpuInfo[4];
16
+ __cpuid(cpuInfo, 1);
17
+ int ecx = cpuInfo[2];
18
+ int osxsave = (ecx >> 27) & 1;
19
+ if (!osxsave) return 0;
20
+ unsigned long long xcr0 = _xgetbv(0);
21
+ return ((xcr0 & 0x6) == 0x6); /* XMM (bit1) and YMM (bit2) state enabled */
22
+ }
23
+
24
+ static int cpu_supports_avx2(void) {
25
+ int cpuInfo[4];
26
+ __cpuid(cpuInfo, 1);
27
+ int ecx = cpuInfo[2];
28
+ int avx = (ecx >> 28) & 1;
29
+ int osxsave = (ecx >> 27) & 1;
30
+ if (!(avx && osxsave && os_supports_avx())) return 0;
31
+
32
+ /* Leaf 7, subleaf 0: EBX bit 5 = AVX2 */
33
+ int ex[4];
34
+ __cpuidex(ex, 7, 0);
35
+ int ebx = ex[1];
36
+ return (ebx >> 5) & 1;
37
+ }
38
+
39
+ static int cpu_supports_sse42(void) {
40
+ int cpuInfo[4];
41
+ __cpuid(cpuInfo, 1);
42
+ int ecx = cpuInfo[2];
43
+ return (ecx >> 20) & 1; /* SSE4.2 */
44
+ }
45
+ #else
46
+ /* GCC/Clang path */
47
+ static int os_supports_avx(void) {
48
+ #if defined(__GNUC__) || defined(__clang__)
49
+ /* If we’re here, assume OS supports AVX when the CPU supports it.
50
+ For full rigor you can also call xgetbv via inline asm, but it’s uncommon to lack it. */
51
+ return 1;
52
+ #else
53
+ return 0;
54
+ #endif
55
+ }
56
+
57
+ static int cpu_supports_avx2(void) {
58
+ #if defined(__GNUC__) || defined(__clang__)
59
+ /* Requires -mavx2 at compile, but we only *call* the AVX2 kernel if true. */
60
+ return __builtin_cpu_supports("avx2");
61
+ #else
62
+ return 0;
63
+ #endif
64
+ }
65
+
66
+ static int cpu_supports_sse42(void) {
67
+ #if defined(__GNUC__) || defined(__clang__)
68
+ return __builtin_cpu_supports("sse4.2");
69
+ #else
70
+ return 0;
71
+ #endif
72
+ }
73
+ #endif
74
+
75
+ #define SKIN_WEIGHT 0.3f
76
+
77
+ typedef enum {
78
+ KERNEL_AUTO = 0,
79
+ KERNEL_SCALAR = 1,
80
+ KERNEL_SSE42 = 2,
81
+ KERNEL_AVX2 = 3
82
+ } kernel_kind;
83
+
84
+ /* ---------- Utility: safe views, shape checks ---------- */
85
+
86
+ /* Make a new uint8, C-contiguous, aligned view we own. Never DECREF the input obj. */
87
+ static inline int get_uint8_c_contig(PyObject *obj, PyArrayObject **out, const char *name) {
88
+ const int flags = NPY_ARRAY_ALIGNED | NPY_ARRAY_C_CONTIGUOUS;
89
+ PyArrayObject *arr = (PyArrayObject*)PyArray_FROM_OTF(obj, NPY_UINT8, flags);
90
+ if (!arr) {
91
+ PyErr_Format(PyExc_TypeError, "%s must be a uint8 ndarray", name);
92
+ return 0;
93
+ }
94
+ *out = arr; /* new reference */
95
+ return 1;
96
+ }
97
+
98
+ static inline int ensure_uint8_contig(PyArrayObject **arr, const char *name) {
99
+ PyArrayObject *tmp = (PyArrayObject*)PyArray_FROM_OTF(
100
+ (PyObject*)(*arr), NPY_UINT8, NPY_ARRAY_ALIGNED | NPY_ARRAY_C_CONTIGUOUS);
101
+ if (!tmp) return 0;
102
+ Py_XDECREF(*arr);
103
+ *arr = tmp;
104
+ return 1;
105
+ }
106
+
107
+ static inline int check_shape_requirements(PyArrayObject *base,
108
+ PyArrayObject *texture,
109
+ PyArrayObject *skin,
110
+ PyArrayObject *im_alpha,
111
+ int *texture_has_alpha,
112
+ npy_intp *height,
113
+ npy_intp *width) {
114
+ if (PyArray_NDIM(base) != 3 || PyArray_DIMS(base)[2] != 3) {
115
+ PyErr_SetString(PyExc_ValueError, "base must have shape (H, W, 3)");
116
+ return 0;
117
+ }
118
+ if (PyArray_NDIM(texture) != 3) {
119
+ PyErr_SetString(PyExc_ValueError, "texture must have shape (H, W, 3) or (H, W, 4)");
120
+ return 0;
121
+ }
122
+ npy_intp tc = PyArray_DIMS(texture)[2];
123
+ if (!(tc == 3 || tc == 4)) {
124
+ PyErr_SetString(PyExc_ValueError, "texture must have 3 or 4 channels");
125
+ return 0;
126
+ }
127
+ *texture_has_alpha = (tc == 4);
128
+
129
+ if (PyArray_NDIM(skin) != 3 || PyArray_DIMS(skin)[2] != 4) {
130
+ PyErr_SetString(PyExc_ValueError, "skin must have shape (H, W, 4)");
131
+ return 0;
132
+ }
133
+ if (PyArray_NDIM(im_alpha) != 2) {
134
+ PyErr_SetString(PyExc_ValueError, "im_alpha must have shape (H, W)");
135
+ return 0;
136
+ }
137
+
138
+ npy_intp h = PyArray_DIMS(base)[0], w = PyArray_DIMS(base)[1];
139
+ if (PyArray_DIMS(texture)[0] != h || PyArray_DIMS(texture)[1] != w ||
140
+ PyArray_DIMS(skin)[0] != h || PyArray_DIMS(skin)[1] != w ||
141
+ PyArray_DIMS(im_alpha)[0] != h|| PyArray_DIMS(im_alpha)[1] != w) {
142
+ PyErr_SetString(PyExc_ValueError, "All inputs must share the same H and W");
143
+ return 0;
144
+ }
145
+ *height = h; *width = w;
146
+ return 1;
147
+ }
148
+
149
+ /* ---------- Scalar reference kernel (clear, correct, easy to modify) ---------- */
150
+ /* Converts uint8 to float32 in [0,1], does placeholder math, writes back to uint8. */
151
+ /* Replace the placeholder math with your blend. */
152
+
153
+ /*
154
+ * Converts nan and inf values to 0 and 255 respectively.
155
+ */
156
+ static inline float nan_to_num(float x) {
157
+ if (isnan(x)) {
158
+ return 0.0f; // replace NaN with 0
159
+ }
160
+ if (isinf(x)) {
161
+ if (x > 0) {
162
+ return 255.0f; // positive infinity -> max uint8
163
+ } else {
164
+ return 0.0f; // negative infinity -> min uint8
165
+ }
166
+ }
167
+ else {
168
+ return x; // keep finite values as they are
169
+ }
170
+ }
171
+
172
+ /*
173
+ * Scaler kernel for RGB texture input.
174
+ */
175
+ static void kernel_scalar_rgb(const uint8_t *base, const uint8_t *texture,
176
+ const uint8_t *skin, const uint8_t *im_alpha,
177
+ uint8_t *out, npy_intp pixels) {
178
+ for (npy_intp i = 0; i < pixels; ++i) {
179
+ const uint8_t b_r = base[3*i+0];
180
+ const uint8_t b_g = base[3*i+1];
181
+ const uint8_t b_b = base[3*i+2];
182
+
183
+ const uint8_t t_r = texture[3*i+0];
184
+ const uint8_t t_g = texture[3*i+1];
185
+ const uint8_t t_b = texture[3*i+2];
186
+
187
+ const uint8_t s_r = skin[4*i+0];
188
+ const uint8_t s_g = skin[4*i+1];
189
+ const uint8_t s_b = skin[4*i+2];
190
+ const uint8_t s_a = skin[4*i+3];
191
+
192
+ const uint8_t a_im = im_alpha[i];
193
+
194
+ /* float32 intermediates in [0,1] */
195
+ const float fb_r = b_r * (1.0f/255.0f);
196
+ const float fb_g = b_g * (1.0f/255.0f);
197
+ const float fb_b = b_b * (1.0f/255.0f);
198
+
199
+ const float ft_r = t_r * (1.0f/255.0f);
200
+ const float ft_g = t_g * (1.0f/255.0f);
201
+ const float ft_b = t_b * (1.0f/255.0f);
202
+
203
+ const float fs_r = s_r * (1.0f/255.0f);
204
+ const float fs_g = s_g * (1.0f/255.0f);
205
+ const float fs_b = s_b * (1.0f/255.0f);
206
+ const float fs_a = s_a * (1.0f/255.0f);
207
+
208
+ const float fa_im = a_im * (1.0f/255.0f);
209
+
210
+ /*
211
+ **********************
212
+ * normal grain merge *
213
+ **********************
214
+ */
215
+
216
+ /* inverse_tpa */
217
+ float fit_a = 1.0f - fa_im;
218
+ /* gm_out = np.clip(texture + skin - 0.5, 0.0, 1.0) */
219
+ float fr = ft_r + fs_r - 0.5f;
220
+ float fg = ft_g + fs_g - 0.5f;
221
+ float fb = ft_b + fs_b - 0.5f;
222
+ /* np.clip */
223
+ fr = fr < 0.0f ? 0.0f : (fr > 1.0f ? 1.0f : fr);
224
+ fg = fg < 0.0f ? 0.0f : (fg > 1.0f ? 1.0f : fg);
225
+ fb = fb < 0.0f ? 0.0f : (fb > 1.0f ? 1.0f : fb);
226
+ /* gm_out = gm_out * texture_alpha + texture * inverse_tpa */
227
+ fr = fr * fa_im + ft_r * fit_a;
228
+ fg = fg * fa_im + ft_g * fit_a;
229
+ fb = fb * fa_im + ft_b * fit_a;
230
+
231
+ /* gm_out = gm_out * (1 - SKIN_WEIGHT) + (skin * SKIN_WEIGHT) */
232
+ fr = fr * (1.0f - SKIN_WEIGHT) + fs_r * SKIN_WEIGHT;
233
+ fg = fg * (1.0f - SKIN_WEIGHT) + fs_g * SKIN_WEIGHT;
234
+ fb = fb * (1.0f - SKIN_WEIGHT) + fs_b * SKIN_WEIGHT;
235
+
236
+ /* np.nan_to_num(gm_out, copy=False) */
237
+ fr = nan_to_num(fr);
238
+ fg = nan_to_num(fg);
239
+ fb = nan_to_num(fb);
240
+
241
+ /* Normal merge
242
+ * n_out = gm_out * texture_alpha + base * inverse_tpa
243
+ *
244
+ * In this case, texture_alpha is the skin alpha since texture doesn't have an alpha channel here.
245
+ */
246
+ fr = fr * fa_im + fb_r * fit_a;
247
+ fg = fg * fa_im + fb_g * fit_a;
248
+ fb = fb * fa_im + fb_b * fit_a;
249
+
250
+
251
+ out[3*i+0] = (uint8_t)(fr * 255.0f);
252
+ out[3*i+1] = (uint8_t)(fg * 255.0f);
253
+ out[3*i+2] = (uint8_t)(fb * 255.0f);
254
+ }
255
+ }
256
+
257
+ static void kernel_scalar_rgba(const uint8_t *base, const uint8_t *texture,
258
+ const uint8_t *skin, const uint8_t *im_alpha,
259
+ uint8_t *out, npy_intp pixels) {
260
+ for (npy_intp i = 0; i < pixels; ++i) {
261
+ const uint8_t b_r = base[3*i+0];
262
+ const uint8_t b_g = base[3*i+1];
263
+ const uint8_t b_b = base[3*i+2];
264
+
265
+ const uint8_t t_r = texture[4*i+0];
266
+ const uint8_t t_g = texture[4*i+1];
267
+ const uint8_t t_b = texture[4*i+2];
268
+ const uint8_t t_a = texture[4*i+3]; /* present in RGBA branch */
269
+
270
+ const uint8_t s_r = skin[4*i+0];
271
+ const uint8_t s_g = skin[4*i+1];
272
+ const uint8_t s_b = skin[4*i+2];
273
+ const uint8_t s_a = skin[4*i+3];
274
+
275
+ const uint8_t a_im = im_alpha[i];
276
+
277
+ const float fb_r = b_r * (1.0f/255.0f);
278
+ const float fb_g = b_g * (1.0f/255.0f);
279
+ const float fb_b = b_b * (1.0f/255.0f);
280
+
281
+ const float ft_r = t_r * (1.0f/255.0f);
282
+ const float ft_g = t_g * (1.0f/255.0f);
283
+ const float ft_b = t_b * (1.0f/255.0f);
284
+ float ft_a = t_a * (1.0f/255.0f);
285
+
286
+ const float fs_r = s_r * (1.0f/255.0f);
287
+ const float fs_g = s_g * (1.0f/255.0f);
288
+ const float fs_b = s_b * (1.0f/255.0f);
289
+ const float fs_a = s_a * (1.0f/255.0f);
290
+
291
+ const float fa_im = a_im * (1.0f/255.0f);
292
+
293
+ /*
294
+ **********************
295
+ * normal grain merge *
296
+ **********************
297
+ */
298
+ /* Merge texture and skin alphas */
299
+
300
+ /* texture_alpha = texture[..., 3] * im_alpha*/
301
+ ft_a = ft_a * fa_im;
302
+ /* inverse_tpa = 1 - texture_alpha */
303
+ float fit_a = 1.0f - ft_a;
304
+
305
+ /* gm_out = np.clip(texture + skin - 0.5, 0.0, 1.0) */
306
+ float fr = ft_r + fs_r - 0.5f;
307
+ float fg = ft_g + fs_g - 0.5f;
308
+ float fb = ft_b + fs_b - 0.5f;
309
+ /* np.clip */
310
+ fr = fr < 0.0f ? 0.0f : (fr > 1.0f ? 1.0f : fr);
311
+ fg = fg < 0.0f ? 0.0f : (fg > 1.0f ? 1.0f : fg);
312
+ fb = fb < 0.0f ? 0.0f : (fb > 1.0f ? 1.0f : fb);
313
+
314
+ /* gm_out = gm_out * texture_alpha + texture * inverse_tpa */
315
+ fr = fr * ft_a + ft_r * fit_a;
316
+ fg = fg * ft_a + ft_g * fit_a;
317
+ fb = fb * ft_a + ft_b * fit_a;
318
+
319
+
320
+ /* gm_out = gm_out * (1 - SKIN_WEIGHT) + (skin * SKIN_WEIGHT) */
321
+ fr = fr * (1.0f - SKIN_WEIGHT) + fs_r * SKIN_WEIGHT;
322
+ fg = fg * (1.0f - SKIN_WEIGHT) + fs_g * SKIN_WEIGHT;
323
+ fb = fb * (1.0f - SKIN_WEIGHT) + fs_b * SKIN_WEIGHT;
324
+
325
+ /* np.nan_to_num(gm_out, copy=False) */
326
+ fr = nan_to_num(fr);
327
+ fg = nan_to_num(fg);
328
+ fb = nan_to_num(fb);
329
+
330
+ /* Normal merge
331
+ * n_out = gm_out * texture_alpha + base * inverse_tpa
332
+ */
333
+ fr = fr * ft_a + fb_r * fit_a;
334
+ fg = fg * ft_a + fb_g * fit_a;
335
+ fb = fb * ft_a + fb_b * fit_a;
336
+
337
+ out[3*i+0] = (uint8_t)(fr * 255.0f);
338
+ out[3*i+1] = (uint8_t)(fg * 255.0f);
339
+ out[3*i+2] = (uint8_t)(fb * 255.0f);
340
+ }
341
+ }
342
+
343
+ /* ---------- AVX2 helpers ----------
344
+ Interleaved RGB(A) is awkward for SIMD. For a skeleton that still uses AVX2, we use gathers
345
+ over a stride (3 or 4) to pull 8 pixels for a given channel into a vector.
346
+ You can later replace gathers with better deinterleaving if needed.
347
+ */
348
+
349
+ /* Convert 8 u8 interleaved channel samples (stride 3 or 4) to float32 in [0,1] via gather. */
350
+ static inline __m256 gather_u8_to_unit_f32_avx2(const uint8_t *base_ptr, int stride,
351
+ npy_intp start_idx) {
352
+ const int i0 = (int)((start_idx + 0) * stride);
353
+ const int i1 = (int)((start_idx + 1) * stride);
354
+ const int i2 = (int)((start_idx + 2) * stride);
355
+ const int i3 = (int)((start_idx + 3) * stride);
356
+ const int i4 = (int)((start_idx + 4) * stride);
357
+ const int i5 = (int)((start_idx + 5) * stride);
358
+ const int i6 = (int)((start_idx + 6) * stride);
359
+ const int i7 = (int)((start_idx + 7) * stride);
360
+
361
+ __m256i offs = _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7);
362
+ __m256i v32 = _mm256_i32gather_epi32((const int*)base_ptr, offs, 1); /* read 8 x u8 as u32 */
363
+ v32 = _mm256_and_si256(v32, _mm256_set1_epi32(0xFF));
364
+ return _mm256_mul_ps(_mm256_cvtepi32_ps(v32), _mm256_set1_ps(1.0f/255.0f));
365
+ }
366
+
367
+ /* Convert 8 consecutive u8 to float32 in [0,1] (for grayscale im_alpha). */
368
+ static inline __m256 load8_u8_to_unit_f32_avx2(const uint8_t *p) {
369
+ __m128i v8 = _mm_loadl_epi64((const __m128i*)p); /* 8 bytes -> XMM */
370
+ __m256i v32 = _mm256_cvtepu8_epi32(v8); /* widen to 8 x u32 */
371
+ return _mm256_mul_ps(_mm256_cvtepi32_ps(v32), _mm256_set1_ps(1.0f/255.0f));
372
+ }
373
+
374
+ static inline __m256 clamp01_ps(__m256 x) {
375
+ return _mm256_min_ps(_mm256_max_ps(x, _mm256_set1_ps(0.0f)), _mm256_set1_ps(1.0f));
376
+ }
377
+
378
+ /* Replace NaN with 0.0f (Inf is not expected from uint8-origin math). */
379
+ static inline __m256 nan_to_num_ps(__m256 x) {
380
+ __m256 cmp = _mm256_cmp_ps(x, x, _CMP_ORD_Q); /* 0 for NaN lanes */
381
+ return _mm256_blendv_ps(_mm256_set1_ps(0.0f), x, cmp);
382
+ }
383
+
384
+ /* Truncate [0,1] floats to uint8 and scatter to interleaved RGB output. */
385
+ static inline void store_unit_f32_to_u8_rgb8_avx2(__m256 fr, __m256 fg, __m256 fb,
386
+ uint8_t *out_ptr, npy_intp start_idx) {
387
+ __m256 scale = _mm256_set1_ps(255.0f);
388
+ __m256i ir = _mm256_cvttps_epi32(_mm256_mul_ps(fr, scale));
389
+ __m256i ig = _mm256_cvttps_epi32(_mm256_mul_ps(fg, scale));
390
+ __m256i ib = _mm256_cvttps_epi32(_mm256_mul_ps(fb, scale));
391
+
392
+ int r[8], g[8], b[8];
393
+ _mm256_storeu_si256((__m256i*)r, ir);
394
+ _mm256_storeu_si256((__m256i*)g, ig);
395
+ _mm256_storeu_si256((__m256i*)b, ib);
396
+
397
+ for (int k = 0; k < 8; ++k) {
398
+ const npy_intp p = start_idx + k;
399
+ out_ptr[3*p+0] = (uint8_t)(r[k] < 0 ? 0 : r[k] > 255 ? 255 : r[k]);
400
+ out_ptr[3*p+1] = (uint8_t)(g[k] < 0 ? 0 : g[k] > 255 ? 255 : g[k]);
401
+ out_ptr[3*p+2] = (uint8_t)(b[k] < 0 ? 0 : b[k] > 255 ? 255 : b[k]);
402
+ }
403
+ }
404
+
405
+ /* texture is RGB: texture_alpha = im_alpha broadcast, inverse_tpa = 1 - texture_alpha */
406
+ static void kernel_avx2_rgb(const uint8_t *base, const uint8_t *texture,
407
+ const uint8_t *skin, const uint8_t *im_alpha,
408
+ uint8_t *out, npy_intp pixels) {
409
+ const int stride3 = 3, stride4 = 4;
410
+ const __m256 half = _mm256_set1_ps(0.5f);
411
+ const __m256 one = _mm256_set1_ps(1.0f);
412
+ const __m256 w = _mm256_set1_ps((float)SKIN_WEIGHT);
413
+ const __m256 invw = _mm256_set1_ps(1.0f - (float)SKIN_WEIGHT);
414
+
415
+ npy_intp i = 0;
416
+ for (; i + 8 <= pixels; i += 8) {
417
+ /* base RGB in [0,1] */
418
+ __m256 fb_r = gather_u8_to_unit_f32_avx2(base+0, stride3, i);
419
+ __m256 fb_g = gather_u8_to_unit_f32_avx2(base+1, stride3, i);
420
+ __m256 fb_b = gather_u8_to_unit_f32_avx2(base+2, stride3, i);
421
+
422
+ /* texture RGB in [0,1] */
423
+ __m256 ft_r = gather_u8_to_unit_f32_avx2(texture+0, stride3, i);
424
+ __m256 ft_g = gather_u8_to_unit_f32_avx2(texture+1, stride3, i);
425
+ __m256 ft_b = gather_u8_to_unit_f32_avx2(texture+2, stride3, i);
426
+
427
+ /* skin RGB in [0,1] */
428
+ __m256 fs_r = gather_u8_to_unit_f32_avx2(skin+0, stride4, i);
429
+ __m256 fs_g = gather_u8_to_unit_f32_avx2(skin+1, stride4, i);
430
+ __m256 fs_b = gather_u8_to_unit_f32_avx2(skin+2, stride4, i);
431
+
432
+ /* texture_alpha = im_alpha */
433
+ __m256 fa_im = load8_u8_to_unit_f32_avx2(im_alpha + i);
434
+ __m256 fit_a = _mm256_sub_ps(one, fa_im);
435
+
436
+ /* gm_out = clip(texture + skin - 0.5) */
437
+ __m256 gm_r = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_r, fs_r), half));
438
+ __m256 gm_g = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_g, fs_g), half));
439
+ __m256 gm_b = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_b, fs_b), half));
440
+
441
+ /* gm_out = gm_out * texture_alpha + texture * inverse_tpa */
442
+ gm_r = _mm256_add_ps(_mm256_mul_ps(gm_r, fa_im), _mm256_mul_ps(ft_r, fit_a));
443
+ gm_g = _mm256_add_ps(_mm256_mul_ps(gm_g, fa_im), _mm256_mul_ps(ft_g, fit_a));
444
+ gm_b = _mm256_add_ps(_mm256_mul_ps(gm_b, fa_im), _mm256_mul_ps(ft_b, fit_a));
445
+
446
+ /* gm_out = gm_out * (1 - w) + skin * w */
447
+ gm_r = _mm256_add_ps(_mm256_mul_ps(gm_r, invw), _mm256_mul_ps(fs_r, w));
448
+ gm_g = _mm256_add_ps(_mm256_mul_ps(gm_g, invw), _mm256_mul_ps(fs_g, w));
449
+ gm_b = _mm256_add_ps(_mm256_mul_ps(gm_b, invw), _mm256_mul_ps(fs_b, w));
450
+
451
+ /* nan_to_num */
452
+ gm_r = nan_to_num_ps(gm_r);
453
+ gm_g = nan_to_num_ps(gm_g);
454
+ gm_b = nan_to_num_ps(gm_b);
455
+
456
+ /* n_out = gm_out * texture_alpha + base * inverse_tpa */
457
+ __m256 fr = _mm256_add_ps(_mm256_mul_ps(gm_r, fa_im), _mm256_mul_ps(fb_r, fit_a));
458
+ __m256 fg = _mm256_add_ps(_mm256_mul_ps(gm_g, fa_im), _mm256_mul_ps(fb_g, fit_a));
459
+ __m256 fb = _mm256_add_ps(_mm256_mul_ps(gm_b, fa_im), _mm256_mul_ps(fb_b, fit_a));
460
+
461
+ store_unit_f32_to_u8_rgb8_avx2(fr, fg, fb, out, i);
462
+ }
463
+
464
+ if (i < pixels) {
465
+ kernel_scalar_rgb(base + 3*i, texture + 3*i, skin + 4*i, im_alpha + i,
466
+ out + 3*i, pixels - i);
467
+ }
468
+ }
469
+
470
+ /* texture is RGBA: texture_alpha = texture.A * im_alpha, inverse_tpa = 1 - texture_alpha */
471
+ static void kernel_avx2_rgba(const uint8_t *base, const uint8_t *texture,
472
+ const uint8_t *skin, const uint8_t *im_alpha,
473
+ uint8_t *out, npy_intp pixels) {
474
+ const int stride3 = 3, stride4 = 4;
475
+ const __m256 half = _mm256_set1_ps(0.5f);
476
+ const __m256 one = _mm256_set1_ps(1.0f);
477
+ const __m256 w = _mm256_set1_ps((float)SKIN_WEIGHT);
478
+ const __m256 invw = _mm256_set1_ps(1.0f - (float)SKIN_WEIGHT);
479
+
480
+ npy_intp i = 0;
481
+ for (; i + 8 <= pixels; i += 8) {
482
+ __m256 fb_r = gather_u8_to_unit_f32_avx2(base+0, stride3, i);
483
+ __m256 fb_g = gather_u8_to_unit_f32_avx2(base+1, stride3, i);
484
+ __m256 fb_b = gather_u8_to_unit_f32_avx2(base+2, stride3, i);
485
+
486
+ __m256 ft_r = gather_u8_to_unit_f32_avx2(texture+0, stride4, i);
487
+ __m256 ft_g = gather_u8_to_unit_f32_avx2(texture+1, stride4, i);
488
+ __m256 ft_b = gather_u8_to_unit_f32_avx2(texture+2, stride4, i);
489
+ __m256 ft_a = gather_u8_to_unit_f32_avx2(texture+3, stride4, i); /* texture alpha */
490
+
491
+ __m256 fs_r = gather_u8_to_unit_f32_avx2(skin+0, stride4, i);
492
+ __m256 fs_g = gather_u8_to_unit_f32_avx2(skin+1, stride4, i);
493
+ __m256 fs_b = gather_u8_to_unit_f32_avx2(skin+2, stride4, i);
494
+
495
+ __m256 fa_im = load8_u8_to_unit_f32_avx2(im_alpha + i);
496
+ __m256 fta = _mm256_mul_ps(ft_a, fa_im); /* texture_alpha */
497
+ __m256 fit_a = _mm256_sub_ps(one, fta); /* inverse_tpa */
498
+
499
+ __m256 gm_r = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_r, fs_r), half));
500
+ __m256 gm_g = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_g, fs_g), half));
501
+ __m256 gm_b = clamp01_ps(_mm256_sub_ps(_mm256_add_ps(ft_b, fs_b), half));
502
+
503
+ gm_r = _mm256_add_ps(_mm256_mul_ps(gm_r, fta), _mm256_mul_ps(ft_r, fit_a));
504
+ gm_g = _mm256_add_ps(_mm256_mul_ps(gm_g, fta), _mm256_mul_ps(ft_g, fit_a));
505
+ gm_b = _mm256_add_ps(_mm256_mul_ps(gm_b, fta), _mm256_mul_ps(ft_b, fit_a));
506
+
507
+ gm_r = _mm256_add_ps(_mm256_mul_ps(gm_r, invw), _mm256_mul_ps(fs_r, w));
508
+ gm_g = _mm256_add_ps(_mm256_mul_ps(gm_g, invw), _mm256_mul_ps(fs_g, w));
509
+ gm_b = _mm256_add_ps(_mm256_mul_ps(gm_b, invw), _mm256_mul_ps(fs_b, w));
510
+
511
+ gm_r = nan_to_num_ps(gm_r);
512
+ gm_g = nan_to_num_ps(gm_g);
513
+ gm_b = nan_to_num_ps(gm_b);
514
+
515
+ __m256 fr = _mm256_add_ps(_mm256_mul_ps(gm_r, fta), _mm256_mul_ps(fb_r, fit_a));
516
+ __m256 fg = _mm256_add_ps(_mm256_mul_ps(gm_g, fta), _mm256_mul_ps(fb_g, fit_a));
517
+ __m256 fb = _mm256_add_ps(_mm256_mul_ps(gm_b, fta), _mm256_mul_ps(fb_b, fit_a));
518
+
519
+ store_unit_f32_to_u8_rgb8_avx2(fr, fg, fb, out, i);
520
+ }
521
+
522
+ if (i < pixels) {
523
+ kernel_scalar_rgba(base + 3*i, texture + 4*i, skin + 4*i, im_alpha + i,
524
+ out + 3*i, pixels - i);
525
+ }
526
+ }
527
+
528
+ /* ---------- SSE4.2 skeleton (process 4 pixels via manual loads) ---------- */
529
+
530
+ /* 4-lane u8->f32 [0,1] from scalar bytes (works with interleaved strides) */
531
+ static inline __m128 u8x4_to_unit_f32(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
532
+ __m128i vi = _mm_setr_epi32((int)a, (int)b, (int)c, (int)d);
533
+ return _mm_mul_ps(_mm_cvtepi32_ps(vi), _mm_set1_ps(1.0f/255.0f));
534
+ }
535
+
536
+ static inline __m128 load4_u8_to_unit_f32(const uint8_t *p) {
537
+ /* p[0..3] are consecutive bytes (for im_alpha) */
538
+ __m128i v8 = _mm_cvtsi32_si128(*(const int*)p); /* 4 bytes into xmm */
539
+ __m128i v16 = _mm_cvtepu8_epi16(v8); /* widen to 8 x u16, we use low 4 */
540
+ __m128i v32 = _mm_cvtepu16_epi32(v16);
541
+ return _mm_mul_ps(_mm_cvtepi32_ps(v32), _mm_set1_ps(1.0f/255.0f));
542
+ }
543
+
544
+ static inline __m128 clamp01_ps128(__m128 x) {
545
+ return _mm_min_ps(_mm_max_ps(x, _mm_set1_ps(0.0f)), _mm_set1_ps(1.0f));
546
+ }
547
+
548
+ static inline __m128 nan_to_num_ps128(__m128 x) {
549
+ __m128 cmp = _mm_cmpord_ps(x, x); /* 0 for NaN lanes */
550
+ return _mm_blendv_ps(_mm_set1_ps(0.0f), x, cmp);
551
+ }
552
+
553
+
554
+ static void kernel_sse42_rgb(const uint8_t *base, const uint8_t *texture,
555
+ const uint8_t *skin, const uint8_t *im_alpha,
556
+ uint8_t *out, npy_intp pixels) {
557
+ const __m128 half = _mm_set1_ps(0.5f);
558
+ const __m128 one = _mm_set1_ps(1.0f);
559
+ const __m128 w = _mm_set1_ps((float)SKIN_WEIGHT);
560
+ const __m128 invw = _mm_set1_ps(1.0f - (float)SKIN_WEIGHT);
561
+
562
+ npy_intp i = 0;
563
+ for (; i + 4 <= pixels; i += 4) {
564
+ __m128 fb_r = u8x4_to_unit_f32(base[3*(i+0)+0], base[3*(i+1)+0],
565
+ base[3*(i+2)+0], base[3*(i+3)+0]);
566
+ __m128 fb_g = u8x4_to_unit_f32(base[3*(i+0)+1], base[3*(i+1)+1],
567
+ base[3*(i+2)+1], base[3*(i+3)+1]);
568
+ __m128 fb_b = u8x4_to_unit_f32(base[3*(i+0)+2], base[3*(i+1)+2],
569
+ base[3*(i+2)+2], base[3*(i+3)+2]);
570
+
571
+ __m128 ft_r = u8x4_to_unit_f32(texture[3*(i+0)+0], texture[3*(i+1)+0],
572
+ texture[3*(i+2)+0], texture[3*(i+3)+0]);
573
+ __m128 ft_g = u8x4_to_unit_f32(texture[3*(i+0)+1], texture[3*(i+1)+1],
574
+ texture[3*(i+2)+1], texture[3*(i+3)+1]);
575
+ __m128 ft_b = u8x4_to_unit_f32(texture[3*(i+0)+2], texture[3*(i+1)+2],
576
+ texture[3*(i+2)+2], texture[3*(i+3)+2]);
577
+
578
+ __m128 fs_r = u8x4_to_unit_f32(skin[4*(i+0)+0], skin[4*(i+1)+0],
579
+ skin[4*(i+2)+0], skin[4*(i+3)+0]);
580
+ __m128 fs_g = u8x4_to_unit_f32(skin[4*(i+0)+1], skin[4*(i+1)+1],
581
+ skin[4*(i+2)+1], skin[4*(i+3)+1]);
582
+ __m128 fs_b = u8x4_to_unit_f32(skin[4*(i+0)+2], skin[4*(i+1)+2],
583
+ skin[4*(i+2)+2], skin[4*(i+3)+2]);
584
+
585
+ __m128 fa_im = load4_u8_to_unit_f32(im_alpha + i);
586
+ __m128 fit_a = _mm_sub_ps(one, fa_im);
587
+
588
+ __m128 gm_r = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_r, fs_r), half));
589
+ __m128 gm_g = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_g, fs_g), half));
590
+ __m128 gm_b = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_b, fs_b), half));
591
+
592
+ gm_r = _mm_add_ps(_mm_mul_ps(gm_r, fa_im), _mm_mul_ps(ft_r, fit_a));
593
+ gm_g = _mm_add_ps(_mm_mul_ps(gm_g, fa_im), _mm_mul_ps(ft_g, fit_a));
594
+ gm_b = _mm_add_ps(_mm_mul_ps(gm_b, fa_im), _mm_mul_ps(ft_b, fit_a));
595
+
596
+ gm_r = _mm_add_ps(_mm_mul_ps(gm_r, invw), _mm_mul_ps(fs_r, w));
597
+ gm_g = _mm_add_ps(_mm_mul_ps(gm_g, invw), _mm_mul_ps(fs_g, w));
598
+ gm_b = _mm_add_ps(_mm_mul_ps(gm_b, invw), _mm_mul_ps(fs_b, w));
599
+
600
+ gm_r = nan_to_num_ps128(gm_r);
601
+ gm_g = nan_to_num_ps128(gm_g);
602
+ gm_b = nan_to_num_ps128(gm_b);
603
+
604
+ __m128 fr = _mm_add_ps(_mm_mul_ps(gm_r, fa_im), _mm_mul_ps(fb_r, fit_a));
605
+ __m128 fg = _mm_add_ps(_mm_mul_ps(gm_g, fa_im), _mm_mul_ps(fb_g, fit_a));
606
+ __m128 fb = _mm_add_ps(_mm_mul_ps(gm_b, fa_im), _mm_mul_ps(fb_b, fit_a));
607
+
608
+ float rr[4], gg[4], bb[4];
609
+ _mm_storeu_ps(rr, fr);
610
+ _mm_storeu_ps(gg, fg);
611
+ _mm_storeu_ps(bb, fb);
612
+
613
+ for (int k = 0; k < 4; ++k) {
614
+ int r = (int)(rr[k] * 255.0f);
615
+ int g = (int)(gg[k] * 255.0f);
616
+ int b = (int)(bb[k] * 255.0f);
617
+ out[3*(i+k)+0] = (uint8_t)(r < 0 ? 0 : r > 255 ? 255 : r);
618
+ out[3*(i+k)+1] = (uint8_t)(g < 0 ? 0 : g > 255 ? 255 : g);
619
+ out[3*(i+k)+2] = (uint8_t)(b < 0 ? 0 : b > 255 ? 255 : b);
620
+ }
621
+ }
622
+
623
+ if (i < pixels) {
624
+ kernel_scalar_rgb(base + 3*i, texture + 3*i, skin + 4*i, im_alpha + i,
625
+ out + 3*i, pixels - i);
626
+ }
627
+ }
628
+
629
+ static void kernel_sse42_rgba(const uint8_t *base, const uint8_t *texture,
630
+ const uint8_t *skin, const uint8_t *im_alpha,
631
+ uint8_t *out, npy_intp pixels) {
632
+ const __m128 half = _mm_set1_ps(0.5f);
633
+ const __m128 one = _mm_set1_ps(1.0f);
634
+ const __m128 w = _mm_set1_ps((float)SKIN_WEIGHT);
635
+ const __m128 invw = _mm_set1_ps(1.0f - (float)SKIN_WEIGHT);
636
+
637
+ npy_intp i = 0;
638
+ for (; i + 4 <= pixels; i += 4) {
639
+ __m128 fb_r = u8x4_to_unit_f32(base[3*(i+0)+0], base[3*(i+1)+0],
640
+ base[3*(i+2)+0], base[3*(i+3)+0]);
641
+ __m128 fb_g = u8x4_to_unit_f32(base[3*(i+0)+1], base[3*(i+1)+1],
642
+ base[3*(i+2)+1], base[3*(i+3)+1]);
643
+ __m128 fb_b = u8x4_to_unit_f32(base[3*(i+0)+2], base[3*(i+1)+2],
644
+ base[3*(i+2)+2], base[3*(i+3)+2]);
645
+
646
+ __m128 ft_r = u8x4_to_unit_f32(texture[4*(i+0)+0], texture[4*(i+1)+0],
647
+ texture[4*(i+2)+0], texture[4*(i+3)+0]);
648
+ __m128 ft_g = u8x4_to_unit_f32(texture[4*(i+0)+1], texture[4*(i+1)+1],
649
+ texture[4*(i+2)+1], texture[4*(i+3)+1]);
650
+ __m128 ft_b = u8x4_to_unit_f32(texture[4*(i+0)+2], texture[4*(i+1)+2],
651
+ texture[4*(i+2)+2], texture[4*(i+3)+2]);
652
+ __m128 ft_a = u8x4_to_unit_f32(texture[4*(i+0)+3], texture[4*(i+1)+3],
653
+ texture[4*(i+2)+3], texture[4*(i+3)+3]);
654
+
655
+ __m128 fs_r = u8x4_to_unit_f32(skin[4*(i+0)+0], skin[4*(i+1)+0],
656
+ skin[4*(i+2)+0], skin[4*(i+3)+0]);
657
+ __m128 fs_g = u8x4_to_unit_f32(skin[4*(i+0)+1], skin[4*(i+1)+1],
658
+ skin[4*(i+2)+1], skin[4*(i+3)+1]);
659
+ __m128 fs_b = u8x4_to_unit_f32(skin[4*(i+0)+2], skin[4*(i+1)+2],
660
+ skin[4*(i+2)+2], skin[4*(i+3)+2]);
661
+
662
+ __m128 fa_im = load4_u8_to_unit_f32(im_alpha + i);
663
+ __m128 fta = _mm_mul_ps(ft_a, fa_im); /* texture_alpha */
664
+ __m128 fit_a = _mm_sub_ps(one, fta);
665
+
666
+ __m128 gm_r = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_r, fs_r), half));
667
+ __m128 gm_g = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_g, fs_g), half));
668
+ __m128 gm_b = clamp01_ps128(_mm_sub_ps(_mm_add_ps(ft_b, fs_b), half));
669
+
670
+ gm_r = _mm_add_ps(_mm_mul_ps(gm_r, fta), _mm_mul_ps(ft_r, fit_a));
671
+ gm_g = _mm_add_ps(_mm_mul_ps(gm_g, fta), _mm_mul_ps(ft_g, fit_a));
672
+ gm_b = _mm_add_ps(_mm_mul_ps(gm_b, fta), _mm_mul_ps(ft_b, fit_a));
673
+
674
+ gm_r = _mm_add_ps(_mm_mul_ps(gm_r, invw), _mm_mul_ps(fs_r, w));
675
+ gm_g = _mm_add_ps(_mm_mul_ps(gm_g, invw), _mm_mul_ps(fs_g, w));
676
+ gm_b = _mm_add_ps(_mm_mul_ps(gm_b, invw), _mm_mul_ps(fs_b, w));
677
+
678
+ gm_r = nan_to_num_ps128(gm_r);
679
+ gm_g = nan_to_num_ps128(gm_g);
680
+ gm_b = nan_to_num_ps128(gm_b);
681
+
682
+ __m128 fr = _mm_add_ps(_mm_mul_ps(gm_r, fta), _mm_mul_ps(fb_r, fit_a));
683
+ __m128 fg = _mm_add_ps(_mm_mul_ps(gm_g, fta), _mm_mul_ps(fb_g, fit_a));
684
+ __m128 fb = _mm_add_ps(_mm_mul_ps(gm_b, fta), _mm_mul_ps(fb_b, fit_a));
685
+
686
+ float rr[4], gg[4], bb[4];
687
+ _mm_storeu_ps(rr, fr);
688
+ _mm_storeu_ps(gg, fg);
689
+ _mm_storeu_ps(bb, fb);
690
+
691
+ for (int k = 0; k < 4; ++k) {
692
+ int r = (int)(rr[k] * 255.0f);
693
+ int g = (int)(gg[k] * 255.0f);
694
+ int b = (int)(bb[k] * 255.0f);
695
+ out[3*(i+k)+0] = (uint8_t)(r < 0 ? 0 : r > 255 ? 255 : r);
696
+ out[3*(i+k)+1] = (uint8_t)(g < 0 ? 0 : g > 255 ? 255 : g);
697
+ out[3*(i+k)+2] = (uint8_t)(b < 0 ? 0 : b > 255 ? 255 : b);
698
+ }
699
+ }
700
+
701
+ if (i < pixels) {
702
+ kernel_scalar_rgba(base + 3*i, texture + 4*i, skin + 4*i, im_alpha + i,
703
+ out + 3*i, pixels - i);
704
+ }
705
+ }
706
+
707
+
708
+ /* ---------- Kernel dispatch ---------- */
709
+
710
+ static kernel_kind pick_kernel(const char *force_name) {
711
+ if (force_name) {
712
+ if (strcmp(force_name, "scalar") == 0) return KERNEL_SCALAR;
713
+ if (strcmp(force_name, "sse42") == 0) return KERNEL_SSE42;
714
+ if (strcmp(force_name, "avx2") == 0) return KERNEL_AVX2;
715
+ if (strcmp(force_name, "auto") == 0) {/* fall through */}
716
+ }
717
+ /* Auto: prefer AVX2, then SSE4.2, else scalar */
718
+ if (cpu_supports_avx2() && os_supports_avx()) return KERNEL_AVX2;
719
+ if (cpu_supports_sse42()) return KERNEL_SSE42;
720
+ return KERNEL_SCALAR;
721
+ }
722
+
723
+ /* ---------- Python binding ---------- */
724
+
725
+ /* Convert base (H,W,3 or H,W,4) -> packed RGB (H,W,3). Returns a NEW ref.
726
+ If base is already (H,W,3), this returns a new C-contig copy of it (to be safe). */
727
+ static PyArrayObject* ensure_base_rgb(PyArrayObject *base_in, const char *name) {
728
+ if (PyArray_NDIM(base_in) != 3) {
729
+ PyErr_Format(PyExc_ValueError, "%s must have shape (H, W, 3) or (H, W, 4)", name);
730
+ return NULL;
731
+ }
732
+ npy_intp const *dims_in = PyArray_DIMS(base_in);
733
+ npy_intp H = dims_in[0], W = dims_in[1], C = dims_in[2];
734
+ if (!(C == 3 || C == 4)) {
735
+ PyErr_Format(PyExc_ValueError, "%s must have 3 or 4 channels", name);
736
+ return NULL;
737
+ }
738
+
739
+ /* Always produce a fresh C-contiguous uint8 (H,W,3) we own. */
740
+ npy_intp dims_out[3] = {H, W, 3};
741
+ PyArrayObject *base_rgb = (PyArrayObject*)PyArray_SimpleNew(3, dims_out, NPY_UINT8);
742
+ if (!base_rgb) return NULL;
743
+
744
+ const uint8_t *src = (const uint8_t*)PyArray_DATA(base_in);
745
+ uint8_t *dst = (uint8_t*)PyArray_DATA(base_rgb);
746
+ const npy_intp pixels = H * W;
747
+
748
+ if (C == 3) {
749
+ /* Packed copy */
750
+ memcpy(dst, src, (size_t)(pixels * 3));
751
+ return base_rgb;
752
+ }
753
+
754
+ /* C == 4: strip alpha, keep RGB packed */
755
+ for (npy_intp i = 0; i < pixels; ++i) {
756
+ dst[3*i + 0] = src[4*i + 0];
757
+ dst[3*i + 1] = src[4*i + 1];
758
+ dst[3*i + 2] = src[4*i + 2];
759
+ }
760
+ return base_rgb;
761
+ }
762
+
763
+ static PyObject* py_normal_grain_merge(PyObject* self, PyObject* args, PyObject* kwargs) {
764
+ static char *kwlist[] = {"base", "texture", "skin", "im_alpha", "kernel", NULL};
765
+
766
+ PyObject *base_obj = NULL, *texture_obj = NULL, *skin_obj = NULL, *im_alpha_obj = NULL;
767
+ const char *kernel_name = "auto";
768
+
769
+ if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OOOO|s", kwlist,
770
+ &base_obj, &texture_obj, &skin_obj, &im_alpha_obj,
771
+ &kernel_name)) {
772
+ return NULL;
773
+ }
774
+
775
+ /* Materialize arrays we own. Do NOT decref the *_obj borrowed refs. */
776
+ /* Borrowed -> owned, uint8, C-contig (you already have get_uint8_c_contig) */
777
+ PyArrayObject *base_u8 = NULL, *texture = NULL, *skin = NULL, *im_alpha = NULL;
778
+ if (!get_uint8_c_contig(base_obj, &base_u8, "base") ||
779
+ !get_uint8_c_contig(texture_obj, &texture, "texture") ||
780
+ !get_uint8_c_contig(skin_obj, &skin, "skin") ||
781
+ !get_uint8_c_contig(im_alpha_obj, &im_alpha, "im_alpha")) {
782
+ Py_XDECREF(base_u8); Py_XDECREF(texture); Py_XDECREF(skin); Py_XDECREF(im_alpha);
783
+ return NULL;
784
+ }
785
+
786
+ /* If base is RGBA, pack to RGB; if it’s already RGB, make a packed copy */
787
+ PyArrayObject *base = ensure_base_rgb(base_u8, "base");
788
+ if (!base) {
789
+ Py_DECREF(base_u8); Py_DECREF(texture); Py_DECREF(skin); Py_DECREF(im_alpha);
790
+ return NULL;
791
+ }
792
+ Py_DECREF(base_u8); /* drop the intermediate reference, we own `base` now */
793
+
794
+ int texture_has_alpha = 0;
795
+ npy_intp H = 0, W = 0;
796
+ if (!check_shape_requirements(base, texture, skin, im_alpha,
797
+ &texture_has_alpha, &H, &W)) {
798
+ Py_DECREF(base); Py_DECREF(texture); Py_DECREF(skin); Py_DECREF(im_alpha);
799
+ return NULL;
800
+ }
801
+
802
+ /* Allocate output (H, W, 3) uint8 */
803
+ PyObject *out = PyArray_NewLikeArray(base, NPY_ANYORDER, NULL, 0);
804
+ if (!out) {
805
+ Py_XDECREF(base); Py_XDECREF(texture); Py_XDECREF(skin); Py_XDECREF(im_alpha);
806
+ return NULL;
807
+ }
808
+
809
+ const uint8_t *p_base = (const uint8_t*)PyArray_DATA(base);
810
+ const uint8_t *p_texture = (const uint8_t*)PyArray_DATA(texture);
811
+ const uint8_t *p_skin = (const uint8_t*)PyArray_DATA(skin);
812
+ const uint8_t *p_imalpha = (const uint8_t*)PyArray_DATA(im_alpha);
813
+ uint8_t *p_out = (uint8_t*)PyArray_DATA((PyArrayObject*)out);
814
+
815
+ const npy_intp pixels = H * W;
816
+
817
+ kernel_kind k = pick_kernel(kernel_name);
818
+
819
+ /* Optional: release the GIL around pure C loops. No Python API calls inside kernels. */
820
+ NPY_BEGIN_ALLOW_THREADS
821
+
822
+ if (!texture_has_alpha) {
823
+ if (k == KERNEL_AVX2) {
824
+ kernel_avx2_rgb(p_base, p_texture, p_skin, p_imalpha, p_out, pixels);
825
+ } else if (k == KERNEL_SSE42) {
826
+ kernel_sse42_rgb(p_base, p_texture, p_skin, p_imalpha, p_out, pixels);
827
+ } else {
828
+ kernel_scalar_rgb(p_base, p_texture, p_skin, p_imalpha, p_out, pixels);
829
+ }
830
+ } else {
831
+ if (k == KERNEL_AVX2) {
832
+ kernel_avx2_rgba(p_base, p_texture, p_skin, p_imalpha, p_out, pixels);
833
+ } else if (k == KERNEL_SSE42) {
834
+ kernel_sse42_rgba(p_base, p_texture, p_skin, p_imalpha, p_out, pixels);
835
+ } else {
836
+ kernel_scalar_rgba(p_base, p_texture, p_skin, p_imalpha, p_out, pixels);
837
+ }
838
+ }
839
+
840
+ NPY_END_ALLOW_THREADS
841
+
842
+ /* DECREF only what we own. */
843
+ Py_DECREF(base); Py_DECREF(texture); Py_DECREF(skin); Py_DECREF(im_alpha);
844
+ return out;
845
+ }
846
+
847
+ static PyMethodDef Methods[] = {
848
+ {"normal_grain_merge", (PyCFunction)py_normal_grain_merge, METH_VARARGS | METH_KEYWORDS,
849
+ "normal_grain_merge(base, texture, skin, im_alpha, kernel='auto') -> np.ndarray\n"
850
+ "kernel: 'auto', 'scalar', 'sse42', or 'avx2'"},
851
+ {NULL, NULL, 0, NULL}
852
+ };
853
+
854
+ static struct PyModuleDef moduledef = {
855
+ PyModuleDef_HEAD_INIT,
856
+ "normal_grain_merge",
857
+ "Normal Grain Merge Module",
858
+ -1,
859
+ Methods
860
+ };
861
+
862
+ PyMODINIT_FUNC PyInit_normal_grain_merge(void) {
863
+ import_array();
864
+ return PyModule_Create(&moduledef);
865
+ }
@@ -0,0 +1,28 @@
1
+ # ngm/normal_grain_merge.pyi
2
+ from typing import Literal
3
+ import numpy as np
4
+ from .kernel_kind import KernelKind
5
+
6
+ def normal_grain_merge(
7
+ base: np.ndarray,
8
+ texture: np.ndarray,
9
+ skin: np.ndarray,
10
+ im_alpha: np.ndarray,
11
+ kernel: Literal["auto", "scalar", "sse42", "avx2"] | KernelKind = "auto",
12
+ ) -> np.ndarray:
13
+ """
14
+ Performs a combined merge: grain merge of skin and texture,
15
+ then a normal merge of that result on base.
16
+ Channel ordering doesn't matter as long as it is consistent.
17
+ :param base: The base RGB image.
18
+ :param texture: The texture, either RGB or RGBA.
19
+ :param skin: The RGBA skin cutout.
20
+ :param im_alpha: The alpha from the cutout.
21
+ :param kernel: Which kernel to use.
22
+ The `auto` kernel chooses between avx2 and sse4.2 when compiled with gcc and uses `scaler` on Windows.
23
+ The `scalar` kernel is a portable implementation that relies on the compiler for SIMD.
24
+ The `sse42` kernel uses SSE4.2 intrinsics.
25
+ The `avx2` kernel uses AVX2 intrinsics.
26
+ :return: RGB np.ndarray.
27
+ """
28
+ ...
@@ -0,0 +1,78 @@
1
+ Metadata-Version: 2.4
2
+ Name: normal_grain_merge
3
+ Version: 0.0.0
4
+ Summary: Fused normal and grain merge C extension
5
+ Author: Samuel Howard
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/samhaswon/normal_grain_merge
8
+ Project-URL: Bug Tracker, https://github.com/samhaswon/normal_grain_merge/issues
9
+ Keywords: image,processing
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: C
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.7
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Requires-Dist: numpy<2.3.0
17
+ Dynamic: license-file
18
+
19
+ # normal_grain_merge
20
+
21
+ This implements a combined version of the blend modes normal and grain merge.
22
+ Grain merge is performed on *s* and *t* with the result normal-merged with *b*.
23
+ Subscripts indicate channels, with alpha (α) channels broadcast to three channels.
24
+
25
+ $$
26
+ (((\mathrm{t_{rgb}} + \mathrm{s_{rgb}} - 0.5) * \mathrm{t_\alpha} + \mathrm{t_{rgb}} * (1 - \mathrm{t_\alpha})) * (1 - 0.3) + \mathrm{s_{rgb}} * 0.3) * \mathrm{t_\alpha} + \mathrm{b_{rgb}} * (1 - \mathrm{t_\alpha})
27
+ $$
28
+
29
+ ## Usage
30
+ ```py
31
+ import numpy as np
32
+ from normal_grain_merge import normal_grain_merge, KernelKind
33
+
34
+
35
+ # Example arrays
36
+ base = np.zeros((100, 100, 3), dtype=np.uint8)
37
+ texture = np.zeros((100, 100, 3), dtype=np.uint8)
38
+ skin = np.zeros((100, 100, 4), dtype=np.uint8)
39
+ im_alpha = np.zeros((100, 100), dtype=np.uint8)
40
+
41
+ result_scalar = normal_grain_merge(base, texture, skin, im_alpha, KernelKind.KERNEL_SCALAR.value)
42
+ print(result_scalar.shape, result_scalar.dtype)
43
+ ```
44
+
45
+ There are three kernels implemented in this module as defined in `KernelKind`.
46
+
47
+ - `KERNEL_AUTO`: Automatically chooses the kernel, preferring AVX2
48
+ - `KERNEL_SCALAR`: Portable scalar implementation.
49
+ - `KERNEL_SSE42`: SSE4.2 intrinsics kernel. Likely better on AMD CPUs.
50
+ - `KERNEL_AVX2`: AVX2 intrinsics kernel. Likely better on Intel CPUs.
51
+
52
+ ### Parameters
53
+
54
+ All input matrices should have the same height and width.
55
+
56
+ #### `base`
57
+
58
+ RGB or RGBA, dropping the alpha channel if it exists.
59
+ The base image for application.
60
+
61
+ #### `texture`
62
+
63
+ RGB or RGBA, applying the alpha if it exists.
64
+ This is the texture to be applied.
65
+
66
+ #### `skin`
67
+
68
+ RGBA, the segmented portion of base to texture.
69
+ The "skin" of the object the texture is to be applied to.
70
+
71
+ #### `im_alpha`
72
+
73
+ The alpha of parameter `skin`.
74
+ This is mostly a holdover from the Python implementation to deal with NumPy.
75
+
76
+ #### `kernel`
77
+
78
+ One of `KernelKind`.
@@ -0,0 +1,10 @@
1
+ normal_grain_merge/__init__.py,sha256=WBZRL-iqnn9JoPOA97-QgwKZamqCBJT1KtuLDv1UT80,87
2
+ normal_grain_merge/kernel_kind.py,sha256=tDzX9sYcNSRamZ0u4iKHp221TWQMmkgCvjAMreGNdY8,154
3
+ normal_grain_merge/normal_grain_merge.c,sha256=MABmEaUhQd15sZYPJSiIjk19aW_4JzqhgPoOekaQO24,35271
4
+ normal_grain_merge/normal_grain_merge.cp312-win_amd64.pyd,sha256=slG7L1tvmVXqqt4K6okpOdlbrMAVyqklwr33Wjlr9Ng,25088
5
+ normal_grain_merge/normal_grain_merge.pyi,sha256=Tz5RVlNbBqn_MsQ46WikaohEPctHdWsFxK3bloRZl1M,1090
6
+ normal_grain_merge-0.0.0.dist-info/licenses/LICENSE,sha256=n2zD0bJYpAHV1YwX4d792d2FiIAUkt4wymsuNA55e8E,1069
7
+ normal_grain_merge-0.0.0.dist-info/METADATA,sha256=lKUWM60R25LLySXCt5XJTYKH63tGG5ju-DPBPVEl0VQ,2525
8
+ normal_grain_merge-0.0.0.dist-info/WHEEL,sha256=8UP9x9puWI0P1V_d7K2oMTBqfeLNm21CTzZ_Ptr0NXU,101
9
+ normal_grain_merge-0.0.0.dist-info/top_level.txt,sha256=jfUAUKWrxBshHvZ0xTu3uF5VJsUpbWp5NkxUj8OXqu8,19
10
+ normal_grain_merge-0.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp312-cp312-win_amd64
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Samuel Howard
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ normal_grain_merge