oil 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1930 @@
1
+ /**
2
+ * Copyright (c) 2014-2019 Timothy Elliott
3
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ * of this software and associated documentation files (the "Software"), to deal
5
+ * in the Software without restriction, including without limitation the rights
6
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ * copies of the Software, and to permit persons to whom the Software is
8
+ * furnished to do so, subject to the following conditions:
9
+ *
10
+ * The above copyright notice and this permission notice shall be included in
11
+ * all copies or substantial portions of the Software.
12
+ *
13
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ * THE SOFTWARE.
20
+ */
21
+
22
+ #include "oil_resample_internal.h"
23
+ #include <immintrin.h>
24
+
25
+ void oil_shift_left_f_sse2(float *f)
26
+ {
27
+ __m128i v = _mm_load_si128((__m128i *)f);
28
+ _mm_store_si128((__m128i *)f, _mm_srli_si128(v, 4));
29
+ }
30
+
31
+ void oil_yscale_out_nonlinear_sse2(float *sums, int len, unsigned char *out)
32
+ {
33
+ int i;
34
+ __m128 vals, ab, cd, f0, f1, f2, f3;
35
+ __m128 scale, half, zero, one;
36
+ __m128i idx, v0, v1, v2, v3;
37
+
38
+ scale = _mm_set1_ps(255.0f);
39
+ half = _mm_set1_ps(0.5f);
40
+ zero = _mm_setzero_ps();
41
+ one = _mm_set1_ps(1.0f);
42
+
43
+ for (i=0; i+3<len; i+=4) {
44
+ v0 = _mm_load_si128((__m128i *)sums);
45
+ v1 = _mm_load_si128((__m128i *)(sums + 4));
46
+ v2 = _mm_load_si128((__m128i *)(sums + 8));
47
+ v3 = _mm_load_si128((__m128i *)(sums + 12));
48
+
49
+ f0 = _mm_castsi128_ps(v0);
50
+ f1 = _mm_castsi128_ps(v1);
51
+ f2 = _mm_castsi128_ps(v2);
52
+ f3 = _mm_castsi128_ps(v3);
53
+ ab = _mm_shuffle_ps(f0, f1, _MM_SHUFFLE(0, 0, 0, 0));
54
+ cd = _mm_shuffle_ps(f2, f3, _MM_SHUFFLE(0, 0, 0, 0));
55
+ vals = _mm_shuffle_ps(ab, cd, _MM_SHUFFLE(2, 0, 2, 0));
56
+
57
+ vals = _mm_min_ps(_mm_max_ps(vals, zero), one);
58
+ idx = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(vals, scale), half));
59
+
60
+ out[i] = _mm_cvtsi128_si32(idx);
61
+ out[i+1] = _mm_cvtsi128_si32(_mm_srli_si128(idx, 4));
62
+ out[i+2] = _mm_cvtsi128_si32(_mm_srli_si128(idx, 8));
63
+ out[i+3] = _mm_cvtsi128_si32(_mm_srli_si128(idx, 12));
64
+
65
+ _mm_store_si128((__m128i *)sums, _mm_srli_si128(v0, 4));
66
+ _mm_store_si128((__m128i *)(sums + 4), _mm_srli_si128(v1, 4));
67
+ _mm_store_si128((__m128i *)(sums + 8), _mm_srli_si128(v2, 4));
68
+ _mm_store_si128((__m128i *)(sums + 12), _mm_srli_si128(v3, 4));
69
+
70
+ sums += 16;
71
+ }
72
+
73
+ for (; i<len; i++) {
74
+ float v = *sums;
75
+ if (v > 1.0f) v = 1.0f;
76
+ else if (v < 0.0f) v = 0.0f;
77
+ out[i] = (int)(v * 255.0f + 0.5f);
78
+ oil_shift_left_f_sse2(sums);
79
+ sums += 4;
80
+ }
81
+ }
82
+
83
+ void oil_yscale_out_linear_sse2(float *sums, int len, unsigned char *out)
84
+ {
85
+ int i;
86
+ __m128 scale, vals, ab, cd, f0, f1, f2, f3;
87
+ __m128i idx, v0, v1, v2, v3;
88
+ unsigned char *lut;
89
+
90
+ lut = l2s_map;
91
+ scale = _mm_set1_ps((float)(l2s_len - 1));
92
+
93
+ for (i=0; i+3<len; i+=4) {
94
+ v0 = _mm_load_si128((__m128i *)sums);
95
+ v1 = _mm_load_si128((__m128i *)(sums + 4));
96
+ v2 = _mm_load_si128((__m128i *)(sums + 8));
97
+ v3 = _mm_load_si128((__m128i *)(sums + 12));
98
+
99
+ f0 = _mm_castsi128_ps(v0);
100
+ f1 = _mm_castsi128_ps(v1);
101
+ f2 = _mm_castsi128_ps(v2);
102
+ f3 = _mm_castsi128_ps(v3);
103
+ ab = _mm_shuffle_ps(f0, f1, _MM_SHUFFLE(0, 0, 0, 0));
104
+ cd = _mm_shuffle_ps(f2, f3, _MM_SHUFFLE(0, 0, 0, 0));
105
+ vals = _mm_shuffle_ps(ab, cd, _MM_SHUFFLE(2, 0, 2, 0));
106
+
107
+ idx = _mm_cvttps_epi32(_mm_mul_ps(vals, scale));
108
+
109
+ out[i] = lut[_mm_cvtsi128_si32(idx)];
110
+ out[i+1] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 4))];
111
+ out[i+2] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 8))];
112
+ out[i+3] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 12))];
113
+
114
+ _mm_store_si128((__m128i *)sums, _mm_srli_si128(v0, 4));
115
+ _mm_store_si128((__m128i *)(sums + 4), _mm_srli_si128(v1, 4));
116
+ _mm_store_si128((__m128i *)(sums + 8), _mm_srli_si128(v2, 4));
117
+ _mm_store_si128((__m128i *)(sums + 12), _mm_srli_si128(v3, 4));
118
+
119
+ sums += 16;
120
+ }
121
+
122
+ for (; i<len; i++) {
123
+ out[i] = lut[(int)(*sums * (l2s_len - 1))];
124
+ oil_shift_left_f_sse2(sums);
125
+ sums += 4;
126
+ }
127
+ }
128
+
129
+ void oil_yscale_out_ga_sse2(float *sums, int width, unsigned char *out)
130
+ {
131
+ int i;
132
+ __m128i v0, v1;
133
+ float gray, alpha;
134
+
135
+ for (i=0; i<width; i++) {
136
+ v0 = _mm_load_si128((__m128i *)sums);
137
+ v1 = _mm_load_si128((__m128i *)(sums + 4));
138
+
139
+ alpha = _mm_cvtss_f32(_mm_castsi128_ps(v1));
140
+ if (alpha > 1.0f) alpha = 1.0f;
141
+ else if (alpha < 0.0f) alpha = 0.0f;
142
+
143
+ gray = _mm_cvtss_f32(_mm_castsi128_ps(v0));
144
+ if (alpha != 0) {
145
+ gray /= alpha;
146
+ }
147
+ if (gray > 1.0f) gray = 1.0f;
148
+ else if (gray < 0.0f) gray = 0.0f;
149
+
150
+ out[0] = (int)(gray * 255.0f + 0.5f);
151
+ out[1] = (int)(alpha * 255.0f + 0.5f);
152
+
153
+ _mm_store_si128((__m128i *)sums, _mm_srli_si128(v0, 4));
154
+ _mm_store_si128((__m128i *)(sums + 4), _mm_srli_si128(v1, 4));
155
+
156
+ sums += 8;
157
+ out += 2;
158
+ }
159
+ }
160
+
161
+ void oil_yscale_out_rgbx_sse2(float *sums, int width, unsigned char *out)
162
+ {
163
+ int i;
164
+ __m128 scale, vals, ab, cd, f0, f1, f2;
165
+ __m128i idx, v0, v1, v2;
166
+ unsigned char *lut;
167
+
168
+ lut = l2s_map;
169
+ scale = _mm_set1_ps((float)(l2s_len - 1));
170
+
171
+ for (i=0; i<width; i++) {
172
+ v0 = _mm_load_si128((__m128i *)sums);
173
+ v1 = _mm_load_si128((__m128i *)(sums + 4));
174
+ v2 = _mm_load_si128((__m128i *)(sums + 8));
175
+
176
+ f0 = _mm_castsi128_ps(v0);
177
+ f1 = _mm_castsi128_ps(v1);
178
+ f2 = _mm_castsi128_ps(v2);
179
+ ab = _mm_shuffle_ps(f0, f1, _MM_SHUFFLE(0, 0, 0, 0));
180
+ cd = _mm_shuffle_ps(f2, f2, _MM_SHUFFLE(0, 0, 0, 0));
181
+ vals = _mm_shuffle_ps(ab, cd, _MM_SHUFFLE(2, 0, 2, 0));
182
+
183
+ idx = _mm_cvttps_epi32(_mm_mul_ps(vals, scale));
184
+
185
+ out[0] = lut[_mm_cvtsi128_si32(idx)];
186
+ out[1] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 4))];
187
+ out[2] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 8))];
188
+ out[3] = 255;
189
+
190
+ _mm_store_si128((__m128i *)sums, _mm_srli_si128(v0, 4));
191
+ _mm_store_si128((__m128i *)(sums + 4), _mm_srli_si128(v1, 4));
192
+ _mm_store_si128((__m128i *)(sums + 8), _mm_srli_si128(v2, 4));
193
+
194
+ sums += 12;
195
+ out += 4;
196
+ }
197
+ }
198
+
199
+ void oil_xscale_up_g_sse2(unsigned char *in, int width_in, float *out,
200
+ float *coeff_buf, int *border_buf)
201
+ {
202
+ int i, j;
203
+ __m128 smp, newval, hi, coeffs, prod, t1, t2;
204
+
205
+ smp = _mm_setzero_ps();
206
+
207
+ for (i=0; i<width_in; i++) {
208
+ /* push_f: shift left, insert new value at position 3 */
209
+ smp = (__m128)_mm_srli_si128((__m128i)smp, 4);
210
+ newval = _mm_set_ss(i2f_map[in[i]]);
211
+ hi = _mm_shuffle_ps(smp, newval, _MM_SHUFFLE(0, 0, 3, 2));
212
+ smp = _mm_shuffle_ps(smp, hi, _MM_SHUFFLE(2, 0, 1, 0));
213
+
214
+ j = border_buf[i];
215
+
216
+ /* process pairs of outputs */
217
+ while (j >= 2) {
218
+ __m128 c0 = _mm_load_ps(coeff_buf);
219
+ __m128 c1 = _mm_load_ps(coeff_buf + 4);
220
+ __m128 p0 = _mm_mul_ps(smp, c0);
221
+ __m128 p1 = _mm_mul_ps(smp, c1);
222
+ __m128 lo = _mm_unpacklo_ps(p0, p1);
223
+ __m128 hh = _mm_unpackhi_ps(p0, p1);
224
+ __m128 sum = _mm_add_ps(lo, hh);
225
+ t1 = _mm_movehl_ps(sum, sum);
226
+ t2 = _mm_add_ps(sum, t1);
227
+ out[0] = _mm_cvtss_f32(t2);
228
+ out[1] = _mm_cvtss_f32(
229
+ _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1)));
230
+ out += 2;
231
+ coeff_buf += 8;
232
+ j -= 2;
233
+ }
234
+
235
+ /* process remaining single output */
236
+ if (j) {
237
+ coeffs = _mm_load_ps(coeff_buf);
238
+ prod = _mm_mul_ps(smp, coeffs);
239
+ t1 = _mm_movehl_ps(prod, prod);
240
+ t2 = _mm_add_ps(prod, t1);
241
+ prod = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
242
+ t2 = _mm_add_ss(t2, prod);
243
+ out[0] = _mm_cvtss_f32(t2);
244
+ out += 1;
245
+ coeff_buf += 4;
246
+ }
247
+ }
248
+ }
249
+
250
+ void oil_xscale_up_ga_sse2(unsigned char *in, int width_in, float *out,
251
+ float *coeff_buf, int *border_buf)
252
+ {
253
+ int i, j;
254
+ __m128 smp_g, smp_a, newval, hi;
255
+
256
+ smp_g = _mm_setzero_ps();
257
+ smp_a = _mm_setzero_ps();
258
+
259
+ for (i=0; i<width_in; i++) {
260
+ /* push_f for alpha: shift left, insert new alpha at position 3 */
261
+ float alpha_new = in[1] / 255.0f;
262
+ smp_a = (__m128)_mm_srli_si128((__m128i)smp_a, 4);
263
+ newval = _mm_set_ss(alpha_new);
264
+ hi = _mm_shuffle_ps(smp_a, newval, _MM_SHUFFLE(0, 0, 3, 2));
265
+ smp_a = _mm_shuffle_ps(smp_a, hi, _MM_SHUFFLE(2, 0, 1, 0));
266
+
267
+ /* push_f for gray: premultiplied by new alpha */
268
+ smp_g = (__m128)_mm_srli_si128((__m128i)smp_g, 4);
269
+ newval = _mm_set_ss(alpha_new * i2f_map[in[0]]);
270
+ hi = _mm_shuffle_ps(smp_g, newval, _MM_SHUFFLE(0, 0, 3, 2));
271
+ smp_g = _mm_shuffle_ps(smp_g, hi, _MM_SHUFFLE(2, 0, 1, 0));
272
+
273
+ j = border_buf[i];
274
+
275
+ /* process pairs of outputs */
276
+ while (j >= 2) {
277
+ __m128 c0 = _mm_load_ps(coeff_buf);
278
+ __m128 c1 = _mm_load_ps(coeff_buf + 4);
279
+
280
+ /* gray dot products for 2 outputs */
281
+ __m128 pg0 = _mm_mul_ps(smp_g, c0);
282
+ __m128 pg1 = _mm_mul_ps(smp_g, c1);
283
+ __m128 lo = _mm_unpacklo_ps(pg0, pg1);
284
+ __m128 hh = _mm_unpackhi_ps(pg0, pg1);
285
+ __m128 sum_g = _mm_add_ps(lo, hh);
286
+ __m128 t1 = _mm_movehl_ps(sum_g, sum_g);
287
+ __m128 t2_g = _mm_add_ps(sum_g, t1);
288
+
289
+ /* alpha dot products for 2 outputs */
290
+ __m128 pa0 = _mm_mul_ps(smp_a, c0);
291
+ __m128 pa1 = _mm_mul_ps(smp_a, c1);
292
+ lo = _mm_unpacklo_ps(pa0, pa1);
293
+ hh = _mm_unpackhi_ps(pa0, pa1);
294
+ __m128 sum_a = _mm_add_ps(lo, hh);
295
+ t1 = _mm_movehl_ps(sum_a, sum_a);
296
+ __m128 t2_a = _mm_add_ps(sum_a, t1);
297
+
298
+ /* interleave: [gray0, alpha0, gray1, alpha1] */
299
+ _mm_storeu_ps(out, _mm_unpacklo_ps(t2_g, t2_a));
300
+ out += 4;
301
+ coeff_buf += 8;
302
+ j -= 2;
303
+ }
304
+
305
+ /* process remaining single output */
306
+ if (j) {
307
+ __m128 coeffs = _mm_load_ps(coeff_buf);
308
+
309
+ __m128 prod_g = _mm_mul_ps(smp_g, coeffs);
310
+ __m128 t1 = _mm_movehl_ps(prod_g, prod_g);
311
+ __m128 t2 = _mm_add_ps(prod_g, t1);
312
+ prod_g = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
313
+ t2 = _mm_add_ss(t2, prod_g);
314
+ out[0] = _mm_cvtss_f32(t2);
315
+
316
+ __m128 prod_a = _mm_mul_ps(smp_a, coeffs);
317
+ t1 = _mm_movehl_ps(prod_a, prod_a);
318
+ t2 = _mm_add_ps(prod_a, t1);
319
+ prod_a = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
320
+ t2 = _mm_add_ss(t2, prod_a);
321
+ out[1] = _mm_cvtss_f32(t2);
322
+
323
+ out += 2;
324
+ coeff_buf += 4;
325
+ }
326
+
327
+ in += 2;
328
+ }
329
+ }
330
+
331
+ void oil_yscale_up_ga_sse2(float **in, int len, float *coeffs,
332
+ unsigned char *out)
333
+ {
334
+ int i;
335
+ __m128 c0, c1, c2, c3;
336
+ __m128 v0, v1, v2, v3, sum, sum2;
337
+ __m128 scale, half, zero, one;
338
+ __m128 alpha_spread, nz_mask, safe_alpha, divided, gray_clamped, result;
339
+ __m128 blend_mask;
340
+ __m128i idx;
341
+
342
+ c0 = _mm_set1_ps(coeffs[0]);
343
+ c1 = _mm_set1_ps(coeffs[1]);
344
+ c2 = _mm_set1_ps(coeffs[2]);
345
+ c3 = _mm_set1_ps(coeffs[3]);
346
+ scale = _mm_set1_ps(255.0f);
347
+ half = _mm_set1_ps(0.5f);
348
+ zero = _mm_setzero_ps();
349
+ one = _mm_set1_ps(1.0f);
350
+ /* mask: 0 for gray positions (0,2), all-ones for alpha positions (1,3) */
351
+ blend_mask = _mm_castsi128_ps(_mm_set_epi32(-1, 0, -1, 0));
352
+
353
+ /* Process 4 GA pixels (8 floats) at a time */
354
+ for (i=0; i+7<len; i+=8) {
355
+ v0 = _mm_loadu_ps(in[0] + i);
356
+ v1 = _mm_loadu_ps(in[1] + i);
357
+ v2 = _mm_loadu_ps(in[2] + i);
358
+ v3 = _mm_loadu_ps(in[3] + i);
359
+ sum = _mm_add_ps(
360
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
361
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
362
+
363
+ v0 = _mm_loadu_ps(in[0] + i + 4);
364
+ v1 = _mm_loadu_ps(in[1] + i + 4);
365
+ v2 = _mm_loadu_ps(in[2] + i + 4);
366
+ v3 = _mm_loadu_ps(in[3] + i + 4);
367
+ sum2 = _mm_add_ps(
368
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
369
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
370
+
371
+ /* sum = [g0, a0, g1, a1], sum2 = [g2, a2, g3, a3] */
372
+
373
+ /* Process first pair: spread alpha to both lanes */
374
+ alpha_spread = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1));
375
+ alpha_spread = _mm_min_ps(_mm_max_ps(alpha_spread, zero), one);
376
+ nz_mask = _mm_cmpneq_ps(alpha_spread, zero);
377
+ safe_alpha = _mm_or_ps(
378
+ _mm_and_ps(nz_mask, alpha_spread),
379
+ _mm_andnot_ps(nz_mask, one));
380
+ divided = _mm_div_ps(sum, safe_alpha);
381
+ gray_clamped = _mm_min_ps(_mm_max_ps(divided, zero), one);
382
+ result = _mm_or_ps(
383
+ _mm_andnot_ps(blend_mask, gray_clamped),
384
+ _mm_and_ps(blend_mask, alpha_spread));
385
+ idx = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(result, scale), half));
386
+
387
+ /* Process second pair */
388
+ alpha_spread = _mm_shuffle_ps(sum2, sum2, _MM_SHUFFLE(3, 3, 1, 1));
389
+ alpha_spread = _mm_min_ps(_mm_max_ps(alpha_spread, zero), one);
390
+ nz_mask = _mm_cmpneq_ps(alpha_spread, zero);
391
+ safe_alpha = _mm_or_ps(
392
+ _mm_and_ps(nz_mask, alpha_spread),
393
+ _mm_andnot_ps(nz_mask, one));
394
+ divided = _mm_div_ps(sum2, safe_alpha);
395
+ gray_clamped = _mm_min_ps(_mm_max_ps(divided, zero), one);
396
+ result = _mm_or_ps(
397
+ _mm_andnot_ps(blend_mask, gray_clamped),
398
+ _mm_and_ps(blend_mask, alpha_spread));
399
+ __m128i idx2 = _mm_cvttps_epi32(
400
+ _mm_add_ps(_mm_mul_ps(result, scale), half));
401
+
402
+ /* Pack 8 ints -> 8 bytes */
403
+ idx = _mm_packs_epi32(idx, idx2);
404
+ idx = _mm_packus_epi16(idx, idx);
405
+ _mm_storel_epi64((__m128i *)(out + i), idx);
406
+ }
407
+
408
+ /* Process 2 GA pixels (4 floats) at a time */
409
+ for (; i+3<len; i+=4) {
410
+ v0 = _mm_loadu_ps(in[0] + i);
411
+ v1 = _mm_loadu_ps(in[1] + i);
412
+ v2 = _mm_loadu_ps(in[2] + i);
413
+ v3 = _mm_loadu_ps(in[3] + i);
414
+ sum = _mm_add_ps(
415
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
416
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
417
+
418
+ alpha_spread = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1));
419
+ alpha_spread = _mm_min_ps(_mm_max_ps(alpha_spread, zero), one);
420
+ nz_mask = _mm_cmpneq_ps(alpha_spread, zero);
421
+ safe_alpha = _mm_or_ps(
422
+ _mm_and_ps(nz_mask, alpha_spread),
423
+ _mm_andnot_ps(nz_mask, one));
424
+ divided = _mm_div_ps(sum, safe_alpha);
425
+ gray_clamped = _mm_min_ps(_mm_max_ps(divided, zero), one);
426
+ result = _mm_or_ps(
427
+ _mm_andnot_ps(blend_mask, gray_clamped),
428
+ _mm_and_ps(blend_mask, alpha_spread));
429
+ idx = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(result, scale), half));
430
+ idx = _mm_packs_epi32(idx, idx);
431
+ idx = _mm_packus_epi16(idx, idx);
432
+ *(int *)(out + i) = _mm_cvtsi128_si32(idx);
433
+ }
434
+
435
+ /* Scalar tail for remaining pixel */
436
+ for (; i<len; i+=2) {
437
+ float gray, alpha_f;
438
+ gray = coeffs[0] * in[0][i] + coeffs[1] * in[1][i] +
439
+ coeffs[2] * in[2][i] + coeffs[3] * in[3][i];
440
+ alpha_f = coeffs[0] * in[0][i+1] + coeffs[1] * in[1][i+1] +
441
+ coeffs[2] * in[2][i+1] + coeffs[3] * in[3][i+1];
442
+ if (alpha_f > 1.0f) alpha_f = 1.0f;
443
+ else if (alpha_f < 0.0f) alpha_f = 0.0f;
444
+ if (alpha_f != 0) gray /= alpha_f;
445
+ if (gray > 1.0f) gray = 1.0f;
446
+ else if (gray < 0.0f) gray = 0.0f;
447
+ out[i] = (int)(gray * 255.0f + 0.5f);
448
+ out[i+1] = (int)(alpha_f * 255.0f + 0.5f);
449
+ }
450
+ }
451
+
452
+ void oil_yscale_up_rgb_sse2(float **in, int len, float *coeffs,
453
+ unsigned char *out)
454
+ {
455
+ int i;
456
+ __m128 c0, c1, c2, c3;
457
+ __m128 v0, v1, v2, v3, sum;
458
+ __m128 scale;
459
+ __m128i idx;
460
+ unsigned char *lut;
461
+
462
+ c0 = _mm_set1_ps(coeffs[0]);
463
+ c1 = _mm_set1_ps(coeffs[1]);
464
+ c2 = _mm_set1_ps(coeffs[2]);
465
+ c3 = _mm_set1_ps(coeffs[3]);
466
+ lut = l2s_map;
467
+ scale = _mm_set1_ps((float)(l2s_len - 1));
468
+
469
+ for (i=0; i+7<len; i+=8) {
470
+ __m128i idx2;
471
+ __m128 sum2;
472
+
473
+ v0 = _mm_loadu_ps(in[0] + i);
474
+ v1 = _mm_loadu_ps(in[1] + i);
475
+ v2 = _mm_loadu_ps(in[2] + i);
476
+ v3 = _mm_loadu_ps(in[3] + i);
477
+ sum = _mm_add_ps(
478
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
479
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
480
+ idx = _mm_cvttps_epi32(_mm_mul_ps(sum, scale));
481
+
482
+ v0 = _mm_loadu_ps(in[0] + i + 4);
483
+ v1 = _mm_loadu_ps(in[1] + i + 4);
484
+ v2 = _mm_loadu_ps(in[2] + i + 4);
485
+ v3 = _mm_loadu_ps(in[3] + i + 4);
486
+ sum2 = _mm_add_ps(
487
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
488
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
489
+ idx2 = _mm_cvttps_epi32(_mm_mul_ps(sum2, scale));
490
+
491
+ out[i] = lut[_mm_cvtsi128_si32(idx)];
492
+ out[i+1] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 4))];
493
+ out[i+2] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 8))];
494
+ out[i+3] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 12))];
495
+ out[i+4] = lut[_mm_cvtsi128_si32(idx2)];
496
+ out[i+5] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx2, 4))];
497
+ out[i+6] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx2, 8))];
498
+ out[i+7] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx2, 12))];
499
+ }
500
+
501
+ for (; i+3<len; i+=4) {
502
+ v0 = _mm_loadu_ps(in[0] + i);
503
+ v1 = _mm_loadu_ps(in[1] + i);
504
+ v2 = _mm_loadu_ps(in[2] + i);
505
+ v3 = _mm_loadu_ps(in[3] + i);
506
+ sum = _mm_add_ps(
507
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
508
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
509
+ idx = _mm_cvttps_epi32(_mm_mul_ps(sum, scale));
510
+ out[i] = lut[_mm_cvtsi128_si32(idx)];
511
+ out[i+1] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 4))];
512
+ out[i+2] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 8))];
513
+ out[i+3] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 12))];
514
+ }
515
+
516
+ for (; i<len; i++) {
517
+ out[i] = lut[(int)(
518
+ (coeffs[0] * in[0][i] + coeffs[1] * in[1][i] +
519
+ coeffs[2] * in[2][i] + coeffs[3] * in[3][i]) * (l2s_len - 1))];
520
+ }
521
+ }
522
+
523
+ void oil_yscale_up_rgbx_sse2(float **in, int len, float *coeffs,
524
+ unsigned char *out)
525
+ {
526
+ int i;
527
+ __m128 c0, c1, c2, c3;
528
+ __m128 v0, v1, v2, v3, sum;
529
+ __m128 scale;
530
+ __m128i idx;
531
+ unsigned char *lut;
532
+
533
+ c0 = _mm_set1_ps(coeffs[0]);
534
+ c1 = _mm_set1_ps(coeffs[1]);
535
+ c2 = _mm_set1_ps(coeffs[2]);
536
+ c3 = _mm_set1_ps(coeffs[3]);
537
+ lut = l2s_map;
538
+ scale = _mm_set1_ps((float)(l2s_len - 1));
539
+
540
+ for (i=0; i+7<len; i+=8) {
541
+ /* Pixel 0: 4 floats [R, G, B, X] */
542
+ v0 = _mm_loadu_ps(in[0] + i);
543
+ v1 = _mm_loadu_ps(in[1] + i);
544
+ v2 = _mm_loadu_ps(in[2] + i);
545
+ v3 = _mm_loadu_ps(in[3] + i);
546
+ sum = _mm_add_ps(
547
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
548
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
549
+ idx = _mm_cvttps_epi32(_mm_mul_ps(sum, scale));
550
+
551
+ out[i] = lut[_mm_cvtsi128_si32(idx)];
552
+ out[i+1] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 4))];
553
+ out[i+2] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 8))];
554
+ out[i+3] = 255;
555
+
556
+ /* Pixel 1: next 4 floats */
557
+ v0 = _mm_loadu_ps(in[0] + i + 4);
558
+ v1 = _mm_loadu_ps(in[1] + i + 4);
559
+ v2 = _mm_loadu_ps(in[2] + i + 4);
560
+ v3 = _mm_loadu_ps(in[3] + i + 4);
561
+ sum = _mm_add_ps(
562
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
563
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
564
+ idx = _mm_cvttps_epi32(_mm_mul_ps(sum, scale));
565
+
566
+ out[i+4] = lut[_mm_cvtsi128_si32(idx)];
567
+ out[i+5] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 4))];
568
+ out[i+6] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 8))];
569
+ out[i+7] = 255;
570
+ }
571
+
572
+ for (; i+3<len; i+=4) {
573
+ v0 = _mm_loadu_ps(in[0] + i);
574
+ v1 = _mm_loadu_ps(in[1] + i);
575
+ v2 = _mm_loadu_ps(in[2] + i);
576
+ v3 = _mm_loadu_ps(in[3] + i);
577
+ sum = _mm_add_ps(
578
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
579
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
580
+ idx = _mm_cvttps_epi32(_mm_mul_ps(sum, scale));
581
+
582
+ out[i] = lut[_mm_cvtsi128_si32(idx)];
583
+ out[i+1] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 4))];
584
+ out[i+2] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 8))];
585
+ out[i+3] = 255;
586
+ }
587
+ }
588
+
589
+ void oil_xscale_up_rgb_sse2(unsigned char *in, int width_in, float *out,
590
+ float *coeff_buf, int *border_buf)
591
+ {
592
+ int i, j;
593
+ __m128 smp_r, smp_g, smp_b, newval, hi;
594
+
595
+ smp_r = _mm_setzero_ps();
596
+ smp_g = _mm_setzero_ps();
597
+ smp_b = _mm_setzero_ps();
598
+
599
+ for (i=0; i<width_in; i++) {
600
+ /* push_f for R: shift left, insert new value at position 3 */
601
+ smp_r = (__m128)_mm_srli_si128((__m128i)smp_r, 4);
602
+ newval = _mm_set_ss(s2l_map[in[0]]);
603
+ hi = _mm_shuffle_ps(smp_r, newval, _MM_SHUFFLE(0, 0, 3, 2));
604
+ smp_r = _mm_shuffle_ps(smp_r, hi, _MM_SHUFFLE(2, 0, 1, 0));
605
+
606
+ /* push_f for G */
607
+ smp_g = (__m128)_mm_srli_si128((__m128i)smp_g, 4);
608
+ newval = _mm_set_ss(s2l_map[in[1]]);
609
+ hi = _mm_shuffle_ps(smp_g, newval, _MM_SHUFFLE(0, 0, 3, 2));
610
+ smp_g = _mm_shuffle_ps(smp_g, hi, _MM_SHUFFLE(2, 0, 1, 0));
611
+
612
+ /* push_f for B */
613
+ smp_b = (__m128)_mm_srli_si128((__m128i)smp_b, 4);
614
+ newval = _mm_set_ss(s2l_map[in[2]]);
615
+ hi = _mm_shuffle_ps(smp_b, newval, _MM_SHUFFLE(0, 0, 3, 2));
616
+ smp_b = _mm_shuffle_ps(smp_b, hi, _MM_SHUFFLE(2, 0, 1, 0));
617
+
618
+ j = border_buf[i];
619
+
620
+ /* process pairs of outputs */
621
+ while (j >= 2) {
622
+ __m128 c0 = _mm_load_ps(coeff_buf);
623
+ __m128 c1 = _mm_load_ps(coeff_buf + 4);
624
+
625
+ /* R dot products for 2 outputs */
626
+ __m128 pr0 = _mm_mul_ps(smp_r, c0);
627
+ __m128 pr1 = _mm_mul_ps(smp_r, c1);
628
+ __m128 lo = _mm_unpacklo_ps(pr0, pr1);
629
+ __m128 hh = _mm_unpackhi_ps(pr0, pr1);
630
+ __m128 sum = _mm_add_ps(lo, hh);
631
+ __m128 t1 = _mm_movehl_ps(sum, sum);
632
+ __m128 t2_r = _mm_add_ps(sum, t1);
633
+
634
+ /* G dot products for 2 outputs */
635
+ __m128 pg0 = _mm_mul_ps(smp_g, c0);
636
+ __m128 pg1 = _mm_mul_ps(smp_g, c1);
637
+ lo = _mm_unpacklo_ps(pg0, pg1);
638
+ hh = _mm_unpackhi_ps(pg0, pg1);
639
+ sum = _mm_add_ps(lo, hh);
640
+ t1 = _mm_movehl_ps(sum, sum);
641
+ __m128 t2_g = _mm_add_ps(sum, t1);
642
+
643
+ /* B dot products for 2 outputs */
644
+ __m128 pb0 = _mm_mul_ps(smp_b, c0);
645
+ __m128 pb1 = _mm_mul_ps(smp_b, c1);
646
+ lo = _mm_unpacklo_ps(pb0, pb1);
647
+ hh = _mm_unpackhi_ps(pb0, pb1);
648
+ sum = _mm_add_ps(lo, hh);
649
+ t1 = _mm_movehl_ps(sum, sum);
650
+ __m128 t2_b = _mm_add_ps(sum, t1);
651
+
652
+ /* Store interleaved: [R0, G0, B0, R1, G1, B1] */
653
+ out[0] = _mm_cvtss_f32(t2_r);
654
+ out[1] = _mm_cvtss_f32(t2_g);
655
+ out[2] = _mm_cvtss_f32(t2_b);
656
+ out[3] = _mm_cvtss_f32(
657
+ _mm_shuffle_ps(t2_r, t2_r, _MM_SHUFFLE(1,1,1,1)));
658
+ out[4] = _mm_cvtss_f32(
659
+ _mm_shuffle_ps(t2_g, t2_g, _MM_SHUFFLE(1,1,1,1)));
660
+ out[5] = _mm_cvtss_f32(
661
+ _mm_shuffle_ps(t2_b, t2_b, _MM_SHUFFLE(1,1,1,1)));
662
+
663
+ out += 6;
664
+ coeff_buf += 8;
665
+ j -= 2;
666
+ }
667
+
668
+ /* process remaining single output */
669
+ if (j) {
670
+ __m128 coeffs = _mm_load_ps(coeff_buf);
671
+
672
+ __m128 prod = _mm_mul_ps(smp_r, coeffs);
673
+ __m128 t1 = _mm_movehl_ps(prod, prod);
674
+ __m128 t2 = _mm_add_ps(prod, t1);
675
+ prod = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
676
+ t2 = _mm_add_ss(t2, prod);
677
+ out[0] = _mm_cvtss_f32(t2);
678
+
679
+ prod = _mm_mul_ps(smp_g, coeffs);
680
+ t1 = _mm_movehl_ps(prod, prod);
681
+ t2 = _mm_add_ps(prod, t1);
682
+ prod = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
683
+ t2 = _mm_add_ss(t2, prod);
684
+ out[1] = _mm_cvtss_f32(t2);
685
+
686
+ prod = _mm_mul_ps(smp_b, coeffs);
687
+ t1 = _mm_movehl_ps(prod, prod);
688
+ t2 = _mm_add_ps(prod, t1);
689
+ prod = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
690
+ t2 = _mm_add_ss(t2, prod);
691
+ out[2] = _mm_cvtss_f32(t2);
692
+
693
+ out += 3;
694
+ coeff_buf += 4;
695
+ }
696
+
697
+ in += 3;
698
+ }
699
+ }
700
+
701
+ void oil_xscale_up_rgbx_sse2(unsigned char *in, int width_in, float *out,
702
+ float *coeff_buf, int *border_buf)
703
+ {
704
+ int i, j;
705
+ __m128 smp_r, smp_g, smp_b, smp_x, newval, hi;
706
+
707
+ smp_r = _mm_setzero_ps();
708
+ smp_g = _mm_setzero_ps();
709
+ smp_b = _mm_setzero_ps();
710
+ smp_x = _mm_setzero_ps();
711
+
712
+ for (i=0; i<width_in; i++) {
713
+ /* push_f for R */
714
+ smp_r = (__m128)_mm_srli_si128((__m128i)smp_r, 4);
715
+ newval = _mm_set_ss(s2l_map[in[0]]);
716
+ hi = _mm_shuffle_ps(smp_r, newval, _MM_SHUFFLE(0, 0, 3, 2));
717
+ smp_r = _mm_shuffle_ps(smp_r, hi, _MM_SHUFFLE(2, 0, 1, 0));
718
+
719
+ /* push_f for G */
720
+ smp_g = (__m128)_mm_srli_si128((__m128i)smp_g, 4);
721
+ newval = _mm_set_ss(s2l_map[in[1]]);
722
+ hi = _mm_shuffle_ps(smp_g, newval, _MM_SHUFFLE(0, 0, 3, 2));
723
+ smp_g = _mm_shuffle_ps(smp_g, hi, _MM_SHUFFLE(2, 0, 1, 0));
724
+
725
+ /* push_f for B */
726
+ smp_b = (__m128)_mm_srli_si128((__m128i)smp_b, 4);
727
+ newval = _mm_set_ss(s2l_map[in[2]]);
728
+ hi = _mm_shuffle_ps(smp_b, newval, _MM_SHUFFLE(0, 0, 3, 2));
729
+ smp_b = _mm_shuffle_ps(smp_b, hi, _MM_SHUFFLE(2, 0, 1, 0));
730
+
731
+ /* push_f for X (always 1.0f) */
732
+ smp_x = (__m128)_mm_srli_si128((__m128i)smp_x, 4);
733
+ newval = _mm_set_ss(1.0f);
734
+ hi = _mm_shuffle_ps(smp_x, newval, _MM_SHUFFLE(0, 0, 3, 2));
735
+ smp_x = _mm_shuffle_ps(smp_x, hi, _MM_SHUFFLE(2, 0, 1, 0));
736
+
737
+ j = border_buf[i];
738
+
739
+ /* process pairs of outputs */
740
+ while (j >= 2) {
741
+ __m128 c0 = _mm_load_ps(coeff_buf);
742
+ __m128 c1 = _mm_load_ps(coeff_buf + 4);
743
+
744
+ /* R dot products for 2 outputs */
745
+ __m128 pr0 = _mm_mul_ps(smp_r, c0);
746
+ __m128 pr1 = _mm_mul_ps(smp_r, c1);
747
+ __m128 lo = _mm_unpacklo_ps(pr0, pr1);
748
+ __m128 hh = _mm_unpackhi_ps(pr0, pr1);
749
+ __m128 sum = _mm_add_ps(lo, hh);
750
+ __m128 t1 = _mm_movehl_ps(sum, sum);
751
+ __m128 t2_r = _mm_add_ps(sum, t1);
752
+
753
+ /* G dot products for 2 outputs */
754
+ __m128 pg0 = _mm_mul_ps(smp_g, c0);
755
+ __m128 pg1 = _mm_mul_ps(smp_g, c1);
756
+ lo = _mm_unpacklo_ps(pg0, pg1);
757
+ hh = _mm_unpackhi_ps(pg0, pg1);
758
+ sum = _mm_add_ps(lo, hh);
759
+ t1 = _mm_movehl_ps(sum, sum);
760
+ __m128 t2_g = _mm_add_ps(sum, t1);
761
+
762
+ /* B dot products for 2 outputs */
763
+ __m128 pb0 = _mm_mul_ps(smp_b, c0);
764
+ __m128 pb1 = _mm_mul_ps(smp_b, c1);
765
+ lo = _mm_unpacklo_ps(pb0, pb1);
766
+ hh = _mm_unpackhi_ps(pb0, pb1);
767
+ sum = _mm_add_ps(lo, hh);
768
+ t1 = _mm_movehl_ps(sum, sum);
769
+ __m128 t2_b = _mm_add_ps(sum, t1);
770
+
771
+ /* X dot products for 2 outputs */
772
+ __m128 px0 = _mm_mul_ps(smp_x, c0);
773
+ __m128 px1 = _mm_mul_ps(smp_x, c1);
774
+ lo = _mm_unpacklo_ps(px0, px1);
775
+ hh = _mm_unpackhi_ps(px0, px1);
776
+ sum = _mm_add_ps(lo, hh);
777
+ t1 = _mm_movehl_ps(sum, sum);
778
+ __m128 t2_x = _mm_add_ps(sum, t1);
779
+
780
+ /* Store interleaved: [R0, G0, B0, X0, R1, G1, B1, X1] */
781
+ out[0] = _mm_cvtss_f32(t2_r);
782
+ out[1] = _mm_cvtss_f32(t2_g);
783
+ out[2] = _mm_cvtss_f32(t2_b);
784
+ out[3] = _mm_cvtss_f32(t2_x);
785
+ out[4] = _mm_cvtss_f32(
786
+ _mm_shuffle_ps(t2_r, t2_r, _MM_SHUFFLE(1,1,1,1)));
787
+ out[5] = _mm_cvtss_f32(
788
+ _mm_shuffle_ps(t2_g, t2_g, _MM_SHUFFLE(1,1,1,1)));
789
+ out[6] = _mm_cvtss_f32(
790
+ _mm_shuffle_ps(t2_b, t2_b, _MM_SHUFFLE(1,1,1,1)));
791
+ out[7] = _mm_cvtss_f32(
792
+ _mm_shuffle_ps(t2_x, t2_x, _MM_SHUFFLE(1,1,1,1)));
793
+
794
+ out += 8;
795
+ coeff_buf += 8;
796
+ j -= 2;
797
+ }
798
+
799
+ /* process remaining single output */
800
+ if (j) {
801
+ __m128 coeffs = _mm_load_ps(coeff_buf);
802
+
803
+ __m128 prod = _mm_mul_ps(smp_r, coeffs);
804
+ __m128 t1 = _mm_movehl_ps(prod, prod);
805
+ __m128 t2 = _mm_add_ps(prod, t1);
806
+ prod = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
807
+ t2 = _mm_add_ss(t2, prod);
808
+ out[0] = _mm_cvtss_f32(t2);
809
+
810
+ prod = _mm_mul_ps(smp_g, coeffs);
811
+ t1 = _mm_movehl_ps(prod, prod);
812
+ t2 = _mm_add_ps(prod, t1);
813
+ prod = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
814
+ t2 = _mm_add_ss(t2, prod);
815
+ out[1] = _mm_cvtss_f32(t2);
816
+
817
+ prod = _mm_mul_ps(smp_b, coeffs);
818
+ t1 = _mm_movehl_ps(prod, prod);
819
+ t2 = _mm_add_ps(prod, t1);
820
+ prod = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
821
+ t2 = _mm_add_ss(t2, prod);
822
+ out[2] = _mm_cvtss_f32(t2);
823
+
824
+ prod = _mm_mul_ps(smp_x, coeffs);
825
+ t1 = _mm_movehl_ps(prod, prod);
826
+ t2 = _mm_add_ps(prod, t1);
827
+ prod = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
828
+ t2 = _mm_add_ss(t2, prod);
829
+ out[3] = _mm_cvtss_f32(t2);
830
+
831
+ out += 4;
832
+ coeff_buf += 4;
833
+ }
834
+
835
+ in += 4;
836
+ }
837
+ }
838
+
839
+ void oil_xscale_up_cmyk_sse2(unsigned char *in, int width_in, float *out,
840
+ float *coeff_buf, int *border_buf)
841
+ {
842
+ int i, j;
843
+ __m128 smp0, smp1, smp2, smp3, inv255;
844
+ __m128i zero_i;
845
+
846
+ /* Interleaved layout: each smpN = [C, M, Y, K] for one tap position */
847
+ smp0 = _mm_setzero_ps();
848
+ smp1 = _mm_setzero_ps();
849
+ smp2 = _mm_setzero_ps();
850
+ smp3 = _mm_setzero_ps();
851
+ inv255 = _mm_set1_ps(1.0f / 255.0f);
852
+ zero_i = _mm_setzero_si128();
853
+
854
+ for (i=0; i<width_in; i++) {
855
+ /* Push new pixel: load 4 bytes [C,M,Y,K], convert to floats */
856
+ __m128i px = _mm_cvtsi32_si128(*(int *)in);
857
+ px = _mm_unpacklo_epi8(px, zero_i);
858
+ px = _mm_unpacklo_epi16(px, zero_i);
859
+ smp0 = smp1;
860
+ smp1 = smp2;
861
+ smp2 = smp3;
862
+ smp3 = _mm_mul_ps(_mm_cvtepi32_ps(px), inv255);
863
+
864
+ j = border_buf[i];
865
+
866
+ /* process pairs of outputs */
867
+ while (j >= 2) {
868
+ __m128 coeffs0 = _mm_load_ps(coeff_buf);
869
+ __m128 coeffs1 = _mm_load_ps(coeff_buf + 4);
870
+
871
+ /* First output: broadcast each coeff and multiply */
872
+ __m128 result0 = _mm_add_ps(
873
+ _mm_add_ps(
874
+ _mm_mul_ps(smp0, _mm_shuffle_ps(coeffs0, coeffs0, _MM_SHUFFLE(0,0,0,0))),
875
+ _mm_mul_ps(smp1, _mm_shuffle_ps(coeffs0, coeffs0, _MM_SHUFFLE(1,1,1,1)))),
876
+ _mm_add_ps(
877
+ _mm_mul_ps(smp2, _mm_shuffle_ps(coeffs0, coeffs0, _MM_SHUFFLE(2,2,2,2))),
878
+ _mm_mul_ps(smp3, _mm_shuffle_ps(coeffs0, coeffs0, _MM_SHUFFLE(3,3,3,3)))));
879
+
880
+ /* Second output */
881
+ __m128 result1 = _mm_add_ps(
882
+ _mm_add_ps(
883
+ _mm_mul_ps(smp0, _mm_shuffle_ps(coeffs1, coeffs1, _MM_SHUFFLE(0,0,0,0))),
884
+ _mm_mul_ps(smp1, _mm_shuffle_ps(coeffs1, coeffs1, _MM_SHUFFLE(1,1,1,1)))),
885
+ _mm_add_ps(
886
+ _mm_mul_ps(smp2, _mm_shuffle_ps(coeffs1, coeffs1, _MM_SHUFFLE(2,2,2,2))),
887
+ _mm_mul_ps(smp3, _mm_shuffle_ps(coeffs1, coeffs1, _MM_SHUFFLE(3,3,3,3)))));
888
+
889
+ _mm_storeu_ps(out, result0);
890
+ _mm_storeu_ps(out + 4, result1);
891
+
892
+ out += 8;
893
+ coeff_buf += 8;
894
+ j -= 2;
895
+ }
896
+
897
+ /* process remaining single output */
898
+ if (j) {
899
+ __m128 coeffs = _mm_load_ps(coeff_buf);
900
+
901
+ __m128 result = _mm_add_ps(
902
+ _mm_add_ps(
903
+ _mm_mul_ps(smp0, _mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(0,0,0,0))),
904
+ _mm_mul_ps(smp1, _mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(1,1,1,1)))),
905
+ _mm_add_ps(
906
+ _mm_mul_ps(smp2, _mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(2,2,2,2))),
907
+ _mm_mul_ps(smp3, _mm_shuffle_ps(coeffs, coeffs, _MM_SHUFFLE(3,3,3,3)))));
908
+
909
+ _mm_storeu_ps(out, result);
910
+
911
+ out += 4;
912
+ coeff_buf += 4;
913
+ }
914
+
915
+ in += 4;
916
+ }
917
+ }
918
+
919
+ void oil_yscale_up_g_cmyk_sse2(float **in, int len, float *coeffs,
920
+ unsigned char *out)
921
+ {
922
+ int i;
923
+ __m128 c0, c1, c2, c3;
924
+ __m128 v0, v1, v2, v3, sum;
925
+ __m128 scale, half, zero, one;
926
+ __m128i idx;
927
+
928
+ c0 = _mm_set1_ps(coeffs[0]);
929
+ c1 = _mm_set1_ps(coeffs[1]);
930
+ c2 = _mm_set1_ps(coeffs[2]);
931
+ c3 = _mm_set1_ps(coeffs[3]);
932
+ scale = _mm_set1_ps(255.0f);
933
+ half = _mm_set1_ps(0.5f);
934
+ zero = _mm_setzero_ps();
935
+ one = _mm_set1_ps(1.0f);
936
+
937
+ for (i=0; i+15<len; i+=16) {
938
+ __m128i idx2, idx3, idx4;
939
+ __m128 sum2;
940
+
941
+ v0 = _mm_loadu_ps(in[0] + i);
942
+ v1 = _mm_loadu_ps(in[1] + i);
943
+ v2 = _mm_loadu_ps(in[2] + i);
944
+ v3 = _mm_loadu_ps(in[3] + i);
945
+ sum = _mm_add_ps(
946
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
947
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
948
+ sum = _mm_min_ps(_mm_max_ps(sum, zero), one);
949
+ idx = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(sum, scale), half));
950
+
951
+ v0 = _mm_loadu_ps(in[0] + i + 4);
952
+ v1 = _mm_loadu_ps(in[1] + i + 4);
953
+ v2 = _mm_loadu_ps(in[2] + i + 4);
954
+ v3 = _mm_loadu_ps(in[3] + i + 4);
955
+ sum2 = _mm_add_ps(
956
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
957
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
958
+ sum2 = _mm_min_ps(_mm_max_ps(sum2, zero), one);
959
+ idx2 = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(sum2, scale), half));
960
+
961
+ v0 = _mm_loadu_ps(in[0] + i + 8);
962
+ v1 = _mm_loadu_ps(in[1] + i + 8);
963
+ v2 = _mm_loadu_ps(in[2] + i + 8);
964
+ v3 = _mm_loadu_ps(in[3] + i + 8);
965
+ sum = _mm_add_ps(
966
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
967
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
968
+ sum = _mm_min_ps(_mm_max_ps(sum, zero), one);
969
+ idx3 = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(sum, scale), half));
970
+
971
+ v0 = _mm_loadu_ps(in[0] + i + 12);
972
+ v1 = _mm_loadu_ps(in[1] + i + 12);
973
+ v2 = _mm_loadu_ps(in[2] + i + 12);
974
+ v3 = _mm_loadu_ps(in[3] + i + 12);
975
+ sum2 = _mm_add_ps(
976
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
977
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
978
+ sum2 = _mm_min_ps(_mm_max_ps(sum2, zero), one);
979
+ idx4 = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(sum2, scale), half));
980
+
981
+ idx = _mm_packs_epi32(idx, idx2);
982
+ idx3 = _mm_packs_epi32(idx3, idx4);
983
+ idx = _mm_packus_epi16(idx, idx3);
984
+ _mm_storeu_si128((__m128i *)(out + i), idx);
985
+ }
986
+
987
+ for (; i+7<len; i+=8) {
988
+ __m128i idx2;
989
+ __m128 sum2;
990
+
991
+ v0 = _mm_loadu_ps(in[0] + i);
992
+ v1 = _mm_loadu_ps(in[1] + i);
993
+ v2 = _mm_loadu_ps(in[2] + i);
994
+ v3 = _mm_loadu_ps(in[3] + i);
995
+ sum = _mm_add_ps(
996
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
997
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
998
+ sum = _mm_min_ps(_mm_max_ps(sum, zero), one);
999
+ idx = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(sum, scale), half));
1000
+
1001
+ v0 = _mm_loadu_ps(in[0] + i + 4);
1002
+ v1 = _mm_loadu_ps(in[1] + i + 4);
1003
+ v2 = _mm_loadu_ps(in[2] + i + 4);
1004
+ v3 = _mm_loadu_ps(in[3] + i + 4);
1005
+ sum2 = _mm_add_ps(
1006
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
1007
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
1008
+ sum2 = _mm_min_ps(_mm_max_ps(sum2, zero), one);
1009
+ idx2 = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(sum2, scale), half));
1010
+
1011
+ idx = _mm_packs_epi32(idx, idx2);
1012
+ idx = _mm_packus_epi16(idx, idx);
1013
+ _mm_storel_epi64((__m128i *)(out + i), idx);
1014
+ }
1015
+
1016
+ for (; i+3<len; i+=4) {
1017
+ v0 = _mm_loadu_ps(in[0] + i);
1018
+ v1 = _mm_loadu_ps(in[1] + i);
1019
+ v2 = _mm_loadu_ps(in[2] + i);
1020
+ v3 = _mm_loadu_ps(in[3] + i);
1021
+ sum = _mm_add_ps(
1022
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
1023
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
1024
+ sum = _mm_min_ps(_mm_max_ps(sum, zero), one);
1025
+ idx = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(sum, scale), half));
1026
+ idx = _mm_packs_epi32(idx, idx);
1027
+ idx = _mm_packus_epi16(idx, idx);
1028
+ *(int *)(out + i) = _mm_cvtsi128_si32(idx);
1029
+ }
1030
+
1031
+ for (; i<len; i++) {
1032
+ float s = coeffs[0] * in[0][i] + coeffs[1] * in[1][i] +
1033
+ coeffs[2] * in[2][i] + coeffs[3] * in[3][i];
1034
+ if (s > 1.0f) s = 1.0f;
1035
+ else if (s < 0.0f) s = 0.0f;
1036
+ out[i] = (int)(s * 255.0f + 0.5f);
1037
+ }
1038
+ }
1039
+
1040
+ void oil_scale_down_g_sse2(unsigned char *in, float *sums_y_out,
1041
+ int out_width, float *coeffs_x_f, int *border_buf, float *coeffs_y_f)
1042
+ {
1043
+ int i, j;
1044
+ __m128 coeffs_x, sample_x, sum;
1045
+ __m128 coeffs_y, sums_y, sample_y;
1046
+
1047
+ coeffs_y = _mm_load_ps(coeffs_y_f);
1048
+ sum = _mm_setzero_ps();
1049
+
1050
+ for (i=0; i<out_width; i++) {
1051
+ for (j=0; j<border_buf[i]; j++) {
1052
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1053
+ sample_x = _mm_set1_ps(i2f_map[in[0]]);
1054
+ sum = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum);
1055
+ in += 1;
1056
+ coeffs_x_f += 4;
1057
+ }
1058
+
1059
+ sums_y = _mm_load_ps(sums_y_out);
1060
+ sample_y = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 0, 0));
1061
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1062
+ _mm_store_ps(sums_y_out, sums_y);
1063
+ sums_y_out += 4;
1064
+
1065
+ sum = (__m128)_mm_srli_si128(_mm_castps_si128(sum), 4);
1066
+ }
1067
+ }
1068
+
1069
+ void oil_scale_down_ga_sse2(unsigned char *in, float *sums_y_out,
1070
+ int out_width, float *coeffs_x_f, int *border_buf, float *coeffs_y_f)
1071
+ {
1072
+ int i, j;
1073
+ float alpha;
1074
+ __m128 coeffs_x, coeffs_x2, sample_x, sum_g, sum_a;
1075
+ __m128 sum_g2, sum_a2;
1076
+ __m128 coeffs_y, sums_y, sample_y;
1077
+
1078
+ coeffs_y = _mm_load_ps(coeffs_y_f);
1079
+
1080
+ sum_g = _mm_setzero_ps();
1081
+ sum_a = _mm_setzero_ps();
1082
+
1083
+ for (i=0; i<out_width; i++) {
1084
+ if (border_buf[i] >= 4) {
1085
+ sum_g2 = _mm_setzero_ps();
1086
+ sum_a2 = _mm_setzero_ps();
1087
+
1088
+ for (j=0; j+1<border_buf[i]; j+=2) {
1089
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1090
+ coeffs_x2 = _mm_load_ps(coeffs_x_f + 4);
1091
+
1092
+ alpha = i2f_map[in[1]];
1093
+ sample_x = _mm_set1_ps(i2f_map[in[0]] * alpha);
1094
+ sum_g = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_g);
1095
+ sample_x = _mm_set1_ps(alpha);
1096
+ sum_a = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_a);
1097
+
1098
+ alpha = i2f_map[in[3]];
1099
+ sample_x = _mm_set1_ps(i2f_map[in[2]] * alpha);
1100
+ sum_g2 = _mm_add_ps(_mm_mul_ps(coeffs_x2, sample_x), sum_g2);
1101
+ sample_x = _mm_set1_ps(alpha);
1102
+ sum_a2 = _mm_add_ps(_mm_mul_ps(coeffs_x2, sample_x), sum_a2);
1103
+
1104
+ in += 4;
1105
+ coeffs_x_f += 8;
1106
+ }
1107
+
1108
+ for (; j<border_buf[i]; j++) {
1109
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1110
+ alpha = i2f_map[in[1]];
1111
+ sample_x = _mm_set1_ps(i2f_map[in[0]] * alpha);
1112
+ sum_g = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_g);
1113
+ sample_x = _mm_set1_ps(alpha);
1114
+ sum_a = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_a);
1115
+ in += 2;
1116
+ coeffs_x_f += 4;
1117
+ }
1118
+
1119
+ sum_g = _mm_add_ps(sum_g, sum_g2);
1120
+ sum_a = _mm_add_ps(sum_a, sum_a2);
1121
+ } else {
1122
+ for (j=0; j<border_buf[i]; j++) {
1123
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1124
+ alpha = i2f_map[in[1]];
1125
+ sample_x = _mm_set1_ps(i2f_map[in[0]] * alpha);
1126
+ sum_g = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_g);
1127
+ sample_x = _mm_set1_ps(alpha);
1128
+ sum_a = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_a);
1129
+ in += 2;
1130
+ coeffs_x_f += 4;
1131
+ }
1132
+ }
1133
+
1134
+ sums_y = _mm_load_ps(sums_y_out);
1135
+ sample_y = _mm_shuffle_ps(sum_g, sum_g, _MM_SHUFFLE(0, 0, 0, 0));
1136
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1137
+ _mm_store_ps(sums_y_out, sums_y);
1138
+ sums_y_out += 4;
1139
+
1140
+ sums_y = _mm_load_ps(sums_y_out);
1141
+ sample_y = _mm_shuffle_ps(sum_a, sum_a, _MM_SHUFFLE(0, 0, 0, 0));
1142
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1143
+ _mm_store_ps(sums_y_out, sums_y);
1144
+ sums_y_out += 4;
1145
+
1146
+ sum_g = (__m128)_mm_srli_si128(_mm_castps_si128(sum_g), 4);
1147
+ sum_a = (__m128)_mm_srli_si128(_mm_castps_si128(sum_a), 4);
1148
+ }
1149
+ }
1150
+
1151
+ void oil_scale_down_rgb_sse2(unsigned char *in, float *sums_y_out,
1152
+ int out_width, float *coeffs_x_f, int *border_buf, float *coeffs_y_f)
1153
+ {
1154
+ int i, j;
1155
+ __m128 coeffs_x, coeffs_x2, sample_x, sum_r, sum_g, sum_b;
1156
+ __m128 sum_r2, sum_g2, sum_b2;
1157
+ __m128 coeffs_y, sums_y, sample_y;
1158
+
1159
+ coeffs_y = _mm_load_ps(coeffs_y_f);
1160
+
1161
+ sum_r = _mm_setzero_ps();
1162
+ sum_g = _mm_setzero_ps();
1163
+ sum_b = _mm_setzero_ps();
1164
+
1165
+ for (i=0; i<out_width; i++) {
1166
+ if (border_buf[i] >= 4) {
1167
+ sum_r2 = _mm_setzero_ps();
1168
+ sum_g2 = _mm_setzero_ps();
1169
+ sum_b2 = _mm_setzero_ps();
1170
+
1171
+ for (j=0; j+1<border_buf[i]; j+=2) {
1172
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1173
+ coeffs_x2 = _mm_load_ps(coeffs_x_f + 4);
1174
+
1175
+ sample_x = _mm_set1_ps(s2l_map[in[0]]);
1176
+ sum_r = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_r);
1177
+
1178
+ sample_x = _mm_set1_ps(s2l_map[in[1]]);
1179
+ sum_g = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_g);
1180
+
1181
+ sample_x = _mm_set1_ps(s2l_map[in[2]]);
1182
+ sum_b = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_b);
1183
+
1184
+ sample_x = _mm_set1_ps(s2l_map[in[3]]);
1185
+ sum_r2 = _mm_add_ps(_mm_mul_ps(coeffs_x2, sample_x), sum_r2);
1186
+
1187
+ sample_x = _mm_set1_ps(s2l_map[in[4]]);
1188
+ sum_g2 = _mm_add_ps(_mm_mul_ps(coeffs_x2, sample_x), sum_g2);
1189
+
1190
+ sample_x = _mm_set1_ps(s2l_map[in[5]]);
1191
+ sum_b2 = _mm_add_ps(_mm_mul_ps(coeffs_x2, sample_x), sum_b2);
1192
+
1193
+ in += 6;
1194
+ coeffs_x_f += 8;
1195
+ }
1196
+
1197
+ for (; j<border_buf[i]; j++) {
1198
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1199
+
1200
+ sample_x = _mm_set1_ps(s2l_map[in[0]]);
1201
+ sum_r = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_r);
1202
+
1203
+ sample_x = _mm_set1_ps(s2l_map[in[1]]);
1204
+ sum_g = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_g);
1205
+
1206
+ sample_x = _mm_set1_ps(s2l_map[in[2]]);
1207
+ sum_b = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_b);
1208
+
1209
+ in += 3;
1210
+ coeffs_x_f += 4;
1211
+ }
1212
+
1213
+ sum_r = _mm_add_ps(sum_r, sum_r2);
1214
+ sum_g = _mm_add_ps(sum_g, sum_g2);
1215
+ sum_b = _mm_add_ps(sum_b, sum_b2);
1216
+ } else {
1217
+ for (j=0; j<border_buf[i]; j++) {
1218
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1219
+
1220
+ sample_x = _mm_set1_ps(s2l_map[in[0]]);
1221
+ sum_r = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_r);
1222
+
1223
+ sample_x = _mm_set1_ps(s2l_map[in[1]]);
1224
+ sum_g = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_g);
1225
+
1226
+ sample_x = _mm_set1_ps(s2l_map[in[2]]);
1227
+ sum_b = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_b);
1228
+
1229
+ in += 3;
1230
+ coeffs_x_f += 4;
1231
+ }
1232
+ }
1233
+
1234
+ sums_y = _mm_load_ps(sums_y_out);
1235
+ sample_y = _mm_shuffle_ps(sum_r, sum_r, _MM_SHUFFLE(0, 0, 0, 0));
1236
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1237
+ _mm_store_ps(sums_y_out, sums_y);
1238
+ sums_y_out += 4;
1239
+
1240
+ sums_y = _mm_load_ps(sums_y_out);
1241
+ sample_y = _mm_shuffle_ps(sum_g, sum_g, _MM_SHUFFLE(0, 0, 0, 0));
1242
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1243
+ _mm_store_ps(sums_y_out, sums_y);
1244
+ sums_y_out += 4;
1245
+
1246
+ sums_y = _mm_load_ps(sums_y_out);
1247
+ sample_y = _mm_shuffle_ps(sum_b, sum_b, _MM_SHUFFLE(0, 0, 0, 0));
1248
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1249
+ _mm_store_ps(sums_y_out, sums_y);
1250
+ sums_y_out += 4;
1251
+
1252
+ sum_r = (__m128)_mm_srli_si128(_mm_castps_si128(sum_r), 4);
1253
+ sum_g = (__m128)_mm_srli_si128(_mm_castps_si128(sum_g), 4);
1254
+ sum_b = (__m128)_mm_srli_si128(_mm_castps_si128(sum_b), 4);
1255
+ }
1256
+ }
1257
+
1258
+ void oil_yscale_out_rgba_sse2(float *sums, int width, unsigned char *out)
1259
+ {
1260
+ int i;
1261
+ __m128 scale, one, zero;
1262
+ __m128 f0, f1, f2, f3, ab, cd, vals, alpha_v;
1263
+ __m128i idx, v0, v1, v2, v3;
1264
+ float alpha;
1265
+ unsigned char *lut;
1266
+
1267
+ lut = l2s_map;
1268
+ scale = _mm_set1_ps((float)(l2s_len - 1));
1269
+ one = _mm_set1_ps(1.0f);
1270
+ zero = _mm_setzero_ps();
1271
+
1272
+
1273
+ for (i=0; i<width; i++) {
1274
+ v0 = _mm_load_si128((__m128i *)sums);
1275
+ v1 = _mm_load_si128((__m128i *)(sums + 4));
1276
+ v2 = _mm_load_si128((__m128i *)(sums + 8));
1277
+ v3 = _mm_load_si128((__m128i *)(sums + 12));
1278
+
1279
+ /* Gather first element of each accumulator: {R, G, B, A} */
1280
+ f0 = _mm_castsi128_ps(v0);
1281
+ f1 = _mm_castsi128_ps(v1);
1282
+ f2 = _mm_castsi128_ps(v2);
1283
+ f3 = _mm_castsi128_ps(v3);
1284
+ ab = _mm_shuffle_ps(f0, f1, _MM_SHUFFLE(0, 0, 0, 0));
1285
+ cd = _mm_shuffle_ps(f2, f3, _MM_SHUFFLE(0, 0, 0, 0));
1286
+ vals = _mm_shuffle_ps(ab, cd, _MM_SHUFFLE(2, 0, 2, 0));
1287
+
1288
+ /* Clamp alpha to [0, 1] */
1289
+ alpha_v = _mm_shuffle_ps(vals, vals, _MM_SHUFFLE(3, 3, 3, 3));
1290
+ alpha_v = _mm_min_ps(_mm_max_ps(alpha_v, zero), one);
1291
+ alpha = _mm_cvtss_f32(alpha_v);
1292
+
1293
+ /* Divide RGB by alpha (skip if alpha == 0) */
1294
+ if (alpha != 0) {
1295
+ vals = _mm_mul_ps(vals, _mm_rcp_ps(alpha_v));
1296
+ }
1297
+
1298
+ /* Clamp RGB to [0, 1] and compute l2s_map indices */
1299
+ vals = _mm_min_ps(_mm_max_ps(vals, zero), one);
1300
+ idx = _mm_cvttps_epi32(_mm_mul_ps(vals, scale));
1301
+
1302
+ out[0] = lut[_mm_cvtsi128_si32(idx)];
1303
+ out[1] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 4))];
1304
+ out[2] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 8))];
1305
+ out[3] = (int)(alpha * 255.0f + 0.5f);
1306
+
1307
+ _mm_store_si128((__m128i *)sums, _mm_srli_si128(v0, 4));
1308
+ _mm_store_si128((__m128i *)(sums + 4), _mm_srli_si128(v1, 4));
1309
+ _mm_store_si128((__m128i *)(sums + 8), _mm_srli_si128(v2, 4));
1310
+ _mm_store_si128((__m128i *)(sums + 12), _mm_srli_si128(v3, 4));
1311
+
1312
+ sums += 16;
1313
+ out += 4;
1314
+ }
1315
+ }
1316
+
1317
+ void oil_yscale_up_rgba_sse2(float **in, int len, float *coeffs,
1318
+ unsigned char *out)
1319
+ {
1320
+ int i;
1321
+ __m128 c0, c1, c2, c3;
1322
+ __m128 v0, v1, v2, v3, sum;
1323
+ __m128 scale, one, zero;
1324
+ __m128 alpha_v, clamped;
1325
+ __m128i idx;
1326
+ unsigned char *lut;
1327
+ float alpha;
1328
+
1329
+ c0 = _mm_set1_ps(coeffs[0]);
1330
+ c1 = _mm_set1_ps(coeffs[1]);
1331
+ c2 = _mm_set1_ps(coeffs[2]);
1332
+ c3 = _mm_set1_ps(coeffs[3]);
1333
+ lut = l2s_map;
1334
+ scale = _mm_set1_ps((float)(l2s_len - 1));
1335
+ one = _mm_set1_ps(1.0f);
1336
+ zero = _mm_setzero_ps();
1337
+
1338
+ for (i=0; i<len; i+=4) {
1339
+ v0 = _mm_loadu_ps(in[0] + i);
1340
+ v1 = _mm_loadu_ps(in[1] + i);
1341
+ v2 = _mm_loadu_ps(in[2] + i);
1342
+ v3 = _mm_loadu_ps(in[3] + i);
1343
+ sum = _mm_add_ps(
1344
+ _mm_add_ps(_mm_mul_ps(c0, v0), _mm_mul_ps(c1, v1)),
1345
+ _mm_add_ps(_mm_mul_ps(c2, v2), _mm_mul_ps(c3, v3)));
1346
+
1347
+ /* Clamp alpha to [0, 1] */
1348
+ alpha_v = _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 3, 3));
1349
+ alpha_v = _mm_min_ps(_mm_max_ps(alpha_v, zero), one);
1350
+ alpha = _mm_cvtss_f32(alpha_v);
1351
+
1352
+ /* Divide RGB by alpha (skip if alpha == 0) */
1353
+ if (alpha != 0) {
1354
+ sum = _mm_mul_ps(sum, _mm_rcp_ps(alpha_v));
1355
+ }
1356
+
1357
+ /* Clamp to [0, 1] and compute l2s_map indices */
1358
+ clamped = _mm_min_ps(_mm_max_ps(sum, zero), one);
1359
+ idx = _mm_cvttps_epi32(_mm_mul_ps(clamped, scale));
1360
+
1361
+ out[i] = lut[_mm_cvtsi128_si32(idx)];
1362
+ out[i+1] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 4))];
1363
+ out[i+2] = lut[_mm_cvtsi128_si32(_mm_srli_si128(idx, 8))];
1364
+ out[i+3] = (int)(alpha * 255.0f + 0.5f);
1365
+ }
1366
+ }
1367
+
1368
+ void oil_xscale_up_rgba_sse2(unsigned char *in, int width_in, float *out,
1369
+ float *coeff_buf, int *border_buf)
1370
+ {
1371
+ int i, j;
1372
+ __m128 smp_r, smp_g, smp_b, smp_a, newval, hi;
1373
+ float *sl;
1374
+
1375
+ sl = s2l_map;
1376
+ smp_r = _mm_setzero_ps();
1377
+ smp_g = _mm_setzero_ps();
1378
+ smp_b = _mm_setzero_ps();
1379
+ smp_a = _mm_setzero_ps();
1380
+
1381
+ for (i=0; i<width_in; i++) {
1382
+ float alpha_new = in[3] / 255.0f;
1383
+
1384
+ /* push_f for A */
1385
+ smp_a = (__m128)_mm_srli_si128((__m128i)smp_a, 4);
1386
+ newval = _mm_set_ss(alpha_new);
1387
+ hi = _mm_shuffle_ps(smp_a, newval, _MM_SHUFFLE(0, 0, 3, 2));
1388
+ smp_a = _mm_shuffle_ps(smp_a, hi, _MM_SHUFFLE(2, 0, 1, 0));
1389
+
1390
+ /* push_f for R: premultiplied by alpha */
1391
+ smp_r = (__m128)_mm_srli_si128((__m128i)smp_r, 4);
1392
+ newval = _mm_set_ss(alpha_new * sl[in[0]]);
1393
+ hi = _mm_shuffle_ps(smp_r, newval, _MM_SHUFFLE(0, 0, 3, 2));
1394
+ smp_r = _mm_shuffle_ps(smp_r, hi, _MM_SHUFFLE(2, 0, 1, 0));
1395
+
1396
+ /* push_f for G: premultiplied by alpha */
1397
+ smp_g = (__m128)_mm_srli_si128((__m128i)smp_g, 4);
1398
+ newval = _mm_set_ss(alpha_new * sl[in[1]]);
1399
+ hi = _mm_shuffle_ps(smp_g, newval, _MM_SHUFFLE(0, 0, 3, 2));
1400
+ smp_g = _mm_shuffle_ps(smp_g, hi, _MM_SHUFFLE(2, 0, 1, 0));
1401
+
1402
+ /* push_f for B: premultiplied by alpha */
1403
+ smp_b = (__m128)_mm_srli_si128((__m128i)smp_b, 4);
1404
+ newval = _mm_set_ss(alpha_new * sl[in[2]]);
1405
+ hi = _mm_shuffle_ps(smp_b, newval, _MM_SHUFFLE(0, 0, 3, 2));
1406
+ smp_b = _mm_shuffle_ps(smp_b, hi, _MM_SHUFFLE(2, 0, 1, 0));
1407
+
1408
+ j = border_buf[i];
1409
+
1410
+ /* process pairs of outputs */
1411
+ while (j >= 2) {
1412
+ __m128 c0 = _mm_load_ps(coeff_buf);
1413
+ __m128 c1 = _mm_load_ps(coeff_buf + 4);
1414
+
1415
+ /* R dot products for 2 outputs */
1416
+ __m128 pr0 = _mm_mul_ps(smp_r, c0);
1417
+ __m128 pr1 = _mm_mul_ps(smp_r, c1);
1418
+ __m128 lo = _mm_unpacklo_ps(pr0, pr1);
1419
+ __m128 hh = _mm_unpackhi_ps(pr0, pr1);
1420
+ __m128 sum = _mm_add_ps(lo, hh);
1421
+ __m128 t1 = _mm_movehl_ps(sum, sum);
1422
+ __m128 t2_r = _mm_add_ps(sum, t1);
1423
+
1424
+ /* G dot products for 2 outputs */
1425
+ __m128 pg0 = _mm_mul_ps(smp_g, c0);
1426
+ __m128 pg1 = _mm_mul_ps(smp_g, c1);
1427
+ lo = _mm_unpacklo_ps(pg0, pg1);
1428
+ hh = _mm_unpackhi_ps(pg0, pg1);
1429
+ sum = _mm_add_ps(lo, hh);
1430
+ t1 = _mm_movehl_ps(sum, sum);
1431
+ __m128 t2_g = _mm_add_ps(sum, t1);
1432
+
1433
+ /* B dot products for 2 outputs */
1434
+ __m128 pb0 = _mm_mul_ps(smp_b, c0);
1435
+ __m128 pb1 = _mm_mul_ps(smp_b, c1);
1436
+ lo = _mm_unpacklo_ps(pb0, pb1);
1437
+ hh = _mm_unpackhi_ps(pb0, pb1);
1438
+ sum = _mm_add_ps(lo, hh);
1439
+ t1 = _mm_movehl_ps(sum, sum);
1440
+ __m128 t2_b = _mm_add_ps(sum, t1);
1441
+
1442
+ /* A dot products for 2 outputs */
1443
+ __m128 pa0 = _mm_mul_ps(smp_a, c0);
1444
+ __m128 pa1 = _mm_mul_ps(smp_a, c1);
1445
+ lo = _mm_unpacklo_ps(pa0, pa1);
1446
+ hh = _mm_unpackhi_ps(pa0, pa1);
1447
+ sum = _mm_add_ps(lo, hh);
1448
+ t1 = _mm_movehl_ps(sum, sum);
1449
+ __m128 t2_a = _mm_add_ps(sum, t1);
1450
+
1451
+ /* Store interleaved: [R0, G0, B0, A0, R1, G1, B1, A1] */
1452
+ out[0] = _mm_cvtss_f32(t2_r);
1453
+ out[1] = _mm_cvtss_f32(t2_g);
1454
+ out[2] = _mm_cvtss_f32(t2_b);
1455
+ out[3] = _mm_cvtss_f32(t2_a);
1456
+ out[4] = _mm_cvtss_f32(
1457
+ _mm_shuffle_ps(t2_r, t2_r, _MM_SHUFFLE(1,1,1,1)));
1458
+ out[5] = _mm_cvtss_f32(
1459
+ _mm_shuffle_ps(t2_g, t2_g, _MM_SHUFFLE(1,1,1,1)));
1460
+ out[6] = _mm_cvtss_f32(
1461
+ _mm_shuffle_ps(t2_b, t2_b, _MM_SHUFFLE(1,1,1,1)));
1462
+ out[7] = _mm_cvtss_f32(
1463
+ _mm_shuffle_ps(t2_a, t2_a, _MM_SHUFFLE(1,1,1,1)));
1464
+
1465
+ out += 8;
1466
+ coeff_buf += 8;
1467
+ j -= 2;
1468
+ }
1469
+
1470
+ /* process remaining single output */
1471
+ if (j) {
1472
+ __m128 coeffs = _mm_load_ps(coeff_buf);
1473
+
1474
+ __m128 prod = _mm_mul_ps(smp_r, coeffs);
1475
+ __m128 t1 = _mm_movehl_ps(prod, prod);
1476
+ __m128 t2 = _mm_add_ps(prod, t1);
1477
+ prod = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
1478
+ t2 = _mm_add_ss(t2, prod);
1479
+ out[0] = _mm_cvtss_f32(t2);
1480
+
1481
+ prod = _mm_mul_ps(smp_g, coeffs);
1482
+ t1 = _mm_movehl_ps(prod, prod);
1483
+ t2 = _mm_add_ps(prod, t1);
1484
+ prod = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
1485
+ t2 = _mm_add_ss(t2, prod);
1486
+ out[1] = _mm_cvtss_f32(t2);
1487
+
1488
+ prod = _mm_mul_ps(smp_b, coeffs);
1489
+ t1 = _mm_movehl_ps(prod, prod);
1490
+ t2 = _mm_add_ps(prod, t1);
1491
+ prod = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
1492
+ t2 = _mm_add_ss(t2, prod);
1493
+ out[2] = _mm_cvtss_f32(t2);
1494
+
1495
+ prod = _mm_mul_ps(smp_a, coeffs);
1496
+ t1 = _mm_movehl_ps(prod, prod);
1497
+ t2 = _mm_add_ps(prod, t1);
1498
+ prod = _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,1,1,1));
1499
+ t2 = _mm_add_ss(t2, prod);
1500
+ out[3] = _mm_cvtss_f32(t2);
1501
+
1502
+ out += 4;
1503
+ coeff_buf += 4;
1504
+ }
1505
+
1506
+ in += 4;
1507
+ }
1508
+ }
1509
+
1510
+ void oil_scale_down_rgba_sse2(unsigned char *in, float *sums_y_out,
1511
+ int out_width, float *coeffs_x_f, int *border_buf, float *coeffs_y_f)
1512
+ {
1513
+ int i, j;
1514
+ __m128 coeffs_x, coeffs_x2, coeffs_x_a, coeffs_x2_a, sample_x;
1515
+ __m128 sum_r, sum_g, sum_b, sum_a;
1516
+ __m128 sum_r2, sum_g2, sum_b2, sum_a2;
1517
+ __m128 coeffs_y, sums_y, sample_y;
1518
+ float *sl;
1519
+
1520
+ sl = s2l_map;
1521
+ coeffs_y = _mm_load_ps(coeffs_y_f);
1522
+
1523
+ sum_r = _mm_setzero_ps();
1524
+ sum_g = _mm_setzero_ps();
1525
+ sum_b = _mm_setzero_ps();
1526
+ sum_a = _mm_setzero_ps();
1527
+
1528
+ for (i=0; i<out_width; i++) {
1529
+ if (border_buf[i] >= 4) {
1530
+ sum_r2 = _mm_setzero_ps();
1531
+ sum_g2 = _mm_setzero_ps();
1532
+ sum_b2 = _mm_setzero_ps();
1533
+ sum_a2 = _mm_setzero_ps();
1534
+
1535
+ for (j=0; j+1<border_buf[i]; j+=2) {
1536
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1537
+ coeffs_x2 = _mm_load_ps(coeffs_x_f + 4);
1538
+
1539
+ coeffs_x_a = _mm_mul_ps(coeffs_x, _mm_set1_ps(i2f_map[in[3]]));
1540
+
1541
+ sample_x = _mm_set1_ps(sl[in[0]]);
1542
+ sum_r = _mm_add_ps(_mm_mul_ps(coeffs_x_a, sample_x), sum_r);
1543
+
1544
+ sample_x = _mm_set1_ps(sl[in[1]]);
1545
+ sum_g = _mm_add_ps(_mm_mul_ps(coeffs_x_a, sample_x), sum_g);
1546
+
1547
+ sample_x = _mm_set1_ps(sl[in[2]]);
1548
+ sum_b = _mm_add_ps(_mm_mul_ps(coeffs_x_a, sample_x), sum_b);
1549
+
1550
+ sum_a = _mm_add_ps(coeffs_x_a, sum_a);
1551
+
1552
+ coeffs_x2_a = _mm_mul_ps(coeffs_x2, _mm_set1_ps(i2f_map[in[7]]));
1553
+
1554
+ sample_x = _mm_set1_ps(sl[in[4]]);
1555
+ sum_r2 = _mm_add_ps(_mm_mul_ps(coeffs_x2_a, sample_x), sum_r2);
1556
+
1557
+ sample_x = _mm_set1_ps(sl[in[5]]);
1558
+ sum_g2 = _mm_add_ps(_mm_mul_ps(coeffs_x2_a, sample_x), sum_g2);
1559
+
1560
+ sample_x = _mm_set1_ps(sl[in[6]]);
1561
+ sum_b2 = _mm_add_ps(_mm_mul_ps(coeffs_x2_a, sample_x), sum_b2);
1562
+
1563
+ sum_a2 = _mm_add_ps(coeffs_x2_a, sum_a2);
1564
+
1565
+ in += 8;
1566
+ coeffs_x_f += 8;
1567
+ }
1568
+
1569
+ for (; j<border_buf[i]; j++) {
1570
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1571
+
1572
+ coeffs_x_a = _mm_mul_ps(coeffs_x, _mm_set1_ps(i2f_map[in[3]]));
1573
+
1574
+ sample_x = _mm_set1_ps(sl[in[0]]);
1575
+ sum_r = _mm_add_ps(_mm_mul_ps(coeffs_x_a, sample_x), sum_r);
1576
+
1577
+ sample_x = _mm_set1_ps(sl[in[1]]);
1578
+ sum_g = _mm_add_ps(_mm_mul_ps(coeffs_x_a, sample_x), sum_g);
1579
+
1580
+ sample_x = _mm_set1_ps(sl[in[2]]);
1581
+ sum_b = _mm_add_ps(_mm_mul_ps(coeffs_x_a, sample_x), sum_b);
1582
+
1583
+ sum_a = _mm_add_ps(coeffs_x_a, sum_a);
1584
+
1585
+ in += 4;
1586
+ coeffs_x_f += 4;
1587
+ }
1588
+
1589
+ sum_r = _mm_add_ps(sum_r, sum_r2);
1590
+ sum_g = _mm_add_ps(sum_g, sum_g2);
1591
+ sum_b = _mm_add_ps(sum_b, sum_b2);
1592
+ sum_a = _mm_add_ps(sum_a, sum_a2);
1593
+ } else {
1594
+ for (j=0; j<border_buf[i]; j++) {
1595
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1596
+
1597
+ coeffs_x_a = _mm_mul_ps(coeffs_x, _mm_set1_ps(i2f_map[in[3]]));
1598
+
1599
+ sample_x = _mm_set1_ps(sl[in[0]]);
1600
+ sum_r = _mm_add_ps(_mm_mul_ps(coeffs_x_a, sample_x), sum_r);
1601
+
1602
+ sample_x = _mm_set1_ps(sl[in[1]]);
1603
+ sum_g = _mm_add_ps(_mm_mul_ps(coeffs_x_a, sample_x), sum_g);
1604
+
1605
+ sample_x = _mm_set1_ps(sl[in[2]]);
1606
+ sum_b = _mm_add_ps(_mm_mul_ps(coeffs_x_a, sample_x), sum_b);
1607
+
1608
+ sum_a = _mm_add_ps(coeffs_x_a, sum_a);
1609
+
1610
+ in += 4;
1611
+ coeffs_x_f += 4;
1612
+ }
1613
+ }
1614
+
1615
+ sums_y = _mm_load_ps(sums_y_out);
1616
+ sample_y = _mm_shuffle_ps(sum_r, sum_r, _MM_SHUFFLE(0, 0, 0, 0));
1617
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1618
+ _mm_store_ps(sums_y_out, sums_y);
1619
+ sums_y_out += 4;
1620
+
1621
+ sums_y = _mm_load_ps(sums_y_out);
1622
+ sample_y = _mm_shuffle_ps(sum_g, sum_g, _MM_SHUFFLE(0, 0, 0, 0));
1623
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1624
+ _mm_store_ps(sums_y_out, sums_y);
1625
+ sums_y_out += 4;
1626
+
1627
+ sums_y = _mm_load_ps(sums_y_out);
1628
+ sample_y = _mm_shuffle_ps(sum_b, sum_b, _MM_SHUFFLE(0, 0, 0, 0));
1629
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1630
+ _mm_store_ps(sums_y_out, sums_y);
1631
+ sums_y_out += 4;
1632
+
1633
+ sums_y = _mm_load_ps(sums_y_out);
1634
+ sample_y = _mm_shuffle_ps(sum_a, sum_a, _MM_SHUFFLE(0, 0, 0, 0));
1635
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1636
+ _mm_store_ps(sums_y_out, sums_y);
1637
+ sums_y_out += 4;
1638
+
1639
+ sum_r = (__m128)_mm_srli_si128(_mm_castps_si128(sum_r), 4);
1640
+ sum_g = (__m128)_mm_srli_si128(_mm_castps_si128(sum_g), 4);
1641
+ sum_b = (__m128)_mm_srli_si128(_mm_castps_si128(sum_b), 4);
1642
+ sum_a = (__m128)_mm_srli_si128(_mm_castps_si128(sum_a), 4);
1643
+ }
1644
+ }
1645
+
1646
+ void oil_yscale_out_cmyk_sse2(float *sums, int len, unsigned char *out)
1647
+ {
1648
+ int i;
1649
+ __m128 scale, vals, ab, cd, f0, f1, f2, f3;
1650
+ __m128i idx, clamped, v0, v1, v2, v3;
1651
+
1652
+ scale = _mm_set1_ps(255.0f);
1653
+
1654
+ for (i=0; i+3<len; i+=4) {
1655
+ v0 = _mm_load_si128((__m128i *)sums);
1656
+ v1 = _mm_load_si128((__m128i *)(sums + 4));
1657
+ v2 = _mm_load_si128((__m128i *)(sums + 8));
1658
+ v3 = _mm_load_si128((__m128i *)(sums + 12));
1659
+
1660
+ f0 = _mm_castsi128_ps(v0);
1661
+ f1 = _mm_castsi128_ps(v1);
1662
+ f2 = _mm_castsi128_ps(v2);
1663
+ f3 = _mm_castsi128_ps(v3);
1664
+ ab = _mm_shuffle_ps(f0, f1, _MM_SHUFFLE(0, 0, 0, 0));
1665
+ cd = _mm_shuffle_ps(f2, f3, _MM_SHUFFLE(0, 0, 0, 0));
1666
+ vals = _mm_shuffle_ps(ab, cd, _MM_SHUFFLE(2, 0, 2, 0));
1667
+
1668
+ /* clamp to [0, 1] then scale to [0, 255] */
1669
+ vals = _mm_min_ps(_mm_max_ps(vals, _mm_setzero_ps()), _mm_set1_ps(1.0f));
1670
+ idx = _mm_cvttps_epi32(_mm_add_ps(_mm_mul_ps(vals, scale), _mm_set1_ps(0.5f)));
1671
+
1672
+ /* Pack 32-bit ints to 16-bit then to 8-bit */
1673
+ clamped = _mm_packs_epi32(idx, idx);
1674
+ clamped = _mm_packus_epi16(clamped, clamped);
1675
+
1676
+ *(int *)&out[i] = _mm_cvtsi128_si32(clamped);
1677
+
1678
+ _mm_store_si128((__m128i *)sums, _mm_srli_si128(v0, 4));
1679
+ _mm_store_si128((__m128i *)(sums + 4), _mm_srli_si128(v1, 4));
1680
+ _mm_store_si128((__m128i *)(sums + 8), _mm_srli_si128(v2, 4));
1681
+ _mm_store_si128((__m128i *)(sums + 12), _mm_srli_si128(v3, 4));
1682
+
1683
+ sums += 16;
1684
+ }
1685
+
1686
+ for (; i<len; i++) {
1687
+ float v = *sums;
1688
+ if (v < 0.0f) v = 0.0f;
1689
+ if (v > 1.0f) v = 1.0f;
1690
+ out[i] = (int)(v * 255.0f + 0.5f);
1691
+ oil_shift_left_f_sse2(sums);
1692
+ sums += 4;
1693
+ }
1694
+ }
1695
+
1696
+ void oil_scale_down_cmyk_sse2(unsigned char *in, float *sums_y_out,
1697
+ int out_width, float *coeffs_x_f, int *border_buf, float *coeffs_y_f)
1698
+ {
1699
+ int i, j;
1700
+ __m128 coeffs_x, coeffs_x2, sample_x, sum_c, sum_m, sum_y, sum_k;
1701
+ __m128 sum_c2, sum_m2, sum_y2, sum_k2;
1702
+ __m128 coeffs_y, sums_y, sample_y;
1703
+
1704
+ coeffs_y = _mm_load_ps(coeffs_y_f);
1705
+
1706
+ sum_c = _mm_setzero_ps();
1707
+ sum_m = _mm_setzero_ps();
1708
+ sum_y = _mm_setzero_ps();
1709
+ sum_k = _mm_setzero_ps();
1710
+
1711
+ for (i=0; i<out_width; i++) {
1712
+ if (border_buf[i] >= 2) {
1713
+ sum_c2 = _mm_setzero_ps();
1714
+ sum_m2 = _mm_setzero_ps();
1715
+ sum_y2 = _mm_setzero_ps();
1716
+ sum_k2 = _mm_setzero_ps();
1717
+
1718
+ for (j=0; j+1<border_buf[i]; j+=2) {
1719
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1720
+ coeffs_x2 = _mm_load_ps(coeffs_x_f + 4);
1721
+
1722
+ sample_x = _mm_set1_ps(i2f_map[in[0]]);
1723
+ sum_c = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_c);
1724
+
1725
+ sample_x = _mm_set1_ps(i2f_map[in[1]]);
1726
+ sum_m = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_m);
1727
+
1728
+ sample_x = _mm_set1_ps(i2f_map[in[2]]);
1729
+ sum_y = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_y);
1730
+
1731
+ sample_x = _mm_set1_ps(i2f_map[in[3]]);
1732
+ sum_k = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_k);
1733
+
1734
+ sample_x = _mm_set1_ps(i2f_map[in[4]]);
1735
+ sum_c2 = _mm_add_ps(_mm_mul_ps(coeffs_x2, sample_x), sum_c2);
1736
+
1737
+ sample_x = _mm_set1_ps(i2f_map[in[5]]);
1738
+ sum_m2 = _mm_add_ps(_mm_mul_ps(coeffs_x2, sample_x), sum_m2);
1739
+
1740
+ sample_x = _mm_set1_ps(i2f_map[in[6]]);
1741
+ sum_y2 = _mm_add_ps(_mm_mul_ps(coeffs_x2, sample_x), sum_y2);
1742
+
1743
+ sample_x = _mm_set1_ps(i2f_map[in[7]]);
1744
+ sum_k2 = _mm_add_ps(_mm_mul_ps(coeffs_x2, sample_x), sum_k2);
1745
+
1746
+ in += 8;
1747
+ coeffs_x_f += 8;
1748
+ }
1749
+
1750
+ for (; j<border_buf[i]; j++) {
1751
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1752
+
1753
+ sample_x = _mm_set1_ps(i2f_map[in[0]]);
1754
+ sum_c = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_c);
1755
+
1756
+ sample_x = _mm_set1_ps(i2f_map[in[1]]);
1757
+ sum_m = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_m);
1758
+
1759
+ sample_x = _mm_set1_ps(i2f_map[in[2]]);
1760
+ sum_y = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_y);
1761
+
1762
+ sample_x = _mm_set1_ps(i2f_map[in[3]]);
1763
+ sum_k = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_k);
1764
+
1765
+ in += 4;
1766
+ coeffs_x_f += 4;
1767
+ }
1768
+
1769
+ sum_c = _mm_add_ps(sum_c, sum_c2);
1770
+ sum_m = _mm_add_ps(sum_m, sum_m2);
1771
+ sum_y = _mm_add_ps(sum_y, sum_y2);
1772
+ sum_k = _mm_add_ps(sum_k, sum_k2);
1773
+ } else {
1774
+ for (j=0; j<border_buf[i]; j++) {
1775
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1776
+
1777
+ sample_x = _mm_set1_ps(i2f_map[in[0]]);
1778
+ sum_c = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_c);
1779
+
1780
+ sample_x = _mm_set1_ps(i2f_map[in[1]]);
1781
+ sum_m = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_m);
1782
+
1783
+ sample_x = _mm_set1_ps(i2f_map[in[2]]);
1784
+ sum_y = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_y);
1785
+
1786
+ sample_x = _mm_set1_ps(i2f_map[in[3]]);
1787
+ sum_k = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_k);
1788
+
1789
+ in += 4;
1790
+ coeffs_x_f += 4;
1791
+ }
1792
+ }
1793
+
1794
+ sums_y = _mm_load_ps(sums_y_out);
1795
+ sample_y = _mm_shuffle_ps(sum_c, sum_c, _MM_SHUFFLE(0, 0, 0, 0));
1796
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1797
+ _mm_store_ps(sums_y_out, sums_y);
1798
+ sums_y_out += 4;
1799
+
1800
+ sums_y = _mm_load_ps(sums_y_out);
1801
+ sample_y = _mm_shuffle_ps(sum_m, sum_m, _MM_SHUFFLE(0, 0, 0, 0));
1802
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1803
+ _mm_store_ps(sums_y_out, sums_y);
1804
+ sums_y_out += 4;
1805
+
1806
+ sums_y = _mm_load_ps(sums_y_out);
1807
+ sample_y = _mm_shuffle_ps(sum_y, sum_y, _MM_SHUFFLE(0, 0, 0, 0));
1808
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1809
+ _mm_store_ps(sums_y_out, sums_y);
1810
+ sums_y_out += 4;
1811
+
1812
+ sums_y = _mm_load_ps(sums_y_out);
1813
+ sample_y = _mm_shuffle_ps(sum_k, sum_k, _MM_SHUFFLE(0, 0, 0, 0));
1814
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1815
+ _mm_store_ps(sums_y_out, sums_y);
1816
+ sums_y_out += 4;
1817
+
1818
+ sum_c = (__m128)_mm_srli_si128(_mm_castps_si128(sum_c), 4);
1819
+ sum_m = (__m128)_mm_srli_si128(_mm_castps_si128(sum_m), 4);
1820
+ sum_y = (__m128)_mm_srli_si128(_mm_castps_si128(sum_y), 4);
1821
+ sum_k = (__m128)_mm_srli_si128(_mm_castps_si128(sum_k), 4);
1822
+ }
1823
+ }
1824
+
1825
+ void oil_scale_down_rgbx_sse2(unsigned char *in, float *sums_y_out,
1826
+ int out_width, float *coeffs_x_f, int *border_buf, float *coeffs_y_f)
1827
+ {
1828
+ int i, j;
1829
+ __m128 coeffs_x, coeffs_x2, sample_x, sum_r, sum_g, sum_b;
1830
+ __m128 sum_r2, sum_g2, sum_b2;
1831
+ __m128 coeffs_y, sums_y, sample_y;
1832
+
1833
+ coeffs_y = _mm_load_ps(coeffs_y_f);
1834
+
1835
+ sum_r = _mm_setzero_ps();
1836
+ sum_g = _mm_setzero_ps();
1837
+ sum_b = _mm_setzero_ps();
1838
+
1839
+ for (i=0; i<out_width; i++) {
1840
+ if (border_buf[i] >= 2) {
1841
+ sum_r2 = _mm_setzero_ps();
1842
+ sum_g2 = _mm_setzero_ps();
1843
+ sum_b2 = _mm_setzero_ps();
1844
+
1845
+ for (j=0; j+1<border_buf[i]; j+=2) {
1846
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1847
+ coeffs_x2 = _mm_load_ps(coeffs_x_f + 4);
1848
+
1849
+ sample_x = _mm_set1_ps(s2l_map[in[0]]);
1850
+ sum_r = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_r);
1851
+
1852
+ sample_x = _mm_set1_ps(s2l_map[in[1]]);
1853
+ sum_g = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_g);
1854
+
1855
+ sample_x = _mm_set1_ps(s2l_map[in[2]]);
1856
+ sum_b = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_b);
1857
+
1858
+ sample_x = _mm_set1_ps(s2l_map[in[4]]);
1859
+ sum_r2 = _mm_add_ps(_mm_mul_ps(coeffs_x2, sample_x), sum_r2);
1860
+
1861
+ sample_x = _mm_set1_ps(s2l_map[in[5]]);
1862
+ sum_g2 = _mm_add_ps(_mm_mul_ps(coeffs_x2, sample_x), sum_g2);
1863
+
1864
+ sample_x = _mm_set1_ps(s2l_map[in[6]]);
1865
+ sum_b2 = _mm_add_ps(_mm_mul_ps(coeffs_x2, sample_x), sum_b2);
1866
+
1867
+ in += 8;
1868
+ coeffs_x_f += 8;
1869
+ }
1870
+
1871
+ for (; j<border_buf[i]; j++) {
1872
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1873
+
1874
+ sample_x = _mm_set1_ps(s2l_map[in[0]]);
1875
+ sum_r = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_r);
1876
+
1877
+ sample_x = _mm_set1_ps(s2l_map[in[1]]);
1878
+ sum_g = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_g);
1879
+
1880
+ sample_x = _mm_set1_ps(s2l_map[in[2]]);
1881
+ sum_b = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_b);
1882
+
1883
+ in += 4;
1884
+ coeffs_x_f += 4;
1885
+ }
1886
+
1887
+ sum_r = _mm_add_ps(sum_r, sum_r2);
1888
+ sum_g = _mm_add_ps(sum_g, sum_g2);
1889
+ sum_b = _mm_add_ps(sum_b, sum_b2);
1890
+ } else {
1891
+ for (j=0; j<border_buf[i]; j++) {
1892
+ coeffs_x = _mm_load_ps(coeffs_x_f);
1893
+
1894
+ sample_x = _mm_set1_ps(s2l_map[in[0]]);
1895
+ sum_r = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_r);
1896
+
1897
+ sample_x = _mm_set1_ps(s2l_map[in[1]]);
1898
+ sum_g = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_g);
1899
+
1900
+ sample_x = _mm_set1_ps(s2l_map[in[2]]);
1901
+ sum_b = _mm_add_ps(_mm_mul_ps(coeffs_x, sample_x), sum_b);
1902
+
1903
+ in += 4;
1904
+ coeffs_x_f += 4;
1905
+ }
1906
+ }
1907
+
1908
+ sums_y = _mm_load_ps(sums_y_out);
1909
+ sample_y = _mm_shuffle_ps(sum_r, sum_r, _MM_SHUFFLE(0, 0, 0, 0));
1910
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1911
+ _mm_store_ps(sums_y_out, sums_y);
1912
+ sums_y_out += 4;
1913
+
1914
+ sums_y = _mm_load_ps(sums_y_out);
1915
+ sample_y = _mm_shuffle_ps(sum_g, sum_g, _MM_SHUFFLE(0, 0, 0, 0));
1916
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1917
+ _mm_store_ps(sums_y_out, sums_y);
1918
+ sums_y_out += 4;
1919
+
1920
+ sums_y = _mm_load_ps(sums_y_out);
1921
+ sample_y = _mm_shuffle_ps(sum_b, sum_b, _MM_SHUFFLE(0, 0, 0, 0));
1922
+ sums_y = _mm_add_ps(_mm_mul_ps(coeffs_y, sample_y), sums_y);
1923
+ _mm_store_ps(sums_y_out, sums_y);
1924
+ sums_y_out += 4;
1925
+
1926
+ sum_r = (__m128)_mm_srli_si128(_mm_castps_si128(sum_r), 4);
1927
+ sum_g = (__m128)_mm_srli_si128(_mm_castps_si128(sum_g), 4);
1928
+ sum_b = (__m128)_mm_srli_si128(_mm_castps_si128(sum_b), 4);
1929
+ }
1930
+ }