fast_resize 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +7 -0
  2. data/CMakeLists.txt +308 -0
  3. data/LICENSE +29 -0
  4. data/README.md +426 -0
  5. data/VERSION +1 -0
  6. data/bindings/ruby/ext/fastresize/extconf.rb +152 -0
  7. data/bindings/ruby/ext/fastresize/fastresize_ext.cpp +377 -0
  8. data/bindings/ruby/lib/fastresize/platform.rb +106 -0
  9. data/bindings/ruby/lib/fastresize/version.rb +5 -0
  10. data/bindings/ruby/lib/fastresize.rb +13 -0
  11. data/bindings/ruby/prebuilt/linux-aarch64/bin/fast_resize +0 -0
  12. data/bindings/ruby/prebuilt/linux-aarch64/include/fastresize.h +189 -0
  13. data/bindings/ruby/prebuilt/linux-aarch64/include/stb_image.h +7988 -0
  14. data/bindings/ruby/prebuilt/linux-aarch64/include/stb_image_resize2.h +10651 -0
  15. data/bindings/ruby/prebuilt/linux-aarch64/include/stb_image_write.h +1724 -0
  16. data/bindings/ruby/prebuilt/linux-aarch64/lib/libfastresize.a +0 -0
  17. data/bindings/ruby/prebuilt/linux-aarch64.tar.gz +0 -0
  18. data/bindings/ruby/prebuilt/linux-x86_64/bin/fast_resize +0 -0
  19. data/bindings/ruby/prebuilt/linux-x86_64/include/fastresize.h +189 -0
  20. data/bindings/ruby/prebuilt/linux-x86_64/include/stb_image.h +7988 -0
  21. data/bindings/ruby/prebuilt/linux-x86_64/include/stb_image_resize2.h +10651 -0
  22. data/bindings/ruby/prebuilt/linux-x86_64/include/stb_image_write.h +1724 -0
  23. data/bindings/ruby/prebuilt/linux-x86_64/lib/libfastresize.a +0 -0
  24. data/bindings/ruby/prebuilt/linux-x86_64.tar.gz +0 -0
  25. data/bindings/ruby/prebuilt/macos-arm64/bin/fast_resize +0 -0
  26. data/bindings/ruby/prebuilt/macos-arm64/include/fastresize.h +189 -0
  27. data/bindings/ruby/prebuilt/macos-arm64/include/stb_image.h +7988 -0
  28. data/bindings/ruby/prebuilt/macos-arm64/include/stb_image_resize2.h +10651 -0
  29. data/bindings/ruby/prebuilt/macos-arm64/include/stb_image_write.h +1724 -0
  30. data/bindings/ruby/prebuilt/macos-arm64/lib/libfastresize.a +0 -0
  31. data/bindings/ruby/prebuilt/macos-arm64.tar.gz +0 -0
  32. data/bindings/ruby/prebuilt/macos-x86_64/bin/fast_resize +0 -0
  33. data/bindings/ruby/prebuilt/macos-x86_64/include/fastresize.h +189 -0
  34. data/bindings/ruby/prebuilt/macos-x86_64/include/stb_image.h +7988 -0
  35. data/bindings/ruby/prebuilt/macos-x86_64/include/stb_image_resize2.h +10651 -0
  36. data/bindings/ruby/prebuilt/macos-x86_64/include/stb_image_write.h +1724 -0
  37. data/bindings/ruby/prebuilt/macos-x86_64/lib/libfastresize.a +0 -0
  38. data/bindings/ruby/prebuilt/macos-x86_64.tar.gz +0 -0
  39. data/include/fastresize.h +189 -0
  40. data/include/stb_image.h +7988 -0
  41. data/include/stb_image_resize2.h +10651 -0
  42. data/include/stb_image_write.h +1724 -0
  43. data/src/cli.cpp +540 -0
  44. data/src/decoder.cpp +647 -0
  45. data/src/encoder.cpp +376 -0
  46. data/src/fastresize.cpp +445 -0
  47. data/src/internal.h +108 -0
  48. data/src/pipeline.cpp +284 -0
  49. data/src/pipeline.h +175 -0
  50. data/src/resizer.cpp +199 -0
  51. data/src/simd_resize.cpp +384 -0
  52. data/src/simd_resize.h +72 -0
  53. data/src/simd_utils.h +127 -0
  54. data/src/thread_pool.cpp +232 -0
  55. metadata +158 -0
@@ -0,0 +1,384 @@
1
+ /*
2
+ * FastResize - The Fastest Image Resizing Library On The Planet
3
+ * Copyright (C) 2025 Tran Huu Canh (0xTh3OKrypt) and FastResize Contributors
4
+ *
5
+ * Resize 1,000 images in 2 seconds. Up to 2.9x faster than libvips,
6
+ * 3.1x faster than imageflow. Uses 3-4x less RAM than alternatives.
7
+ *
8
+ * Author: Tran Huu Canh (0xTh3OKrypt)
9
+ * Email: tranhuucanh39@gmail.com
10
+ * Homepage: https://github.com/tranhuucanh/fast_resize
11
+ *
12
+ * BSD 3-Clause License
13
+ *
14
+ * Redistribution and use in source and binary forms, with or without
15
+ * modification, are permitted provided that the following conditions are met:
16
+ *
17
+ * 1. Redistributions of source code must retain the above copyright notice,
18
+ * this list of conditions and the following disclaimer.
19
+ *
20
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
21
+ * this list of conditions and the following disclaimer in the documentation
22
+ * and/or other materials provided with the distribution.
23
+ *
24
+ * 3. Neither the name of the copyright holder nor the names of its
25
+ * contributors may be used to endorse or promote products derived from
26
+ * this software without specific prior written permission.
27
+ *
28
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
32
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
38
+ * THE POSSIBILITY OF SUCH DAMAGE.
39
+ */
40
+
41
+ #include "simd_resize.h"
42
+ #include <cstring>
43
+ #include <cmath>
44
+ #include <algorithm>
45
+
46
+ #if defined(__ARM_NEON) || defined(__aarch64__)
47
+ #define USE_NEON 1
48
+ #include <arm_neon.h>
49
+ #elif defined(__AVX2__)
50
+ #define USE_AVX2 1
51
+ #include <immintrin.h>
52
+ #elif defined(__SSE2__) || defined(__x86_64__) || defined(_M_X64)
53
+ #define USE_SSE2 1
54
+ #include <emmintrin.h>
55
+ #endif
56
+
57
+ namespace fastresize {
58
+ namespace internal {
59
+
60
+ static void resize_bilinear_scalar(
61
+ const uint8_t* src, int src_w, int src_h, int channels,
62
+ uint8_t* dst, int dst_w, int dst_h
63
+ ) {
64
+ const int FRAC_BITS = 16;
65
+ const int FRAC_ONE = 1 << FRAC_BITS;
66
+
67
+ int x_ratio = ((src_w - 1) << FRAC_BITS) / dst_w;
68
+ int y_ratio = ((src_h - 1) << FRAC_BITS) / dst_h;
69
+
70
+ int src_stride = src_w * channels;
71
+ int dst_stride = dst_w * channels;
72
+
73
+ for (int y = 0; y < dst_h; y++) {
74
+ int src_y_fp = y * y_ratio;
75
+ int y1 = src_y_fp >> FRAC_BITS;
76
+ int y2 = std::min(y1 + 1, src_h - 1);
77
+ int y_frac = src_y_fp & (FRAC_ONE - 1);
78
+ int y_frac_inv = FRAC_ONE - y_frac;
79
+
80
+ const uint8_t* row1 = src + y1 * src_stride;
81
+ const uint8_t* row2 = src + y2 * src_stride;
82
+ uint8_t* out_row = dst + y * dst_stride;
83
+
84
+ for (int x = 0; x < dst_w; x++) {
85
+ int src_x_fp = x * x_ratio;
86
+ int x1 = src_x_fp >> FRAC_BITS;
87
+ int x2 = std::min(x1 + 1, src_w - 1);
88
+ int x_frac = src_x_fp & (FRAC_ONE - 1);
89
+ int x_frac_inv = FRAC_ONE - x_frac;
90
+
91
+ const uint8_t* p1 = row1 + x1 * channels;
92
+ const uint8_t* p2 = row1 + x2 * channels;
93
+ const uint8_t* p3 = row2 + x1 * channels;
94
+ const uint8_t* p4 = row2 + x2 * channels;
95
+
96
+ int w1 = (x_frac_inv * y_frac_inv) >> FRAC_BITS;
97
+ int w2 = (x_frac * y_frac_inv) >> FRAC_BITS;
98
+ int w3 = (x_frac_inv * y_frac) >> FRAC_BITS;
99
+ int w4 = (x_frac * y_frac) >> FRAC_BITS;
100
+
101
+ uint8_t* out = out_row + x * channels;
102
+
103
+ for (int c = 0; c < channels; c++) {
104
+ int val = (p1[c] * w1 + p2[c] * w2 + p3[c] * w3 + p4[c] * w4) >> FRAC_BITS;
105
+ out[c] = (uint8_t)std::min(val, 255);
106
+ }
107
+ }
108
+ }
109
+ }
110
+
111
+ #ifdef USE_NEON
112
+
113
+ static void resize_bilinear_neon_rgba(
114
+ const uint8_t* __restrict src, int src_w, int src_h,
115
+ uint8_t* __restrict dst, int dst_w, int dst_h,
116
+ int channels
117
+ ) {
118
+ const int FRAC_BITS = 8;
119
+ const int FRAC_ONE = 1 << FRAC_BITS;
120
+
121
+ int x_ratio_fp = ((src_w - 1) << 16) / dst_w;
122
+ int y_ratio_fp = ((src_h - 1) << 16) / dst_h;
123
+
124
+ int src_stride = src_w * channels;
125
+ int dst_stride = dst_w * channels;
126
+
127
+ for (int y = 0; y < dst_h; y++) {
128
+ int src_y_fp = (y * y_ratio_fp) >> 8;
129
+ int y1 = src_y_fp >> FRAC_BITS;
130
+ int y2 = std::min(y1 + 1, src_h - 1);
131
+ int y_frac = src_y_fp & (FRAC_ONE - 1);
132
+
133
+ int16x8_t wy2_vec = vdupq_n_s16(y_frac);
134
+ int16x8_t wy1_vec = vdupq_n_s16(FRAC_ONE - y_frac);
135
+
136
+ const uint8_t* row1 = src + y1 * src_stride;
137
+ const uint8_t* row2 = src + y2 * src_stride;
138
+ uint8_t* out_row = dst + y * dst_stride;
139
+
140
+ int x = 0;
141
+
142
+ for (; x + 4 <= dst_w; x += 4) {
143
+ int src_x_fp[4];
144
+ int x1_arr[4], x2_arr[4], x_frac_arr[4];
145
+
146
+ for (int i = 0; i < 4; i++) {
147
+ src_x_fp[i] = ((x + i) * x_ratio_fp) >> 8;
148
+ x1_arr[i] = src_x_fp[i] >> FRAC_BITS;
149
+ x2_arr[i] = std::min(x1_arr[i] + 1, src_w - 1);
150
+ x_frac_arr[i] = src_x_fp[i] & (FRAC_ONE - 1);
151
+ }
152
+
153
+ if (channels == 4) {
154
+ for (int i = 0; i < 4; i++) {
155
+ uint8x8_t tl = vld1_u8(row1 + x1_arr[i] * 4);
156
+ uint8x8_t tr = vld1_u8(row1 + x2_arr[i] * 4);
157
+ uint8x8_t bl = vld1_u8(row2 + x1_arr[i] * 4);
158
+ uint8x8_t br = vld1_u8(row2 + x2_arr[i] * 4);
159
+
160
+ int16x8_t tl16 = vreinterpretq_s16_u16(vmovl_u8(tl));
161
+ int16x8_t tr16 = vreinterpretq_s16_u16(vmovl_u8(tr));
162
+ int16x8_t bl16 = vreinterpretq_s16_u16(vmovl_u8(bl));
163
+ int16x8_t br16 = vreinterpretq_s16_u16(vmovl_u8(br));
164
+
165
+ int16_t wx2 = x_frac_arr[i];
166
+ int16_t wx1 = FRAC_ONE - wx2;
167
+ int16x8_t wx1_vec = vdupq_n_s16(wx1);
168
+ int16x8_t wx2_vec = vdupq_n_s16(wx2);
169
+
170
+ int16x8_t top = vaddq_s16(
171
+ vmulq_s16(tl16, wx1_vec),
172
+ vmulq_s16(tr16, wx2_vec)
173
+ );
174
+ int16x8_t bottom = vaddq_s16(
175
+ vmulq_s16(bl16, wx1_vec),
176
+ vmulq_s16(br16, wx2_vec)
177
+ );
178
+
179
+ top = vshrq_n_s16(top, FRAC_BITS);
180
+ bottom = vshrq_n_s16(bottom, FRAC_BITS);
181
+
182
+ int16x8_t result = vaddq_s16(
183
+ vmulq_s16(top, wy1_vec),
184
+ vmulq_s16(bottom, wy2_vec)
185
+ );
186
+ result = vshrq_n_s16(result, FRAC_BITS);
187
+
188
+ uint8x8_t result8 = vqmovun_s16(result);
189
+
190
+ vst1_lane_u32((uint32_t*)(out_row + (x + i) * 4),
191
+ vreinterpret_u32_u8(result8), 0);
192
+ }
193
+ } else if (channels == 3) {
194
+ for (int i = 0; i < 4; i++) {
195
+ const uint8_t* p_tl = row1 + x1_arr[i] * 3;
196
+ const uint8_t* p_tr = row1 + x2_arr[i] * 3;
197
+ const uint8_t* p_bl = row2 + x1_arr[i] * 3;
198
+ const uint8_t* p_br = row2 + x2_arr[i] * 3;
199
+
200
+ int16_t wx2 = x_frac_arr[i];
201
+ int16_t wx1 = FRAC_ONE - wx2;
202
+ int16_t wy1 = FRAC_ONE - y_frac;
203
+ int16_t wy2 = y_frac;
204
+
205
+ uint8_t* out = out_row + (x + i) * 3;
206
+
207
+ for (int c = 0; c < 3; c++) {
208
+ int top = (p_tl[c] * wx1 + p_tr[c] * wx2) >> FRAC_BITS;
209
+ int bottom = (p_bl[c] * wx1 + p_br[c] * wx2) >> FRAC_BITS;
210
+ int val = (top * wy1 + bottom * wy2) >> FRAC_BITS;
211
+ out[c] = (uint8_t)std::min(std::max(val, 0), 255);
212
+ }
213
+ }
214
+ }
215
+ }
216
+
217
+ for (; x < dst_w; x++) {
218
+ int src_x_fp = (x * x_ratio_fp) >> 8;
219
+ int x1 = src_x_fp >> FRAC_BITS;
220
+ int x2 = std::min(x1 + 1, src_w - 1);
221
+ int x_frac = src_x_fp & (FRAC_ONE - 1);
222
+ int x_frac_inv = FRAC_ONE - x_frac;
223
+ int y_frac_inv = FRAC_ONE - y_frac;
224
+
225
+ const uint8_t* p1 = row1 + x1 * channels;
226
+ const uint8_t* p2 = row1 + x2 * channels;
227
+ const uint8_t* p3 = row2 + x1 * channels;
228
+ const uint8_t* p4 = row2 + x2 * channels;
229
+
230
+ int w1 = (x_frac_inv * y_frac_inv) >> FRAC_BITS;
231
+ int w2 = (x_frac * y_frac_inv) >> FRAC_BITS;
232
+ int w3 = (x_frac_inv * y_frac) >> FRAC_BITS;
233
+ int w4 = (x_frac * y_frac) >> FRAC_BITS;
234
+
235
+ uint8_t* out = out_row + x * channels;
236
+
237
+ for (int c = 0; c < channels; c++) {
238
+ int val = (p1[c] * w1 + p2[c] * w2 + p3[c] * w3 + p4[c] * w4) >> FRAC_BITS;
239
+ out[c] = (uint8_t)std::min(val, 255);
240
+ }
241
+ }
242
+ }
243
+ }
244
+
245
+ static void resize_area_neon(
246
+ const uint8_t* __restrict src, int src_w, int src_h,
247
+ uint8_t* __restrict dst, int dst_w, int dst_h,
248
+ int channels
249
+ ) {
250
+ float x_scale = (float)src_w / dst_w;
251
+ float y_scale = (float)src_h / dst_h;
252
+
253
+ int src_stride = src_w * channels;
254
+ int dst_stride = dst_w * channels;
255
+
256
+ for (int dy = 0; dy < dst_h; dy++) {
257
+ int sy_start = (int)(dy * y_scale);
258
+ int sy_end = std::min((int)((dy + 1) * y_scale), src_h);
259
+ int y_count = sy_end - sy_start;
260
+ if (y_count < 1) y_count = 1;
261
+
262
+ uint8_t* out_row = dst + dy * dst_stride;
263
+
264
+ for (int dx = 0; dx < dst_w; dx++) {
265
+ int sx_start = (int)(dx * x_scale);
266
+ int sx_end = std::min((int)((dx + 1) * x_scale), src_w);
267
+ int x_count = sx_end - sx_start;
268
+ if (x_count < 1) x_count = 1;
269
+
270
+ int pixel_count = x_count * y_count;
271
+
272
+ if (channels == 4) {
273
+ uint32x4_t sum = vdupq_n_u32(0);
274
+
275
+ for (int sy = sy_start; sy < sy_end; sy++) {
276
+ const uint8_t* src_row = src + sy * src_stride + sx_start * 4;
277
+
278
+ int sx = 0;
279
+ for (; sx + 4 <= x_count; sx += 4) {
280
+ uint8x16_t pixels = vld1q_u8(src_row + sx * 4);
281
+
282
+ uint16x8_t lo = vmovl_u8(vget_low_u8(pixels));
283
+ uint16x8_t hi = vmovl_u8(vget_high_u8(pixels));
284
+
285
+ sum = vaddq_u32(sum, vmovl_u16(vget_low_u16(lo)));
286
+ sum = vaddq_u32(sum, vmovl_u16(vget_high_u16(lo)));
287
+ sum = vaddq_u32(sum, vmovl_u16(vget_low_u16(hi)));
288
+ sum = vaddq_u32(sum, vmovl_u16(vget_high_u16(hi)));
289
+ }
290
+
291
+ for (; sx < x_count; sx++) {
292
+ const uint8_t* p = src_row + sx * 4;
293
+ uint32_t vals[4] = {p[0], p[1], p[2], p[3]};
294
+ sum = vaddq_u32(sum, vld1q_u32(vals));
295
+ }
296
+ }
297
+
298
+ uint32_t sums[4];
299
+ vst1q_u32(sums, sum);
300
+
301
+ uint8_t* out = out_row + dx * 4;
302
+ out[0] = sums[0] / pixel_count;
303
+ out[1] = sums[1] / pixel_count;
304
+ out[2] = sums[2] / pixel_count;
305
+ out[3] = sums[3] / pixel_count;
306
+
307
+ } else if (channels == 3) {
308
+ uint32_t sum_r = 0, sum_g = 0, sum_b = 0;
309
+
310
+ for (int sy = sy_start; sy < sy_end; sy++) {
311
+ const uint8_t* src_row = src + sy * src_stride + sx_start * 3;
312
+ for (int sx = 0; sx < x_count; sx++) {
313
+ const uint8_t* p = src_row + sx * 3;
314
+ sum_r += p[0];
315
+ sum_g += p[1];
316
+ sum_b += p[2];
317
+ }
318
+ }
319
+
320
+ uint8_t* out = out_row + dx * 3;
321
+ out[0] = sum_r / pixel_count;
322
+ out[1] = sum_g / pixel_count;
323
+ out[2] = sum_b / pixel_count;
324
+
325
+ } else {
326
+ uint32_t sums[4] = {0, 0, 0, 0};
327
+
328
+ for (int sy = sy_start; sy < sy_end; sy++) {
329
+ const uint8_t* src_row = src + sy * src_stride + sx_start * channels;
330
+ for (int sx = 0; sx < x_count; sx++) {
331
+ const uint8_t* p = src_row + sx * channels;
332
+ for (int c = 0; c < channels; c++) {
333
+ sums[c] += p[c];
334
+ }
335
+ }
336
+ }
337
+
338
+ uint8_t* out = out_row + dx * channels;
339
+ for (int c = 0; c < channels; c++) {
340
+ out[c] = sums[c] / pixel_count;
341
+ }
342
+ }
343
+ }
344
+ }
345
+ }
346
+
347
+ #endif
348
+
349
+ bool simd_resize(
350
+ const uint8_t* src, int src_w, int src_h, int channels,
351
+ uint8_t* dst, int dst_w, int dst_h,
352
+ ResizeQuality quality
353
+ ) {
354
+ if (!src || !dst || src_w <= 0 || src_h <= 0 ||
355
+ dst_w <= 0 || dst_h <= 0 || channels < 1 || channels > 4) {
356
+ return false;
357
+ }
358
+
359
+ if (quality == ResizeQuality::BEST) {
360
+ return false;
361
+ }
362
+
363
+ float x_scale = (float)src_w / dst_w;
364
+ float y_scale = (float)src_h / dst_h;
365
+ float max_scale = std::max(x_scale, y_scale);
366
+
367
+ #ifdef USE_NEON
368
+ if (max_scale > 3.0f && (channels == 3 || channels == 4)) {
369
+ resize_area_neon(src, src_w, src_h, dst, dst_w, dst_h, channels);
370
+ return true;
371
+ }
372
+
373
+ if (channels == 3 || channels == 4) {
374
+ resize_bilinear_neon_rgba(src, src_w, src_h, dst, dst_w, dst_h, channels);
375
+ return true;
376
+ }
377
+ #endif
378
+
379
+ resize_bilinear_scalar(src, src_w, src_h, channels, dst, dst_w, dst_h);
380
+ return true;
381
+ }
382
+
383
+ }
384
+ }
data/src/simd_resize.h ADDED
@@ -0,0 +1,72 @@
1
+ /*
2
+ * FastResize - The Fastest Image Resizing Library On The Planet
3
+ * Copyright (C) 2025 Tran Huu Canh (0xTh3OKrypt) and FastResize Contributors
4
+ *
5
+ * Resize 1,000 images in 2 seconds. Up to 2.9x faster than libvips,
6
+ * 3.1x faster than imageflow. Uses 3-4x less RAM than alternatives.
7
+ *
8
+ * Author: Tran Huu Canh (0xTh3OKrypt)
9
+ * Email: tranhuucanh39@gmail.com
10
+ * Homepage: https://github.com/tranhuucanh/fast_resize
11
+ *
12
+ * BSD 3-Clause License
13
+ *
14
+ * Redistribution and use in source and binary forms, with or without
15
+ * modification, are permitted provided that the following conditions are met:
16
+ *
17
+ * 1. Redistributions of source code must retain the above copyright notice,
18
+ * this list of conditions and the following disclaimer.
19
+ *
20
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
21
+ * this list of conditions and the following disclaimer in the documentation
22
+ * and/or other materials provided with the distribution.
23
+ *
24
+ * 3. Neither the name of the copyright holder nor the names of its
25
+ * contributors may be used to endorse or promote products derived from
26
+ * this software without specific prior written permission.
27
+ *
28
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
32
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
38
+ * THE POSSIBILITY OF SUCH DAMAGE.
39
+ */
40
+
41
+ #ifndef FASTRESIZE_SIMD_RESIZE_H
42
+ #define FASTRESIZE_SIMD_RESIZE_H
43
+
44
+ #include <cstddef>
45
+ #include <cstdint>
46
+
47
+ namespace fastresize {
48
+ namespace internal {
49
+
50
+ enum class ResizeQuality {
51
+ FAST,
52
+ GOOD,
53
+ BEST
54
+ };
55
+
56
+ bool simd_resize(
57
+ const uint8_t* src,
58
+ int src_w, int src_h,
59
+ int channels,
60
+ uint8_t* dst,
61
+ int dst_w, int dst_h,
62
+ ResizeQuality quality = ResizeQuality::FAST
63
+ );
64
+
65
+ inline size_t simd_resize_buffer_size(int dst_w, int dst_h, int channels) {
66
+ return static_cast<size_t>(dst_w) * dst_h * channels;
67
+ }
68
+
69
+ }
70
+ }
71
+
72
+ #endif
data/src/simd_utils.h ADDED
@@ -0,0 +1,127 @@
1
+ /*
2
+ * FastResize - The Fastest Image Resizing Library On The Planet
3
+ * Copyright (C) 2025 Tran Huu Canh (0xTh3OKrypt) and FastResize Contributors
4
+ *
5
+ * Resize 1,000 images in 2 seconds. Up to 2.9x faster than libvips,
6
+ * 3.1x faster than imageflow. Uses 3-4x less RAM than alternatives.
7
+ *
8
+ * Author: Tran Huu Canh (0xTh3OKrypt)
9
+ * Email: tranhuucanh39@gmail.com
10
+ * Homepage: https://github.com/tranhuucanh/fast_resize
11
+ *
12
+ * BSD 3-Clause License
13
+ *
14
+ * Redistribution and use in source and binary forms, with or without
15
+ * modification, are permitted provided that the following conditions are met:
16
+ *
17
+ * 1. Redistributions of source code must retain the above copyright notice,
18
+ * this list of conditions and the following disclaimer.
19
+ *
20
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
21
+ * this list of conditions and the following disclaimer in the documentation
22
+ * and/or other materials provided with the distribution.
23
+ *
24
+ * 3. Neither the name of the copyright holder nor the names of its
25
+ * contributors may be used to endorse or promote products derived from
26
+ * this software without specific prior written permission.
27
+ *
28
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
32
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
38
+ * THE POSSIBILITY OF SUCH DAMAGE.
39
+ */
40
+
41
+ #pragma once
42
+
43
+ #include <cstddef>
44
+ #include <cstdint>
45
+ #include <cstring>
46
+
47
+ #ifdef __AVX2__
48
+ #include <immintrin.h>
49
+ #endif
50
+
51
+ #ifdef __ARM_NEON
52
+ #include <arm_neon.h>
53
+ #endif
54
+
55
+ namespace fastresize {
56
+ namespace internal {
57
+
58
+ inline void fast_copy_aligned(void* dst, const void* src, size_t size) {
59
+ #ifdef __AVX2__
60
+ uint8_t* d = (uint8_t*)dst;
61
+ const uint8_t* s = (const uint8_t*)src;
62
+ size_t i = 0;
63
+
64
+ for (; i + 32 <= size; i += 32) {
65
+ __m256i data = _mm256_loadu_si256((const __m256i*)(s + i));
66
+ _mm256_storeu_si256((__m256i*)(d + i), data);
67
+ }
68
+
69
+ for (; i < size; i++) {
70
+ d[i] = s[i];
71
+ }
72
+ #elif defined(__ARM_NEON)
73
+ uint8_t* d = (uint8_t*)dst;
74
+ const uint8_t* s = (const uint8_t*)src;
75
+ size_t i = 0;
76
+
77
+ for (; i + 16 <= size; i += 16) {
78
+ uint8x16_t data = vld1q_u8(s + i);
79
+ vst1q_u8(d + i, data);
80
+ }
81
+
82
+ for (; i < size; i++) {
83
+ d[i] = s[i];
84
+ }
85
+ #else
86
+ memcpy(dst, src, size);
87
+ #endif
88
+ }
89
+
90
+ inline void fast_copy_pixels(unsigned char* dst, const unsigned char* src,
91
+ int width, int height, int channels) {
92
+ size_t total_bytes = static_cast<size_t>(width) * height * channels;
93
+ fast_copy_aligned(dst, src, total_bytes);
94
+ }
95
+
96
+ inline void fast_zero(void* dst, size_t size) {
97
+ #ifdef __AVX2__
98
+ uint8_t* d = (uint8_t*)dst;
99
+ size_t i = 0;
100
+ __m256i zero = _mm256_setzero_si256();
101
+
102
+ for (; i + 32 <= size; i += 32) {
103
+ _mm256_storeu_si256((__m256i*)(d + i), zero);
104
+ }
105
+
106
+ for (; i < size; i++) {
107
+ d[i] = 0;
108
+ }
109
+ #elif defined(__ARM_NEON)
110
+ uint8_t* d = (uint8_t*)dst;
111
+ size_t i = 0;
112
+ uint8x16_t zero = vdupq_n_u8(0);
113
+
114
+ for (; i + 16 <= size; i += 16) {
115
+ vst1q_u8(d + i, zero);
116
+ }
117
+
118
+ for (; i < size; i++) {
119
+ d[i] = 0;
120
+ }
121
+ #else
122
+ memset(dst, 0, size);
123
+ #endif
124
+ }
125
+
126
+ }
127
+ }