image_pack 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +20 -0
  3. data/README.md +75 -16
  4. data/ext/image_pack/extconf.rb +41 -126
  5. data/ext/image_pack/image_pack.c +1151 -596
  6. data/ext/image_pack/mozjpeg_sources.rb +178 -0
  7. data/ext/image_pack/vendor/mozjpeg/BUILDING.md +744 -0
  8. data/ext/image_pack/vendor/mozjpeg/CODE_OF_CONDUCT.md +15 -0
  9. data/ext/image_pack/vendor/mozjpeg/ChangeLog.md +1996 -0
  10. data/lib/image_pack/configuration.rb +54 -8
  11. data/lib/image_pack/version.rb +1 -1
  12. data/lib/image_pack.rb +124 -41
  13. metadata +13 -79
  14. data/ext/image_pack/vendor/mozjpeg/README.ijg +0 -258
  15. data/ext/image_pack/vendor/mozjpeg/cdjpeg.c +0 -156
  16. data/ext/image_pack/vendor/mozjpeg/cjpeg.c +0 -961
  17. data/ext/image_pack/vendor/mozjpeg/djpeg.c +0 -855
  18. data/ext/image_pack/vendor/mozjpeg/jaricom.c +0 -157
  19. data/ext/image_pack/vendor/mozjpeg/jcarith.c +0 -972
  20. data/ext/image_pack/vendor/mozjpeg/jcstest.c +0 -126
  21. data/ext/image_pack/vendor/mozjpeg/jdarith.c +0 -782
  22. data/ext/image_pack/vendor/mozjpeg/jdatadst-tj.c +0 -198
  23. data/ext/image_pack/vendor/mozjpeg/jdatasrc-tj.c +0 -194
  24. data/ext/image_pack/vendor/mozjpeg/jpegtran.c +0 -827
  25. data/ext/image_pack/vendor/mozjpeg/jpegyuv.c +0 -172
  26. data/ext/image_pack/vendor/mozjpeg/rdbmp.c +0 -690
  27. data/ext/image_pack/vendor/mozjpeg/rdcolmap.c +0 -253
  28. data/ext/image_pack/vendor/mozjpeg/rdgif.c +0 -720
  29. data/ext/image_pack/vendor/mozjpeg/rdjpeg.c +0 -160
  30. data/ext/image_pack/vendor/mozjpeg/rdjpgcom.c +0 -494
  31. data/ext/image_pack/vendor/mozjpeg/rdpng.c +0 -194
  32. data/ext/image_pack/vendor/mozjpeg/rdppm.c +0 -781
  33. data/ext/image_pack/vendor/mozjpeg/rdswitch.c +0 -642
  34. data/ext/image_pack/vendor/mozjpeg/rdtarga.c +0 -508
  35. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jccolext-neon.c +0 -148
  36. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jchuff-neon.c +0 -334
  37. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jsimd.c +0 -976
  38. data/ext/image_pack/vendor/mozjpeg/simd/i386/jsimd.c +0 -1312
  39. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd.c +0 -1143
  40. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jccolext-mmi.c +0 -455
  41. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jccolor-mmi.c +0 -148
  42. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcgray-mmi.c +0 -132
  43. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcgryext-mmi.c +0 -374
  44. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcsample-mmi.c +0 -98
  45. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdcolext-mmi.c +0 -415
  46. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdcolor-mmi.c +0 -139
  47. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdmerge-mmi.c +0 -149
  48. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdmrgext-mmi.c +0 -615
  49. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdsample-mmi.c +0 -304
  50. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jfdctfst-mmi.c +0 -255
  51. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jfdctint-mmi.c +0 -398
  52. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jidctfst-mmi.c +0 -395
  53. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jidctint-mmi.c +0 -571
  54. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jquanti-mmi.c +0 -124
  55. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jsimd.c +0 -866
  56. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jccolext-altivec.c +0 -269
  57. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jccolor-altivec.c +0 -116
  58. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcgray-altivec.c +0 -111
  59. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcgryext-altivec.c +0 -228
  60. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcsample-altivec.c +0 -159
  61. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdcolext-altivec.c +0 -276
  62. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdcolor-altivec.c +0 -106
  63. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdmerge-altivec.c +0 -130
  64. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdmrgext-altivec.c +0 -329
  65. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdsample-altivec.c +0 -400
  66. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jfdctfst-altivec.c +0 -154
  67. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jfdctint-altivec.c +0 -258
  68. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jidctfst-altivec.c +0 -255
  69. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jidctint-altivec.c +0 -357
  70. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jquanti-altivec.c +0 -250
  71. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jsimd.c +0 -884
  72. data/ext/image_pack/vendor/mozjpeg/strtest.c +0 -170
  73. data/ext/image_pack/vendor/mozjpeg/tjbench.c +0 -1044
  74. data/ext/image_pack/vendor/mozjpeg/tjexample.c +0 -406
  75. data/ext/image_pack/vendor/mozjpeg/tjunittest.c +0 -961
  76. data/ext/image_pack/vendor/mozjpeg/tjutil.c +0 -70
  77. data/ext/image_pack/vendor/mozjpeg/transupp.c +0 -2373
  78. data/ext/image_pack/vendor/mozjpeg/turbojpeg-jni.c +0 -1259
  79. data/ext/image_pack/vendor/mozjpeg/turbojpeg.c +0 -2320
  80. data/ext/image_pack/vendor/mozjpeg/wrbmp.c +0 -552
  81. data/ext/image_pack/vendor/mozjpeg/wrgif.c +0 -580
  82. data/ext/image_pack/vendor/mozjpeg/wrjpgcom.c +0 -577
  83. data/ext/image_pack/vendor/mozjpeg/wrppm.c +0 -366
  84. data/ext/image_pack/vendor/mozjpeg/wrtarga.c +0 -258
  85. data/ext/image_pack/vendor/mozjpeg/yuvjpeg.c +0 -268
  86. data/lib/image_pack/backend.rb +0 -8
@@ -1,615 +0,0 @@
1
- /*
2
- * Loongson MMI optimizations for libjpeg-turbo
3
- *
4
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5
- * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved.
6
- * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
7
- * All Rights Reserved.
8
- * Authors: ZhangLixia <zhanglixia-hf@loongson.cn>
9
- *
10
- * Based on the x86 SIMD extension for IJG JPEG library
11
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
12
- *
13
- * This software is provided 'as-is', without any express or implied
14
- * warranty. In no event will the authors be held liable for any damages
15
- * arising from the use of this software.
16
- *
17
- * Permission is granted to anyone to use this software for any purpose,
18
- * including commercial applications, and to alter it and redistribute it
19
- * freely, subject to the following restrictions:
20
- *
21
- * 1. The origin of this software must not be misrepresented; you must not
22
- * claim that you wrote the original software. If you use this software
23
- * in a product, an acknowledgment in the product documentation would be
24
- * appreciated but is not required.
25
- * 2. Altered source versions must be plainly marked as such, and must not be
26
- * misrepresented as being the original software.
27
- * 3. This notice may not be removed or altered from any source distribution.
28
- */
29
-
30
- /* This file is included by jdmerge-mmi.c */
31
-
32
-
33
- #if RGB_RED == 0
34
- #define mmA re
35
- #define mmB ro
36
- #elif RGB_GREEN == 0
37
- #define mmA ge
38
- #define mmB go
39
- #elif RGB_BLUE == 0
40
- #define mmA be
41
- #define mmB bo
42
- #else
43
- #define mmA xe
44
- #define mmB xo
45
- #endif
46
-
47
- #if RGB_RED == 1
48
- #define mmC re
49
- #define mmD ro
50
- #elif RGB_GREEN == 1
51
- #define mmC ge
52
- #define mmD go
53
- #elif RGB_BLUE == 1
54
- #define mmC be
55
- #define mmD bo
56
- #else
57
- #define mmC xe
58
- #define mmD xo
59
- #endif
60
-
61
- #if RGB_RED == 2
62
- #define mmE re
63
- #define mmF ro
64
- #elif RGB_GREEN == 2
65
- #define mmE ge
66
- #define mmF go
67
- #elif RGB_BLUE == 2
68
- #define mmE be
69
- #define mmF bo
70
- #else
71
- #define mmE xe
72
- #define mmF xo
73
- #endif
74
-
75
- #if RGB_RED == 3
76
- #define mmG re
77
- #define mmH ro
78
- #elif RGB_GREEN == 3
79
- #define mmG ge
80
- #define mmH go
81
- #elif RGB_BLUE == 3
82
- #define mmG be
83
- #define mmH bo
84
- #else
85
- #define mmG xe
86
- #define mmH xo
87
- #endif
88
-
89
-
90
- void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
91
- JSAMPIMAGE input_buf,
92
- JDIMENSION in_row_group_ctr,
93
- JSAMPARRAY output_buf)
94
- {
95
- JSAMPROW outptr, inptr0, inptr1, inptr2;
96
- int num_cols, col;
97
- __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y;
98
- __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr;
99
- __m64 rle, rlo, rl, rhe, rho, rh, re, ro;
100
- __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go;
101
- __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0;
102
- __m64 decenter, mask, zero = 0.0;
103
- #if RGB_PIXELSIZE == 4
104
- __m64 mm8, mm9;
105
- #endif
106
-
107
- inptr0 = input_buf[0][in_row_group_ctr];
108
- inptr1 = input_buf[1][in_row_group_ctr];
109
- inptr2 = input_buf[2][in_row_group_ctr];
110
- outptr = output_buf[0];
111
-
112
- for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8,
113
- inptr0 += 16, inptr1 += 8, inptr2 += 8) {
114
-
115
- cb = _mm_load_si64((__m64 *)inptr1);
116
- cr = _mm_load_si64((__m64 *)inptr2);
117
- ythis = _mm_load_si64((__m64 *)inptr0);
118
- ynext = _mm_load_si64((__m64 *)inptr0 + 1);
119
-
120
- mask = decenter = 0.0;
121
- mask = _mm_cmpeq_pi16(mask, mask);
122
- decenter = _mm_cmpeq_pi16(decenter, decenter);
123
- mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
124
- decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
125
-
126
- cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */
127
- cbh = _mm_unpackhi_pi8(cb, zero); /* Cb(4567) */
128
- crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */
129
- crh = _mm_unpackhi_pi8(cr, zero); /* Cr(4567) */
130
- cbl = _mm_add_pi16(cbl, decenter);
131
- cbh = _mm_add_pi16(cbh, decenter);
132
- crl = _mm_add_pi16(crl, decenter);
133
- crh = _mm_add_pi16(crh, decenter);
134
-
135
- /* (Original)
136
- * R = Y + 1.40200 * Cr
137
- * G = Y - 0.34414 * Cb - 0.71414 * Cr
138
- * B = Y + 1.77200 * Cb
139
- *
140
- * (This implementation)
141
- * R = Y + 0.40200 * Cr + Cr
142
- * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
143
- * B = Y - 0.22800 * Cb + Cb + Cb
144
- */
145
-
146
- cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */
147
- cbh2 = _mm_add_pi16(cbh, cbh); /* 2*CbH */
148
- crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */
149
- crh2 = _mm_add_pi16(crh, crh); /* 2*CrH */
150
-
151
- bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */
152
- bh = _mm_mulhi_pi16(cbh2, PW_MF0228); /* (2*CbH * -FIX(0.22800) */
153
- rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */
154
- rh = _mm_mulhi_pi16(crh2, PW_F0402); /* (2*CrH * FIX(0.40200)) */
155
-
156
- bl = _mm_add_pi16(bl, PW_ONE);
157
- bh = _mm_add_pi16(bh, PW_ONE);
158
- bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */
159
- bh = _mm_srai_pi16(bh, 1); /* (CbH * -FIX(0.22800)) */
160
- rl = _mm_add_pi16(rl, PW_ONE);
161
- rh = _mm_add_pi16(rh, PW_ONE);
162
- rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */
163
- rh = _mm_srai_pi16(rh, 1); /* (CrH * FIX(0.40200)) */
164
-
165
- bl = _mm_add_pi16(bl, cbl);
166
- bh = _mm_add_pi16(bh, cbh);
167
- bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */
168
- bh = _mm_add_pi16(bh, cbh); /* (CbH * FIX(1.77200))=(B-Y)H */
169
- rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */
170
- rh = _mm_add_pi16(rh, crh); /* (CrH * FIX(1.40200))=(R-Y)H */
171
-
172
- ga = _mm_unpacklo_pi16(cbl, crl);
173
- gb = _mm_unpackhi_pi16(cbl, crl);
174
- ga = _mm_madd_pi16(ga, PW_MF0344_F0285);
175
- gb = _mm_madd_pi16(gb, PW_MF0344_F0285);
176
- gc = _mm_unpacklo_pi16(cbh, crh);
177
- gd = _mm_unpackhi_pi16(cbh, crh);
178
- gc = _mm_madd_pi16(gc, PW_MF0344_F0285);
179
- gd = _mm_madd_pi16(gd, PW_MF0344_F0285);
180
-
181
- ga = _mm_add_pi32(ga, PD_ONEHALF);
182
- gb = _mm_add_pi32(gb, PD_ONEHALF);
183
- ga = _mm_srai_pi32(ga, SCALEBITS);
184
- gb = _mm_srai_pi32(gb, SCALEBITS);
185
- gc = _mm_add_pi32(gc, PD_ONEHALF);
186
- gd = _mm_add_pi32(gd, PD_ONEHALF);
187
- gc = _mm_srai_pi32(gc, SCALEBITS);
188
- gd = _mm_srai_pi32(gd, SCALEBITS);
189
-
190
- gl = _mm_packs_pi32(ga, gb); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
191
- gh = _mm_packs_pi32(gc, gd); /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
192
- gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
193
- gh = _mm_sub_pi16(gh, crh); /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
194
-
195
- ythise = _mm_and_si64(mask, ythis); /* Y(0246) */
196
- ythiso = _mm_srli_pi16(ythis, BYTE_BIT); /* Y(1357) */
197
- ynexte = _mm_and_si64(mask, ynext); /* Y(8ACE) */
198
- ynexto = _mm_srli_pi16(ynext, BYTE_BIT); /* Y(9BDF) */
199
-
200
- rle = _mm_add_pi16(rl, ythise); /* (R0 R2 R4 R6) */
201
- rlo = _mm_add_pi16(rl, ythiso); /* (R1 R3 R5 R7) */
202
- rhe = _mm_add_pi16(rh, ynexte); /* (R8 RA RC RE) */
203
- rho = _mm_add_pi16(rh, ynexto); /* (R9 RB RD RF) */
204
- re = _mm_packs_pu16(rle, rhe); /* (R0 R2 R4 R6 R8 RA RC RE) */
205
- ro = _mm_packs_pu16(rlo, rho); /* (R1 R3 R5 R7 R9 RB RD RF) */
206
-
207
- gle = _mm_add_pi16(gl, ythise); /* (G0 G2 G4 G6) */
208
- glo = _mm_add_pi16(gl, ythiso); /* (G1 G3 G5 G7) */
209
- ghe = _mm_add_pi16(gh, ynexte); /* (G8 GA GC GE) */
210
- gho = _mm_add_pi16(gh, ynexto); /* (G9 GB GD GF) */
211
- ge = _mm_packs_pu16(gle, ghe); /* (G0 G2 G4 G6 G8 GA GC GE) */
212
- go = _mm_packs_pu16(glo, gho); /* (G1 G3 G5 G7 G9 GB GD GF) */
213
-
214
- ble = _mm_add_pi16(bl, ythise); /* (B0 B2 B4 B6) */
215
- blo = _mm_add_pi16(bl, ythiso); /* (B1 B3 B5 B7) */
216
- bhe = _mm_add_pi16(bh, ynexte); /* (B8 BA BC BE) */
217
- bho = _mm_add_pi16(bh, ynexto); /* (B9 BB BD BF) */
218
- be = _mm_packs_pu16(ble, bhe); /* (B0 B2 B4 B6 B8 BA BC BE) */
219
- bo = _mm_packs_pu16(blo, bho); /* (B1 B3 B5 B7 B9 BB BD BF) */
220
-
221
- #if RGB_PIXELSIZE == 3
222
-
223
- /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
224
- /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
225
- /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
226
- mmG = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
227
- mmA = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */
228
- mmH = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */
229
- mmE = _mm_unpackhi_pi8(mmE, mmB); /* (28 09 2A 0B 2C 0D 2E 0F) */
230
- mmC = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */
231
- mmD = _mm_unpackhi_pi8(mmD, mmF); /* (19 29 1B 2B 1D 2D 1F 2F) */
232
-
233
- mmB = _mm_unpacklo_pi16(mmG, mmA); /* (00 10 08 18 02 12 0A 1A) */
234
- mmA = _mm_unpackhi_pi16(mmG, mmA); /* (04 14 0C 1C 06 16 0E 1E) */
235
- mmF = _mm_unpacklo_pi16(mmH, mmE); /* (20 01 28 09 22 03 2A 0B) */
236
- mmE = _mm_unpackhi_pi16(mmH, mmE); /* (24 05 2C 0D 26 07 2E 0F) */
237
- mmH = _mm_unpacklo_pi16(mmC, mmD); /* (11 21 19 29 13 23 1B 2B) */
238
- mmG = _mm_unpackhi_pi16(mmC, mmD); /* (15 25 1D 2D 17 27 1F 2F) */
239
-
240
- mmC = _mm_unpacklo_pi16(mmB, mmF); /* (00 10 20 01 08 18 28 09) */
241
- mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
242
- mmB = _mm_unpacklo_pi16(mmH, mmB); /* (11 21 02 12 19 29 0A 1A) */
243
- mmD = _mm_unpackhi_pi16(mmF, mmH); /* (22 03 13 23 2A 0B 1B 2B) */
244
- mmF = _mm_unpacklo_pi16(mmA, mmE); /* (04 14 24 05 0C 1C 2C 0D) */
245
- mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
246
- mmH = _mm_unpacklo_pi16(mmG, mmA); /* (15 25 06 16 1D 2D 0E 1E) */
247
- mmG = _mm_unpackhi_pi16(mmE, mmG); /* (26 07 17 27 2E 0F 1F 2F) */
248
-
249
- mmA = _mm_unpacklo_pi32(mmC, mmB); /* (00 10 20 01 11 21 02 12) */
250
- mmE = _mm_unpackhi_pi32(mmC, mmB); /* (08 18 28 09 19 29 0A 1A) */
251
- mmB = _mm_unpacklo_pi32(mmD, mmF); /* (22 03 13 23 04 14 24 05) */
252
- mmF = _mm_unpackhi_pi32(mmD, mmF); /* (2A 0B 1B 2B 0C 1C 2C 0D) */
253
- mmC = _mm_unpacklo_pi32(mmH, mmG); /* (15 25 06 16 26 07 17 27) */
254
- mmG = _mm_unpackhi_pi32(mmH, mmG); /* (1D 2D 0E 1E 2E 0F 1F 2F) */
255
-
256
- if (num_cols >= 8) {
257
- if (!(((long)outptr) & 7)) {
258
- _mm_store_si64((__m64 *)outptr, mmA);
259
- _mm_store_si64((__m64 *)(outptr + 8), mmB);
260
- _mm_store_si64((__m64 *)(outptr + 16), mmC);
261
- _mm_store_si64((__m64 *)(outptr + 24), mmE);
262
- _mm_store_si64((__m64 *)(outptr + 32), mmF);
263
- _mm_store_si64((__m64 *)(outptr + 40), mmG);
264
- } else {
265
- _mm_storeu_si64((__m64 *)outptr, mmA);
266
- _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
267
- _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
268
- _mm_storeu_si64((__m64 *)(outptr + 24), mmE);
269
- _mm_storeu_si64((__m64 *)(outptr + 32), mmF);
270
- _mm_storeu_si64((__m64 *)(outptr + 40), mmG);
271
- }
272
- outptr += RGB_PIXELSIZE * 16;
273
- } else {
274
- if (output_width & 1)
275
- col = num_cols * 6 + 3;
276
- else
277
- col = num_cols * 6;
278
-
279
- asm(".set noreorder\r\n" /* st24 */
280
-
281
- "li $8, 24\r\n"
282
- "move $9, %7\r\n"
283
- "mov.s $f4, %1\r\n"
284
- "mov.s $f6, %2\r\n"
285
- "mov.s $f8, %3\r\n"
286
- "move $10, %8\r\n"
287
- "bltu $9, $8, 1f\r\n"
288
- "nop \r\n"
289
- "gssdlc1 $f4, 7($10)\r\n"
290
- "gssdrc1 $f4, 0($10)\r\n"
291
- "gssdlc1 $f6, 7+8($10)\r\n"
292
- "gssdrc1 $f6, 8($10)\r\n"
293
- "gssdlc1 $f8, 7+16($10)\r\n"
294
- "gssdrc1 $f8, 16($10)\r\n"
295
- "mov.s $f4, %4\r\n"
296
- "mov.s $f6, %5\r\n"
297
- "mov.s $f8, %6\r\n"
298
- "subu $9, $9, 24\r\n"
299
- PTR_ADDU "$10, $10, 24\r\n"
300
-
301
- "1: \r\n"
302
- "li $8, 16\r\n" /* st16 */
303
- "bltu $9, $8, 2f\r\n"
304
- "nop \r\n"
305
- "gssdlc1 $f4, 7($10)\r\n"
306
- "gssdrc1 $f4, 0($10)\r\n"
307
- "gssdlc1 $f6, 7+8($10)\r\n"
308
- "gssdrc1 $f6, 8($10)\r\n"
309
- "mov.s $f4, $f8\r\n"
310
- "subu $9, $9, 16\r\n"
311
- PTR_ADDU "$10, $10, 16\r\n"
312
-
313
- "2: \r\n"
314
- "li $8, 8\r\n" /* st8 */
315
- "bltu $9, $8, 3f\r\n"
316
- "nop \r\n"
317
- "gssdlc1 $f4, 7($10)\r\n"
318
- "gssdrc1 $f4, 0($10)\r\n"
319
- "mov.s $f4, $f6\r\n"
320
- "subu $9, $9, 8\r\n"
321
- PTR_ADDU "$10, $10, 8\r\n"
322
-
323
- "3: \r\n"
324
- "li $8, 4\r\n" /* st4 */
325
- "mfc1 $11, $f4\r\n"
326
- "bltu $9, $8, 4f\r\n"
327
- "nop \r\n"
328
- "swl $11, 3($10)\r\n"
329
- "swr $11, 0($10)\r\n"
330
- "li $8, 32\r\n"
331
- "mtc1 $8, $f6\r\n"
332
- "dsrl $f4, $f4, $f6\r\n"
333
- "mfc1 $11, $f4\r\n"
334
- "subu $9, $9, 4\r\n"
335
- PTR_ADDU "$10, $10, 4\r\n"
336
-
337
- "4: \r\n"
338
- "li $8, 2\r\n" /* st2 */
339
- "bltu $9, $8, 5f\r\n"
340
- "nop \r\n"
341
- "ush $11, 0($10)\r\n"
342
- "srl $11, 16\r\n"
343
- "subu $9, $9, 2\r\n"
344
- PTR_ADDU "$10, $10, 2\r\n"
345
-
346
- "5: \r\n"
347
- "li $8, 1\r\n" /* st1 */
348
- "bltu $9, $8, 6f\r\n"
349
- "nop \r\n"
350
- "sb $11, 0($10)\r\n"
351
-
352
- "6: \r\n"
353
- "nop \r\n" /* end */
354
- : "=m" (*outptr)
355
- : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF),
356
- "f" (mmG), "r" (col), "r" (outptr)
357
- : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory"
358
- );
359
- }
360
-
361
- #else /* RGB_PIXELSIZE == 4 */
362
-
363
- #ifdef RGBX_FILLER_0XFF
364
- xe = _mm_cmpeq_pi8(xe, xe);
365
- xo = _mm_cmpeq_pi8(xo, xo);
366
- #else
367
- xe = _mm_xor_si64(xe, xe);
368
- xo = _mm_xor_si64(xo, xo);
369
- #endif
370
- /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
371
- /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
372
- /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
373
- /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
374
-
375
- mm8 = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
376
- mm9 = _mm_unpackhi_pi8(mmA, mmC); /* (08 18 0A 1A 0C 1C 0E 1E) */
377
- mmA = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */
378
- mmE = _mm_unpackhi_pi8(mmE, mmG); /* (28 38 2A 3A 2C 3C 2E 3E) */
379
-
380
- mmG = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */
381
- mmB = _mm_unpackhi_pi8(mmB, mmD); /* (09 19 0B 1B 0D 1D 0F 1F) */
382
- mmD = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */
383
- mmF = _mm_unpackhi_pi8(mmF, mmH); /* (29 39 2B 3B 2D 3D 2F 3F) */
384
-
385
- mmH = _mm_unpacklo_pi16(mm8, mmA); /* (00 10 20 30 02 12 22 32) */
386
- mm8 = _mm_unpackhi_pi16(mm8, mmA); /* (04 14 24 34 06 16 26 36) */
387
- mmA = _mm_unpacklo_pi16(mmG, mmD); /* (01 11 21 31 03 13 23 33) */
388
- mmD = _mm_unpackhi_pi16(mmG, mmD); /* (05 15 25 35 07 17 27 37) */
389
-
390
- mmG = _mm_unpackhi_pi16(mm9, mmE); /* (0C 1C 2C 3C 0E 1E 2E 3E) */
391
- mm9 = _mm_unpacklo_pi16(mm9, mmE); /* (08 18 28 38 0A 1A 2A 3A) */
392
- mmE = _mm_unpacklo_pi16(mmB, mmF); /* (09 19 29 39 0B 1B 2B 3B) */
393
- mmF = _mm_unpackhi_pi16(mmB, mmF); /* (0D 1D 2D 3D 0F 1F 2F 3F) */
394
-
395
- mmB = _mm_unpackhi_pi32(mmH, mmA); /* (02 12 22 32 03 13 23 33) */
396
- mmA = _mm_unpacklo_pi32(mmH, mmA); /* (00 10 20 30 01 11 21 31) */
397
- mmC = _mm_unpacklo_pi32(mm8, mmD); /* (04 14 24 34 05 15 25 35) */
398
- mmD = _mm_unpackhi_pi32(mm8, mmD); /* (06 16 26 36 07 17 27 37) */
399
-
400
- mmH = _mm_unpackhi_pi32(mmG, mmF); /* (0E 1E 2E 3E 0F 1F 2F 3F) */
401
- mmG = _mm_unpacklo_pi32(mmG, mmF); /* (0C 1C 2C 3C 0D 1D 2D 3D) */
402
- mmF = _mm_unpackhi_pi32(mm9, mmE); /* (0A 1A 2A 3A 0B 1B 2B 3B) */
403
- mmE = _mm_unpacklo_pi32(mm9, mmE); /* (08 18 28 38 09 19 29 39) */
404
-
405
- if (num_cols >= 8) {
406
- if (!(((long)outptr) & 7)) {
407
- _mm_store_si64((__m64 *)outptr, mmA);
408
- _mm_store_si64((__m64 *)(outptr + 8), mmB);
409
- _mm_store_si64((__m64 *)(outptr + 16), mmC);
410
- _mm_store_si64((__m64 *)(outptr + 24), mmD);
411
- _mm_store_si64((__m64 *)(outptr + 32), mmE);
412
- _mm_store_si64((__m64 *)(outptr + 40), mmF);
413
- _mm_store_si64((__m64 *)(outptr + 48), mmG);
414
- _mm_store_si64((__m64 *)(outptr + 56), mmH);
415
- } else {
416
- _mm_storeu_si64((__m64 *)outptr, mmA);
417
- _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
418
- _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
419
- _mm_storeu_si64((__m64 *)(outptr + 24), mmD);
420
- _mm_storeu_si64((__m64 *)(outptr + 32), mmE);
421
- _mm_storeu_si64((__m64 *)(outptr + 40), mmF);
422
- _mm_storeu_si64((__m64 *)(outptr + 48), mmG);
423
- _mm_storeu_si64((__m64 *)(outptr + 56), mmH);
424
- }
425
- outptr += RGB_PIXELSIZE * 16;
426
- } else {
427
- if (output_width & 1)
428
- col = num_cols * 2 + 1;
429
- else
430
- col = num_cols * 2;
431
- asm(".set noreorder\r\n" /* st32 */
432
-
433
- "li $8, 8\r\n"
434
- "move $9, %10\r\n"
435
- "move $10, %11\r\n"
436
- "mov.s $f4, %2\r\n"
437
- "mov.s $f6, %3\r\n"
438
- "mov.s $f8, %4\r\n"
439
- "mov.s $f10, %5\r\n"
440
- "bltu $9, $8, 1f\r\n"
441
- "nop \r\n"
442
- "gssdlc1 $f4, 7($10)\r\n"
443
- "gssdrc1 $f4, 0($10)\r\n"
444
- "gssdlc1 $f6, 7+8($10)\r\n"
445
- "gssdrc1 $f6, 8($10)\r\n"
446
- "gssdlc1 $f8, 7+16($10)\r\n"
447
- "gssdrc1 $f8, 16($10)\r\n"
448
- "gssdlc1 $f10, 7+24($10)\r\n"
449
- "gssdrc1 $f10, 24($10)\r\n"
450
- "mov.s $f4, %6\r\n"
451
- "mov.s $f6, %7\r\n"
452
- "mov.s $f8, %8\r\n"
453
- "mov.s $f10, %9\r\n"
454
- "subu $9, $9, 8\r\n"
455
- PTR_ADDU "$10, $10, 32\r\n"
456
-
457
- "1: \r\n"
458
- "li $8, 4\r\n" /* st16 */
459
- "bltu $9, $8, 2f\r\n"
460
- "nop \r\n"
461
- "gssdlc1 $f4, 7($10)\r\n"
462
- "gssdrc1 $f4, 0($10)\r\n"
463
- "gssdlc1 $f6, 7+8($10)\r\n"
464
- "gssdrc1 $f6, 8($10)\r\n"
465
- "mov.s $f4, $f8\r\n"
466
- "mov.s $f6, $f10\r\n"
467
- "subu $9, $9, 4\r\n"
468
- PTR_ADDU "$10, $10, 16\r\n"
469
-
470
- "2: \r\n"
471
- "li $8, 2\r\n" /* st8 */
472
- "bltu $9, $8, 3f\r\n"
473
- "nop \r\n"
474
- "gssdlc1 $f4, 7($10)\r\n"
475
- "gssdrc1 $f4, 0($10)\r\n"
476
- "mov.s $f4, $f6\r\n"
477
- "subu $9, $9, 2\r\n"
478
- PTR_ADDU "$10, $10, 8\r\n"
479
-
480
- "3: \r\n"
481
- "li $8, 1\r\n" /* st4 */
482
- "bltu $9, $8, 4f\r\n"
483
- "nop \r\n"
484
- "gsswlc1 $f4, 3($10)\r\n"
485
- "gsswrc1 $f4, 0($10)\r\n"
486
-
487
- "4: \r\n"
488
- "li %1, 0\r\n" /* end */
489
- : "=m" (*outptr), "=r" (col)
490
- : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF),
491
- "f" (mmG), "f" (mmH), "r" (col), "r" (outptr)
492
- : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory"
493
- );
494
- }
495
-
496
- #endif
497
-
498
- }
499
-
500
- if (!((output_width >> 1) & 7)) {
501
- if (output_width & 1) {
502
- cb = _mm_load_si64((__m64 *)inptr1);
503
- cr = _mm_load_si64((__m64 *)inptr2);
504
- y = _mm_load_si64((__m64 *)inptr0);
505
-
506
- decenter = 0.0;
507
- decenter = _mm_cmpeq_pi16(decenter, decenter);
508
- decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
509
-
510
- cbl = _mm_unpacklo_pi8(cb, zero); /* Cb(0123) */
511
- crl = _mm_unpacklo_pi8(cr, zero); /* Cr(0123) */
512
- cbl = _mm_add_pi16(cbl, decenter);
513
- crl = _mm_add_pi16(crl, decenter);
514
-
515
- cbl2 = _mm_add_pi16(cbl, cbl); /* 2*CbL */
516
- crl2 = _mm_add_pi16(crl, crl); /* 2*CrL */
517
- bl = _mm_mulhi_pi16(cbl2, PW_MF0228); /* (2*CbL * -FIX(0.22800) */
518
- rl = _mm_mulhi_pi16(crl2, PW_F0402); /* (2*CrL * FIX(0.40200)) */
519
-
520
- bl = _mm_add_pi16(bl, PW_ONE);
521
- bl = _mm_srai_pi16(bl, 1); /* (CbL * -FIX(0.22800)) */
522
- rl = _mm_add_pi16(rl, PW_ONE);
523
- rl = _mm_srai_pi16(rl, 1); /* (CrL * FIX(0.40200)) */
524
-
525
- bl = _mm_add_pi16(bl, cbl);
526
- bl = _mm_add_pi16(bl, cbl); /* (CbL * FIX(1.77200))=(B-Y)L */
527
- rl = _mm_add_pi16(rl, crl); /* (CrL * FIX(1.40200))=(R-Y)L */
528
-
529
- gl = _mm_unpacklo_pi16(cbl, crl);
530
- gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
531
- gl = _mm_add_pi32(gl, PD_ONEHALF);
532
- gl = _mm_srai_pi32(gl, SCALEBITS);
533
- gl = _mm_packs_pi32(gl, zero); /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
534
- gl = _mm_sub_pi16(gl, crl); /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
535
-
536
- yl = _mm_unpacklo_pi8(y, zero); /* Y(0123) */
537
- rl = _mm_add_pi16(rl, yl); /* (R0 R1 R2 R3) */
538
- gl = _mm_add_pi16(gl, yl); /* (G0 G1 G2 G3) */
539
- bl = _mm_add_pi16(bl, yl); /* (B0 B1 B2 B3) */
540
- re = _mm_packs_pu16(rl, rl);
541
- ge = _mm_packs_pu16(gl, gl);
542
- be = _mm_packs_pu16(bl, bl);
543
- #if RGB_PIXELSIZE == 3
544
- mmA = _mm_unpacklo_pi8(mmA, mmC);
545
- mmA = _mm_unpacklo_pi16(mmA, mmE);
546
- asm(".set noreorder\r\n"
547
-
548
- "move $8, %2\r\n"
549
- "mov.s $f4, %1\r\n"
550
- "mfc1 $9, $f4\r\n"
551
- "ush $9, 0($8)\r\n"
552
- "srl $9, 16\r\n"
553
- "sb $9, 2($8)\r\n"
554
- : "=m" (*outptr)
555
- : "f" (mmA), "r" (outptr)
556
- : "$f4", "$8", "$9", "memory"
557
- );
558
- #else /* RGB_PIXELSIZE == 4 */
559
-
560
- #ifdef RGBX_FILLER_0XFF
561
- xe = _mm_cmpeq_pi8(xe, xe);
562
- #else
563
- xe = _mm_xor_si64(xe, xe);
564
- #endif
565
- mmA = _mm_unpacklo_pi8(mmA, mmC);
566
- mmE = _mm_unpacklo_pi8(mmE, mmG);
567
- mmA = _mm_unpacklo_pi16(mmA, mmE);
568
- asm(".set noreorder\r\n"
569
-
570
- "move $8, %2\r\n"
571
- "mov.s $f4, %1\r\n"
572
- "gsswlc1 $f4, 3($8)\r\n"
573
- "gsswrc1 $f4, 0($8)\r\n"
574
- : "=m" (*outptr)
575
- : "f" (mmA), "r" (outptr)
576
- : "$f4", "$8", "memory"
577
- );
578
- #endif
579
- }
580
- }
581
- }
582
-
583
-
584
- void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width,
585
- JSAMPIMAGE input_buf,
586
- JDIMENSION in_row_group_ctr,
587
- JSAMPARRAY output_buf)
588
- {
589
- JSAMPROW inptr, outptr;
590
-
591
- inptr = input_buf[0][in_row_group_ctr];
592
- outptr = output_buf[0];
593
-
594
- input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
595
- jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
596
- output_buf);
597
-
598
- input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
599
- output_buf[0] = output_buf[1];
600
- jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
601
- output_buf);
602
-
603
- input_buf[0][in_row_group_ctr] = inptr;
604
- output_buf[0] = outptr;
605
- }
606
-
607
-
608
- #undef mmA
609
- #undef mmB
610
- #undef mmC
611
- #undef mmD
612
- #undef mmE
613
- #undef mmF
614
- #undef mmG
615
- #undef mmH