image_pack 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +18 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +140 -0
  5. data/THIRD_PARTY_NOTICES.md +8 -0
  6. data/ext/image_pack/extconf.rb +515 -0
  7. data/ext/image_pack/image_pack.c +1618 -0
  8. data/ext/image_pack/vendor/.vendored +1 -0
  9. data/ext/image_pack/vendor/mozjpeg/BUILDING.txt +902 -0
  10. data/ext/image_pack/vendor/mozjpeg/CMakeLists.txt +1593 -0
  11. data/ext/image_pack/vendor/mozjpeg/LICENSE.md +132 -0
  12. data/ext/image_pack/vendor/mozjpeg/README-mozilla.txt +194 -0
  13. data/ext/image_pack/vendor/mozjpeg/README-turbo.txt +346 -0
  14. data/ext/image_pack/vendor/mozjpeg/README.ijg +258 -0
  15. data/ext/image_pack/vendor/mozjpeg/README.md +29 -0
  16. data/ext/image_pack/vendor/mozjpeg/cderror.h +128 -0
  17. data/ext/image_pack/vendor/mozjpeg/cdjpeg.c +156 -0
  18. data/ext/image_pack/vendor/mozjpeg/cdjpeg.h +171 -0
  19. data/ext/image_pack/vendor/mozjpeg/cjpeg.c +961 -0
  20. data/ext/image_pack/vendor/mozjpeg/cmyk.h +60 -0
  21. data/ext/image_pack/vendor/mozjpeg/coderules.txt +78 -0
  22. data/ext/image_pack/vendor/mozjpeg/croptest.in +95 -0
  23. data/ext/image_pack/vendor/mozjpeg/djpeg.c +855 -0
  24. data/ext/image_pack/vendor/mozjpeg/example.txt +464 -0
  25. data/ext/image_pack/vendor/mozjpeg/jaricom.c +157 -0
  26. data/ext/image_pack/vendor/mozjpeg/jcapimin.c +307 -0
  27. data/ext/image_pack/vendor/mozjpeg/jcapistd.c +168 -0
  28. data/ext/image_pack/vendor/mozjpeg/jcarith.c +972 -0
  29. data/ext/image_pack/vendor/mozjpeg/jccoefct.c +609 -0
  30. data/ext/image_pack/vendor/mozjpeg/jccolext.c +144 -0
  31. data/ext/image_pack/vendor/mozjpeg/jccolor.c +721 -0
  32. data/ext/image_pack/vendor/mozjpeg/jcdctmgr.c +1776 -0
  33. data/ext/image_pack/vendor/mozjpeg/jcext.c +219 -0
  34. data/ext/image_pack/vendor/mozjpeg/jchuff.c +1146 -0
  35. data/ext/image_pack/vendor/mozjpeg/jchuff.h +57 -0
  36. data/ext/image_pack/vendor/mozjpeg/jcicc.c +105 -0
  37. data/ext/image_pack/vendor/mozjpeg/jcinit.c +82 -0
  38. data/ext/image_pack/vendor/mozjpeg/jcmainct.c +162 -0
  39. data/ext/image_pack/vendor/mozjpeg/jcmarker.c +844 -0
  40. data/ext/image_pack/vendor/mozjpeg/jcmaster.c +958 -0
  41. data/ext/image_pack/vendor/mozjpeg/jcmaster.h +56 -0
  42. data/ext/image_pack/vendor/mozjpeg/jcomapi.c +109 -0
  43. data/ext/image_pack/vendor/mozjpeg/jconfig.h.in +37 -0
  44. data/ext/image_pack/vendor/mozjpeg/jconfig.txt +93 -0
  45. data/ext/image_pack/vendor/mozjpeg/jconfigint.h.in +44 -0
  46. data/ext/image_pack/vendor/mozjpeg/jcparam.c +991 -0
  47. data/ext/image_pack/vendor/mozjpeg/jcphuff.c +1123 -0
  48. data/ext/image_pack/vendor/mozjpeg/jcprepct.c +351 -0
  49. data/ext/image_pack/vendor/mozjpeg/jcsample.c +522 -0
  50. data/ext/image_pack/vendor/mozjpeg/jcstest.c +126 -0
  51. data/ext/image_pack/vendor/mozjpeg/jctrans.c +408 -0
  52. data/ext/image_pack/vendor/mozjpeg/jdapimin.c +407 -0
  53. data/ext/image_pack/vendor/mozjpeg/jdapistd.c +691 -0
  54. data/ext/image_pack/vendor/mozjpeg/jdarith.c +782 -0
  55. data/ext/image_pack/vendor/mozjpeg/jdatadst-tj.c +198 -0
  56. data/ext/image_pack/vendor/mozjpeg/jdatadst.c +299 -0
  57. data/ext/image_pack/vendor/mozjpeg/jdatasrc-tj.c +194 -0
  58. data/ext/image_pack/vendor/mozjpeg/jdatasrc.c +295 -0
  59. data/ext/image_pack/vendor/mozjpeg/jdcoefct.c +881 -0
  60. data/ext/image_pack/vendor/mozjpeg/jdcoefct.h +83 -0
  61. data/ext/image_pack/vendor/mozjpeg/jdcol565.c +384 -0
  62. data/ext/image_pack/vendor/mozjpeg/jdcolext.c +141 -0
  63. data/ext/image_pack/vendor/mozjpeg/jdcolor.c +881 -0
  64. data/ext/image_pack/vendor/mozjpeg/jdct.h +208 -0
  65. data/ext/image_pack/vendor/mozjpeg/jddctmgr.c +367 -0
  66. data/ext/image_pack/vendor/mozjpeg/jdhuff.c +834 -0
  67. data/ext/image_pack/vendor/mozjpeg/jdhuff.h +247 -0
  68. data/ext/image_pack/vendor/mozjpeg/jdicc.c +167 -0
  69. data/ext/image_pack/vendor/mozjpeg/jdinput.c +408 -0
  70. data/ext/image_pack/vendor/mozjpeg/jdmainct.c +460 -0
  71. data/ext/image_pack/vendor/mozjpeg/jdmainct.h +71 -0
  72. data/ext/image_pack/vendor/mozjpeg/jdmarker.c +1374 -0
  73. data/ext/image_pack/vendor/mozjpeg/jdmaster.c +727 -0
  74. data/ext/image_pack/vendor/mozjpeg/jdmaster.h +33 -0
  75. data/ext/image_pack/vendor/mozjpeg/jdmerge.c +587 -0
  76. data/ext/image_pack/vendor/mozjpeg/jdmerge.h +47 -0
  77. data/ext/image_pack/vendor/mozjpeg/jdmrg565.c +354 -0
  78. data/ext/image_pack/vendor/mozjpeg/jdmrgext.c +184 -0
  79. data/ext/image_pack/vendor/mozjpeg/jdphuff.c +679 -0
  80. data/ext/image_pack/vendor/mozjpeg/jdpostct.c +294 -0
  81. data/ext/image_pack/vendor/mozjpeg/jdsample.c +524 -0
  82. data/ext/image_pack/vendor/mozjpeg/jdsample.h +50 -0
  83. data/ext/image_pack/vendor/mozjpeg/jdtrans.c +156 -0
  84. data/ext/image_pack/vendor/mozjpeg/jerror.c +251 -0
  85. data/ext/image_pack/vendor/mozjpeg/jerror.h +335 -0
  86. data/ext/image_pack/vendor/mozjpeg/jfdctflt.c +169 -0
  87. data/ext/image_pack/vendor/mozjpeg/jfdctfst.c +227 -0
  88. data/ext/image_pack/vendor/mozjpeg/jfdctint.c +288 -0
  89. data/ext/image_pack/vendor/mozjpeg/jidctflt.c +240 -0
  90. data/ext/image_pack/vendor/mozjpeg/jidctfst.c +371 -0
  91. data/ext/image_pack/vendor/mozjpeg/jidctint.c +2627 -0
  92. data/ext/image_pack/vendor/mozjpeg/jidctred.c +409 -0
  93. data/ext/image_pack/vendor/mozjpeg/jinclude.h +147 -0
  94. data/ext/image_pack/vendor/mozjpeg/jmemmgr.c +1180 -0
  95. data/ext/image_pack/vendor/mozjpeg/jmemnobs.c +110 -0
  96. data/ext/image_pack/vendor/mozjpeg/jmemsys.h +178 -0
  97. data/ext/image_pack/vendor/mozjpeg/jmorecfg.h +382 -0
  98. data/ext/image_pack/vendor/mozjpeg/jpeg_nbits_table.h +4098 -0
  99. data/ext/image_pack/vendor/mozjpeg/jpegcomp.h +32 -0
  100. data/ext/image_pack/vendor/mozjpeg/jpegint.h +453 -0
  101. data/ext/image_pack/vendor/mozjpeg/jpeglib.h +1211 -0
  102. data/ext/image_pack/vendor/mozjpeg/jpegtran.c +827 -0
  103. data/ext/image_pack/vendor/mozjpeg/jpegyuv.c +172 -0
  104. data/ext/image_pack/vendor/mozjpeg/jquant1.c +856 -0
  105. data/ext/image_pack/vendor/mozjpeg/jquant2.c +1286 -0
  106. data/ext/image_pack/vendor/mozjpeg/jsimd.h +123 -0
  107. data/ext/image_pack/vendor/mozjpeg/jsimd_none.c +431 -0
  108. data/ext/image_pack/vendor/mozjpeg/jsimddct.h +70 -0
  109. data/ext/image_pack/vendor/mozjpeg/jstdhuff.c +144 -0
  110. data/ext/image_pack/vendor/mozjpeg/jutils.c +133 -0
  111. data/ext/image_pack/vendor/mozjpeg/jversion.h.in +56 -0
  112. data/ext/image_pack/vendor/mozjpeg/libjpeg.map.in +11 -0
  113. data/ext/image_pack/vendor/mozjpeg/libjpeg.txt +3150 -0
  114. data/ext/image_pack/vendor/mozjpeg/rdbmp.c +690 -0
  115. data/ext/image_pack/vendor/mozjpeg/rdcolmap.c +253 -0
  116. data/ext/image_pack/vendor/mozjpeg/rdgif.c +720 -0
  117. data/ext/image_pack/vendor/mozjpeg/rdjpeg.c +160 -0
  118. data/ext/image_pack/vendor/mozjpeg/rdjpgcom.c +494 -0
  119. data/ext/image_pack/vendor/mozjpeg/rdpng.c +194 -0
  120. data/ext/image_pack/vendor/mozjpeg/rdppm.c +781 -0
  121. data/ext/image_pack/vendor/mozjpeg/rdswitch.c +642 -0
  122. data/ext/image_pack/vendor/mozjpeg/rdtarga.c +508 -0
  123. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jccolext-neon.c +148 -0
  124. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jchuff-neon.c +334 -0
  125. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jsimd.c +976 -0
  126. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jsimd_neon.S +1200 -0
  127. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jccolext-neon.c +316 -0
  128. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jchuff-neon.c +411 -0
  129. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jsimd.c +1053 -0
  130. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jsimd_neon.S +2254 -0
  131. data/ext/image_pack/vendor/mozjpeg/simd/arm/align.h +28 -0
  132. data/ext/image_pack/vendor/mozjpeg/simd/arm/jccolor-neon.c +160 -0
  133. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcgray-neon.c +120 -0
  134. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcgryext-neon.c +106 -0
  135. data/ext/image_pack/vendor/mozjpeg/simd/arm/jchuff.h +131 -0
  136. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcphuff-neon.c +623 -0
  137. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcsample-neon.c +192 -0
  138. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdcolext-neon.c +374 -0
  139. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdcolor-neon.c +141 -0
  140. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdmerge-neon.c +144 -0
  141. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdmrgext-neon.c +723 -0
  142. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdsample-neon.c +569 -0
  143. data/ext/image_pack/vendor/mozjpeg/simd/arm/jfdctfst-neon.c +214 -0
  144. data/ext/image_pack/vendor/mozjpeg/simd/arm/jfdctint-neon.c +376 -0
  145. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctfst-neon.c +472 -0
  146. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctint-neon.c +801 -0
  147. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctred-neon.c +486 -0
  148. data/ext/image_pack/vendor/mozjpeg/simd/arm/jquanti-neon.c +193 -0
  149. data/ext/image_pack/vendor/mozjpeg/simd/arm/neon-compat.h +26 -0
  150. data/ext/image_pack/vendor/mozjpeg/simd/arm/neon-compat.h.in +37 -0
  151. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-avx2.asm +578 -0
  152. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-mmx.asm +476 -0
  153. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-sse2.asm +503 -0
  154. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-avx2.asm +121 -0
  155. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-mmx.asm +121 -0
  156. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-sse2.asm +120 -0
  157. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-avx2.asm +113 -0
  158. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-mmx.asm +113 -0
  159. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-sse2.asm +112 -0
  160. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-avx2.asm +457 -0
  161. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-mmx.asm +355 -0
  162. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-sse2.asm +382 -0
  163. data/ext/image_pack/vendor/mozjpeg/simd/i386/jchuff-sse2.asm +761 -0
  164. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcphuff-sse2.asm +662 -0
  165. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-avx2.asm +388 -0
  166. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-mmx.asm +324 -0
  167. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-sse2.asm +351 -0
  168. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-avx2.asm +515 -0
  169. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-mmx.asm +404 -0
  170. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-sse2.asm +458 -0
  171. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-avx2.asm +118 -0
  172. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-mmx.asm +117 -0
  173. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-sse2.asm +117 -0
  174. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-avx2.asm +136 -0
  175. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-mmx.asm +123 -0
  176. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-sse2.asm +135 -0
  177. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-avx2.asm +575 -0
  178. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-mmx.asm +460 -0
  179. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-sse2.asm +517 -0
  180. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-avx2.asm +760 -0
  181. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-mmx.asm +731 -0
  182. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-sse2.asm +724 -0
  183. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctflt-3dn.asm +318 -0
  184. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctflt-sse.asm +369 -0
  185. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctfst-mmx.asm +395 -0
  186. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctfst-sse2.asm +403 -0
  187. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-avx2.asm +331 -0
  188. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-mmx.asm +620 -0
  189. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-sse2.asm +633 -0
  190. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-3dn.asm +451 -0
  191. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-sse.asm +571 -0
  192. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-sse2.asm +497 -0
  193. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctfst-mmx.asm +499 -0
  194. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctfst-sse2.asm +501 -0
  195. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-avx2.asm +453 -0
  196. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-mmx.asm +851 -0
  197. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-sse2.asm +858 -0
  198. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctred-mmx.asm +704 -0
  199. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctred-sse2.asm +592 -0
  200. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-3dn.asm +230 -0
  201. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-mmx.asm +276 -0
  202. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-sse.asm +208 -0
  203. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquantf-sse2.asm +168 -0
  204. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquanti-avx2.asm +188 -0
  205. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquanti-sse2.asm +201 -0
  206. data/ext/image_pack/vendor/mozjpeg/simd/i386/jsimd.c +1312 -0
  207. data/ext/image_pack/vendor/mozjpeg/simd/i386/jsimdcpu.asm +135 -0
  208. data/ext/image_pack/vendor/mozjpeg/simd/jsimd.h +1258 -0
  209. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd.c +1143 -0
  210. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd_dspr2.S +4543 -0
  211. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd_dspr2_asm.h +292 -0
  212. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jccolext-mmi.c +455 -0
  213. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jccolor-mmi.c +148 -0
  214. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcgray-mmi.c +132 -0
  215. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcgryext-mmi.c +374 -0
  216. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcsample-mmi.c +98 -0
  217. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcsample.h +28 -0
  218. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdcolext-mmi.c +415 -0
  219. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdcolor-mmi.c +139 -0
  220. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdmerge-mmi.c +149 -0
  221. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdmrgext-mmi.c +615 -0
  222. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdsample-mmi.c +304 -0
  223. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jfdctfst-mmi.c +255 -0
  224. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jfdctint-mmi.c +398 -0
  225. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jidctfst-mmi.c +395 -0
  226. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jidctint-mmi.c +571 -0
  227. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jquanti-mmi.c +124 -0
  228. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jsimd.c +866 -0
  229. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jsimd_mmi.h +69 -0
  230. data/ext/image_pack/vendor/mozjpeg/simd/mips64/loongson-mmintrin.h +1334 -0
  231. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jcolsamp.inc +135 -0
  232. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jdct.inc +31 -0
  233. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdcfg.inc +93 -0
  234. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdcfg.inc.h +133 -0
  235. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdext.inc +520 -0
  236. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jccolext-altivec.c +269 -0
  237. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jccolor-altivec.c +116 -0
  238. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcgray-altivec.c +111 -0
  239. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcgryext-altivec.c +228 -0
  240. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcsample-altivec.c +159 -0
  241. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcsample.h +28 -0
  242. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdcolext-altivec.c +276 -0
  243. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdcolor-altivec.c +106 -0
  244. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdmerge-altivec.c +130 -0
  245. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdmrgext-altivec.c +329 -0
  246. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdsample-altivec.c +400 -0
  247. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jfdctfst-altivec.c +154 -0
  248. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jfdctint-altivec.c +258 -0
  249. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jidctfst-altivec.c +255 -0
  250. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jidctint-altivec.c +357 -0
  251. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jquanti-altivec.c +250 -0
  252. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jsimd.c +884 -0
  253. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jsimd_altivec.h +98 -0
  254. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolext-avx2.asm +559 -0
  255. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolext-sse2.asm +484 -0
  256. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolor-avx2.asm +121 -0
  257. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolor-sse2.asm +120 -0
  258. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgray-avx2.asm +113 -0
  259. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgray-sse2.asm +112 -0
  260. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgryext-avx2.asm +438 -0
  261. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgryext-sse2.asm +363 -0
  262. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jchuff-sse2.asm +583 -0
  263. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcphuff-sse2.asm +639 -0
  264. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcsample-avx2.asm +367 -0
  265. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcsample-sse2.asm +330 -0
  266. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolext-avx2.asm +496 -0
  267. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolext-sse2.asm +439 -0
  268. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolor-avx2.asm +118 -0
  269. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolor-sse2.asm +117 -0
  270. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmerge-avx2.asm +136 -0
  271. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmerge-sse2.asm +135 -0
  272. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmrgext-avx2.asm +596 -0
  273. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmrgext-sse2.asm +538 -0
  274. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdsample-avx2.asm +696 -0
  275. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdsample-sse2.asm +665 -0
  276. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctflt-sse.asm +355 -0
  277. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctfst-sse2.asm +389 -0
  278. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctint-avx2.asm +320 -0
  279. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctint-sse2.asm +619 -0
  280. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctflt-sse2.asm +482 -0
  281. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctfst-sse2.asm +491 -0
  282. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctint-avx2.asm +418 -0
  283. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctint-sse2.asm +847 -0
  284. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctred-sse2.asm +574 -0
  285. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquantf-sse2.asm +155 -0
  286. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquanti-avx2.asm +163 -0
  287. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquanti-sse2.asm +188 -0
  288. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jsimd.c +1110 -0
  289. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jsimdcpu.asm +86 -0
  290. data/ext/image_pack/vendor/mozjpeg/strtest.c +170 -0
  291. data/ext/image_pack/vendor/mozjpeg/structure.txt +900 -0
  292. data/ext/image_pack/vendor/mozjpeg/tjbench.c +1044 -0
  293. data/ext/image_pack/vendor/mozjpeg/tjbenchtest.in +256 -0
  294. data/ext/image_pack/vendor/mozjpeg/tjbenchtest.java.in +215 -0
  295. data/ext/image_pack/vendor/mozjpeg/tjexample.c +406 -0
  296. data/ext/image_pack/vendor/mozjpeg/tjexampletest.in +149 -0
  297. data/ext/image_pack/vendor/mozjpeg/tjexampletest.java.in +151 -0
  298. data/ext/image_pack/vendor/mozjpeg/tjunittest.c +961 -0
  299. data/ext/image_pack/vendor/mozjpeg/tjutil.c +70 -0
  300. data/ext/image_pack/vendor/mozjpeg/tjutil.h +53 -0
  301. data/ext/image_pack/vendor/mozjpeg/transupp.c +2373 -0
  302. data/ext/image_pack/vendor/mozjpeg/transupp.h +243 -0
  303. data/ext/image_pack/vendor/mozjpeg/turbojpeg-jni.c +1259 -0
  304. data/ext/image_pack/vendor/mozjpeg/turbojpeg.c +2320 -0
  305. data/ext/image_pack/vendor/mozjpeg/turbojpeg.h +1784 -0
  306. data/ext/image_pack/vendor/mozjpeg/usage.txt +679 -0
  307. data/ext/image_pack/vendor/mozjpeg/wizard.txt +220 -0
  308. data/ext/image_pack/vendor/mozjpeg/wrbmp.c +552 -0
  309. data/ext/image_pack/vendor/mozjpeg/wrgif.c +580 -0
  310. data/ext/image_pack/vendor/mozjpeg/wrjpgcom.c +577 -0
  311. data/ext/image_pack/vendor/mozjpeg/wrppm.c +366 -0
  312. data/ext/image_pack/vendor/mozjpeg/wrtarga.c +258 -0
  313. data/ext/image_pack/vendor/mozjpeg/yuvjpeg.c +268 -0
  314. data/lib/image_pack/backend.rb +8 -0
  315. data/lib/image_pack/configuration.rb +23 -0
  316. data/lib/image_pack/errors.rb +13 -0
  317. data/lib/image_pack/version.rb +5 -0
  318. data/lib/image_pack.rb +208 -0
  319. metadata +433 -0
@@ -0,0 +1,696 @@
1
+ ;
2
+ ; jdsample.asm - upsampling (64-bit AVX2)
3
+ ;
4
+ ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5
+ ; Copyright (C) 2009, 2016, D. R. Commander.
6
+ ; Copyright (C) 2015, Intel Corporation.
7
+ ; Copyright (C) 2018, Matthias Räncker.
8
+ ;
9
+ ; Based on the x86 SIMD extension for IJG JPEG library
10
+ ; Copyright (C) 1999-2006, MIYASAKA Masaru.
11
+ ; For conditions of distribution and use, see copyright notice in jsimdext.inc
12
+ ;
13
+ ; This file should be assembled with NASM (Netwide Assembler),
14
+ ; can *not* be assembled with Microsoft's MASM or any compatible
15
+ ; assembler (including Borland's Turbo Assembler).
16
+ ; NASM is available from http://nasm.sourceforge.net/ or
17
+ ; http://sourceforge.net/project/showfiles.php?group_id=6208
18
+
19
+ %include "jsimdext.inc"
20
+
21
+ ; --------------------------------------------------------------------------
22
+ SECTION SEG_CONST
23
+
24
+ alignz 32
25
+ GLOBAL_DATA(jconst_fancy_upsample_avx2)
26
+
27
+ EXTN(jconst_fancy_upsample_avx2):
28
+
29
+ PW_ONE times 16 dw 1
30
+ PW_TWO times 16 dw 2
31
+ PW_THREE times 16 dw 3
32
+ PW_SEVEN times 16 dw 7
33
+ PW_EIGHT times 16 dw 8
34
+
35
+ alignz 32
36
+
37
+ ; --------------------------------------------------------------------------
38
+ SECTION SEG_TEXT
39
+ BITS 64
40
+ ;
41
+ ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
42
+ ;
43
+ ; The upsampling algorithm is linear interpolation between pixel centers,
44
+ ; also known as a "triangle filter". This is a good compromise between
45
+ ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
46
+ ; of the way between input pixel centers.
47
+ ;
48
+ ; GLOBAL(void)
49
+ ; jsimd_h2v1_fancy_upsample_avx2(int max_v_samp_factor,
50
+ ; JDIMENSION downsampled_width,
51
+ ; JSAMPARRAY input_data,
52
+ ; JSAMPARRAY *output_data_ptr);
53
+ ;
54
+
55
+ ; r10 = int max_v_samp_factor
56
+ ; r11d = JDIMENSION downsampled_width
57
+ ; r12 = JSAMPARRAY input_data
58
+ ; r13 = JSAMPARRAY *output_data_ptr
59
+
60
+ align 32
61
+ GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_avx2)
62
+
63
+ EXTN(jsimd_h2v1_fancy_upsample_avx2):
64
+ push rbp
65
+ mov rax, rsp
66
+ mov rbp, rsp
67
+ push_xmm 3
68
+ collect_args 4
69
+
70
+ mov eax, r11d ; colctr
71
+ test rax, rax
72
+ jz near .return
73
+
74
+ mov rcx, r10 ; rowctr
75
+ test rcx, rcx
76
+ jz near .return
77
+
78
+ mov rsi, r12 ; input_data
79
+ mov rdi, r13
80
+ mov rdip, JSAMPARRAY [rdi] ; output_data
81
+
82
+ vpxor ymm0, ymm0, ymm0 ; ymm0=(all 0's)
83
+ vpcmpeqb xmm9, xmm9, xmm9
84
+ vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-1) ; (ff -- -- -- ... -- --) LSB is ff
85
+
86
+ vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-1)
87
+ vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ff) MSB is ff
88
+
89
+ .rowloop:
90
+ push rax ; colctr
91
+ push rdi
92
+ push rsi
93
+
94
+ mov rsip, JSAMPROW [rsi] ; inptr
95
+ mov rdip, JSAMPROW [rdi] ; outptr
96
+
97
+ test rax, SIZEOF_YMMWORD-1
98
+ jz short .skip
99
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
100
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
101
+ .skip:
102
+ vpand ymm7, ymm10, YMMWORD [rsi+0*SIZEOF_YMMWORD]
103
+
104
+ add rax, byte SIZEOF_YMMWORD-1
105
+ and rax, byte -SIZEOF_YMMWORD
106
+ cmp rax, byte SIZEOF_YMMWORD
107
+ ja short .columnloop
108
+
109
+ .columnloop_last:
110
+ vpand ymm6, ymm9, YMMWORD [rsi+0*SIZEOF_YMMWORD]
111
+ jmp short .upsample
112
+
113
+ .columnloop:
114
+ vmovdqu ymm6, YMMWORD [rsi+1*SIZEOF_YMMWORD]
115
+ vperm2i128 ymm6, ymm0, ymm6, 0x20
116
+ vpslldq ymm6, ymm6, 15
117
+
118
+ .upsample:
119
+ vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm1=( 0 1 2 ... 29 30 31)
120
+
121
+ vperm2i128 ymm2, ymm0, ymm1, 0x20
122
+ vpalignr ymm2, ymm1, ymm2, 15 ; ymm2=(-- 0 1 ... 28 29 30)
123
+ vperm2i128 ymm4, ymm0, ymm1, 0x03
124
+ vpalignr ymm3, ymm4, ymm1, 1 ; ymm3=( 1 2 3 ... 30 31 --)
125
+
126
+ vpor ymm2, ymm2, ymm7 ; ymm2=(-1 0 1 ... 28 29 30)
127
+ vpor ymm3, ymm3, ymm6 ; ymm3=( 1 2 3 ... 30 31 32)
128
+
129
+ vpsrldq ymm7, ymm4, (SIZEOF_XMMWORD-1) ; ymm7=(31 -- -- ... -- -- --)
130
+
131
+ vpunpckhbw ymm4, ymm1, ymm0 ; ymm4=( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
132
+ vpunpcklbw ymm5, ymm1, ymm0 ; ymm5=( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
133
+ vperm2i128 ymm1, ymm5, ymm4, 0x20 ; ymm1=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
134
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
135
+
136
+ vpunpckhbw ymm5, ymm2, ymm0 ; ymm5=( 7 8 9 10 11 12 13 14 23 24 25 26 27 28 29 30)
137
+ vpunpcklbw ymm6, ymm2, ymm0 ; ymm6=(-1 0 1 2 3 4 5 6 15 16 17 18 19 20 21 22)
138
+ vperm2i128 ymm2, ymm6, ymm5, 0x20 ; ymm2=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
139
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
140
+
141
+ vpunpckhbw ymm6, ymm3, ymm0 ; ymm6=( 1 2 3 4 5 6 7 8 17 18 19 20 21 22 23 24)
142
+ vpunpcklbw ymm8, ymm3, ymm0 ; ymm8=( 9 10 11 12 13 14 15 16 25 26 27 28 29 30 31 32)
143
+ vperm2i128 ymm3, ymm8, ymm6, 0x20 ; ymm3=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
144
+ vperm2i128 ymm6, ymm8, ymm6, 0x31 ; ymm6=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
145
+
146
+ vpmullw ymm1, ymm1, [rel PW_THREE]
147
+ vpmullw ymm4, ymm4, [rel PW_THREE]
148
+ vpaddw ymm2, ymm2, [rel PW_ONE]
149
+ vpaddw ymm5, ymm5, [rel PW_ONE]
150
+ vpaddw ymm3, ymm3, [rel PW_TWO]
151
+ vpaddw ymm6, ymm6, [rel PW_TWO]
152
+
153
+ vpaddw ymm2, ymm2, ymm1
154
+ vpaddw ymm5, ymm5, ymm4
155
+ vpsrlw ymm2, ymm2, 2 ; ymm2=OutLE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
156
+ vpsrlw ymm5, ymm5, 2 ; ymm5=OutHE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
157
+ vpaddw ymm3, ymm3, ymm1
158
+ vpaddw ymm6, ymm6, ymm4
159
+ vpsrlw ymm3, ymm3, 2 ; ymm3=OutLO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
160
+ vpsrlw ymm6, ymm6, 2 ; ymm6=OutHO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
161
+
162
+ vpsllw ymm3, ymm3, BYTE_BIT
163
+ vpsllw ymm6, ymm6, BYTE_BIT
164
+ vpor ymm2, ymm2, ymm3 ; ymm2=OutL=( 0 1 2 ... 29 30 31)
165
+ vpor ymm5, ymm5, ymm6 ; ymm5=OutH=(32 33 34 ... 61 62 63)
166
+
167
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
168
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm5
169
+
170
+ sub rax, byte SIZEOF_YMMWORD
171
+ add rsi, byte 1*SIZEOF_YMMWORD ; inptr
172
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
173
+ cmp rax, byte SIZEOF_YMMWORD
174
+ ja near .columnloop
175
+ test eax, eax
176
+ jnz near .columnloop_last
177
+
178
+ pop rsi
179
+ pop rdi
180
+ pop rax
181
+
182
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
183
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
184
+ dec rcx ; rowctr
185
+ jg near .rowloop
186
+
187
+ .return:
188
+ vzeroupper
189
+ uncollect_args 4
190
+ pop_xmm 3
191
+ pop rbp
192
+ ret
193
+
194
+ ; --------------------------------------------------------------------------
195
+ ;
196
+ ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
197
+ ; Again a triangle filter; see comments for h2v1 case, above.
198
+ ;
199
+ ; GLOBAL(void)
200
+ ; jsimd_h2v2_fancy_upsample_avx2(int max_v_samp_factor,
201
+ ; JDIMENSION downsampled_width,
202
+ ; JSAMPARRAY input_data,
203
+ ; JSAMPARRAY *output_data_ptr);
204
+ ;
205
+
206
+ ; r10 = int max_v_samp_factor
207
+ ; r11d = JDIMENSION downsampled_width
208
+ ; r12 = JSAMPARRAY input_data
209
+ ; r13 = JSAMPARRAY *output_data_ptr
210
+
211
+ %define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD ; ymmword wk[WK_NUM]
212
+ %define WK_NUM 4
213
+
214
+ align 32
215
+ GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_avx2)
216
+
217
+ EXTN(jsimd_h2v2_fancy_upsample_avx2):
218
+ push rbp
219
+ mov rax, rsp ; rax = original rbp
220
+ sub rsp, byte 4
221
+ and rsp, byte (-SIZEOF_YMMWORD) ; align to 256 bits
222
+ mov [rsp], rax
223
+ mov rbp, rsp ; rbp = aligned rbp
224
+ lea rsp, [wk(0)]
225
+ push_xmm 3
226
+ collect_args 4
227
+ push rbx
228
+
229
+ mov eax, r11d ; colctr
230
+ test rax, rax
231
+ jz near .return
232
+
233
+ mov rcx, r10 ; rowctr
234
+ test rcx, rcx
235
+ jz near .return
236
+
237
+ mov rsi, r12 ; input_data
238
+ mov rdi, r13
239
+ mov rdip, JSAMPARRAY [rdi] ; output_data
240
+ .rowloop:
241
+ push rax ; colctr
242
+ push rcx
243
+ push rdi
244
+ push rsi
245
+
246
+ mov rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW] ; inptr1(above)
247
+ mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
248
+ mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1(below)
249
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
250
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
251
+
252
+ vpxor ymm8, ymm8, ymm8 ; ymm8=(all 0's)
253
+ vpcmpeqb xmm9, xmm9, xmm9
254
+ vpsrldq xmm10, xmm9, (SIZEOF_XMMWORD-2) ; (ffff ---- ---- ... ---- ----) LSB is ffff
255
+ vpslldq xmm9, xmm9, (SIZEOF_XMMWORD-2)
256
+ vperm2i128 ymm9, ymm9, ymm9, 1 ; (---- ---- ... ---- ---- ffff) MSB is ffff
257
+
258
+ test rax, SIZEOF_YMMWORD-1
259
+ jz short .skip
260
+ push rdx
261
+ mov dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
262
+ mov JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
263
+ mov dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
264
+ mov JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
265
+ mov dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
266
+ mov JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl ; insert a dummy sample
267
+ pop rdx
268
+ .skip:
269
+ ; -- process the first column block
270
+
271
+ vmovdqu ymm0, YMMWORD [rbx+0*SIZEOF_YMMWORD] ; ymm0=row[ 0][0]
272
+ vmovdqu ymm1, YMMWORD [rcx+0*SIZEOF_YMMWORD] ; ymm1=row[-1][0]
273
+ vmovdqu ymm2, YMMWORD [rsi+0*SIZEOF_YMMWORD] ; ymm2=row[+1][0]
274
+
275
+ vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
276
+ vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
277
+ vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
278
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
279
+
280
+ vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
281
+ vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
282
+ vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
283
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
284
+
285
+ vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
286
+ vpunpcklbw ymm3, ymm2, ymm8 ; ymm3=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
287
+ vperm2i128 ymm2, ymm3, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
288
+ vperm2i128 ymm6, ymm3, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
289
+
290
+ vpmullw ymm0, ymm0, [rel PW_THREE]
291
+ vpmullw ymm4, ymm4, [rel PW_THREE]
292
+
293
+ vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
294
+ vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
295
+ vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
296
+ vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
297
+
298
+ vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1 ; temporarily save
299
+ vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5 ; the intermediate data
300
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm2
301
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm6
302
+
303
+ vpand ymm1, ymm1, ymm10 ; ymm1=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
304
+ vpand ymm2, ymm2, ymm10 ; ymm2=( 0 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
305
+
306
+ vmovdqa YMMWORD [wk(0)], ymm1
307
+ vmovdqa YMMWORD [wk(1)], ymm2
308
+
309
+ add rax, byte SIZEOF_YMMWORD-1
310
+ and rax, byte -SIZEOF_YMMWORD
311
+ cmp rax, byte SIZEOF_YMMWORD
312
+ ja short .columnloop
313
+
314
+ .columnloop_last:
315
+ ; -- process the last column block
316
+
317
+ vpand ymm1, ymm9, YMMWORD [rdx+1*SIZEOF_YMMWORD]
318
+ vpand ymm2, ymm9, YMMWORD [rdi+1*SIZEOF_YMMWORD]
319
+
320
+ vmovdqa YMMWORD [wk(2)], ymm1 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
321
+ vmovdqa YMMWORD [wk(3)], ymm2 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 31)
322
+
323
+ jmp near .upsample
324
+
325
+ .columnloop:
326
+ ; -- process the next column block
327
+
328
+ vmovdqu ymm0, YMMWORD [rbx+1*SIZEOF_YMMWORD] ; ymm0=row[ 0][1]
329
+ vmovdqu ymm1, YMMWORD [rcx+1*SIZEOF_YMMWORD] ; ymm1=row[-1][1]
330
+ vmovdqu ymm2, YMMWORD [rsi+1*SIZEOF_YMMWORD] ; ymm2=row[+1][1]
331
+
332
+ vpunpckhbw ymm4, ymm0, ymm8 ; ymm4=row[ 0]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
333
+ vpunpcklbw ymm5, ymm0, ymm8 ; ymm5=row[ 0]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
334
+ vperm2i128 ymm0, ymm5, ymm4, 0x20 ; ymm0=row[ 0]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
335
+ vperm2i128 ymm4, ymm5, ymm4, 0x31 ; ymm4=row[ 0](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
336
+
337
+ vpunpckhbw ymm5, ymm1, ymm8 ; ymm5=row[-1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
338
+ vpunpcklbw ymm6, ymm1, ymm8 ; ymm6=row[-1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
339
+ vperm2i128 ymm1, ymm6, ymm5, 0x20 ; ymm1=row[-1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
340
+ vperm2i128 ymm5, ymm6, ymm5, 0x31 ; ymm5=row[-1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
341
+
342
+ vpunpckhbw ymm6, ymm2, ymm8 ; ymm6=row[+1]( 8 9 10 11 12 13 14 15 24 25 26 27 28 29 30 31)
343
+ vpunpcklbw ymm7, ymm2, ymm8 ; ymm7=row[+1]( 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23)
344
+ vperm2i128 ymm2, ymm7, ymm6, 0x20 ; ymm2=row[+1]( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
345
+ vperm2i128 ymm6, ymm7, ymm6, 0x31 ; ymm6=row[+1](16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
346
+
347
+ vpmullw ymm0, ymm0, [rel PW_THREE]
348
+ vpmullw ymm4, ymm4, [rel PW_THREE]
349
+
350
+ vpaddw ymm1, ymm1, ymm0 ; ymm1=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
351
+ vpaddw ymm5, ymm5, ymm4 ; ymm5=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
352
+ vpaddw ymm2, ymm2, ymm0 ; ymm2=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
353
+ vpaddw ymm6, ymm6, ymm4 ; ymm6=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
354
+
355
+ vmovdqu YMMWORD [rdx+2*SIZEOF_YMMWORD], ymm1 ; temporarily save
356
+ vmovdqu YMMWORD [rdx+3*SIZEOF_YMMWORD], ymm5 ; the intermediate data
357
+ vmovdqu YMMWORD [rdi+2*SIZEOF_YMMWORD], ymm2
358
+ vmovdqu YMMWORD [rdi+3*SIZEOF_YMMWORD], ymm6
359
+
360
+ vperm2i128 ymm1, ymm8, ymm1, 0x20
361
+ vpslldq ymm1, ymm1, 14 ; ymm1=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
362
+ vperm2i128 ymm2, ymm8, ymm2, 0x20
363
+ vpslldq ymm2, ymm2, 14 ; ymm2=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 0)
364
+
365
+ vmovdqa YMMWORD [wk(2)], ymm1
366
+ vmovdqa YMMWORD [wk(3)], ymm2
367
+
368
+ .upsample:
369
+ ; -- process the upper row
370
+
371
+ vmovdqu ymm7, YMMWORD [rdx+0*SIZEOF_YMMWORD] ; ymm7=Int0L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
372
+ vmovdqu ymm3, YMMWORD [rdx+1*SIZEOF_YMMWORD] ; ymm3=Int0H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
373
+
374
+ vperm2i128 ymm0, ymm8, ymm7, 0x03
375
+ vpalignr ymm0, ymm0, ymm7, 2 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
376
+ vperm2i128 ymm4, ymm8, ymm3, 0x20
377
+ vpslldq ymm4, ymm4, 14 ; ymm4=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
378
+
379
+ vperm2i128 ymm5, ymm8, ymm7, 0x03
380
+ vpsrldq ymm5, ymm5, 14 ; ymm5=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
381
+ vperm2i128 ymm6, ymm8, ymm3, 0x20
382
+ vpalignr ymm6, ymm3, ymm6, 14 ; ymm6=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
383
+
384
+ vpor ymm0, ymm0, ymm4 ; ymm0=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
385
+ vpor ymm5, ymm5, ymm6 ; ymm5=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
386
+
387
+ vperm2i128 ymm2, ymm8, ymm3, 0x03
388
+ vpalignr ymm2, ymm2, ymm3, 2 ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
389
+ vperm2i128 ymm4, ymm8, ymm3, 0x03
390
+ vpsrldq ymm4, ymm4, 14 ; ymm4=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
391
+ vperm2i128 ymm1, ymm8, ymm7, 0x20
392
+ vpalignr ymm1, ymm7, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
393
+
394
+ vpor ymm1, ymm1, YMMWORD [wk(0)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
395
+ vpor ymm2, ymm2, YMMWORD [wk(2)] ; ymm2=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
396
+
397
+ vmovdqa YMMWORD [wk(0)], ymm4
398
+
399
+ vpmullw ymm7, ymm7, [rel PW_THREE]
400
+ vpmullw ymm3, ymm3, [rel PW_THREE]
401
+ vpaddw ymm1, ymm1, [rel PW_EIGHT]
402
+ vpaddw ymm5, ymm5, [rel PW_EIGHT]
403
+ vpaddw ymm0, ymm0, [rel PW_SEVEN]
404
+ vpaddw ymm2, [rel PW_SEVEN]
405
+
406
+ vpaddw ymm1, ymm1, ymm7
407
+ vpaddw ymm5, ymm5, ymm3
408
+ vpsrlw ymm1, ymm1, 4 ; ymm1=Out0LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
409
+ vpsrlw ymm5, ymm5, 4 ; ymm5=Out0HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
410
+ vpaddw ymm0, ymm0, ymm7
411
+ vpaddw ymm2, ymm2, ymm3
412
+ vpsrlw ymm0, ymm0, 4 ; ymm0=Out0LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
413
+ vpsrlw ymm2, ymm2, 4 ; ymm2=Out0HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
414
+
415
+ vpsllw ymm0, ymm0, BYTE_BIT
416
+ vpsllw ymm2, ymm2, BYTE_BIT
417
+ vpor ymm1, ymm1, ymm0 ; ymm1=Out0L=( 0 1 2 ... 29 30 31)
418
+ vpor ymm5, ymm5, ymm2 ; ymm5=Out0H=(32 33 34 ... 61 62 63)
419
+
420
+ vmovdqu YMMWORD [rdx+0*SIZEOF_YMMWORD], ymm1
421
+ vmovdqu YMMWORD [rdx+1*SIZEOF_YMMWORD], ymm5
422
+
423
+ ; -- process the lower row
424
+
425
+ vmovdqu ymm6, YMMWORD [rdi+0*SIZEOF_YMMWORD] ; ymm6=Int1L=( 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)
426
+ vmovdqu ymm4, YMMWORD [rdi+1*SIZEOF_YMMWORD] ; ymm4=Int1H=(16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31)
427
+
428
+ vperm2i128 ymm7, ymm8, ymm6, 0x03
429
+ vpalignr ymm7, ymm7, ymm6, 2 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 --)
430
+ vperm2i128 ymm3, ymm8, ymm4, 0x20
431
+ vpslldq ymm3, ymm3, 14 ; ymm3=(-- -- -- -- -- -- -- -- -- -- -- -- -- -- -- 16)
432
+
433
+ vperm2i128 ymm0, ymm8, ymm6, 0x03
434
+ vpsrldq ymm0, ymm0, 14 ; ymm0=(15 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
435
+ vperm2i128 ymm2, ymm8, ymm4, 0x20
436
+ vpalignr ymm2, ymm4, ymm2, 14 ; ymm2=(-- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
437
+
438
+ vpor ymm7, ymm7, ymm3 ; ymm7=( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16)
439
+ vpor ymm0, ymm0, ymm2 ; ymm0=(15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30)
440
+
441
+ vperm2i128 ymm5, ymm8, ymm4, 0x03
442
+ vpalignr ymm5, ymm5, ymm4, 2 ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 --)
443
+ vperm2i128 ymm3, ymm8, ymm4, 0x03
444
+ vpsrldq ymm3, ymm3, 14 ; ymm3=(31 -- -- -- -- -- -- -- -- -- -- -- -- -- -- --)
445
+ vperm2i128 ymm1, ymm8, ymm6, 0x20
446
+ vpalignr ymm1, ymm6, ymm1, 14 ; ymm1=(-- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
447
+
448
+ vpor ymm1, ymm1, YMMWORD [wk(1)] ; ymm1=(-1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14)
449
+ vpor ymm5, ymm5, YMMWORD [wk(3)] ; ymm5=(17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32)
450
+
451
+ vmovdqa YMMWORD [wk(1)], ymm3
452
+
453
+ vpmullw ymm6, ymm6, [rel PW_THREE]
454
+ vpmullw ymm4, ymm4, [rel PW_THREE]
455
+ vpaddw ymm1, ymm1, [rel PW_EIGHT]
456
+ vpaddw ymm0, ymm0, [rel PW_EIGHT]
457
+ vpaddw ymm7, ymm7, [rel PW_SEVEN]
458
+ vpaddw ymm5, ymm5, [rel PW_SEVEN]
459
+
460
+ vpaddw ymm1, ymm1, ymm6
461
+ vpaddw ymm0, ymm0, ymm4
462
+ vpsrlw ymm1, ymm1, 4 ; ymm1=Out1LE=( 0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30)
463
+ vpsrlw ymm0, ymm0, 4 ; ymm0=Out1HE=(32 34 36 38 40 42 44 46 48 50 52 54 56 58 60 62)
464
+ vpaddw ymm7, ymm7, ymm6
465
+ vpaddw ymm5, ymm5, ymm4
466
+ vpsrlw ymm7, ymm7, 4 ; ymm7=Out1LO=( 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31)
467
+ vpsrlw ymm5, ymm5, 4 ; ymm5=Out1HO=(33 35 37 39 41 43 45 47 49 51 53 55 57 59 61 63)
468
+
469
+ vpsllw ymm7, ymm7, BYTE_BIT
470
+ vpsllw ymm5, ymm5, BYTE_BIT
471
+ vpor ymm1, ymm1, ymm7 ; ymm1=Out1L=( 0 1 2 ... 29 30 31)
472
+ vpor ymm0, ymm0, ymm5 ; ymm0=Out1H=(32 33 34 ... 61 62 63)
473
+
474
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm1
475
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm0
476
+
477
+ sub rax, byte SIZEOF_YMMWORD
478
+ add rcx, byte 1*SIZEOF_YMMWORD ; inptr1(above)
479
+ add rbx, byte 1*SIZEOF_YMMWORD ; inptr0
480
+ add rsi, byte 1*SIZEOF_YMMWORD ; inptr1(below)
481
+ add rdx, byte 2*SIZEOF_YMMWORD ; outptr0
482
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr1
483
+ cmp rax, byte SIZEOF_YMMWORD
484
+ ja near .columnloop
485
+ test rax, rax
486
+ jnz near .columnloop_last
487
+
488
+ pop rsi
489
+ pop rdi
490
+ pop rcx
491
+ pop rax
492
+
493
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
494
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
495
+ sub rcx, byte 2 ; rowctr
496
+ jg near .rowloop
497
+
498
+ .return:
499
+ pop rbx
500
+ vzeroupper
501
+ uncollect_args 4
502
+ pop_xmm 3
503
+ mov rsp, rbp ; rsp <- aligned rbp
504
+ pop rsp ; rsp <- original rbp
505
+ pop rbp
506
+ ret
507
+
508
+ ; --------------------------------------------------------------------------
509
+ ;
510
+ ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
511
+ ; It's still a box filter.
512
+ ;
513
+ ; GLOBAL(void)
514
+ ; jsimd_h2v1_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
515
+ ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
516
+ ;
517
+
518
+ ; r10 = int max_v_samp_factor
519
+ ; r11d = JDIMENSION output_width
520
+ ; r12 = JSAMPARRAY input_data
521
+ ; r13 = JSAMPARRAY *output_data_ptr
522
+
523
+ align 32
524
+ GLOBAL_FUNCTION(jsimd_h2v1_upsample_avx2)
525
+
526
+ EXTN(jsimd_h2v1_upsample_avx2):
527
+ push rbp
528
+ mov rax, rsp
529
+ mov rbp, rsp
530
+ collect_args 4
531
+
532
+ mov edx, r11d
533
+ add rdx, byte (SIZEOF_YMMWORD-1)
534
+ and rdx, -SIZEOF_YMMWORD
535
+ jz near .return
536
+
537
+ mov rcx, r10 ; rowctr
538
+ test rcx, rcx
539
+ jz short .return
540
+
541
+ mov rsi, r12 ; input_data
542
+ mov rdi, r13
543
+ mov rdip, JSAMPARRAY [rdi] ; output_data
544
+ .rowloop:
545
+ push rdi
546
+ push rsi
547
+
548
+ mov rsip, JSAMPROW [rsi] ; inptr
549
+ mov rdip, JSAMPROW [rdi] ; outptr
550
+ mov rax, rdx ; colctr
551
+ .columnloop:
552
+
553
+ cmp rax, byte SIZEOF_YMMWORD
554
+ ja near .above_16
555
+
556
+ vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_YMMWORD]
557
+ vpunpckhbw xmm1, xmm0, xmm0
558
+ vpunpcklbw xmm0, xmm0, xmm0
559
+
560
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
561
+ vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
562
+
563
+ jmp short .nextrow
564
+
565
+ .above_16:
566
+ vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
567
+
568
+ vpermq ymm0, ymm0, 0xd8
569
+ vpunpckhbw ymm1, ymm0, ymm0
570
+ vpunpcklbw ymm0, ymm0, ymm0
571
+
572
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
573
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
574
+
575
+ sub rax, byte 2*SIZEOF_YMMWORD
576
+ jz short .nextrow
577
+
578
+ add rsi, byte SIZEOF_YMMWORD ; inptr
579
+ add rdi, byte 2*SIZEOF_YMMWORD ; outptr
580
+ jmp short .columnloop
581
+
582
+ .nextrow:
583
+ pop rsi
584
+ pop rdi
585
+
586
+ add rsi, byte SIZEOF_JSAMPROW ; input_data
587
+ add rdi, byte SIZEOF_JSAMPROW ; output_data
588
+ dec rcx ; rowctr
589
+ jg short .rowloop
590
+
591
+ .return:
592
+ vzeroupper
593
+ uncollect_args 4
594
+ pop rbp
595
+ ret
596
+
597
+ ; --------------------------------------------------------------------------
598
+ ;
599
+ ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
600
+ ; It's still a box filter.
601
+ ;
602
+ ; GLOBAL(void)
603
+ ; jsimd_h2v2_upsample_avx2(int max_v_samp_factor, JDIMENSION output_width,
604
+ ; JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
605
+ ;
606
+
607
+ ; r10 = int max_v_samp_factor
608
+ ; r11d = JDIMENSION output_width
609
+ ; r12 = JSAMPARRAY input_data
610
+ ; r13 = JSAMPARRAY *output_data_ptr
611
+
612
+ align 32
613
+ GLOBAL_FUNCTION(jsimd_h2v2_upsample_avx2)
614
+
615
+ EXTN(jsimd_h2v2_upsample_avx2):
616
+ push rbp
617
+ mov rax, rsp
618
+ mov rbp, rsp
619
+ collect_args 4
620
+ push rbx
621
+
622
+ mov edx, r11d
623
+ add rdx, byte (SIZEOF_YMMWORD-1)
624
+ and rdx, -SIZEOF_YMMWORD
625
+ jz near .return
626
+
627
+ mov rcx, r10 ; rowctr
628
+ test rcx, rcx
629
+ jz near .return
630
+
631
+ mov rsi, r12 ; input_data
632
+ mov rdi, r13
633
+ mov rdip, JSAMPARRAY [rdi] ; output_data
634
+ .rowloop:
635
+ push rdi
636
+ push rsi
637
+
638
+ mov rsip, JSAMPROW [rsi] ; inptr
639
+ mov rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
640
+ mov rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
641
+ mov rax, rdx ; colctr
642
+ .columnloop:
643
+
644
+ cmp rax, byte SIZEOF_YMMWORD
645
+ ja short .above_16
646
+
647
+ vmovdqu xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
648
+ vpunpckhbw xmm1, xmm0, xmm0
649
+ vpunpcklbw xmm0, xmm0, xmm0
650
+
651
+ vmovdqu XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
652
+ vmovdqu XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
653
+ vmovdqu XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
654
+ vmovdqu XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
655
+
656
+ jmp near .nextrow
657
+
658
+ .above_16:
659
+ vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
660
+
661
+ vpermq ymm0, ymm0, 0xd8
662
+ vpunpckhbw ymm1, ymm0, ymm0
663
+ vpunpcklbw ymm0, ymm0, ymm0
664
+
665
+ vmovdqu YMMWORD [rbx+0*SIZEOF_YMMWORD], ymm0
666
+ vmovdqu YMMWORD [rbx+1*SIZEOF_YMMWORD], ymm1
667
+ vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
668
+ vmovdqu YMMWORD [rdi+1*SIZEOF_YMMWORD], ymm1
669
+
670
+ sub rax, byte 2*SIZEOF_YMMWORD
671
+ jz short .nextrow
672
+
673
+ add rsi, byte SIZEOF_YMMWORD ; inptr
674
+ add rbx, 2*SIZEOF_YMMWORD ; outptr0
675
+ add rdi, 2*SIZEOF_YMMWORD ; outptr1
676
+ jmp short .columnloop
677
+
678
+ .nextrow:
679
+ pop rsi
680
+ pop rdi
681
+
682
+ add rsi, byte 1*SIZEOF_JSAMPROW ; input_data
683
+ add rdi, byte 2*SIZEOF_JSAMPROW ; output_data
684
+ sub rcx, byte 2 ; rowctr
685
+ jg near .rowloop
686
+
687
+ .return:
688
+ pop rbx
689
+ vzeroupper
690
+ uncollect_args 4
691
+ pop rbp
692
+ ret
693
+
694
+ ; For some reason, the OS X linker does not honor the request to align the
695
+ ; segment unless we do this.
696
+ align 32