image_pack 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +18 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +140 -0
  5. data/THIRD_PARTY_NOTICES.md +8 -0
  6. data/ext/image_pack/extconf.rb +515 -0
  7. data/ext/image_pack/image_pack.c +1618 -0
  8. data/ext/image_pack/vendor/.vendored +1 -0
  9. data/ext/image_pack/vendor/mozjpeg/BUILDING.txt +902 -0
  10. data/ext/image_pack/vendor/mozjpeg/CMakeLists.txt +1593 -0
  11. data/ext/image_pack/vendor/mozjpeg/LICENSE.md +132 -0
  12. data/ext/image_pack/vendor/mozjpeg/README-mozilla.txt +194 -0
  13. data/ext/image_pack/vendor/mozjpeg/README-turbo.txt +346 -0
  14. data/ext/image_pack/vendor/mozjpeg/README.ijg +258 -0
  15. data/ext/image_pack/vendor/mozjpeg/README.md +29 -0
  16. data/ext/image_pack/vendor/mozjpeg/cderror.h +128 -0
  17. data/ext/image_pack/vendor/mozjpeg/cdjpeg.c +156 -0
  18. data/ext/image_pack/vendor/mozjpeg/cdjpeg.h +171 -0
  19. data/ext/image_pack/vendor/mozjpeg/cjpeg.c +961 -0
  20. data/ext/image_pack/vendor/mozjpeg/cmyk.h +60 -0
  21. data/ext/image_pack/vendor/mozjpeg/coderules.txt +78 -0
  22. data/ext/image_pack/vendor/mozjpeg/croptest.in +95 -0
  23. data/ext/image_pack/vendor/mozjpeg/djpeg.c +855 -0
  24. data/ext/image_pack/vendor/mozjpeg/example.txt +464 -0
  25. data/ext/image_pack/vendor/mozjpeg/jaricom.c +157 -0
  26. data/ext/image_pack/vendor/mozjpeg/jcapimin.c +307 -0
  27. data/ext/image_pack/vendor/mozjpeg/jcapistd.c +168 -0
  28. data/ext/image_pack/vendor/mozjpeg/jcarith.c +972 -0
  29. data/ext/image_pack/vendor/mozjpeg/jccoefct.c +609 -0
  30. data/ext/image_pack/vendor/mozjpeg/jccolext.c +144 -0
  31. data/ext/image_pack/vendor/mozjpeg/jccolor.c +721 -0
  32. data/ext/image_pack/vendor/mozjpeg/jcdctmgr.c +1776 -0
  33. data/ext/image_pack/vendor/mozjpeg/jcext.c +219 -0
  34. data/ext/image_pack/vendor/mozjpeg/jchuff.c +1146 -0
  35. data/ext/image_pack/vendor/mozjpeg/jchuff.h +57 -0
  36. data/ext/image_pack/vendor/mozjpeg/jcicc.c +105 -0
  37. data/ext/image_pack/vendor/mozjpeg/jcinit.c +82 -0
  38. data/ext/image_pack/vendor/mozjpeg/jcmainct.c +162 -0
  39. data/ext/image_pack/vendor/mozjpeg/jcmarker.c +844 -0
  40. data/ext/image_pack/vendor/mozjpeg/jcmaster.c +958 -0
  41. data/ext/image_pack/vendor/mozjpeg/jcmaster.h +56 -0
  42. data/ext/image_pack/vendor/mozjpeg/jcomapi.c +109 -0
  43. data/ext/image_pack/vendor/mozjpeg/jconfig.h.in +37 -0
  44. data/ext/image_pack/vendor/mozjpeg/jconfig.txt +93 -0
  45. data/ext/image_pack/vendor/mozjpeg/jconfigint.h.in +44 -0
  46. data/ext/image_pack/vendor/mozjpeg/jcparam.c +991 -0
  47. data/ext/image_pack/vendor/mozjpeg/jcphuff.c +1123 -0
  48. data/ext/image_pack/vendor/mozjpeg/jcprepct.c +351 -0
  49. data/ext/image_pack/vendor/mozjpeg/jcsample.c +522 -0
  50. data/ext/image_pack/vendor/mozjpeg/jcstest.c +126 -0
  51. data/ext/image_pack/vendor/mozjpeg/jctrans.c +408 -0
  52. data/ext/image_pack/vendor/mozjpeg/jdapimin.c +407 -0
  53. data/ext/image_pack/vendor/mozjpeg/jdapistd.c +691 -0
  54. data/ext/image_pack/vendor/mozjpeg/jdarith.c +782 -0
  55. data/ext/image_pack/vendor/mozjpeg/jdatadst-tj.c +198 -0
  56. data/ext/image_pack/vendor/mozjpeg/jdatadst.c +299 -0
  57. data/ext/image_pack/vendor/mozjpeg/jdatasrc-tj.c +194 -0
  58. data/ext/image_pack/vendor/mozjpeg/jdatasrc.c +295 -0
  59. data/ext/image_pack/vendor/mozjpeg/jdcoefct.c +881 -0
  60. data/ext/image_pack/vendor/mozjpeg/jdcoefct.h +83 -0
  61. data/ext/image_pack/vendor/mozjpeg/jdcol565.c +384 -0
  62. data/ext/image_pack/vendor/mozjpeg/jdcolext.c +141 -0
  63. data/ext/image_pack/vendor/mozjpeg/jdcolor.c +881 -0
  64. data/ext/image_pack/vendor/mozjpeg/jdct.h +208 -0
  65. data/ext/image_pack/vendor/mozjpeg/jddctmgr.c +367 -0
  66. data/ext/image_pack/vendor/mozjpeg/jdhuff.c +834 -0
  67. data/ext/image_pack/vendor/mozjpeg/jdhuff.h +247 -0
  68. data/ext/image_pack/vendor/mozjpeg/jdicc.c +167 -0
  69. data/ext/image_pack/vendor/mozjpeg/jdinput.c +408 -0
  70. data/ext/image_pack/vendor/mozjpeg/jdmainct.c +460 -0
  71. data/ext/image_pack/vendor/mozjpeg/jdmainct.h +71 -0
  72. data/ext/image_pack/vendor/mozjpeg/jdmarker.c +1374 -0
  73. data/ext/image_pack/vendor/mozjpeg/jdmaster.c +727 -0
  74. data/ext/image_pack/vendor/mozjpeg/jdmaster.h +33 -0
  75. data/ext/image_pack/vendor/mozjpeg/jdmerge.c +587 -0
  76. data/ext/image_pack/vendor/mozjpeg/jdmerge.h +47 -0
  77. data/ext/image_pack/vendor/mozjpeg/jdmrg565.c +354 -0
  78. data/ext/image_pack/vendor/mozjpeg/jdmrgext.c +184 -0
  79. data/ext/image_pack/vendor/mozjpeg/jdphuff.c +679 -0
  80. data/ext/image_pack/vendor/mozjpeg/jdpostct.c +294 -0
  81. data/ext/image_pack/vendor/mozjpeg/jdsample.c +524 -0
  82. data/ext/image_pack/vendor/mozjpeg/jdsample.h +50 -0
  83. data/ext/image_pack/vendor/mozjpeg/jdtrans.c +156 -0
  84. data/ext/image_pack/vendor/mozjpeg/jerror.c +251 -0
  85. data/ext/image_pack/vendor/mozjpeg/jerror.h +335 -0
  86. data/ext/image_pack/vendor/mozjpeg/jfdctflt.c +169 -0
  87. data/ext/image_pack/vendor/mozjpeg/jfdctfst.c +227 -0
  88. data/ext/image_pack/vendor/mozjpeg/jfdctint.c +288 -0
  89. data/ext/image_pack/vendor/mozjpeg/jidctflt.c +240 -0
  90. data/ext/image_pack/vendor/mozjpeg/jidctfst.c +371 -0
  91. data/ext/image_pack/vendor/mozjpeg/jidctint.c +2627 -0
  92. data/ext/image_pack/vendor/mozjpeg/jidctred.c +409 -0
  93. data/ext/image_pack/vendor/mozjpeg/jinclude.h +147 -0
  94. data/ext/image_pack/vendor/mozjpeg/jmemmgr.c +1180 -0
  95. data/ext/image_pack/vendor/mozjpeg/jmemnobs.c +110 -0
  96. data/ext/image_pack/vendor/mozjpeg/jmemsys.h +178 -0
  97. data/ext/image_pack/vendor/mozjpeg/jmorecfg.h +382 -0
  98. data/ext/image_pack/vendor/mozjpeg/jpeg_nbits_table.h +4098 -0
  99. data/ext/image_pack/vendor/mozjpeg/jpegcomp.h +32 -0
  100. data/ext/image_pack/vendor/mozjpeg/jpegint.h +453 -0
  101. data/ext/image_pack/vendor/mozjpeg/jpeglib.h +1211 -0
  102. data/ext/image_pack/vendor/mozjpeg/jpegtran.c +827 -0
  103. data/ext/image_pack/vendor/mozjpeg/jpegyuv.c +172 -0
  104. data/ext/image_pack/vendor/mozjpeg/jquant1.c +856 -0
  105. data/ext/image_pack/vendor/mozjpeg/jquant2.c +1286 -0
  106. data/ext/image_pack/vendor/mozjpeg/jsimd.h +123 -0
  107. data/ext/image_pack/vendor/mozjpeg/jsimd_none.c +431 -0
  108. data/ext/image_pack/vendor/mozjpeg/jsimddct.h +70 -0
  109. data/ext/image_pack/vendor/mozjpeg/jstdhuff.c +144 -0
  110. data/ext/image_pack/vendor/mozjpeg/jutils.c +133 -0
  111. data/ext/image_pack/vendor/mozjpeg/jversion.h.in +56 -0
  112. data/ext/image_pack/vendor/mozjpeg/libjpeg.map.in +11 -0
  113. data/ext/image_pack/vendor/mozjpeg/libjpeg.txt +3150 -0
  114. data/ext/image_pack/vendor/mozjpeg/rdbmp.c +690 -0
  115. data/ext/image_pack/vendor/mozjpeg/rdcolmap.c +253 -0
  116. data/ext/image_pack/vendor/mozjpeg/rdgif.c +720 -0
  117. data/ext/image_pack/vendor/mozjpeg/rdjpeg.c +160 -0
  118. data/ext/image_pack/vendor/mozjpeg/rdjpgcom.c +494 -0
  119. data/ext/image_pack/vendor/mozjpeg/rdpng.c +194 -0
  120. data/ext/image_pack/vendor/mozjpeg/rdppm.c +781 -0
  121. data/ext/image_pack/vendor/mozjpeg/rdswitch.c +642 -0
  122. data/ext/image_pack/vendor/mozjpeg/rdtarga.c +508 -0
  123. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jccolext-neon.c +148 -0
  124. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jchuff-neon.c +334 -0
  125. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jsimd.c +976 -0
  126. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jsimd_neon.S +1200 -0
  127. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jccolext-neon.c +316 -0
  128. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jchuff-neon.c +411 -0
  129. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jsimd.c +1053 -0
  130. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jsimd_neon.S +2254 -0
  131. data/ext/image_pack/vendor/mozjpeg/simd/arm/align.h +28 -0
  132. data/ext/image_pack/vendor/mozjpeg/simd/arm/jccolor-neon.c +160 -0
  133. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcgray-neon.c +120 -0
  134. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcgryext-neon.c +106 -0
  135. data/ext/image_pack/vendor/mozjpeg/simd/arm/jchuff.h +131 -0
  136. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcphuff-neon.c +623 -0
  137. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcsample-neon.c +192 -0
  138. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdcolext-neon.c +374 -0
  139. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdcolor-neon.c +141 -0
  140. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdmerge-neon.c +144 -0
  141. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdmrgext-neon.c +723 -0
  142. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdsample-neon.c +569 -0
  143. data/ext/image_pack/vendor/mozjpeg/simd/arm/jfdctfst-neon.c +214 -0
  144. data/ext/image_pack/vendor/mozjpeg/simd/arm/jfdctint-neon.c +376 -0
  145. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctfst-neon.c +472 -0
  146. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctint-neon.c +801 -0
  147. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctred-neon.c +486 -0
  148. data/ext/image_pack/vendor/mozjpeg/simd/arm/jquanti-neon.c +193 -0
  149. data/ext/image_pack/vendor/mozjpeg/simd/arm/neon-compat.h +26 -0
  150. data/ext/image_pack/vendor/mozjpeg/simd/arm/neon-compat.h.in +37 -0
  151. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-avx2.asm +578 -0
  152. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-mmx.asm +476 -0
  153. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-sse2.asm +503 -0
  154. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-avx2.asm +121 -0
  155. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-mmx.asm +121 -0
  156. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-sse2.asm +120 -0
  157. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-avx2.asm +113 -0
  158. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-mmx.asm +113 -0
  159. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-sse2.asm +112 -0
  160. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-avx2.asm +457 -0
  161. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-mmx.asm +355 -0
  162. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-sse2.asm +382 -0
  163. data/ext/image_pack/vendor/mozjpeg/simd/i386/jchuff-sse2.asm +761 -0
  164. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcphuff-sse2.asm +662 -0
  165. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-avx2.asm +388 -0
  166. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-mmx.asm +324 -0
  167. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-sse2.asm +351 -0
  168. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-avx2.asm +515 -0
  169. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-mmx.asm +404 -0
  170. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-sse2.asm +458 -0
  171. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-avx2.asm +118 -0
  172. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-mmx.asm +117 -0
  173. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-sse2.asm +117 -0
  174. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-avx2.asm +136 -0
  175. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-mmx.asm +123 -0
  176. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-sse2.asm +135 -0
  177. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-avx2.asm +575 -0
  178. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-mmx.asm +460 -0
  179. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-sse2.asm +517 -0
  180. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-avx2.asm +760 -0
  181. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-mmx.asm +731 -0
  182. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-sse2.asm +724 -0
  183. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctflt-3dn.asm +318 -0
  184. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctflt-sse.asm +369 -0
  185. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctfst-mmx.asm +395 -0
  186. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctfst-sse2.asm +403 -0
  187. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-avx2.asm +331 -0
  188. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-mmx.asm +620 -0
  189. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-sse2.asm +633 -0
  190. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-3dn.asm +451 -0
  191. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-sse.asm +571 -0
  192. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-sse2.asm +497 -0
  193. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctfst-mmx.asm +499 -0
  194. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctfst-sse2.asm +501 -0
  195. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-avx2.asm +453 -0
  196. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-mmx.asm +851 -0
  197. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-sse2.asm +858 -0
  198. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctred-mmx.asm +704 -0
  199. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctred-sse2.asm +592 -0
  200. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-3dn.asm +230 -0
  201. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-mmx.asm +276 -0
  202. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-sse.asm +208 -0
  203. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquantf-sse2.asm +168 -0
  204. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquanti-avx2.asm +188 -0
  205. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquanti-sse2.asm +201 -0
  206. data/ext/image_pack/vendor/mozjpeg/simd/i386/jsimd.c +1312 -0
  207. data/ext/image_pack/vendor/mozjpeg/simd/i386/jsimdcpu.asm +135 -0
  208. data/ext/image_pack/vendor/mozjpeg/simd/jsimd.h +1258 -0
  209. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd.c +1143 -0
  210. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd_dspr2.S +4543 -0
  211. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd_dspr2_asm.h +292 -0
  212. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jccolext-mmi.c +455 -0
  213. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jccolor-mmi.c +148 -0
  214. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcgray-mmi.c +132 -0
  215. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcgryext-mmi.c +374 -0
  216. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcsample-mmi.c +98 -0
  217. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcsample.h +28 -0
  218. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdcolext-mmi.c +415 -0
  219. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdcolor-mmi.c +139 -0
  220. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdmerge-mmi.c +149 -0
  221. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdmrgext-mmi.c +615 -0
  222. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdsample-mmi.c +304 -0
  223. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jfdctfst-mmi.c +255 -0
  224. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jfdctint-mmi.c +398 -0
  225. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jidctfst-mmi.c +395 -0
  226. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jidctint-mmi.c +571 -0
  227. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jquanti-mmi.c +124 -0
  228. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jsimd.c +866 -0
  229. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jsimd_mmi.h +69 -0
  230. data/ext/image_pack/vendor/mozjpeg/simd/mips64/loongson-mmintrin.h +1334 -0
  231. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jcolsamp.inc +135 -0
  232. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jdct.inc +31 -0
  233. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdcfg.inc +93 -0
  234. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdcfg.inc.h +133 -0
  235. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdext.inc +520 -0
  236. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jccolext-altivec.c +269 -0
  237. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jccolor-altivec.c +116 -0
  238. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcgray-altivec.c +111 -0
  239. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcgryext-altivec.c +228 -0
  240. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcsample-altivec.c +159 -0
  241. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcsample.h +28 -0
  242. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdcolext-altivec.c +276 -0
  243. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdcolor-altivec.c +106 -0
  244. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdmerge-altivec.c +130 -0
  245. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdmrgext-altivec.c +329 -0
  246. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdsample-altivec.c +400 -0
  247. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jfdctfst-altivec.c +154 -0
  248. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jfdctint-altivec.c +258 -0
  249. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jidctfst-altivec.c +255 -0
  250. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jidctint-altivec.c +357 -0
  251. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jquanti-altivec.c +250 -0
  252. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jsimd.c +884 -0
  253. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jsimd_altivec.h +98 -0
  254. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolext-avx2.asm +559 -0
  255. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolext-sse2.asm +484 -0
  256. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolor-avx2.asm +121 -0
  257. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolor-sse2.asm +120 -0
  258. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgray-avx2.asm +113 -0
  259. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgray-sse2.asm +112 -0
  260. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgryext-avx2.asm +438 -0
  261. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgryext-sse2.asm +363 -0
  262. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jchuff-sse2.asm +583 -0
  263. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcphuff-sse2.asm +639 -0
  264. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcsample-avx2.asm +367 -0
  265. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcsample-sse2.asm +330 -0
  266. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolext-avx2.asm +496 -0
  267. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolext-sse2.asm +439 -0
  268. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolor-avx2.asm +118 -0
  269. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolor-sse2.asm +117 -0
  270. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmerge-avx2.asm +136 -0
  271. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmerge-sse2.asm +135 -0
  272. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmrgext-avx2.asm +596 -0
  273. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmrgext-sse2.asm +538 -0
  274. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdsample-avx2.asm +696 -0
  275. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdsample-sse2.asm +665 -0
  276. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctflt-sse.asm +355 -0
  277. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctfst-sse2.asm +389 -0
  278. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctint-avx2.asm +320 -0
  279. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctint-sse2.asm +619 -0
  280. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctflt-sse2.asm +482 -0
  281. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctfst-sse2.asm +491 -0
  282. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctint-avx2.asm +418 -0
  283. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctint-sse2.asm +847 -0
  284. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctred-sse2.asm +574 -0
  285. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquantf-sse2.asm +155 -0
  286. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquanti-avx2.asm +163 -0
  287. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquanti-sse2.asm +188 -0
  288. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jsimd.c +1110 -0
  289. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jsimdcpu.asm +86 -0
  290. data/ext/image_pack/vendor/mozjpeg/strtest.c +170 -0
  291. data/ext/image_pack/vendor/mozjpeg/structure.txt +900 -0
  292. data/ext/image_pack/vendor/mozjpeg/tjbench.c +1044 -0
  293. data/ext/image_pack/vendor/mozjpeg/tjbenchtest.in +256 -0
  294. data/ext/image_pack/vendor/mozjpeg/tjbenchtest.java.in +215 -0
  295. data/ext/image_pack/vendor/mozjpeg/tjexample.c +406 -0
  296. data/ext/image_pack/vendor/mozjpeg/tjexampletest.in +149 -0
  297. data/ext/image_pack/vendor/mozjpeg/tjexampletest.java.in +151 -0
  298. data/ext/image_pack/vendor/mozjpeg/tjunittest.c +961 -0
  299. data/ext/image_pack/vendor/mozjpeg/tjutil.c +70 -0
  300. data/ext/image_pack/vendor/mozjpeg/tjutil.h +53 -0
  301. data/ext/image_pack/vendor/mozjpeg/transupp.c +2373 -0
  302. data/ext/image_pack/vendor/mozjpeg/transupp.h +243 -0
  303. data/ext/image_pack/vendor/mozjpeg/turbojpeg-jni.c +1259 -0
  304. data/ext/image_pack/vendor/mozjpeg/turbojpeg.c +2320 -0
  305. data/ext/image_pack/vendor/mozjpeg/turbojpeg.h +1784 -0
  306. data/ext/image_pack/vendor/mozjpeg/usage.txt +679 -0
  307. data/ext/image_pack/vendor/mozjpeg/wizard.txt +220 -0
  308. data/ext/image_pack/vendor/mozjpeg/wrbmp.c +552 -0
  309. data/ext/image_pack/vendor/mozjpeg/wrgif.c +580 -0
  310. data/ext/image_pack/vendor/mozjpeg/wrjpgcom.c +577 -0
  311. data/ext/image_pack/vendor/mozjpeg/wrppm.c +366 -0
  312. data/ext/image_pack/vendor/mozjpeg/wrtarga.c +258 -0
  313. data/ext/image_pack/vendor/mozjpeg/yuvjpeg.c +268 -0
  314. data/lib/image_pack/backend.rb +8 -0
  315. data/lib/image_pack/configuration.rb +23 -0
  316. data/lib/image_pack/errors.rb +13 -0
  317. data/lib/image_pack/version.rb +5 -0
  318. data/lib/image_pack.rb +208 -0
  319. metadata +433 -0
@@ -0,0 +1,847 @@
1
+ ;
2
+ ; jidctint.asm - accurate integer IDCT (64-bit SSE2)
3
+ ;
4
+ ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5
+ ; Copyright (C) 2009, 2016, 2020, D. R. Commander.
6
+ ; Copyright (C) 2018, Matthias Räncker.
7
+ ;
8
+ ; Based on the x86 SIMD extension for IJG JPEG library
9
+ ; Copyright (C) 1999-2006, MIYASAKA Masaru.
10
+ ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11
+ ;
12
+ ; This file should be assembled with NASM (Netwide Assembler),
13
+ ; can *not* be assembled with Microsoft's MASM or any compatible
14
+ ; assembler (including Borland's Turbo Assembler).
15
+ ; NASM is available from http://nasm.sourceforge.net/ or
16
+ ; http://sourceforge.net/project/showfiles.php?group_id=6208
17
+ ;
18
+ ; This file contains a slower but more accurate integer implementation of the
19
+ ; inverse DCT (Discrete Cosine Transform). The following code is based
20
+ ; directly on the IJG's original jidctint.c; see the jidctint.c for
21
+ ; more details.
22
+
23
+ %include "jsimdext.inc"
24
+ %include "jdct.inc"
25
+
26
+ ; --------------------------------------------------------------------------
27
+
28
+ %define CONST_BITS 13
29
+ %define PASS1_BITS 2
30
+
31
+ %define DESCALE_P1 (CONST_BITS - PASS1_BITS)
32
+ %define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
33
+
34
+ %if CONST_BITS == 13
35
+ F_0_298 equ 2446 ; FIX(0.298631336)
36
+ F_0_390 equ 3196 ; FIX(0.390180644)
37
+ F_0_541 equ 4433 ; FIX(0.541196100)
38
+ F_0_765 equ 6270 ; FIX(0.765366865)
39
+ F_0_899 equ 7373 ; FIX(0.899976223)
40
+ F_1_175 equ 9633 ; FIX(1.175875602)
41
+ F_1_501 equ 12299 ; FIX(1.501321110)
42
+ F_1_847 equ 15137 ; FIX(1.847759065)
43
+ F_1_961 equ 16069 ; FIX(1.961570560)
44
+ F_2_053 equ 16819 ; FIX(2.053119869)
45
+ F_2_562 equ 20995 ; FIX(2.562915447)
46
+ F_3_072 equ 25172 ; FIX(3.072711026)
47
+ %else
48
+ ; NASM cannot do compile-time arithmetic on floating-point constants.
49
+ %define DESCALE(x, n) (((x) + (1 << ((n) - 1))) >> (n))
50
+ F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS) ; FIX(0.298631336)
51
+ F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS) ; FIX(0.390180644)
52
+ F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS) ; FIX(0.541196100)
53
+ F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS) ; FIX(0.765366865)
54
+ F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS) ; FIX(0.899976223)
55
+ F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS) ; FIX(1.175875602)
56
+ F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS) ; FIX(1.501321110)
57
+ F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS) ; FIX(1.847759065)
58
+ F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS) ; FIX(1.961570560)
59
+ F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS) ; FIX(2.053119869)
60
+ F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS) ; FIX(2.562915447)
61
+ F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS) ; FIX(3.072711026)
62
+ %endif
63
+
64
+ ; --------------------------------------------------------------------------
65
+ SECTION SEG_CONST
66
+
67
+ alignz 32
68
+ GLOBAL_DATA(jconst_idct_islow_sse2)
69
+
70
+ EXTN(jconst_idct_islow_sse2):
71
+
72
+ PW_F130_F054 times 4 dw (F_0_541 + F_0_765), F_0_541
73
+ PW_F054_MF130 times 4 dw F_0_541, (F_0_541 - F_1_847)
74
+ PW_MF078_F117 times 4 dw (F_1_175 - F_1_961), F_1_175
75
+ PW_F117_F078 times 4 dw F_1_175, (F_1_175 - F_0_390)
76
+ PW_MF060_MF089 times 4 dw (F_0_298 - F_0_899), -F_0_899
77
+ PW_MF089_F060 times 4 dw -F_0_899, (F_1_501 - F_0_899)
78
+ PW_MF050_MF256 times 4 dw (F_2_053 - F_2_562), -F_2_562
79
+ PW_MF256_F050 times 4 dw -F_2_562, (F_3_072 - F_2_562)
80
+ PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1 - 1)
81
+ PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2 - 1)
82
+ PB_CENTERJSAMP times 16 db CENTERJSAMPLE
83
+
84
+ alignz 32
85
+
86
+ ; --------------------------------------------------------------------------
87
+ SECTION SEG_TEXT
88
+ BITS 64
89
+ ;
90
+ ; Perform dequantization and inverse DCT on one block of coefficients.
91
+ ;
92
+ ; GLOBAL(void)
93
+ ; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
94
+ ; JSAMPARRAY output_buf, JDIMENSION output_col)
95
+ ;
96
+
97
+ ; r10 = jpeg_component_info *compptr
98
+ ; r11 = JCOEFPTR coef_block
99
+ ; r12 = JSAMPARRAY output_buf
100
+ ; r13d = JDIMENSION output_col
101
+
102
+ %define original_rbp rbp + 0
103
+ %define wk(i) rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
104
+ ; xmmword wk[WK_NUM]
105
+ %define WK_NUM 12
106
+
107
+ align 32
108
+ GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
109
+
110
+ EXTN(jsimd_idct_islow_sse2):
111
+ push rbp
112
+ mov rax, rsp ; rax = original rbp
113
+ sub rsp, byte 4
114
+ and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
115
+ mov [rsp], rax
116
+ mov rbp, rsp ; rbp = aligned rbp
117
+ lea rsp, [wk(0)]
118
+ collect_args 4
119
+
120
+ ; ---- Pass 1: process columns from input.
121
+
122
+ mov rdx, r10 ; quantptr
123
+ mov rsi, r11 ; inptr
124
+
125
+ %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
126
+ mov eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
127
+ or eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
128
+ jnz near .columnDCT
129
+
130
+ movdqa xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
131
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
132
+ por xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
133
+ por xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
134
+ por xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
135
+ por xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
136
+ por xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
137
+ por xmm1, xmm0
138
+ packsswb xmm1, xmm1
139
+ packsswb xmm1, xmm1
140
+ movd eax, xmm1
141
+ test rax, rax
142
+ jnz short .columnDCT
143
+
144
+ ; -- AC terms all zero
145
+
146
+ movdqa xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
147
+ pmullw xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
148
+
149
+ psllw xmm5, PASS1_BITS
150
+
151
+ movdqa xmm4, xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07)
152
+ punpcklwd xmm5, xmm5 ; xmm5=(00 00 01 01 02 02 03 03)
153
+ punpckhwd xmm4, xmm4 ; xmm4=(04 04 05 05 06 06 07 07)
154
+
155
+ pshufd xmm7, xmm5, 0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00)
156
+ pshufd xmm6, xmm5, 0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01)
157
+ pshufd xmm1, xmm5, 0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02)
158
+ pshufd xmm5, xmm5, 0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03)
159
+ pshufd xmm0, xmm4, 0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04)
160
+ pshufd xmm3, xmm4, 0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05)
161
+ pshufd xmm2, xmm4, 0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06)
162
+ pshufd xmm4, xmm4, 0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07)
163
+
164
+ movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1
165
+ movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3
166
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
167
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
168
+ jmp near .column_end
169
+ %endif
170
+ .columnDCT:
171
+
172
+ ; -- Even part
173
+
174
+ movdqa xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
175
+ movdqa xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
176
+ pmullw xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
177
+ pmullw xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
178
+ movdqa xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
179
+ movdqa xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
180
+ pmullw xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
181
+ pmullw xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
182
+
183
+ ; (Original)
184
+ ; z1 = (z2 + z3) * 0.541196100;
185
+ ; tmp2 = z1 + z3 * -1.847759065;
186
+ ; tmp3 = z1 + z2 * 0.765366865;
187
+ ;
188
+ ; (This implementation)
189
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
190
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
191
+
192
+ movdqa xmm4, xmm1 ; xmm1=in2=z2
193
+ movdqa xmm5, xmm1
194
+ punpcklwd xmm4, xmm3 ; xmm3=in6=z3
195
+ punpckhwd xmm5, xmm3
196
+ movdqa xmm1, xmm4
197
+ movdqa xmm3, xmm5
198
+ pmaddwd xmm4, [rel PW_F130_F054] ; xmm4=tmp3L
199
+ pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
200
+ pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
201
+ pmaddwd xmm3, [rel PW_F054_MF130] ; xmm3=tmp2H
202
+
203
+ movdqa xmm6, xmm0
204
+ paddw xmm0, xmm2 ; xmm0=in0+in4
205
+ psubw xmm6, xmm2 ; xmm6=in0-in4
206
+
207
+ pxor xmm7, xmm7
208
+ pxor xmm2, xmm2
209
+ punpcklwd xmm7, xmm0 ; xmm7=tmp0L
210
+ punpckhwd xmm2, xmm0 ; xmm2=tmp0H
211
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
212
+ psrad xmm2, (16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS
213
+
214
+ movdqa xmm0, xmm7
215
+ paddd xmm7, xmm4 ; xmm7=tmp10L
216
+ psubd xmm0, xmm4 ; xmm0=tmp13L
217
+ movdqa xmm4, xmm2
218
+ paddd xmm2, xmm5 ; xmm2=tmp10H
219
+ psubd xmm4, xmm5 ; xmm4=tmp13H
220
+
221
+ movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L
222
+ movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H
223
+ movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L
224
+ movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H
225
+
226
+ pxor xmm5, xmm5
227
+ pxor xmm7, xmm7
228
+ punpcklwd xmm5, xmm6 ; xmm5=tmp1L
229
+ punpckhwd xmm7, xmm6 ; xmm7=tmp1H
230
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
231
+ psrad xmm7, (16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS
232
+
233
+ movdqa xmm2, xmm5
234
+ paddd xmm5, xmm1 ; xmm5=tmp11L
235
+ psubd xmm2, xmm1 ; xmm2=tmp12L
236
+ movdqa xmm0, xmm7
237
+ paddd xmm7, xmm3 ; xmm7=tmp11H
238
+ psubd xmm0, xmm3 ; xmm0=tmp12H
239
+
240
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
241
+ movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H
242
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L
243
+ movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H
244
+
245
+ ; -- Odd part
246
+
247
+ movdqa xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
248
+ movdqa xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
249
+ pmullw xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
250
+ pmullw xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
251
+ movdqa xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
252
+ movdqa xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
253
+ pmullw xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
254
+ pmullw xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
255
+
256
+ movdqa xmm5, xmm6
257
+ movdqa xmm7, xmm4
258
+ paddw xmm5, xmm3 ; xmm5=z3
259
+ paddw xmm7, xmm1 ; xmm7=z4
260
+
261
+ ; (Original)
262
+ ; z5 = (z3 + z4) * 1.175875602;
263
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
264
+ ; z3 += z5; z4 += z5;
265
+ ;
266
+ ; (This implementation)
267
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
268
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
269
+
270
+ movdqa xmm2, xmm5
271
+ movdqa xmm0, xmm5
272
+ punpcklwd xmm2, xmm7
273
+ punpckhwd xmm0, xmm7
274
+ movdqa xmm5, xmm2
275
+ movdqa xmm7, xmm0
276
+ pmaddwd xmm2, [rel PW_MF078_F117] ; xmm2=z3L
277
+ pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3H
278
+ pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
279
+ pmaddwd xmm7, [rel PW_F117_F078] ; xmm7=z4H
280
+
281
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L
282
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H
283
+
284
+ ; (Original)
285
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
286
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
287
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
288
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
289
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
290
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
291
+ ;
292
+ ; (This implementation)
293
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
294
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
295
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
296
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
297
+ ; tmp0 += z3; tmp1 += z4;
298
+ ; tmp2 += z3; tmp3 += z4;
299
+
300
+ movdqa xmm2, xmm3
301
+ movdqa xmm0, xmm3
302
+ punpcklwd xmm2, xmm4
303
+ punpckhwd xmm0, xmm4
304
+ movdqa xmm3, xmm2
305
+ movdqa xmm4, xmm0
306
+ pmaddwd xmm2, [rel PW_MF060_MF089] ; xmm2=tmp0L
307
+ pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0H
308
+ pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3L
309
+ pmaddwd xmm4, [rel PW_MF089_F060] ; xmm4=tmp3H
310
+
311
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L
312
+ paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H
313
+ paddd xmm3, xmm5 ; xmm3=tmp3L
314
+ paddd xmm4, xmm7 ; xmm4=tmp3H
315
+
316
+ movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L
317
+ movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H
318
+
319
+ movdqa xmm2, xmm1
320
+ movdqa xmm0, xmm1
321
+ punpcklwd xmm2, xmm6
322
+ punpckhwd xmm0, xmm6
323
+ movdqa xmm1, xmm2
324
+ movdqa xmm6, xmm0
325
+ pmaddwd xmm2, [rel PW_MF050_MF256] ; xmm2=tmp1L
326
+ pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1H
327
+ pmaddwd xmm1, [rel PW_MF256_F050] ; xmm1=tmp2L
328
+ pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
329
+
330
+ paddd xmm2, xmm5 ; xmm2=tmp1L
331
+ paddd xmm0, xmm7 ; xmm0=tmp1H
332
+ paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L
333
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
334
+
335
+ movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L
336
+ movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H
337
+
338
+ ; -- Final output stage
339
+
340
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
341
+ movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H
342
+
343
+ movdqa xmm2, xmm5
344
+ movdqa xmm0, xmm7
345
+ paddd xmm5, xmm3 ; xmm5=data0L
346
+ paddd xmm7, xmm4 ; xmm7=data0H
347
+ psubd xmm2, xmm3 ; xmm2=data7L
348
+ psubd xmm0, xmm4 ; xmm0=data7H
349
+
350
+ movdqa xmm3, [rel PD_DESCALE_P1] ; xmm3=[rel PD_DESCALE_P1]
351
+
352
+ paddd xmm5, xmm3
353
+ paddd xmm7, xmm3
354
+ psrad xmm5, DESCALE_P1
355
+ psrad xmm7, DESCALE_P1
356
+ paddd xmm2, xmm3
357
+ paddd xmm0, xmm3
358
+ psrad xmm2, DESCALE_P1
359
+ psrad xmm0, DESCALE_P1
360
+
361
+ packssdw xmm5, xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07)
362
+ packssdw xmm2, xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77)
363
+
364
+ movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L
365
+ movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H
366
+
367
+ movdqa xmm7, xmm4
368
+ movdqa xmm0, xmm3
369
+ paddd xmm4, xmm1 ; xmm4=data1L
370
+ paddd xmm3, xmm6 ; xmm3=data1H
371
+ psubd xmm7, xmm1 ; xmm7=data6L
372
+ psubd xmm0, xmm6 ; xmm0=data6H
373
+
374
+ movdqa xmm1, [rel PD_DESCALE_P1] ; xmm1=[rel PD_DESCALE_P1]
375
+
376
+ paddd xmm4, xmm1
377
+ paddd xmm3, xmm1
378
+ psrad xmm4, DESCALE_P1
379
+ psrad xmm3, DESCALE_P1
380
+ paddd xmm7, xmm1
381
+ paddd xmm0, xmm1
382
+ psrad xmm7, DESCALE_P1
383
+ psrad xmm0, DESCALE_P1
384
+
385
+ packssdw xmm4, xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17)
386
+ packssdw xmm7, xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67)
387
+
388
+ movdqa xmm6, xmm5 ; transpose coefficients(phase 1)
389
+ punpcklwd xmm5, xmm4 ; xmm5=(00 10 01 11 02 12 03 13)
390
+ punpckhwd xmm6, xmm4 ; xmm6=(04 14 05 15 06 16 07 17)
391
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 1)
392
+ punpcklwd xmm7, xmm2 ; xmm7=(60 70 61 71 62 72 63 73)
393
+ punpckhwd xmm1, xmm2 ; xmm1=(64 74 65 75 66 76 67 77)
394
+
395
+ movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L
396
+ movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H
397
+ movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L
398
+ movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H
399
+
400
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13)
401
+ movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17)
402
+ movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73)
403
+ movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77)
404
+
405
+ movdqa xmm5, xmm3
406
+ movdqa xmm6, xmm0
407
+ paddd xmm3, xmm4 ; xmm3=data2L
408
+ paddd xmm0, xmm2 ; xmm0=data2H
409
+ psubd xmm5, xmm4 ; xmm5=data5L
410
+ psubd xmm6, xmm2 ; xmm6=data5H
411
+
412
+ movdqa xmm7, [rel PD_DESCALE_P1] ; xmm7=[rel PD_DESCALE_P1]
413
+
414
+ paddd xmm3, xmm7
415
+ paddd xmm0, xmm7
416
+ psrad xmm3, DESCALE_P1
417
+ psrad xmm0, DESCALE_P1
418
+ paddd xmm5, xmm7
419
+ paddd xmm6, xmm7
420
+ psrad xmm5, DESCALE_P1
421
+ psrad xmm6, DESCALE_P1
422
+
423
+ packssdw xmm3, xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27)
424
+ packssdw xmm5, xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57)
425
+
426
+ movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L
427
+ movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H
428
+ movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L
429
+ movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H
430
+
431
+ movdqa xmm0, xmm1
432
+ movdqa xmm6, xmm4
433
+ paddd xmm1, xmm2 ; xmm1=data3L
434
+ paddd xmm4, xmm7 ; xmm4=data3H
435
+ psubd xmm0, xmm2 ; xmm0=data4L
436
+ psubd xmm6, xmm7 ; xmm6=data4H
437
+
438
+ movdqa xmm2, [rel PD_DESCALE_P1] ; xmm2=[rel PD_DESCALE_P1]
439
+
440
+ paddd xmm1, xmm2
441
+ paddd xmm4, xmm2
442
+ psrad xmm1, DESCALE_P1
443
+ psrad xmm4, DESCALE_P1
444
+ paddd xmm0, xmm2
445
+ paddd xmm6, xmm2
446
+ psrad xmm0, DESCALE_P1
447
+ psrad xmm6, DESCALE_P1
448
+
449
+ packssdw xmm1, xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37)
450
+ packssdw xmm0, xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47)
451
+
452
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13)
453
+ movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17)
454
+
455
+ movdqa xmm4, xmm3 ; transpose coefficients(phase 1)
456
+ punpcklwd xmm3, xmm1 ; xmm3=(20 30 21 31 22 32 23 33)
457
+ punpckhwd xmm4, xmm1 ; xmm4=(24 34 25 35 26 36 27 37)
458
+ movdqa xmm6, xmm0 ; transpose coefficients(phase 1)
459
+ punpcklwd xmm0, xmm5 ; xmm0=(40 50 41 51 42 52 43 53)
460
+ punpckhwd xmm6, xmm5 ; xmm6=(44 54 45 55 46 56 47 57)
461
+
462
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 2)
463
+ punpckldq xmm7, xmm3 ; xmm7=(00 10 20 30 01 11 21 31)
464
+ punpckhdq xmm1, xmm3 ; xmm1=(02 12 22 32 03 13 23 33)
465
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
466
+ punpckldq xmm2, xmm4 ; xmm2=(04 14 24 34 05 15 25 35)
467
+ punpckhdq xmm5, xmm4 ; xmm5=(06 16 26 36 07 17 27 37)
468
+
469
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73)
470
+ movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77)
471
+
472
+ movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35)
473
+ movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37)
474
+
475
+ movdqa xmm2, xmm0 ; transpose coefficients(phase 2)
476
+ punpckldq xmm0, xmm3 ; xmm0=(40 50 60 70 41 51 61 71)
477
+ punpckhdq xmm2, xmm3 ; xmm2=(42 52 62 72 43 53 63 73)
478
+ movdqa xmm5, xmm6 ; transpose coefficients(phase 2)
479
+ punpckldq xmm6, xmm4 ; xmm6=(44 54 64 74 45 55 65 75)
480
+ punpckhdq xmm5, xmm4 ; xmm5=(46 56 66 76 47 57 67 77)
481
+
482
+ movdqa xmm3, xmm7 ; transpose coefficients(phase 3)
483
+ punpcklqdq xmm7, xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70)
484
+ punpckhqdq xmm3, xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71)
485
+ movdqa xmm4, xmm1 ; transpose coefficients(phase 3)
486
+ punpcklqdq xmm1, xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72)
487
+ punpckhqdq xmm4, xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73)
488
+
489
+ movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35)
490
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37)
491
+
492
+ movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1
493
+ movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3
494
+
495
+ movdqa xmm3, xmm0 ; transpose coefficients(phase 3)
496
+ punpcklqdq xmm0, xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74)
497
+ punpckhqdq xmm3, xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75)
498
+ movdqa xmm4, xmm2 ; transpose coefficients(phase 3)
499
+ punpcklqdq xmm2, xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76)
500
+ punpckhqdq xmm4, xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77)
501
+
502
+ movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5
503
+ movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7
504
+ .column_end:
505
+
506
+ ; -- Prefetch the next coefficient block
507
+
508
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
509
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
510
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
511
+ prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
512
+
513
+ ; ---- Pass 2: process rows from work array, store into output array.
514
+
515
+ mov rax, [original_rbp]
516
+ mov rdi, r12 ; (JSAMPROW *)
517
+ mov eax, r13d
518
+
519
+ ; -- Even part
520
+
521
+ ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
522
+
523
+ ; (Original)
524
+ ; z1 = (z2 + z3) * 0.541196100;
525
+ ; tmp2 = z1 + z3 * -1.847759065;
526
+ ; tmp3 = z1 + z2 * 0.765366865;
527
+ ;
528
+ ; (This implementation)
529
+ ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
530
+ ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
531
+
532
+ movdqa xmm6, xmm1 ; xmm1=in2=z2
533
+ movdqa xmm5, xmm1
534
+ punpcklwd xmm6, xmm2 ; xmm2=in6=z3
535
+ punpckhwd xmm5, xmm2
536
+ movdqa xmm1, xmm6
537
+ movdqa xmm2, xmm5
538
+ pmaddwd xmm6, [rel PW_F130_F054] ; xmm6=tmp3L
539
+ pmaddwd xmm5, [rel PW_F130_F054] ; xmm5=tmp3H
540
+ pmaddwd xmm1, [rel PW_F054_MF130] ; xmm1=tmp2L
541
+ pmaddwd xmm2, [rel PW_F054_MF130] ; xmm2=tmp2H
542
+
543
+ movdqa xmm3, xmm7
544
+ paddw xmm7, xmm0 ; xmm7=in0+in4
545
+ psubw xmm3, xmm0 ; xmm3=in0-in4
546
+
547
+ pxor xmm4, xmm4
548
+ pxor xmm0, xmm0
549
+ punpcklwd xmm4, xmm7 ; xmm4=tmp0L
550
+ punpckhwd xmm0, xmm7 ; xmm0=tmp0H
551
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
552
+ psrad xmm0, (16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS
553
+
554
+ movdqa xmm7, xmm4
555
+ paddd xmm4, xmm6 ; xmm4=tmp10L
556
+ psubd xmm7, xmm6 ; xmm7=tmp13L
557
+ movdqa xmm6, xmm0
558
+ paddd xmm0, xmm5 ; xmm0=tmp10H
559
+ psubd xmm6, xmm5 ; xmm6=tmp13H
560
+
561
+ movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L
562
+ movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H
563
+ movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L
564
+ movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H
565
+
566
+ pxor xmm5, xmm5
567
+ pxor xmm4, xmm4
568
+ punpcklwd xmm5, xmm3 ; xmm5=tmp1L
569
+ punpckhwd xmm4, xmm3 ; xmm4=tmp1H
570
+ psrad xmm5, (16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS
571
+ psrad xmm4, (16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS
572
+
573
+ movdqa xmm0, xmm5
574
+ paddd xmm5, xmm1 ; xmm5=tmp11L
575
+ psubd xmm0, xmm1 ; xmm0=tmp12L
576
+ movdqa xmm7, xmm4
577
+ paddd xmm4, xmm2 ; xmm4=tmp11H
578
+ psubd xmm7, xmm2 ; xmm7=tmp12H
579
+
580
+ movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L
581
+ movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H
582
+ movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L
583
+ movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H
584
+
585
+ ; -- Odd part
586
+
587
+ movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3
588
+ movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1
589
+ movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7
590
+ movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5
591
+
592
+ movdqa xmm5, xmm6
593
+ movdqa xmm4, xmm3
594
+ paddw xmm5, xmm1 ; xmm5=z3
595
+ paddw xmm4, xmm2 ; xmm4=z4
596
+
597
+ ; (Original)
598
+ ; z5 = (z3 + z4) * 1.175875602;
599
+ ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644;
600
+ ; z3 += z5; z4 += z5;
601
+ ;
602
+ ; (This implementation)
603
+ ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
604
+ ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
605
+
606
+ movdqa xmm0, xmm5
607
+ movdqa xmm7, xmm5
608
+ punpcklwd xmm0, xmm4
609
+ punpckhwd xmm7, xmm4
610
+ movdqa xmm5, xmm0
611
+ movdqa xmm4, xmm7
612
+ pmaddwd xmm0, [rel PW_MF078_F117] ; xmm0=z3L
613
+ pmaddwd xmm7, [rel PW_MF078_F117] ; xmm7=z3H
614
+ pmaddwd xmm5, [rel PW_F117_F078] ; xmm5=z4L
615
+ pmaddwd xmm4, [rel PW_F117_F078] ; xmm4=z4H
616
+
617
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L
618
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H
619
+
620
+ ; (Original)
621
+ ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2;
622
+ ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869;
623
+ ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110;
624
+ ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447;
625
+ ; tmp0 += z1 + z3; tmp1 += z2 + z4;
626
+ ; tmp2 += z2 + z3; tmp3 += z1 + z4;
627
+ ;
628
+ ; (This implementation)
629
+ ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
630
+ ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
631
+ ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
632
+ ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
633
+ ; tmp0 += z3; tmp1 += z4;
634
+ ; tmp2 += z3; tmp3 += z4;
635
+
636
+ movdqa xmm0, xmm1
637
+ movdqa xmm7, xmm1
638
+ punpcklwd xmm0, xmm3
639
+ punpckhwd xmm7, xmm3
640
+ movdqa xmm1, xmm0
641
+ movdqa xmm3, xmm7
642
+ pmaddwd xmm0, [rel PW_MF060_MF089] ; xmm0=tmp0L
643
+ pmaddwd xmm7, [rel PW_MF060_MF089] ; xmm7=tmp0H
644
+ pmaddwd xmm1, [rel PW_MF089_F060] ; xmm1=tmp3L
645
+ pmaddwd xmm3, [rel PW_MF089_F060] ; xmm3=tmp3H
646
+
647
+ paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L
648
+ paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H
649
+ paddd xmm1, xmm5 ; xmm1=tmp3L
650
+ paddd xmm3, xmm4 ; xmm3=tmp3H
651
+
652
+ movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L
653
+ movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H
654
+
655
+ movdqa xmm0, xmm2
656
+ movdqa xmm7, xmm2
657
+ punpcklwd xmm0, xmm6
658
+ punpckhwd xmm7, xmm6
659
+ movdqa xmm2, xmm0
660
+ movdqa xmm6, xmm7
661
+ pmaddwd xmm0, [rel PW_MF050_MF256] ; xmm0=tmp1L
662
+ pmaddwd xmm7, [rel PW_MF050_MF256] ; xmm7=tmp1H
663
+ pmaddwd xmm2, [rel PW_MF256_F050] ; xmm2=tmp2L
664
+ pmaddwd xmm6, [rel PW_MF256_F050] ; xmm6=tmp2H
665
+
666
+ paddd xmm0, xmm5 ; xmm0=tmp1L
667
+ paddd xmm7, xmm4 ; xmm7=tmp1H
668
+ paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L
669
+ paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H
670
+
671
+ movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L
672
+ movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H
673
+
674
+ ; -- Final output stage
675
+
676
+ movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L
677
+ movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H
678
+
679
+ movdqa xmm0, xmm5
680
+ movdqa xmm7, xmm4
681
+ paddd xmm5, xmm1 ; xmm5=data0L
682
+ paddd xmm4, xmm3 ; xmm4=data0H
683
+ psubd xmm0, xmm1 ; xmm0=data7L
684
+ psubd xmm7, xmm3 ; xmm7=data7H
685
+
686
+ movdqa xmm1, [rel PD_DESCALE_P2] ; xmm1=[rel PD_DESCALE_P2]
687
+
688
+ paddd xmm5, xmm1
689
+ paddd xmm4, xmm1
690
+ psrad xmm5, DESCALE_P2
691
+ psrad xmm4, DESCALE_P2
692
+ paddd xmm0, xmm1
693
+ paddd xmm7, xmm1
694
+ psrad xmm0, DESCALE_P2
695
+ psrad xmm7, DESCALE_P2
696
+
697
+ packssdw xmm5, xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70)
698
+ packssdw xmm0, xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77)
699
+
700
+ movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L
701
+ movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H
702
+
703
+ movdqa xmm4, xmm3
704
+ movdqa xmm7, xmm1
705
+ paddd xmm3, xmm2 ; xmm3=data1L
706
+ paddd xmm1, xmm6 ; xmm1=data1H
707
+ psubd xmm4, xmm2 ; xmm4=data6L
708
+ psubd xmm7, xmm6 ; xmm7=data6H
709
+
710
+ movdqa xmm2, [rel PD_DESCALE_P2] ; xmm2=[rel PD_DESCALE_P2]
711
+
712
+ paddd xmm3, xmm2
713
+ paddd xmm1, xmm2
714
+ psrad xmm3, DESCALE_P2
715
+ psrad xmm1, DESCALE_P2
716
+ paddd xmm4, xmm2
717
+ paddd xmm7, xmm2
718
+ psrad xmm4, DESCALE_P2
719
+ psrad xmm7, DESCALE_P2
720
+
721
+ packssdw xmm3, xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71)
722
+ packssdw xmm4, xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76)
723
+
724
+ packsswb xmm5, xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
725
+ packsswb xmm3, xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
726
+
727
+ movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L
728
+ movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H
729
+ movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L
730
+ movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H
731
+
732
+ movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
733
+ movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
734
+
735
+ movdqa xmm4, xmm6
736
+ movdqa xmm0, xmm2
737
+ paddd xmm6, xmm1 ; xmm6=data2L
738
+ paddd xmm2, xmm7 ; xmm2=data2H
739
+ psubd xmm4, xmm1 ; xmm4=data5L
740
+ psubd xmm0, xmm7 ; xmm0=data5H
741
+
742
+ movdqa xmm5, [rel PD_DESCALE_P2] ; xmm5=[rel PD_DESCALE_P2]
743
+
744
+ paddd xmm6, xmm5
745
+ paddd xmm2, xmm5
746
+ psrad xmm6, DESCALE_P2
747
+ psrad xmm2, DESCALE_P2
748
+ paddd xmm4, xmm5
749
+ paddd xmm0, xmm5
750
+ psrad xmm4, DESCALE_P2
751
+ psrad xmm0, DESCALE_P2
752
+
753
+ packssdw xmm6, xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72)
754
+ packssdw xmm4, xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75)
755
+
756
+ movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L
757
+ movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H
758
+ movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L
759
+ movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H
760
+
761
+ movdqa xmm2, xmm3
762
+ movdqa xmm0, xmm1
763
+ paddd xmm3, xmm7 ; xmm3=data3L
764
+ paddd xmm1, xmm5 ; xmm1=data3H
765
+ psubd xmm2, xmm7 ; xmm2=data4L
766
+ psubd xmm0, xmm5 ; xmm0=data4H
767
+
768
+ movdqa xmm7, [rel PD_DESCALE_P2] ; xmm7=[rel PD_DESCALE_P2]
769
+
770
+ paddd xmm3, xmm7
771
+ paddd xmm1, xmm7
772
+ psrad xmm3, DESCALE_P2
773
+ psrad xmm1, DESCALE_P2
774
+ paddd xmm2, xmm7
775
+ paddd xmm0, xmm7
776
+ psrad xmm2, DESCALE_P2
777
+ psrad xmm0, DESCALE_P2
778
+
779
+ movdqa xmm5, [rel PB_CENTERJSAMP] ; xmm5=[rel PB_CENTERJSAMP]
780
+
781
+ packssdw xmm3, xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73)
782
+ packssdw xmm2, xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74)
783
+
784
+ movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
785
+ movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
786
+
787
+ packsswb xmm6, xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
788
+ packsswb xmm3, xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
789
+
790
+ paddb xmm7, xmm5
791
+ paddb xmm1, xmm5
792
+ paddb xmm6, xmm5
793
+ paddb xmm3, xmm5
794
+
795
+ movdqa xmm0, xmm7 ; transpose coefficients(phase 1)
796
+ punpcklbw xmm7, xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
797
+ punpckhbw xmm0, xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
798
+ movdqa xmm2, xmm6 ; transpose coefficients(phase 1)
799
+ punpcklbw xmm6, xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
800
+ punpckhbw xmm2, xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
801
+
802
+ movdqa xmm4, xmm7 ; transpose coefficients(phase 2)
803
+ punpcklwd xmm7, xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
804
+ punpckhwd xmm4, xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
805
+ movdqa xmm5, xmm2 ; transpose coefficients(phase 2)
806
+ punpcklwd xmm2, xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
807
+ punpckhwd xmm5, xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
808
+
809
+ movdqa xmm1, xmm7 ; transpose coefficients(phase 3)
810
+ punpckldq xmm7, xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
811
+ punpckhdq xmm1, xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
812
+ movdqa xmm3, xmm4 ; transpose coefficients(phase 3)
813
+ punpckldq xmm4, xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
814
+ punpckhdq xmm3, xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
815
+
816
+ pshufd xmm6, xmm7, 0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
817
+ pshufd xmm0, xmm1, 0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
818
+ pshufd xmm2, xmm4, 0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
819
+ pshufd xmm5, xmm3, 0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
820
+
821
+ mov rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
822
+ mov rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
823
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
824
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
825
+ mov rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
826
+ mov rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
827
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
828
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
829
+
830
+ mov rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
831
+ mov rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
832
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
833
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
834
+ mov rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
835
+ mov rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
836
+ movq XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
837
+ movq XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
838
+
839
+ uncollect_args 4
840
+ mov rsp, rbp ; rsp <- aligned rbp
841
+ pop rsp ; rsp <- original rbp
842
+ pop rbp
843
+ ret
844
+
845
+ ; For some reason, the OS X linker does not honor the request to align the
846
+ ; segment unless we do this.
847
+ align 32