image_pack 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +18 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +140 -0
  5. data/THIRD_PARTY_NOTICES.md +8 -0
  6. data/ext/image_pack/extconf.rb +515 -0
  7. data/ext/image_pack/image_pack.c +1618 -0
  8. data/ext/image_pack/vendor/.vendored +1 -0
  9. data/ext/image_pack/vendor/mozjpeg/BUILDING.txt +902 -0
  10. data/ext/image_pack/vendor/mozjpeg/CMakeLists.txt +1593 -0
  11. data/ext/image_pack/vendor/mozjpeg/LICENSE.md +132 -0
  12. data/ext/image_pack/vendor/mozjpeg/README-mozilla.txt +194 -0
  13. data/ext/image_pack/vendor/mozjpeg/README-turbo.txt +346 -0
  14. data/ext/image_pack/vendor/mozjpeg/README.ijg +258 -0
  15. data/ext/image_pack/vendor/mozjpeg/README.md +29 -0
  16. data/ext/image_pack/vendor/mozjpeg/cderror.h +128 -0
  17. data/ext/image_pack/vendor/mozjpeg/cdjpeg.c +156 -0
  18. data/ext/image_pack/vendor/mozjpeg/cdjpeg.h +171 -0
  19. data/ext/image_pack/vendor/mozjpeg/cjpeg.c +961 -0
  20. data/ext/image_pack/vendor/mozjpeg/cmyk.h +60 -0
  21. data/ext/image_pack/vendor/mozjpeg/coderules.txt +78 -0
  22. data/ext/image_pack/vendor/mozjpeg/croptest.in +95 -0
  23. data/ext/image_pack/vendor/mozjpeg/djpeg.c +855 -0
  24. data/ext/image_pack/vendor/mozjpeg/example.txt +464 -0
  25. data/ext/image_pack/vendor/mozjpeg/jaricom.c +157 -0
  26. data/ext/image_pack/vendor/mozjpeg/jcapimin.c +307 -0
  27. data/ext/image_pack/vendor/mozjpeg/jcapistd.c +168 -0
  28. data/ext/image_pack/vendor/mozjpeg/jcarith.c +972 -0
  29. data/ext/image_pack/vendor/mozjpeg/jccoefct.c +609 -0
  30. data/ext/image_pack/vendor/mozjpeg/jccolext.c +144 -0
  31. data/ext/image_pack/vendor/mozjpeg/jccolor.c +721 -0
  32. data/ext/image_pack/vendor/mozjpeg/jcdctmgr.c +1776 -0
  33. data/ext/image_pack/vendor/mozjpeg/jcext.c +219 -0
  34. data/ext/image_pack/vendor/mozjpeg/jchuff.c +1146 -0
  35. data/ext/image_pack/vendor/mozjpeg/jchuff.h +57 -0
  36. data/ext/image_pack/vendor/mozjpeg/jcicc.c +105 -0
  37. data/ext/image_pack/vendor/mozjpeg/jcinit.c +82 -0
  38. data/ext/image_pack/vendor/mozjpeg/jcmainct.c +162 -0
  39. data/ext/image_pack/vendor/mozjpeg/jcmarker.c +844 -0
  40. data/ext/image_pack/vendor/mozjpeg/jcmaster.c +958 -0
  41. data/ext/image_pack/vendor/mozjpeg/jcmaster.h +56 -0
  42. data/ext/image_pack/vendor/mozjpeg/jcomapi.c +109 -0
  43. data/ext/image_pack/vendor/mozjpeg/jconfig.h.in +37 -0
  44. data/ext/image_pack/vendor/mozjpeg/jconfig.txt +93 -0
  45. data/ext/image_pack/vendor/mozjpeg/jconfigint.h.in +44 -0
  46. data/ext/image_pack/vendor/mozjpeg/jcparam.c +991 -0
  47. data/ext/image_pack/vendor/mozjpeg/jcphuff.c +1123 -0
  48. data/ext/image_pack/vendor/mozjpeg/jcprepct.c +351 -0
  49. data/ext/image_pack/vendor/mozjpeg/jcsample.c +522 -0
  50. data/ext/image_pack/vendor/mozjpeg/jcstest.c +126 -0
  51. data/ext/image_pack/vendor/mozjpeg/jctrans.c +408 -0
  52. data/ext/image_pack/vendor/mozjpeg/jdapimin.c +407 -0
  53. data/ext/image_pack/vendor/mozjpeg/jdapistd.c +691 -0
  54. data/ext/image_pack/vendor/mozjpeg/jdarith.c +782 -0
  55. data/ext/image_pack/vendor/mozjpeg/jdatadst-tj.c +198 -0
  56. data/ext/image_pack/vendor/mozjpeg/jdatadst.c +299 -0
  57. data/ext/image_pack/vendor/mozjpeg/jdatasrc-tj.c +194 -0
  58. data/ext/image_pack/vendor/mozjpeg/jdatasrc.c +295 -0
  59. data/ext/image_pack/vendor/mozjpeg/jdcoefct.c +881 -0
  60. data/ext/image_pack/vendor/mozjpeg/jdcoefct.h +83 -0
  61. data/ext/image_pack/vendor/mozjpeg/jdcol565.c +384 -0
  62. data/ext/image_pack/vendor/mozjpeg/jdcolext.c +141 -0
  63. data/ext/image_pack/vendor/mozjpeg/jdcolor.c +881 -0
  64. data/ext/image_pack/vendor/mozjpeg/jdct.h +208 -0
  65. data/ext/image_pack/vendor/mozjpeg/jddctmgr.c +367 -0
  66. data/ext/image_pack/vendor/mozjpeg/jdhuff.c +834 -0
  67. data/ext/image_pack/vendor/mozjpeg/jdhuff.h +247 -0
  68. data/ext/image_pack/vendor/mozjpeg/jdicc.c +167 -0
  69. data/ext/image_pack/vendor/mozjpeg/jdinput.c +408 -0
  70. data/ext/image_pack/vendor/mozjpeg/jdmainct.c +460 -0
  71. data/ext/image_pack/vendor/mozjpeg/jdmainct.h +71 -0
  72. data/ext/image_pack/vendor/mozjpeg/jdmarker.c +1374 -0
  73. data/ext/image_pack/vendor/mozjpeg/jdmaster.c +727 -0
  74. data/ext/image_pack/vendor/mozjpeg/jdmaster.h +33 -0
  75. data/ext/image_pack/vendor/mozjpeg/jdmerge.c +587 -0
  76. data/ext/image_pack/vendor/mozjpeg/jdmerge.h +47 -0
  77. data/ext/image_pack/vendor/mozjpeg/jdmrg565.c +354 -0
  78. data/ext/image_pack/vendor/mozjpeg/jdmrgext.c +184 -0
  79. data/ext/image_pack/vendor/mozjpeg/jdphuff.c +679 -0
  80. data/ext/image_pack/vendor/mozjpeg/jdpostct.c +294 -0
  81. data/ext/image_pack/vendor/mozjpeg/jdsample.c +524 -0
  82. data/ext/image_pack/vendor/mozjpeg/jdsample.h +50 -0
  83. data/ext/image_pack/vendor/mozjpeg/jdtrans.c +156 -0
  84. data/ext/image_pack/vendor/mozjpeg/jerror.c +251 -0
  85. data/ext/image_pack/vendor/mozjpeg/jerror.h +335 -0
  86. data/ext/image_pack/vendor/mozjpeg/jfdctflt.c +169 -0
  87. data/ext/image_pack/vendor/mozjpeg/jfdctfst.c +227 -0
  88. data/ext/image_pack/vendor/mozjpeg/jfdctint.c +288 -0
  89. data/ext/image_pack/vendor/mozjpeg/jidctflt.c +240 -0
  90. data/ext/image_pack/vendor/mozjpeg/jidctfst.c +371 -0
  91. data/ext/image_pack/vendor/mozjpeg/jidctint.c +2627 -0
  92. data/ext/image_pack/vendor/mozjpeg/jidctred.c +409 -0
  93. data/ext/image_pack/vendor/mozjpeg/jinclude.h +147 -0
  94. data/ext/image_pack/vendor/mozjpeg/jmemmgr.c +1180 -0
  95. data/ext/image_pack/vendor/mozjpeg/jmemnobs.c +110 -0
  96. data/ext/image_pack/vendor/mozjpeg/jmemsys.h +178 -0
  97. data/ext/image_pack/vendor/mozjpeg/jmorecfg.h +382 -0
  98. data/ext/image_pack/vendor/mozjpeg/jpeg_nbits_table.h +4098 -0
  99. data/ext/image_pack/vendor/mozjpeg/jpegcomp.h +32 -0
  100. data/ext/image_pack/vendor/mozjpeg/jpegint.h +453 -0
  101. data/ext/image_pack/vendor/mozjpeg/jpeglib.h +1211 -0
  102. data/ext/image_pack/vendor/mozjpeg/jpegtran.c +827 -0
  103. data/ext/image_pack/vendor/mozjpeg/jpegyuv.c +172 -0
  104. data/ext/image_pack/vendor/mozjpeg/jquant1.c +856 -0
  105. data/ext/image_pack/vendor/mozjpeg/jquant2.c +1286 -0
  106. data/ext/image_pack/vendor/mozjpeg/jsimd.h +123 -0
  107. data/ext/image_pack/vendor/mozjpeg/jsimd_none.c +431 -0
  108. data/ext/image_pack/vendor/mozjpeg/jsimddct.h +70 -0
  109. data/ext/image_pack/vendor/mozjpeg/jstdhuff.c +144 -0
  110. data/ext/image_pack/vendor/mozjpeg/jutils.c +133 -0
  111. data/ext/image_pack/vendor/mozjpeg/jversion.h.in +56 -0
  112. data/ext/image_pack/vendor/mozjpeg/libjpeg.map.in +11 -0
  113. data/ext/image_pack/vendor/mozjpeg/libjpeg.txt +3150 -0
  114. data/ext/image_pack/vendor/mozjpeg/rdbmp.c +690 -0
  115. data/ext/image_pack/vendor/mozjpeg/rdcolmap.c +253 -0
  116. data/ext/image_pack/vendor/mozjpeg/rdgif.c +720 -0
  117. data/ext/image_pack/vendor/mozjpeg/rdjpeg.c +160 -0
  118. data/ext/image_pack/vendor/mozjpeg/rdjpgcom.c +494 -0
  119. data/ext/image_pack/vendor/mozjpeg/rdpng.c +194 -0
  120. data/ext/image_pack/vendor/mozjpeg/rdppm.c +781 -0
  121. data/ext/image_pack/vendor/mozjpeg/rdswitch.c +642 -0
  122. data/ext/image_pack/vendor/mozjpeg/rdtarga.c +508 -0
  123. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jccolext-neon.c +148 -0
  124. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jchuff-neon.c +334 -0
  125. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jsimd.c +976 -0
  126. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jsimd_neon.S +1200 -0
  127. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jccolext-neon.c +316 -0
  128. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jchuff-neon.c +411 -0
  129. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jsimd.c +1053 -0
  130. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jsimd_neon.S +2254 -0
  131. data/ext/image_pack/vendor/mozjpeg/simd/arm/align.h +28 -0
  132. data/ext/image_pack/vendor/mozjpeg/simd/arm/jccolor-neon.c +160 -0
  133. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcgray-neon.c +120 -0
  134. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcgryext-neon.c +106 -0
  135. data/ext/image_pack/vendor/mozjpeg/simd/arm/jchuff.h +131 -0
  136. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcphuff-neon.c +623 -0
  137. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcsample-neon.c +192 -0
  138. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdcolext-neon.c +374 -0
  139. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdcolor-neon.c +141 -0
  140. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdmerge-neon.c +144 -0
  141. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdmrgext-neon.c +723 -0
  142. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdsample-neon.c +569 -0
  143. data/ext/image_pack/vendor/mozjpeg/simd/arm/jfdctfst-neon.c +214 -0
  144. data/ext/image_pack/vendor/mozjpeg/simd/arm/jfdctint-neon.c +376 -0
  145. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctfst-neon.c +472 -0
  146. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctint-neon.c +801 -0
  147. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctred-neon.c +486 -0
  148. data/ext/image_pack/vendor/mozjpeg/simd/arm/jquanti-neon.c +193 -0
  149. data/ext/image_pack/vendor/mozjpeg/simd/arm/neon-compat.h +26 -0
  150. data/ext/image_pack/vendor/mozjpeg/simd/arm/neon-compat.h.in +37 -0
  151. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-avx2.asm +578 -0
  152. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-mmx.asm +476 -0
  153. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-sse2.asm +503 -0
  154. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-avx2.asm +121 -0
  155. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-mmx.asm +121 -0
  156. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-sse2.asm +120 -0
  157. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-avx2.asm +113 -0
  158. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-mmx.asm +113 -0
  159. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-sse2.asm +112 -0
  160. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-avx2.asm +457 -0
  161. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-mmx.asm +355 -0
  162. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-sse2.asm +382 -0
  163. data/ext/image_pack/vendor/mozjpeg/simd/i386/jchuff-sse2.asm +761 -0
  164. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcphuff-sse2.asm +662 -0
  165. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-avx2.asm +388 -0
  166. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-mmx.asm +324 -0
  167. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-sse2.asm +351 -0
  168. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-avx2.asm +515 -0
  169. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-mmx.asm +404 -0
  170. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-sse2.asm +458 -0
  171. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-avx2.asm +118 -0
  172. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-mmx.asm +117 -0
  173. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-sse2.asm +117 -0
  174. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-avx2.asm +136 -0
  175. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-mmx.asm +123 -0
  176. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-sse2.asm +135 -0
  177. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-avx2.asm +575 -0
  178. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-mmx.asm +460 -0
  179. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-sse2.asm +517 -0
  180. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-avx2.asm +760 -0
  181. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-mmx.asm +731 -0
  182. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-sse2.asm +724 -0
  183. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctflt-3dn.asm +318 -0
  184. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctflt-sse.asm +369 -0
  185. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctfst-mmx.asm +395 -0
  186. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctfst-sse2.asm +403 -0
  187. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-avx2.asm +331 -0
  188. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-mmx.asm +620 -0
  189. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-sse2.asm +633 -0
  190. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-3dn.asm +451 -0
  191. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-sse.asm +571 -0
  192. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-sse2.asm +497 -0
  193. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctfst-mmx.asm +499 -0
  194. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctfst-sse2.asm +501 -0
  195. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-avx2.asm +453 -0
  196. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-mmx.asm +851 -0
  197. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-sse2.asm +858 -0
  198. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctred-mmx.asm +704 -0
  199. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctred-sse2.asm +592 -0
  200. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-3dn.asm +230 -0
  201. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-mmx.asm +276 -0
  202. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-sse.asm +208 -0
  203. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquantf-sse2.asm +168 -0
  204. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquanti-avx2.asm +188 -0
  205. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquanti-sse2.asm +201 -0
  206. data/ext/image_pack/vendor/mozjpeg/simd/i386/jsimd.c +1312 -0
  207. data/ext/image_pack/vendor/mozjpeg/simd/i386/jsimdcpu.asm +135 -0
  208. data/ext/image_pack/vendor/mozjpeg/simd/jsimd.h +1258 -0
  209. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd.c +1143 -0
  210. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd_dspr2.S +4543 -0
  211. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd_dspr2_asm.h +292 -0
  212. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jccolext-mmi.c +455 -0
  213. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jccolor-mmi.c +148 -0
  214. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcgray-mmi.c +132 -0
  215. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcgryext-mmi.c +374 -0
  216. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcsample-mmi.c +98 -0
  217. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcsample.h +28 -0
  218. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdcolext-mmi.c +415 -0
  219. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdcolor-mmi.c +139 -0
  220. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdmerge-mmi.c +149 -0
  221. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdmrgext-mmi.c +615 -0
  222. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdsample-mmi.c +304 -0
  223. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jfdctfst-mmi.c +255 -0
  224. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jfdctint-mmi.c +398 -0
  225. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jidctfst-mmi.c +395 -0
  226. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jidctint-mmi.c +571 -0
  227. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jquanti-mmi.c +124 -0
  228. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jsimd.c +866 -0
  229. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jsimd_mmi.h +69 -0
  230. data/ext/image_pack/vendor/mozjpeg/simd/mips64/loongson-mmintrin.h +1334 -0
  231. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jcolsamp.inc +135 -0
  232. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jdct.inc +31 -0
  233. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdcfg.inc +93 -0
  234. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdcfg.inc.h +133 -0
  235. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdext.inc +520 -0
  236. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jccolext-altivec.c +269 -0
  237. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jccolor-altivec.c +116 -0
  238. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcgray-altivec.c +111 -0
  239. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcgryext-altivec.c +228 -0
  240. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcsample-altivec.c +159 -0
  241. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcsample.h +28 -0
  242. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdcolext-altivec.c +276 -0
  243. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdcolor-altivec.c +106 -0
  244. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdmerge-altivec.c +130 -0
  245. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdmrgext-altivec.c +329 -0
  246. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdsample-altivec.c +400 -0
  247. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jfdctfst-altivec.c +154 -0
  248. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jfdctint-altivec.c +258 -0
  249. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jidctfst-altivec.c +255 -0
  250. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jidctint-altivec.c +357 -0
  251. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jquanti-altivec.c +250 -0
  252. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jsimd.c +884 -0
  253. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jsimd_altivec.h +98 -0
  254. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolext-avx2.asm +559 -0
  255. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolext-sse2.asm +484 -0
  256. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolor-avx2.asm +121 -0
  257. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolor-sse2.asm +120 -0
  258. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgray-avx2.asm +113 -0
  259. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgray-sse2.asm +112 -0
  260. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgryext-avx2.asm +438 -0
  261. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgryext-sse2.asm +363 -0
  262. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jchuff-sse2.asm +583 -0
  263. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcphuff-sse2.asm +639 -0
  264. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcsample-avx2.asm +367 -0
  265. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcsample-sse2.asm +330 -0
  266. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolext-avx2.asm +496 -0
  267. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolext-sse2.asm +439 -0
  268. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolor-avx2.asm +118 -0
  269. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolor-sse2.asm +117 -0
  270. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmerge-avx2.asm +136 -0
  271. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmerge-sse2.asm +135 -0
  272. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmrgext-avx2.asm +596 -0
  273. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmrgext-sse2.asm +538 -0
  274. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdsample-avx2.asm +696 -0
  275. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdsample-sse2.asm +665 -0
  276. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctflt-sse.asm +355 -0
  277. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctfst-sse2.asm +389 -0
  278. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctint-avx2.asm +320 -0
  279. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctint-sse2.asm +619 -0
  280. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctflt-sse2.asm +482 -0
  281. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctfst-sse2.asm +491 -0
  282. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctint-avx2.asm +418 -0
  283. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctint-sse2.asm +847 -0
  284. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctred-sse2.asm +574 -0
  285. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquantf-sse2.asm +155 -0
  286. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquanti-avx2.asm +163 -0
  287. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquanti-sse2.asm +188 -0
  288. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jsimd.c +1110 -0
  289. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jsimdcpu.asm +86 -0
  290. data/ext/image_pack/vendor/mozjpeg/strtest.c +170 -0
  291. data/ext/image_pack/vendor/mozjpeg/structure.txt +900 -0
  292. data/ext/image_pack/vendor/mozjpeg/tjbench.c +1044 -0
  293. data/ext/image_pack/vendor/mozjpeg/tjbenchtest.in +256 -0
  294. data/ext/image_pack/vendor/mozjpeg/tjbenchtest.java.in +215 -0
  295. data/ext/image_pack/vendor/mozjpeg/tjexample.c +406 -0
  296. data/ext/image_pack/vendor/mozjpeg/tjexampletest.in +149 -0
  297. data/ext/image_pack/vendor/mozjpeg/tjexampletest.java.in +151 -0
  298. data/ext/image_pack/vendor/mozjpeg/tjunittest.c +961 -0
  299. data/ext/image_pack/vendor/mozjpeg/tjutil.c +70 -0
  300. data/ext/image_pack/vendor/mozjpeg/tjutil.h +53 -0
  301. data/ext/image_pack/vendor/mozjpeg/transupp.c +2373 -0
  302. data/ext/image_pack/vendor/mozjpeg/transupp.h +243 -0
  303. data/ext/image_pack/vendor/mozjpeg/turbojpeg-jni.c +1259 -0
  304. data/ext/image_pack/vendor/mozjpeg/turbojpeg.c +2320 -0
  305. data/ext/image_pack/vendor/mozjpeg/turbojpeg.h +1784 -0
  306. data/ext/image_pack/vendor/mozjpeg/usage.txt +679 -0
  307. data/ext/image_pack/vendor/mozjpeg/wizard.txt +220 -0
  308. data/ext/image_pack/vendor/mozjpeg/wrbmp.c +552 -0
  309. data/ext/image_pack/vendor/mozjpeg/wrgif.c +580 -0
  310. data/ext/image_pack/vendor/mozjpeg/wrjpgcom.c +577 -0
  311. data/ext/image_pack/vendor/mozjpeg/wrppm.c +366 -0
  312. data/ext/image_pack/vendor/mozjpeg/wrtarga.c +258 -0
  313. data/ext/image_pack/vendor/mozjpeg/yuvjpeg.c +268 -0
  314. data/lib/image_pack/backend.rb +8 -0
  315. data/lib/image_pack/configuration.rb +23 -0
  316. data/lib/image_pack/errors.rb +13 -0
  317. data/lib/image_pack/version.rb +5 -0
  318. data/lib/image_pack.rb +208 -0
  319. metadata +433 -0
@@ -0,0 +1,1200 @@
1
+ /*
2
+ * Armv7 Neon optimizations for libjpeg-turbo
3
+ *
4
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
5
+ * All Rights Reserved.
6
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7
+ * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
8
+ * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
9
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
10
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
11
+ *
12
+ * This software is provided 'as-is', without any express or implied
13
+ * warranty. In no event will the authors be held liable for any damages
14
+ * arising from the use of this software.
15
+ *
16
+ * Permission is granted to anyone to use this software for any purpose,
17
+ * including commercial applications, and to alter it and redistribute it
18
+ * freely, subject to the following restrictions:
19
+ *
20
+ * 1. The origin of this software must not be misrepresented; you must not
21
+ * claim that you wrote the original software. If you use this software
22
+ * in a product, an acknowledgment in the product documentation would be
23
+ * appreciated but is not required.
24
+ * 2. Altered source versions must be plainly marked as such, and must not be
25
+ * misrepresented as being the original software.
26
+ * 3. This notice may not be removed or altered from any source distribution.
27
+ */
28
+
29
+ #if defined(__linux__) && defined(__ELF__)
30
+ .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
31
+ #endif
32
+
33
+ .text
34
+ .fpu neon
35
+ .arch armv7a
36
+ .object_arch armv4
37
+ .arm
38
+ .syntax unified
39
+
40
+
41
+ /*****************************************************************************/
42
+
43
+ /* Supplementary macro for setting function attributes */
44
+ .macro asm_function fname
45
+ #ifdef __APPLE__
46
+ .private_extern _\fname
47
+ .globl _\fname
48
+ _\fname:
49
+ #else
50
+ .global \fname
51
+ #ifdef __ELF__
52
+ .hidden \fname
53
+ .type \fname, %function
54
+ #endif
55
+ \fname:
56
+ #endif
57
+ .endm
58
+
59
+
60
+ #define CENTERJSAMPLE 128
61
+
62
+ /*****************************************************************************/
63
+
64
+ /*
65
+ * Perform dequantization and inverse DCT on one block of coefficients.
66
+ *
67
+ * GLOBAL(void)
68
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
69
+ * JSAMPARRAY output_buf, JDIMENSION output_col)
70
+ */
71
+
72
+ #define FIX_0_298631336 (2446)
73
+ #define FIX_0_390180644 (3196)
74
+ #define FIX_0_541196100 (4433)
75
+ #define FIX_0_765366865 (6270)
76
+ #define FIX_0_899976223 (7373)
77
+ #define FIX_1_175875602 (9633)
78
+ #define FIX_1_501321110 (12299)
79
+ #define FIX_1_847759065 (15137)
80
+ #define FIX_1_961570560 (16069)
81
+ #define FIX_2_053119869 (16819)
82
+ #define FIX_2_562915447 (20995)
83
+ #define FIX_3_072711026 (25172)
84
+
85
+ #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
86
+ #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
87
+ #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
88
+ #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
89
+ #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
90
+ #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
91
+ #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
92
+ #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
93
+
94
+ /*
95
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
96
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
97
+ */
98
+ #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
99
+ DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
100
+ JLONG q1, q2, q3, q4, q5, q6, q7; \
101
+ JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
102
+ \
103
+ /* 1-D iDCT input data */ \
104
+ row0 = xrow0; \
105
+ row1 = xrow1; \
106
+ row2 = xrow2; \
107
+ row3 = xrow3; \
108
+ row4 = xrow4; \
109
+ row5 = xrow5; \
110
+ row6 = xrow6; \
111
+ row7 = xrow7; \
112
+ \
113
+ q5 = row7 + row3; \
114
+ q4 = row5 + row1; \
115
+ q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
116
+ MULTIPLY(q4, FIX_1_175875602); \
117
+ q7 = MULTIPLY(q5, FIX_1_175875602) + \
118
+ MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
119
+ q2 = MULTIPLY(row2, FIX_0_541196100) + \
120
+ MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
121
+ q4 = q6; \
122
+ q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
123
+ q6 += MULTIPLY(row5, -FIX_2_562915447) + \
124
+ MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
125
+ /* now we can use q1 (reloadable constants have been used up) */ \
126
+ q1 = q3 + q2; \
127
+ q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
128
+ MULTIPLY(row1, -FIX_0_899976223); \
129
+ q5 = q7; \
130
+ q1 = q1 + q6; \
131
+ q7 += MULTIPLY(row7, -FIX_0_899976223) + \
132
+ MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
133
+ \
134
+ /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
135
+ tmp11_plus_tmp2 = q1; \
136
+ row1 = 0; \
137
+ \
138
+ q1 = q1 - q6; \
139
+ q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
140
+ MULTIPLY(row3, -FIX_2_562915447); \
141
+ q1 = q1 - q6; \
142
+ q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
143
+ MULTIPLY(row6, FIX_0_541196100); \
144
+ q3 = q3 - q2; \
145
+ \
146
+ /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
147
+ tmp11_minus_tmp2 = q1; \
148
+ \
149
+ q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
150
+ q2 = q1 + q6; \
151
+ q1 = q1 - q6; \
152
+ \
153
+ /* pick up the results */ \
154
+ tmp0 = q4; \
155
+ tmp1 = q5; \
156
+ tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
157
+ tmp3 = q7; \
158
+ tmp10 = q2; \
159
+ tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
160
+ tmp12 = q3; \
161
+ tmp13 = q1; \
162
+ }
163
+
164
+ #define XFIX_0_899976223 d0[0]
165
+ #define XFIX_0_541196100 d0[1]
166
+ #define XFIX_2_562915447 d0[2]
167
+ #define XFIX_0_298631336_MINUS_0_899976223 d0[3]
168
+ #define XFIX_1_501321110_MINUS_0_899976223 d1[0]
169
+ #define XFIX_2_053119869_MINUS_2_562915447 d1[1]
170
+ #define XFIX_0_541196100_PLUS_0_765366865 d1[2]
171
+ #define XFIX_1_175875602 d1[3]
172
+ #define XFIX_1_175875602_MINUS_0_390180644 d2[0]
173
+ #define XFIX_0_541196100_MINUS_1_847759065 d2[1]
174
+ #define XFIX_3_072711026_MINUS_2_562915447 d2[2]
175
+ #define XFIX_1_175875602_MINUS_1_961570560 d2[3]
176
+
177
+ .balign 16
178
+ jsimd_idct_islow_neon_consts:
179
+ .short FIX_0_899976223 /* d0[0] */
180
+ .short FIX_0_541196100 /* d0[1] */
181
+ .short FIX_2_562915447 /* d0[2] */
182
+ .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
183
+ .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
184
+ .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
185
+ .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
186
+ .short FIX_1_175875602 /* d1[3] */
187
+ /* reloadable constants */
188
+ .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
189
+ .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
190
+ .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
191
+ .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
192
+
193
+ asm_function jsimd_idct_islow_neon
194
+
195
+ DCT_TABLE .req r0
196
+ COEF_BLOCK .req r1
197
+ OUTPUT_BUF .req r2
198
+ OUTPUT_COL .req r3
199
+ TMP1 .req r0
200
+ TMP2 .req r1
201
+ TMP3 .req r2
202
+ TMP4 .req ip
203
+
204
+ ROW0L .req d16
205
+ ROW0R .req d17
206
+ ROW1L .req d18
207
+ ROW1R .req d19
208
+ ROW2L .req d20
209
+ ROW2R .req d21
210
+ ROW3L .req d22
211
+ ROW3R .req d23
212
+ ROW4L .req d24
213
+ ROW4R .req d25
214
+ ROW5L .req d26
215
+ ROW5R .req d27
216
+ ROW6L .req d28
217
+ ROW6R .req d29
218
+ ROW7L .req d30
219
+ ROW7R .req d31
220
+
221
+ /* Load and dequantize coefficients into Neon registers
222
+ * with the following allocation:
223
+ * 0 1 2 3 | 4 5 6 7
224
+ * ---------+--------
225
+ * 0 | d16 | d17 ( q8 )
226
+ * 1 | d18 | d19 ( q9 )
227
+ * 2 | d20 | d21 ( q10 )
228
+ * 3 | d22 | d23 ( q11 )
229
+ * 4 | d24 | d25 ( q12 )
230
+ * 5 | d26 | d27 ( q13 )
231
+ * 6 | d28 | d29 ( q14 )
232
+ * 7 | d30 | d31 ( q15 )
233
+ */
234
+ adr ip, jsimd_idct_islow_neon_consts
235
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
236
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
237
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
238
+ vmul.s16 q8, q8, q0
239
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
240
+ vmul.s16 q9, q9, q1
241
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
242
+ vmul.s16 q10, q10, q2
243
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
244
+ vmul.s16 q11, q11, q3
245
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
246
+ vmul.s16 q12, q12, q0
247
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
248
+ vmul.s16 q14, q14, q2
249
+ vmul.s16 q13, q13, q1
250
+ vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
251
+ add ip, ip, #16
252
+ vmul.s16 q15, q15, q3
253
+ vpush {d8 - d15} /* save Neon registers */
254
+ /* 1-D IDCT, pass 1, left 4x8 half */
255
+ vadd.s16 d4, ROW7L, ROW3L
256
+ vadd.s16 d5, ROW5L, ROW1L
257
+ vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
258
+ vmlal.s16 q6, d5, XFIX_1_175875602
259
+ vmull.s16 q7, d4, XFIX_1_175875602
260
+ /* Check for the zero coefficients in the right 4x8 half */
261
+ push {r4, r5}
262
+ vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
263
+ vsubl.s16 q3, ROW0L, ROW4L
264
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
265
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
266
+ vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
267
+ orr r0, r4, r5
268
+ vmov q4, q6
269
+ vmlsl.s16 q6, ROW5L, XFIX_2_562915447
270
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
271
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
272
+ vshl.s32 q3, q3, #13
273
+ orr r0, r0, r4
274
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
275
+ orr r0, r0, r5
276
+ vadd.s32 q1, q3, q2
277
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
278
+ vmov q5, q7
279
+ vadd.s32 q1, q1, q6
280
+ orr r0, r0, r4
281
+ vmlsl.s16 q7, ROW7L, XFIX_0_899976223
282
+ orr r0, r0, r5
283
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
284
+ vrshrn.s32 ROW1L, q1, #11
285
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
286
+ vsub.s32 q1, q1, q6
287
+ vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
288
+ orr r0, r0, r4
289
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
290
+ orr r0, r0, r5
291
+ vsub.s32 q1, q1, q6
292
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
293
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
294
+ vmlal.s16 q6, ROW6L, XFIX_0_541196100
295
+ vsub.s32 q3, q3, q2
296
+ orr r0, r0, r4
297
+ vrshrn.s32 ROW6L, q1, #11
298
+ orr r0, r0, r5
299
+ vadd.s32 q1, q3, q5
300
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
301
+ vsub.s32 q3, q3, q5
302
+ vaddl.s16 q5, ROW0L, ROW4L
303
+ orr r0, r0, r4
304
+ vrshrn.s32 ROW2L, q1, #11
305
+ orr r0, r0, r5
306
+ vrshrn.s32 ROW5L, q3, #11
307
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
308
+ vshl.s32 q5, q5, #13
309
+ vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
310
+ orr r0, r0, r4
311
+ vadd.s32 q2, q5, q6
312
+ orrs r0, r0, r5
313
+ vsub.s32 q1, q5, q6
314
+ vadd.s32 q6, q2, q7
315
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
316
+ vsub.s32 q2, q2, q7
317
+ vadd.s32 q5, q1, q4
318
+ orr r0, r4, r5
319
+ vsub.s32 q3, q1, q4
320
+ pop {r4, r5}
321
+ vrshrn.s32 ROW7L, q2, #11
322
+ vrshrn.s32 ROW3L, q5, #11
323
+ vrshrn.s32 ROW0L, q6, #11
324
+ vrshrn.s32 ROW4L, q3, #11
325
+
326
+ beq 3f /* Go to do some special handling for the sparse
327
+ right 4x8 half */
328
+
329
+ /* 1-D IDCT, pass 1, right 4x8 half */
330
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
331
+ vadd.s16 d10, ROW7R, ROW3R
332
+ vadd.s16 d8, ROW5R, ROW1R
333
+ /* Transpose left 4x8 half */
334
+ vtrn.16 ROW6L, ROW7L
335
+ vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
336
+ vmlal.s16 q6, d8, XFIX_1_175875602
337
+ vtrn.16 ROW2L, ROW3L
338
+ vmull.s16 q7, d10, XFIX_1_175875602
339
+ vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
340
+ vtrn.16 ROW0L, ROW1L
341
+ vsubl.s16 q3, ROW0R, ROW4R
342
+ vmull.s16 q2, ROW2R, XFIX_0_541196100
343
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
344
+ vtrn.16 ROW4L, ROW5L
345
+ vmov q4, q6
346
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447
347
+ vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
348
+ vtrn.32 ROW1L, ROW3L
349
+ vshl.s32 q3, q3, #13
350
+ vmlsl.s16 q4, ROW1R, XFIX_0_899976223
351
+ vtrn.32 ROW4L, ROW6L
352
+ vadd.s32 q1, q3, q2
353
+ vmov q5, q7
354
+ vadd.s32 q1, q1, q6
355
+ vtrn.32 ROW0L, ROW2L
356
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223
357
+ vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
358
+ vrshrn.s32 ROW1R, q1, #11
359
+ vtrn.32 ROW5L, ROW7L
360
+ vsub.s32 q1, q1, q6
361
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
362
+ vmlsl.s16 q5, ROW3R, XFIX_2_562915447
363
+ vsub.s32 q1, q1, q6
364
+ vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
365
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100
366
+ vsub.s32 q3, q3, q2
367
+ vrshrn.s32 ROW6R, q1, #11
368
+ vadd.s32 q1, q3, q5
369
+ vsub.s32 q3, q3, q5
370
+ vaddl.s16 q5, ROW0R, ROW4R
371
+ vrshrn.s32 ROW2R, q1, #11
372
+ vrshrn.s32 ROW5R, q3, #11
373
+ vshl.s32 q5, q5, #13
374
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
375
+ vadd.s32 q2, q5, q6
376
+ vsub.s32 q1, q5, q6
377
+ vadd.s32 q6, q2, q7
378
+ vsub.s32 q2, q2, q7
379
+ vadd.s32 q5, q1, q4
380
+ vsub.s32 q3, q1, q4
381
+ vrshrn.s32 ROW7R, q2, #11
382
+ vrshrn.s32 ROW3R, q5, #11
383
+ vrshrn.s32 ROW0R, q6, #11
384
+ vrshrn.s32 ROW4R, q3, #11
385
+ /* Transpose right 4x8 half */
386
+ vtrn.16 ROW6R, ROW7R
387
+ vtrn.16 ROW2R, ROW3R
388
+ vtrn.16 ROW0R, ROW1R
389
+ vtrn.16 ROW4R, ROW5R
390
+ vtrn.32 ROW1R, ROW3R
391
+ vtrn.32 ROW4R, ROW6R
392
+ vtrn.32 ROW0R, ROW2R
393
+ vtrn.32 ROW5R, ROW7R
394
+
395
+ 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
396
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
397
+ vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
398
+ vmlal.s16 q6, ROW1L, XFIX_1_175875602
399
+ vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
400
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
401
+ vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
402
+ vmlal.s16 q7, ROW3L, XFIX_1_175875602
403
+ vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
404
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
405
+ vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
406
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
407
+ vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
408
+ vmov q4, q6
409
+ vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
410
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
411
+ vshl.s32 q3, q3, #13
412
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
413
+ vadd.s32 q1, q3, q2
414
+ vmov q5, q7
415
+ vadd.s32 q1, q1, q6
416
+ vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
417
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
418
+ vshrn.s32 ROW1L, q1, #16
419
+ vsub.s32 q1, q1, q6
420
+ vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
421
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
422
+ vsub.s32 q1, q1, q6
423
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
424
+ vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
425
+ vsub.s32 q3, q3, q2
426
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
427
+ vadd.s32 q1, q3, q5
428
+ vsub.s32 q3, q3, q5
429
+ vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
430
+ vshrn.s32 ROW2L, q1, #16
431
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
432
+ vshl.s32 q5, q5, #13
433
+ vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
434
+ vadd.s32 q2, q5, q6
435
+ vsub.s32 q1, q5, q6
436
+ vadd.s32 q6, q2, q7
437
+ vsub.s32 q2, q2, q7
438
+ vadd.s32 q5, q1, q4
439
+ vsub.s32 q3, q1, q4
440
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
441
+ vshrn.s32 ROW3L, q5, #16
442
+ vshrn.s32 ROW0L, q6, #16
443
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
444
+ /* 1-D IDCT, pass 2, right 4x8 half */
445
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
446
+ vmull.s16 q6, ROW5R, XFIX_1_175875602
447
+ vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
448
+ vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
449
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
450
+ vmull.s16 q7, ROW7R, XFIX_1_175875602
451
+ vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
452
+ vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
453
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
454
+ vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
455
+ vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
456
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
457
+ vmov q4, q6
458
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447
459
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
460
+ vshl.s32 q3, q3, #13
461
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
462
+ vadd.s32 q1, q3, q2
463
+ vmov q5, q7
464
+ vadd.s32 q1, q1, q6
465
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223
466
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
467
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
468
+ vsub.s32 q1, q1, q6
469
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
470
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
471
+ vsub.s32 q1, q1, q6
472
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
473
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100
474
+ vsub.s32 q3, q3, q2
475
+ vshrn.s32 ROW6R, q1, #16
476
+ vadd.s32 q1, q3, q5
477
+ vsub.s32 q3, q3, q5
478
+ vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
479
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
480
+ vshrn.s32 ROW5R, q3, #16
481
+ vshl.s32 q5, q5, #13
482
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
483
+ vadd.s32 q2, q5, q6
484
+ vsub.s32 q1, q5, q6
485
+ vadd.s32 q6, q2, q7
486
+ vsub.s32 q2, q2, q7
487
+ vadd.s32 q5, q1, q4
488
+ vsub.s32 q3, q1, q4
489
+ vshrn.s32 ROW7R, q2, #16
490
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
491
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
492
+ vshrn.s32 ROW4R, q3, #16
493
+
494
+ 2: /* Descale to 8-bit and range limit */
495
+ vqrshrn.s16 d16, q8, #2
496
+ vqrshrn.s16 d17, q9, #2
497
+ vqrshrn.s16 d18, q10, #2
498
+ vqrshrn.s16 d19, q11, #2
499
+ vpop {d8 - d15} /* restore Neon registers */
500
+ vqrshrn.s16 d20, q12, #2
501
+ /* Transpose the final 8-bit samples and do signed->unsigned conversion */
502
+ vtrn.16 q8, q9
503
+ vqrshrn.s16 d21, q13, #2
504
+ vqrshrn.s16 d22, q14, #2
505
+ vmov.u8 q0, #(CENTERJSAMPLE)
506
+ vqrshrn.s16 d23, q15, #2
507
+ vtrn.8 d16, d17
508
+ vtrn.8 d18, d19
509
+ vadd.u8 q8, q8, q0
510
+ vadd.u8 q9, q9, q0
511
+ vtrn.16 q10, q11
512
+ /* Store results to the output buffer */
513
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
514
+ add TMP1, TMP1, OUTPUT_COL
515
+ add TMP2, TMP2, OUTPUT_COL
516
+ vst1.8 {d16}, [TMP1]
517
+ vtrn.8 d20, d21
518
+ vst1.8 {d17}, [TMP2]
519
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
520
+ add TMP1, TMP1, OUTPUT_COL
521
+ add TMP2, TMP2, OUTPUT_COL
522
+ vst1.8 {d18}, [TMP1]
523
+ vadd.u8 q10, q10, q0
524
+ vst1.8 {d19}, [TMP2]
525
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
526
+ add TMP1, TMP1, OUTPUT_COL
527
+ add TMP2, TMP2, OUTPUT_COL
528
+ add TMP3, TMP3, OUTPUT_COL
529
+ add TMP4, TMP4, OUTPUT_COL
530
+ vtrn.8 d22, d23
531
+ vst1.8 {d20}, [TMP1]
532
+ vadd.u8 q11, q11, q0
533
+ vst1.8 {d21}, [TMP2]
534
+ vst1.8 {d22}, [TMP3]
535
+ vst1.8 {d23}, [TMP4]
536
+ bx lr
537
+
538
+ 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
539
+
540
+ /* Transpose left 4x8 half */
541
+ vtrn.16 ROW6L, ROW7L
542
+ vtrn.16 ROW2L, ROW3L
543
+ vtrn.16 ROW0L, ROW1L
544
+ vtrn.16 ROW4L, ROW5L
545
+ vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
546
+ vtrn.32 ROW1L, ROW3L
547
+ vtrn.32 ROW4L, ROW6L
548
+ vtrn.32 ROW0L, ROW2L
549
+ vtrn.32 ROW5L, ROW7L
550
+
551
+ cmp r0, #0
552
+ beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
553
+ pass */
554
+
555
+ /* Only row 0 is non-zero for the right 4x8 half */
556
+ vdup.s16 ROW1R, ROW0R[1]
557
+ vdup.s16 ROW2R, ROW0R[2]
558
+ vdup.s16 ROW3R, ROW0R[3]
559
+ vdup.s16 ROW4R, ROW0R[0]
560
+ vdup.s16 ROW5R, ROW0R[1]
561
+ vdup.s16 ROW6R, ROW0R[2]
562
+ vdup.s16 ROW7R, ROW0R[3]
563
+ vdup.s16 ROW0R, ROW0R[0]
564
+ b 1b /* Go to 'normal' second pass */
565
+
566
+ 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
567
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
568
+ vmull.s16 q6, ROW1L, XFIX_1_175875602
569
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
570
+ vmull.s16 q7, ROW3L, XFIX_1_175875602
571
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
572
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
573
+ vshll.s16 q3, ROW0L, #13
574
+ vmov q4, q6
575
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
576
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
577
+ vadd.s32 q1, q3, q2
578
+ vmov q5, q7
579
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
580
+ vadd.s32 q1, q1, q6
581
+ vadd.s32 q6, q6, q6
582
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
583
+ vshrn.s32 ROW1L, q1, #16
584
+ vsub.s32 q1, q1, q6
585
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
586
+ vsub.s32 q3, q3, q2
587
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
588
+ vadd.s32 q1, q3, q5
589
+ vsub.s32 q3, q3, q5
590
+ vshll.s16 q5, ROW0L, #13
591
+ vshrn.s32 ROW2L, q1, #16
592
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
593
+ vadd.s32 q2, q5, q6
594
+ vsub.s32 q1, q5, q6
595
+ vadd.s32 q6, q2, q7
596
+ vsub.s32 q2, q2, q7
597
+ vadd.s32 q5, q1, q4
598
+ vsub.s32 q3, q1, q4
599
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
600
+ vshrn.s32 ROW3L, q5, #16
601
+ vshrn.s32 ROW0L, q6, #16
602
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
603
+ /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
604
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
605
+ vmull.s16 q6, ROW5L, XFIX_1_175875602
606
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
607
+ vmull.s16 q7, ROW7L, XFIX_1_175875602
608
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
609
+ vmull.s16 q2, ROW6L, XFIX_0_541196100
610
+ vshll.s16 q3, ROW4L, #13
611
+ vmov q4, q6
612
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
613
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223
614
+ vadd.s32 q1, q3, q2
615
+ vmov q5, q7
616
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
617
+ vadd.s32 q1, q1, q6
618
+ vadd.s32 q6, q6, q6
619
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447
620
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
621
+ vsub.s32 q1, q1, q6
622
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
623
+ vsub.s32 q3, q3, q2
624
+ vshrn.s32 ROW6R, q1, #16
625
+ vadd.s32 q1, q3, q5
626
+ vsub.s32 q3, q3, q5
627
+ vshll.s16 q5, ROW4L, #13
628
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
629
+ vshrn.s32 ROW5R, q3, #16
630
+ vadd.s32 q2, q5, q6
631
+ vsub.s32 q1, q5, q6
632
+ vadd.s32 q6, q2, q7
633
+ vsub.s32 q2, q2, q7
634
+ vadd.s32 q5, q1, q4
635
+ vsub.s32 q3, q1, q4
636
+ vshrn.s32 ROW7R, q2, #16
637
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
638
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
639
+ vshrn.s32 ROW4R, q3, #16
640
+ b 2b /* Go to epilogue */
641
+
642
+ .unreq DCT_TABLE
643
+ .unreq COEF_BLOCK
644
+ .unreq OUTPUT_BUF
645
+ .unreq OUTPUT_COL
646
+ .unreq TMP1
647
+ .unreq TMP2
648
+ .unreq TMP3
649
+ .unreq TMP4
650
+
651
+ .unreq ROW0L
652
+ .unreq ROW0R
653
+ .unreq ROW1L
654
+ .unreq ROW1R
655
+ .unreq ROW2L
656
+ .unreq ROW2R
657
+ .unreq ROW3L
658
+ .unreq ROW3R
659
+ .unreq ROW4L
660
+ .unreq ROW4R
661
+ .unreq ROW5L
662
+ .unreq ROW5R
663
+ .unreq ROW6L
664
+ .unreq ROW6R
665
+ .unreq ROW7L
666
+ .unreq ROW7R
667
+
668
+
669
+ /*****************************************************************************/
670
+
671
+ /*
672
+ * jsimd_idct_ifast_neon
673
+ *
674
+ * This function contains a fast, not so accurate integer implementation of
675
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
676
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
677
+ * function from jidctfst.c
678
+ *
679
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
680
+ * But in Arm Neon case some extra additions are required because VQDMULH
681
+ * instruction can't handle the constants larger than 1. So the expressions
682
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
683
+ * which introduces an extra addition. Overall, there are 6 extra additions
684
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
685
+ */
686
+
687
+ #define XFIX_1_082392200 d0[0]
688
+ #define XFIX_1_414213562 d0[1]
689
+ #define XFIX_1_847759065 d0[2]
690
+ #define XFIX_2_613125930 d0[3]
691
+
692
+ .balign 16
693
+ jsimd_idct_ifast_neon_consts:
694
+ .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
695
+ .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
696
+ .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
697
+ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
698
+
699
+ asm_function jsimd_idct_ifast_neon
700
+
701
+ DCT_TABLE .req r0
702
+ COEF_BLOCK .req r1
703
+ OUTPUT_BUF .req r2
704
+ OUTPUT_COL .req r3
705
+ TMP1 .req r0
706
+ TMP2 .req r1
707
+ TMP3 .req r2
708
+ TMP4 .req ip
709
+
710
+ /* Load and dequantize coefficients into Neon registers
711
+ * with the following allocation:
712
+ * 0 1 2 3 | 4 5 6 7
713
+ * ---------+--------
714
+ * 0 | d16 | d17 ( q8 )
715
+ * 1 | d18 | d19 ( q9 )
716
+ * 2 | d20 | d21 ( q10 )
717
+ * 3 | d22 | d23 ( q11 )
718
+ * 4 | d24 | d25 ( q12 )
719
+ * 5 | d26 | d27 ( q13 )
720
+ * 6 | d28 | d29 ( q14 )
721
+ * 7 | d30 | d31 ( q15 )
722
+ */
723
+ adr ip, jsimd_idct_ifast_neon_consts
724
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
725
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
726
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
727
+ vmul.s16 q8, q8, q0
728
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
729
+ vmul.s16 q9, q9, q1
730
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
731
+ vmul.s16 q10, q10, q2
732
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
733
+ vmul.s16 q11, q11, q3
734
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
735
+ vmul.s16 q12, q12, q0
736
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
737
+ vmul.s16 q14, q14, q2
738
+ vmul.s16 q13, q13, q1
739
+ vld1.16 {d0}, [ip, :64] /* load constants */
740
+ vmul.s16 q15, q15, q3
741
+ vpush {d8 - d13} /* save Neon registers */
742
+ /* 1-D IDCT, pass 1 */
743
+ vsub.s16 q2, q10, q14
744
+ vadd.s16 q14, q10, q14
745
+ vsub.s16 q1, q11, q13
746
+ vadd.s16 q13, q11, q13
747
+ vsub.s16 q5, q9, q15
748
+ vadd.s16 q15, q9, q15
749
+ vqdmulh.s16 q4, q2, XFIX_1_414213562
750
+ vqdmulh.s16 q6, q1, XFIX_2_613125930
751
+ vadd.s16 q3, q1, q1
752
+ vsub.s16 q1, q5, q1
753
+ vadd.s16 q10, q2, q4
754
+ vqdmulh.s16 q4, q1, XFIX_1_847759065
755
+ vsub.s16 q2, q15, q13
756
+ vadd.s16 q3, q3, q6
757
+ vqdmulh.s16 q6, q2, XFIX_1_414213562
758
+ vadd.s16 q1, q1, q4
759
+ vqdmulh.s16 q4, q5, XFIX_1_082392200
760
+ vsub.s16 q10, q10, q14
761
+ vadd.s16 q2, q2, q6
762
+ vsub.s16 q6, q8, q12
763
+ vadd.s16 q12, q8, q12
764
+ vadd.s16 q9, q5, q4
765
+ vadd.s16 q5, q6, q10
766
+ vsub.s16 q10, q6, q10
767
+ vadd.s16 q6, q15, q13
768
+ vadd.s16 q8, q12, q14
769
+ vsub.s16 q3, q6, q3
770
+ vsub.s16 q12, q12, q14
771
+ vsub.s16 q3, q3, q1
772
+ vsub.s16 q1, q9, q1
773
+ vadd.s16 q2, q3, q2
774
+ vsub.s16 q15, q8, q6
775
+ vadd.s16 q1, q1, q2
776
+ vadd.s16 q8, q8, q6
777
+ vadd.s16 q14, q5, q3
778
+ vsub.s16 q9, q5, q3
779
+ vsub.s16 q13, q10, q2
780
+ vadd.s16 q10, q10, q2
781
+ /* Transpose */
782
+ vtrn.16 q8, q9
783
+ vsub.s16 q11, q12, q1
784
+ vtrn.16 q14, q15
785
+ vadd.s16 q12, q12, q1
786
+ vtrn.16 q10, q11
787
+ vtrn.16 q12, q13
788
+ vtrn.32 q9, q11
789
+ vtrn.32 q12, q14
790
+ vtrn.32 q8, q10
791
+ vtrn.32 q13, q15
792
+ vswp d28, d21
793
+ vswp d26, d19
794
+ /* 1-D IDCT, pass 2 */
795
+ vsub.s16 q2, q10, q14
796
+ vswp d30, d23
797
+ vadd.s16 q14, q10, q14
798
+ vswp d24, d17
799
+ vsub.s16 q1, q11, q13
800
+ vadd.s16 q13, q11, q13
801
+ vsub.s16 q5, q9, q15
802
+ vadd.s16 q15, q9, q15
803
+ vqdmulh.s16 q4, q2, XFIX_1_414213562
804
+ vqdmulh.s16 q6, q1, XFIX_2_613125930
805
+ vadd.s16 q3, q1, q1
806
+ vsub.s16 q1, q5, q1
807
+ vadd.s16 q10, q2, q4
808
+ vqdmulh.s16 q4, q1, XFIX_1_847759065
809
+ vsub.s16 q2, q15, q13
810
+ vadd.s16 q3, q3, q6
811
+ vqdmulh.s16 q6, q2, XFIX_1_414213562
812
+ vadd.s16 q1, q1, q4
813
+ vqdmulh.s16 q4, q5, XFIX_1_082392200
814
+ vsub.s16 q10, q10, q14
815
+ vadd.s16 q2, q2, q6
816
+ vsub.s16 q6, q8, q12
817
+ vadd.s16 q12, q8, q12
818
+ vadd.s16 q9, q5, q4
819
+ vadd.s16 q5, q6, q10
820
+ vsub.s16 q10, q6, q10
821
+ vadd.s16 q6, q15, q13
822
+ vadd.s16 q8, q12, q14
823
+ vsub.s16 q3, q6, q3
824
+ vsub.s16 q12, q12, q14
825
+ vsub.s16 q3, q3, q1
826
+ vsub.s16 q1, q9, q1
827
+ vadd.s16 q2, q3, q2
828
+ vsub.s16 q15, q8, q6
829
+ vadd.s16 q1, q1, q2
830
+ vadd.s16 q8, q8, q6
831
+ vadd.s16 q14, q5, q3
832
+ vsub.s16 q9, q5, q3
833
+ vsub.s16 q13, q10, q2
834
+ vpop {d8 - d13} /* restore Neon registers */
835
+ vadd.s16 q10, q10, q2
836
+ vsub.s16 q11, q12, q1
837
+ vadd.s16 q12, q12, q1
838
+ /* Descale to 8-bit and range limit */
839
+ vmov.u8 q0, #0x80
840
+ vqshrn.s16 d16, q8, #5
841
+ vqshrn.s16 d17, q9, #5
842
+ vqshrn.s16 d18, q10, #5
843
+ vqshrn.s16 d19, q11, #5
844
+ vqshrn.s16 d20, q12, #5
845
+ vqshrn.s16 d21, q13, #5
846
+ vqshrn.s16 d22, q14, #5
847
+ vqshrn.s16 d23, q15, #5
848
+ vadd.u8 q8, q8, q0
849
+ vadd.u8 q9, q9, q0
850
+ vadd.u8 q10, q10, q0
851
+ vadd.u8 q11, q11, q0
852
+ /* Transpose the final 8-bit samples */
853
+ vtrn.16 q8, q9
854
+ vtrn.16 q10, q11
855
+ vtrn.32 q8, q10
856
+ vtrn.32 q9, q11
857
+ vtrn.8 d16, d17
858
+ vtrn.8 d18, d19
859
+ /* Store results to the output buffer */
860
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
861
+ add TMP1, TMP1, OUTPUT_COL
862
+ add TMP2, TMP2, OUTPUT_COL
863
+ vst1.8 {d16}, [TMP1]
864
+ vst1.8 {d17}, [TMP2]
865
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
866
+ add TMP1, TMP1, OUTPUT_COL
867
+ add TMP2, TMP2, OUTPUT_COL
868
+ vst1.8 {d18}, [TMP1]
869
+ vtrn.8 d20, d21
870
+ vst1.8 {d19}, [TMP2]
871
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
872
+ add TMP1, TMP1, OUTPUT_COL
873
+ add TMP2, TMP2, OUTPUT_COL
874
+ add TMP3, TMP3, OUTPUT_COL
875
+ add TMP4, TMP4, OUTPUT_COL
876
+ vst1.8 {d20}, [TMP1]
877
+ vtrn.8 d22, d23
878
+ vst1.8 {d21}, [TMP2]
879
+ vst1.8 {d22}, [TMP3]
880
+ vst1.8 {d23}, [TMP4]
881
+ bx lr
882
+
883
+ .unreq DCT_TABLE
884
+ .unreq COEF_BLOCK
885
+ .unreq OUTPUT_BUF
886
+ .unreq OUTPUT_COL
887
+ .unreq TMP1
888
+ .unreq TMP2
889
+ .unreq TMP3
890
+ .unreq TMP4
891
+
892
+
893
+ /*****************************************************************************/
894
+
895
+ /*
896
+ * jsimd_extrgb_ycc_convert_neon
897
+ * jsimd_extbgr_ycc_convert_neon
898
+ * jsimd_extrgbx_ycc_convert_neon
899
+ * jsimd_extbgrx_ycc_convert_neon
900
+ * jsimd_extxbgr_ycc_convert_neon
901
+ * jsimd_extxrgb_ycc_convert_neon
902
+ *
903
+ * Colorspace conversion RGB -> YCbCr
904
+ */
905
+
906
+ .macro do_store size
907
+ .if \size == 8
908
+ vst1.8 {d20}, [Y]!
909
+ vst1.8 {d21}, [U]!
910
+ vst1.8 {d22}, [V]!
911
+ .elseif \size == 4
912
+ vst1.8 {d20[0]}, [Y]!
913
+ vst1.8 {d20[1]}, [Y]!
914
+ vst1.8 {d20[2]}, [Y]!
915
+ vst1.8 {d20[3]}, [Y]!
916
+ vst1.8 {d21[0]}, [U]!
917
+ vst1.8 {d21[1]}, [U]!
918
+ vst1.8 {d21[2]}, [U]!
919
+ vst1.8 {d21[3]}, [U]!
920
+ vst1.8 {d22[0]}, [V]!
921
+ vst1.8 {d22[1]}, [V]!
922
+ vst1.8 {d22[2]}, [V]!
923
+ vst1.8 {d22[3]}, [V]!
924
+ .elseif \size == 2
925
+ vst1.8 {d20[4]}, [Y]!
926
+ vst1.8 {d20[5]}, [Y]!
927
+ vst1.8 {d21[4]}, [U]!
928
+ vst1.8 {d21[5]}, [U]!
929
+ vst1.8 {d22[4]}, [V]!
930
+ vst1.8 {d22[5]}, [V]!
931
+ .elseif \size == 1
932
+ vst1.8 {d20[6]}, [Y]!
933
+ vst1.8 {d21[6]}, [U]!
934
+ vst1.8 {d22[6]}, [V]!
935
+ .else
936
+ .error unsupported macroblock size
937
+ .endif
938
+ .endm
939
+
940
+ .macro do_load bpp, size
941
+ .if \bpp == 24
942
+ .if \size == 8
943
+ vld3.8 {d10, d11, d12}, [RGB]!
944
+ pld [RGB, #128]
945
+ .elseif \size == 4
946
+ vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
947
+ vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
948
+ vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
949
+ vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
950
+ .elseif \size == 2
951
+ vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
952
+ vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
953
+ .elseif \size == 1
954
+ vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
955
+ .else
956
+ .error unsupported macroblock size
957
+ .endif
958
+ .elseif \bpp == 32
959
+ .if \size == 8
960
+ vld4.8 {d10, d11, d12, d13}, [RGB]!
961
+ pld [RGB, #128]
962
+ .elseif \size == 4
963
+ vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
964
+ vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
965
+ vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
966
+ vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
967
+ .elseif \size == 2
968
+ vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
969
+ vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
970
+ .elseif \size == 1
971
+ vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
972
+ .else
973
+ .error unsupported macroblock size
974
+ .endif
975
+ .else
976
+ .error unsupported bpp
977
+ .endif
978
+ .endm
979
+
980
+ .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
981
+
982
+ /*
983
+ * 2-stage pipelined RGB->YCbCr conversion
984
+ */
985
+
986
+ .macro do_rgb_to_yuv_stage1
987
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
988
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
989
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
990
+ vmull.u16 q7, d4, d0[0]
991
+ vmlal.u16 q7, d6, d0[1]
992
+ vmlal.u16 q7, d8, d0[2]
993
+ vmull.u16 q8, d5, d0[0]
994
+ vmlal.u16 q8, d7, d0[1]
995
+ vmlal.u16 q8, d9, d0[2]
996
+ vrev64.32 q9, q1
997
+ vrev64.32 q13, q1
998
+ vmlsl.u16 q9, d4, d0[3]
999
+ vmlsl.u16 q9, d6, d1[0]
1000
+ vmlal.u16 q9, d8, d1[1]
1001
+ vmlsl.u16 q13, d5, d0[3]
1002
+ vmlsl.u16 q13, d7, d1[0]
1003
+ vmlal.u16 q13, d9, d1[1]
1004
+ vrev64.32 q14, q1
1005
+ vrev64.32 q15, q1
1006
+ vmlal.u16 q14, d4, d1[1]
1007
+ vmlsl.u16 q14, d6, d1[2]
1008
+ vmlsl.u16 q14, d8, d1[3]
1009
+ vmlal.u16 q15, d5, d1[1]
1010
+ vmlsl.u16 q15, d7, d1[2]
1011
+ vmlsl.u16 q15, d9, d1[3]
1012
+ .endm
1013
+
1014
+ .macro do_rgb_to_yuv_stage2
1015
+ vrshrn.u32 d20, q7, #16
1016
+ vrshrn.u32 d21, q8, #16
1017
+ vshrn.u32 d22, q9, #16
1018
+ vshrn.u32 d23, q13, #16
1019
+ vshrn.u32 d24, q14, #16
1020
+ vshrn.u32 d25, q15, #16
1021
+ vmovn.u16 d20, q10 /* d20 = y */
1022
+ vmovn.u16 d21, q11 /* d21 = u */
1023
+ vmovn.u16 d22, q12 /* d22 = v */
1024
+ .endm
1025
+
1026
+ .macro do_rgb_to_yuv
1027
+ do_rgb_to_yuv_stage1
1028
+ do_rgb_to_yuv_stage2
1029
+ .endm
1030
+
1031
+ .macro do_rgb_to_yuv_stage2_store_load_stage1
1032
+ vrshrn.u32 d20, q7, #16
1033
+ vrshrn.u32 d21, q8, #16
1034
+ vshrn.u32 d22, q9, #16
1035
+ vrev64.32 q9, q1
1036
+ vshrn.u32 d23, q13, #16
1037
+ vrev64.32 q13, q1
1038
+ vshrn.u32 d24, q14, #16
1039
+ vshrn.u32 d25, q15, #16
1040
+ do_load \bpp, 8
1041
+ vmovn.u16 d20, q10 /* d20 = y */
1042
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1043
+ vmovn.u16 d21, q11 /* d21 = u */
1044
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1045
+ vmovn.u16 d22, q12 /* d22 = v */
1046
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1047
+ vmull.u16 q7, d4, d0[0]
1048
+ vmlal.u16 q7, d6, d0[1]
1049
+ vmlal.u16 q7, d8, d0[2]
1050
+ vst1.8 {d20}, [Y]!
1051
+ vmull.u16 q8, d5, d0[0]
1052
+ vmlal.u16 q8, d7, d0[1]
1053
+ vmlal.u16 q8, d9, d0[2]
1054
+ vmlsl.u16 q9, d4, d0[3]
1055
+ vmlsl.u16 q9, d6, d1[0]
1056
+ vmlal.u16 q9, d8, d1[1]
1057
+ vst1.8 {d21}, [U]!
1058
+ vmlsl.u16 q13, d5, d0[3]
1059
+ vmlsl.u16 q13, d7, d1[0]
1060
+ vmlal.u16 q13, d9, d1[1]
1061
+ vrev64.32 q14, q1
1062
+ vrev64.32 q15, q1
1063
+ vmlal.u16 q14, d4, d1[1]
1064
+ vmlsl.u16 q14, d6, d1[2]
1065
+ vmlsl.u16 q14, d8, d1[3]
1066
+ vst1.8 {d22}, [V]!
1067
+ vmlal.u16 q15, d5, d1[1]
1068
+ vmlsl.u16 q15, d7, d1[2]
1069
+ vmlsl.u16 q15, d9, d1[3]
1070
+ .endm
1071
+
1072
+ .balign 16
1073
+ jsimd_\colorid\()_ycc_neon_consts:
1074
+ .short 19595, 38470, 7471, 11059
1075
+ .short 21709, 32768, 27439, 5329
1076
+ .short 32767, 128, 32767, 128
1077
+ .short 32767, 128, 32767, 128
1078
+
1079
+ asm_function jsimd_\colorid\()_ycc_convert_neon
1080
+ OUTPUT_WIDTH .req r0
1081
+ INPUT_BUF .req r1
1082
+ OUTPUT_BUF .req r2
1083
+ OUTPUT_ROW .req r3
1084
+ NUM_ROWS .req r4
1085
+
1086
+ OUTPUT_BUF0 .req r5
1087
+ OUTPUT_BUF1 .req r6
1088
+ OUTPUT_BUF2 .req OUTPUT_BUF
1089
+
1090
+ RGB .req r7
1091
+ Y .req r8
1092
+ U .req r9
1093
+ V .req r10
1094
+ N .req ip
1095
+
1096
+ /* Load constants to d0, d1, d2, d3 */
1097
+ adr ip, jsimd_\colorid\()_ycc_neon_consts
1098
+ vld1.16 {d0, d1, d2, d3}, [ip, :128]
1099
+
1100
+ /* Save Arm registers and handle input arguments */
1101
+ push {r4, r5, r6, r7, r8, r9, r10, lr}
1102
+ ldr NUM_ROWS, [sp, #(4 * 8)]
1103
+ ldr OUTPUT_BUF0, [OUTPUT_BUF]
1104
+ ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
1105
+ ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
1106
+ .unreq OUTPUT_BUF
1107
+
1108
+ /* Save Neon registers */
1109
+ vpush {d8 - d15}
1110
+
1111
+ /* Outer loop over scanlines */
1112
+ cmp NUM_ROWS, #1
1113
+ blt 9f
1114
+ 0:
1115
+ ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1116
+ ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1117
+ mov N, OUTPUT_WIDTH
1118
+ ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1119
+ add OUTPUT_ROW, OUTPUT_ROW, #1
1120
+ ldr RGB, [INPUT_BUF], #4
1121
+
1122
+ /* Inner loop over pixels */
1123
+ subs N, N, #8
1124
+ blt 3f
1125
+ do_load \bpp, 8
1126
+ do_rgb_to_yuv_stage1
1127
+ subs N, N, #8
1128
+ blt 2f
1129
+ 1:
1130
+ do_rgb_to_yuv_stage2_store_load_stage1
1131
+ subs N, N, #8
1132
+ bge 1b
1133
+ 2:
1134
+ do_rgb_to_yuv_stage2
1135
+ do_store 8
1136
+ tst N, #7
1137
+ beq 8f
1138
+ 3:
1139
+ tst N, #4
1140
+ beq 3f
1141
+ do_load \bpp, 4
1142
+ 3:
1143
+ tst N, #2
1144
+ beq 4f
1145
+ do_load \bpp, 2
1146
+ 4:
1147
+ tst N, #1
1148
+ beq 5f
1149
+ do_load \bpp, 1
1150
+ 5:
1151
+ do_rgb_to_yuv
1152
+ tst N, #4
1153
+ beq 6f
1154
+ do_store 4
1155
+ 6:
1156
+ tst N, #2
1157
+ beq 7f
1158
+ do_store 2
1159
+ 7:
1160
+ tst N, #1
1161
+ beq 8f
1162
+ do_store 1
1163
+ 8:
1164
+ subs NUM_ROWS, NUM_ROWS, #1
1165
+ bgt 0b
1166
+ 9:
1167
+ /* Restore all registers and return */
1168
+ vpop {d8 - d15}
1169
+ pop {r4, r5, r6, r7, r8, r9, r10, pc}
1170
+
1171
+ .unreq OUTPUT_WIDTH
1172
+ .unreq OUTPUT_ROW
1173
+ .unreq INPUT_BUF
1174
+ .unreq NUM_ROWS
1175
+ .unreq OUTPUT_BUF0
1176
+ .unreq OUTPUT_BUF1
1177
+ .unreq OUTPUT_BUF2
1178
+ .unreq RGB
1179
+ .unreq Y
1180
+ .unreq U
1181
+ .unreq V
1182
+ .unreq N
1183
+
1184
+ .purgem do_rgb_to_yuv
1185
+ .purgem do_rgb_to_yuv_stage1
1186
+ .purgem do_rgb_to_yuv_stage2
1187
+ .purgem do_rgb_to_yuv_stage2_store_load_stage1
1188
+
1189
+ .endm
1190
+
1191
+ /*--------------------------------- id ----- bpp R G B */
1192
+ generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
1193
+ generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
1194
+ generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1195
+ generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1196
+ generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1197
+ generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1198
+
1199
+ .purgem do_load
1200
+ .purgem do_store