image_pack 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +18 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +140 -0
  5. data/THIRD_PARTY_NOTICES.md +8 -0
  6. data/ext/image_pack/extconf.rb +515 -0
  7. data/ext/image_pack/image_pack.c +1618 -0
  8. data/ext/image_pack/vendor/.vendored +1 -0
  9. data/ext/image_pack/vendor/mozjpeg/BUILDING.txt +902 -0
  10. data/ext/image_pack/vendor/mozjpeg/CMakeLists.txt +1593 -0
  11. data/ext/image_pack/vendor/mozjpeg/LICENSE.md +132 -0
  12. data/ext/image_pack/vendor/mozjpeg/README-mozilla.txt +194 -0
  13. data/ext/image_pack/vendor/mozjpeg/README-turbo.txt +346 -0
  14. data/ext/image_pack/vendor/mozjpeg/README.ijg +258 -0
  15. data/ext/image_pack/vendor/mozjpeg/README.md +29 -0
  16. data/ext/image_pack/vendor/mozjpeg/cderror.h +128 -0
  17. data/ext/image_pack/vendor/mozjpeg/cdjpeg.c +156 -0
  18. data/ext/image_pack/vendor/mozjpeg/cdjpeg.h +171 -0
  19. data/ext/image_pack/vendor/mozjpeg/cjpeg.c +961 -0
  20. data/ext/image_pack/vendor/mozjpeg/cmyk.h +60 -0
  21. data/ext/image_pack/vendor/mozjpeg/coderules.txt +78 -0
  22. data/ext/image_pack/vendor/mozjpeg/croptest.in +95 -0
  23. data/ext/image_pack/vendor/mozjpeg/djpeg.c +855 -0
  24. data/ext/image_pack/vendor/mozjpeg/example.txt +464 -0
  25. data/ext/image_pack/vendor/mozjpeg/jaricom.c +157 -0
  26. data/ext/image_pack/vendor/mozjpeg/jcapimin.c +307 -0
  27. data/ext/image_pack/vendor/mozjpeg/jcapistd.c +168 -0
  28. data/ext/image_pack/vendor/mozjpeg/jcarith.c +972 -0
  29. data/ext/image_pack/vendor/mozjpeg/jccoefct.c +609 -0
  30. data/ext/image_pack/vendor/mozjpeg/jccolext.c +144 -0
  31. data/ext/image_pack/vendor/mozjpeg/jccolor.c +721 -0
  32. data/ext/image_pack/vendor/mozjpeg/jcdctmgr.c +1776 -0
  33. data/ext/image_pack/vendor/mozjpeg/jcext.c +219 -0
  34. data/ext/image_pack/vendor/mozjpeg/jchuff.c +1146 -0
  35. data/ext/image_pack/vendor/mozjpeg/jchuff.h +57 -0
  36. data/ext/image_pack/vendor/mozjpeg/jcicc.c +105 -0
  37. data/ext/image_pack/vendor/mozjpeg/jcinit.c +82 -0
  38. data/ext/image_pack/vendor/mozjpeg/jcmainct.c +162 -0
  39. data/ext/image_pack/vendor/mozjpeg/jcmarker.c +844 -0
  40. data/ext/image_pack/vendor/mozjpeg/jcmaster.c +958 -0
  41. data/ext/image_pack/vendor/mozjpeg/jcmaster.h +56 -0
  42. data/ext/image_pack/vendor/mozjpeg/jcomapi.c +109 -0
  43. data/ext/image_pack/vendor/mozjpeg/jconfig.h.in +37 -0
  44. data/ext/image_pack/vendor/mozjpeg/jconfig.txt +93 -0
  45. data/ext/image_pack/vendor/mozjpeg/jconfigint.h.in +44 -0
  46. data/ext/image_pack/vendor/mozjpeg/jcparam.c +991 -0
  47. data/ext/image_pack/vendor/mozjpeg/jcphuff.c +1123 -0
  48. data/ext/image_pack/vendor/mozjpeg/jcprepct.c +351 -0
  49. data/ext/image_pack/vendor/mozjpeg/jcsample.c +522 -0
  50. data/ext/image_pack/vendor/mozjpeg/jcstest.c +126 -0
  51. data/ext/image_pack/vendor/mozjpeg/jctrans.c +408 -0
  52. data/ext/image_pack/vendor/mozjpeg/jdapimin.c +407 -0
  53. data/ext/image_pack/vendor/mozjpeg/jdapistd.c +691 -0
  54. data/ext/image_pack/vendor/mozjpeg/jdarith.c +782 -0
  55. data/ext/image_pack/vendor/mozjpeg/jdatadst-tj.c +198 -0
  56. data/ext/image_pack/vendor/mozjpeg/jdatadst.c +299 -0
  57. data/ext/image_pack/vendor/mozjpeg/jdatasrc-tj.c +194 -0
  58. data/ext/image_pack/vendor/mozjpeg/jdatasrc.c +295 -0
  59. data/ext/image_pack/vendor/mozjpeg/jdcoefct.c +881 -0
  60. data/ext/image_pack/vendor/mozjpeg/jdcoefct.h +83 -0
  61. data/ext/image_pack/vendor/mozjpeg/jdcol565.c +384 -0
  62. data/ext/image_pack/vendor/mozjpeg/jdcolext.c +141 -0
  63. data/ext/image_pack/vendor/mozjpeg/jdcolor.c +881 -0
  64. data/ext/image_pack/vendor/mozjpeg/jdct.h +208 -0
  65. data/ext/image_pack/vendor/mozjpeg/jddctmgr.c +367 -0
  66. data/ext/image_pack/vendor/mozjpeg/jdhuff.c +834 -0
  67. data/ext/image_pack/vendor/mozjpeg/jdhuff.h +247 -0
  68. data/ext/image_pack/vendor/mozjpeg/jdicc.c +167 -0
  69. data/ext/image_pack/vendor/mozjpeg/jdinput.c +408 -0
  70. data/ext/image_pack/vendor/mozjpeg/jdmainct.c +460 -0
  71. data/ext/image_pack/vendor/mozjpeg/jdmainct.h +71 -0
  72. data/ext/image_pack/vendor/mozjpeg/jdmarker.c +1374 -0
  73. data/ext/image_pack/vendor/mozjpeg/jdmaster.c +727 -0
  74. data/ext/image_pack/vendor/mozjpeg/jdmaster.h +33 -0
  75. data/ext/image_pack/vendor/mozjpeg/jdmerge.c +587 -0
  76. data/ext/image_pack/vendor/mozjpeg/jdmerge.h +47 -0
  77. data/ext/image_pack/vendor/mozjpeg/jdmrg565.c +354 -0
  78. data/ext/image_pack/vendor/mozjpeg/jdmrgext.c +184 -0
  79. data/ext/image_pack/vendor/mozjpeg/jdphuff.c +679 -0
  80. data/ext/image_pack/vendor/mozjpeg/jdpostct.c +294 -0
  81. data/ext/image_pack/vendor/mozjpeg/jdsample.c +524 -0
  82. data/ext/image_pack/vendor/mozjpeg/jdsample.h +50 -0
  83. data/ext/image_pack/vendor/mozjpeg/jdtrans.c +156 -0
  84. data/ext/image_pack/vendor/mozjpeg/jerror.c +251 -0
  85. data/ext/image_pack/vendor/mozjpeg/jerror.h +335 -0
  86. data/ext/image_pack/vendor/mozjpeg/jfdctflt.c +169 -0
  87. data/ext/image_pack/vendor/mozjpeg/jfdctfst.c +227 -0
  88. data/ext/image_pack/vendor/mozjpeg/jfdctint.c +288 -0
  89. data/ext/image_pack/vendor/mozjpeg/jidctflt.c +240 -0
  90. data/ext/image_pack/vendor/mozjpeg/jidctfst.c +371 -0
  91. data/ext/image_pack/vendor/mozjpeg/jidctint.c +2627 -0
  92. data/ext/image_pack/vendor/mozjpeg/jidctred.c +409 -0
  93. data/ext/image_pack/vendor/mozjpeg/jinclude.h +147 -0
  94. data/ext/image_pack/vendor/mozjpeg/jmemmgr.c +1180 -0
  95. data/ext/image_pack/vendor/mozjpeg/jmemnobs.c +110 -0
  96. data/ext/image_pack/vendor/mozjpeg/jmemsys.h +178 -0
  97. data/ext/image_pack/vendor/mozjpeg/jmorecfg.h +382 -0
  98. data/ext/image_pack/vendor/mozjpeg/jpeg_nbits_table.h +4098 -0
  99. data/ext/image_pack/vendor/mozjpeg/jpegcomp.h +32 -0
  100. data/ext/image_pack/vendor/mozjpeg/jpegint.h +453 -0
  101. data/ext/image_pack/vendor/mozjpeg/jpeglib.h +1211 -0
  102. data/ext/image_pack/vendor/mozjpeg/jpegtran.c +827 -0
  103. data/ext/image_pack/vendor/mozjpeg/jpegyuv.c +172 -0
  104. data/ext/image_pack/vendor/mozjpeg/jquant1.c +856 -0
  105. data/ext/image_pack/vendor/mozjpeg/jquant2.c +1286 -0
  106. data/ext/image_pack/vendor/mozjpeg/jsimd.h +123 -0
  107. data/ext/image_pack/vendor/mozjpeg/jsimd_none.c +431 -0
  108. data/ext/image_pack/vendor/mozjpeg/jsimddct.h +70 -0
  109. data/ext/image_pack/vendor/mozjpeg/jstdhuff.c +144 -0
  110. data/ext/image_pack/vendor/mozjpeg/jutils.c +133 -0
  111. data/ext/image_pack/vendor/mozjpeg/jversion.h.in +56 -0
  112. data/ext/image_pack/vendor/mozjpeg/libjpeg.map.in +11 -0
  113. data/ext/image_pack/vendor/mozjpeg/libjpeg.txt +3150 -0
  114. data/ext/image_pack/vendor/mozjpeg/rdbmp.c +690 -0
  115. data/ext/image_pack/vendor/mozjpeg/rdcolmap.c +253 -0
  116. data/ext/image_pack/vendor/mozjpeg/rdgif.c +720 -0
  117. data/ext/image_pack/vendor/mozjpeg/rdjpeg.c +160 -0
  118. data/ext/image_pack/vendor/mozjpeg/rdjpgcom.c +494 -0
  119. data/ext/image_pack/vendor/mozjpeg/rdpng.c +194 -0
  120. data/ext/image_pack/vendor/mozjpeg/rdppm.c +781 -0
  121. data/ext/image_pack/vendor/mozjpeg/rdswitch.c +642 -0
  122. data/ext/image_pack/vendor/mozjpeg/rdtarga.c +508 -0
  123. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jccolext-neon.c +148 -0
  124. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jchuff-neon.c +334 -0
  125. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jsimd.c +976 -0
  126. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jsimd_neon.S +1200 -0
  127. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jccolext-neon.c +316 -0
  128. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jchuff-neon.c +411 -0
  129. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jsimd.c +1053 -0
  130. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jsimd_neon.S +2254 -0
  131. data/ext/image_pack/vendor/mozjpeg/simd/arm/align.h +28 -0
  132. data/ext/image_pack/vendor/mozjpeg/simd/arm/jccolor-neon.c +160 -0
  133. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcgray-neon.c +120 -0
  134. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcgryext-neon.c +106 -0
  135. data/ext/image_pack/vendor/mozjpeg/simd/arm/jchuff.h +131 -0
  136. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcphuff-neon.c +623 -0
  137. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcsample-neon.c +192 -0
  138. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdcolext-neon.c +374 -0
  139. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdcolor-neon.c +141 -0
  140. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdmerge-neon.c +144 -0
  141. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdmrgext-neon.c +723 -0
  142. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdsample-neon.c +569 -0
  143. data/ext/image_pack/vendor/mozjpeg/simd/arm/jfdctfst-neon.c +214 -0
  144. data/ext/image_pack/vendor/mozjpeg/simd/arm/jfdctint-neon.c +376 -0
  145. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctfst-neon.c +472 -0
  146. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctint-neon.c +801 -0
  147. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctred-neon.c +486 -0
  148. data/ext/image_pack/vendor/mozjpeg/simd/arm/jquanti-neon.c +193 -0
  149. data/ext/image_pack/vendor/mozjpeg/simd/arm/neon-compat.h +26 -0
  150. data/ext/image_pack/vendor/mozjpeg/simd/arm/neon-compat.h.in +37 -0
  151. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-avx2.asm +578 -0
  152. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-mmx.asm +476 -0
  153. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-sse2.asm +503 -0
  154. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-avx2.asm +121 -0
  155. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-mmx.asm +121 -0
  156. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-sse2.asm +120 -0
  157. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-avx2.asm +113 -0
  158. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-mmx.asm +113 -0
  159. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-sse2.asm +112 -0
  160. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-avx2.asm +457 -0
  161. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-mmx.asm +355 -0
  162. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-sse2.asm +382 -0
  163. data/ext/image_pack/vendor/mozjpeg/simd/i386/jchuff-sse2.asm +761 -0
  164. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcphuff-sse2.asm +662 -0
  165. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-avx2.asm +388 -0
  166. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-mmx.asm +324 -0
  167. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-sse2.asm +351 -0
  168. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-avx2.asm +515 -0
  169. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-mmx.asm +404 -0
  170. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-sse2.asm +458 -0
  171. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-avx2.asm +118 -0
  172. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-mmx.asm +117 -0
  173. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-sse2.asm +117 -0
  174. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-avx2.asm +136 -0
  175. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-mmx.asm +123 -0
  176. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-sse2.asm +135 -0
  177. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-avx2.asm +575 -0
  178. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-mmx.asm +460 -0
  179. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-sse2.asm +517 -0
  180. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-avx2.asm +760 -0
  181. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-mmx.asm +731 -0
  182. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-sse2.asm +724 -0
  183. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctflt-3dn.asm +318 -0
  184. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctflt-sse.asm +369 -0
  185. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctfst-mmx.asm +395 -0
  186. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctfst-sse2.asm +403 -0
  187. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-avx2.asm +331 -0
  188. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-mmx.asm +620 -0
  189. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-sse2.asm +633 -0
  190. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-3dn.asm +451 -0
  191. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-sse.asm +571 -0
  192. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-sse2.asm +497 -0
  193. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctfst-mmx.asm +499 -0
  194. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctfst-sse2.asm +501 -0
  195. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-avx2.asm +453 -0
  196. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-mmx.asm +851 -0
  197. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-sse2.asm +858 -0
  198. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctred-mmx.asm +704 -0
  199. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctred-sse2.asm +592 -0
  200. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-3dn.asm +230 -0
  201. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-mmx.asm +276 -0
  202. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-sse.asm +208 -0
  203. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquantf-sse2.asm +168 -0
  204. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquanti-avx2.asm +188 -0
  205. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquanti-sse2.asm +201 -0
  206. data/ext/image_pack/vendor/mozjpeg/simd/i386/jsimd.c +1312 -0
  207. data/ext/image_pack/vendor/mozjpeg/simd/i386/jsimdcpu.asm +135 -0
  208. data/ext/image_pack/vendor/mozjpeg/simd/jsimd.h +1258 -0
  209. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd.c +1143 -0
  210. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd_dspr2.S +4543 -0
  211. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd_dspr2_asm.h +292 -0
  212. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jccolext-mmi.c +455 -0
  213. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jccolor-mmi.c +148 -0
  214. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcgray-mmi.c +132 -0
  215. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcgryext-mmi.c +374 -0
  216. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcsample-mmi.c +98 -0
  217. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcsample.h +28 -0
  218. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdcolext-mmi.c +415 -0
  219. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdcolor-mmi.c +139 -0
  220. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdmerge-mmi.c +149 -0
  221. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdmrgext-mmi.c +615 -0
  222. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdsample-mmi.c +304 -0
  223. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jfdctfst-mmi.c +255 -0
  224. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jfdctint-mmi.c +398 -0
  225. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jidctfst-mmi.c +395 -0
  226. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jidctint-mmi.c +571 -0
  227. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jquanti-mmi.c +124 -0
  228. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jsimd.c +866 -0
  229. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jsimd_mmi.h +69 -0
  230. data/ext/image_pack/vendor/mozjpeg/simd/mips64/loongson-mmintrin.h +1334 -0
  231. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jcolsamp.inc +135 -0
  232. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jdct.inc +31 -0
  233. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdcfg.inc +93 -0
  234. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdcfg.inc.h +133 -0
  235. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdext.inc +520 -0
  236. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jccolext-altivec.c +269 -0
  237. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jccolor-altivec.c +116 -0
  238. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcgray-altivec.c +111 -0
  239. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcgryext-altivec.c +228 -0
  240. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcsample-altivec.c +159 -0
  241. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcsample.h +28 -0
  242. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdcolext-altivec.c +276 -0
  243. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdcolor-altivec.c +106 -0
  244. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdmerge-altivec.c +130 -0
  245. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdmrgext-altivec.c +329 -0
  246. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdsample-altivec.c +400 -0
  247. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jfdctfst-altivec.c +154 -0
  248. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jfdctint-altivec.c +258 -0
  249. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jidctfst-altivec.c +255 -0
  250. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jidctint-altivec.c +357 -0
  251. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jquanti-altivec.c +250 -0
  252. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jsimd.c +884 -0
  253. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jsimd_altivec.h +98 -0
  254. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolext-avx2.asm +559 -0
  255. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolext-sse2.asm +484 -0
  256. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolor-avx2.asm +121 -0
  257. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolor-sse2.asm +120 -0
  258. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgray-avx2.asm +113 -0
  259. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgray-sse2.asm +112 -0
  260. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgryext-avx2.asm +438 -0
  261. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgryext-sse2.asm +363 -0
  262. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jchuff-sse2.asm +583 -0
  263. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcphuff-sse2.asm +639 -0
  264. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcsample-avx2.asm +367 -0
  265. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcsample-sse2.asm +330 -0
  266. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolext-avx2.asm +496 -0
  267. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolext-sse2.asm +439 -0
  268. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolor-avx2.asm +118 -0
  269. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolor-sse2.asm +117 -0
  270. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmerge-avx2.asm +136 -0
  271. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmerge-sse2.asm +135 -0
  272. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmrgext-avx2.asm +596 -0
  273. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmrgext-sse2.asm +538 -0
  274. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdsample-avx2.asm +696 -0
  275. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdsample-sse2.asm +665 -0
  276. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctflt-sse.asm +355 -0
  277. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctfst-sse2.asm +389 -0
  278. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctint-avx2.asm +320 -0
  279. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctint-sse2.asm +619 -0
  280. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctflt-sse2.asm +482 -0
  281. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctfst-sse2.asm +491 -0
  282. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctint-avx2.asm +418 -0
  283. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctint-sse2.asm +847 -0
  284. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctred-sse2.asm +574 -0
  285. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquantf-sse2.asm +155 -0
  286. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquanti-avx2.asm +163 -0
  287. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquanti-sse2.asm +188 -0
  288. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jsimd.c +1110 -0
  289. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jsimdcpu.asm +86 -0
  290. data/ext/image_pack/vendor/mozjpeg/strtest.c +170 -0
  291. data/ext/image_pack/vendor/mozjpeg/structure.txt +900 -0
  292. data/ext/image_pack/vendor/mozjpeg/tjbench.c +1044 -0
  293. data/ext/image_pack/vendor/mozjpeg/tjbenchtest.in +256 -0
  294. data/ext/image_pack/vendor/mozjpeg/tjbenchtest.java.in +215 -0
  295. data/ext/image_pack/vendor/mozjpeg/tjexample.c +406 -0
  296. data/ext/image_pack/vendor/mozjpeg/tjexampletest.in +149 -0
  297. data/ext/image_pack/vendor/mozjpeg/tjexampletest.java.in +151 -0
  298. data/ext/image_pack/vendor/mozjpeg/tjunittest.c +961 -0
  299. data/ext/image_pack/vendor/mozjpeg/tjutil.c +70 -0
  300. data/ext/image_pack/vendor/mozjpeg/tjutil.h +53 -0
  301. data/ext/image_pack/vendor/mozjpeg/transupp.c +2373 -0
  302. data/ext/image_pack/vendor/mozjpeg/transupp.h +243 -0
  303. data/ext/image_pack/vendor/mozjpeg/turbojpeg-jni.c +1259 -0
  304. data/ext/image_pack/vendor/mozjpeg/turbojpeg.c +2320 -0
  305. data/ext/image_pack/vendor/mozjpeg/turbojpeg.h +1784 -0
  306. data/ext/image_pack/vendor/mozjpeg/usage.txt +679 -0
  307. data/ext/image_pack/vendor/mozjpeg/wizard.txt +220 -0
  308. data/ext/image_pack/vendor/mozjpeg/wrbmp.c +552 -0
  309. data/ext/image_pack/vendor/mozjpeg/wrgif.c +580 -0
  310. data/ext/image_pack/vendor/mozjpeg/wrjpgcom.c +577 -0
  311. data/ext/image_pack/vendor/mozjpeg/wrppm.c +366 -0
  312. data/ext/image_pack/vendor/mozjpeg/wrtarga.c +258 -0
  313. data/ext/image_pack/vendor/mozjpeg/yuvjpeg.c +268 -0
  314. data/lib/image_pack/backend.rb +8 -0
  315. data/lib/image_pack/configuration.rb +23 -0
  316. data/lib/image_pack/errors.rb +13 -0
  317. data/lib/image_pack/version.rb +5 -0
  318. data/lib/image_pack.rb +208 -0
  319. metadata +433 -0
@@ -0,0 +1,4543 @@
1
+ /*
2
+ * MIPS DSPr2 optimizations for libjpeg-turbo
3
+ *
4
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
5
+ * All Rights Reserved.
6
+ * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com>
7
+ * Darko Laus <darko.laus@imgtec.com>
8
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
9
+ *
10
+ * This software is provided 'as-is', without any express or implied
11
+ * warranty. In no event will the authors be held liable for any damages
12
+ * arising from the use of this software.
13
+ *
14
+ * Permission is granted to anyone to use this software for any purpose,
15
+ * including commercial applications, and to alter it and redistribute it
16
+ * freely, subject to the following restrictions:
17
+ *
18
+ * 1. The origin of this software must not be misrepresented; you must not
19
+ * claim that you wrote the original software. If you use this software
20
+ * in a product, an acknowledgment in the product documentation would be
21
+ * appreciated but is not required.
22
+ * 2. Altered source versions must be plainly marked as such, and must not be
23
+ * misrepresented as being the original software.
24
+ * 3. This notice may not be removed or altered from any source distribution.
25
+ */
26
+
27
+ #include "jsimd_dspr2_asm.h"
28
+
29
+
30
+ /*****************************************************************************/
31
+ LEAF_DSPR2(jsimd_c_null_convert_dspr2)
32
+ /*
33
+ * a0 = cinfo->image_width
34
+ * a1 = input_buf
35
+ * a2 = output_buf
36
+ * a3 = output_row
37
+ * 16(sp) = num_rows
38
+ * 20(sp) = cinfo->num_components
39
+ *
40
+ * Null conversion for compression
41
+ */
42
+ SAVE_REGS_ON_STACK 8, s0, s1
43
+
44
+ lw t9, 24(sp) /* t9 = num_rows */
45
+ lw s0, 28(sp) /* s0 = cinfo->num_components */
46
+ andi t0, a0, 3 /* t0 = cinfo->image_width & 3 */
47
+ beqz t0, 4f /* no residual */
48
+ nop
49
+ 0:
50
+ addiu t9, t9, -1
51
+ bltz t9, 7f
52
+ li t1, 0
53
+ 1:
54
+ sll t3, t1, 2
55
+ lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */
56
+ lw t2, 0(a1) /* t2 = inptr = *input_buf */
57
+ sll t4, a3, 2
58
+ lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */
59
+ addu t2, t2, t1
60
+ addu s1, t5, a0
61
+ addu t6, t5, t0
62
+ 2:
63
+ lbu t3, 0(t2)
64
+ addiu t5, t5, 1
65
+ sb t3, -1(t5)
66
+ bne t6, t5, 2b
67
+ addu t2, t2, s0
68
+ 3:
69
+ lbu t3, 0(t2)
70
+ addu t4, t2, s0
71
+ addu t7, t4, s0
72
+ addu t8, t7, s0
73
+ addu t2, t8, s0
74
+ lbu t4, 0(t4)
75
+ lbu t7, 0(t7)
76
+ lbu t8, 0(t8)
77
+ addiu t5, t5, 4
78
+ sb t3, -4(t5)
79
+ sb t4, -3(t5)
80
+ sb t7, -2(t5)
81
+ bne s1, t5, 3b
82
+ sb t8, -1(t5)
83
+ addiu t1, t1, 1
84
+ bne t1, s0, 1b
85
+ nop
86
+ addiu a1, a1, 4
87
+ bgez t9, 0b
88
+ addiu a3, a3, 1
89
+ b 7f
90
+ nop
91
+ 4:
92
+ addiu t9, t9, -1
93
+ bltz t9, 7f
94
+ li t1, 0
95
+ 5:
96
+ sll t3, t1, 2
97
+ lwx t5, t3(a2) /* t5 = outptr = output_buf[ci] */
98
+ lw t2, 0(a1) /* t2 = inptr = *input_buf */
99
+ sll t4, a3, 2
100
+ lwx t5, t4(t5) /* t5 = outptr = output_buf[ci][output_row] */
101
+ addu t2, t2, t1
102
+ addu s1, t5, a0
103
+ addu t6, t5, t0
104
+ 6:
105
+ lbu t3, 0(t2)
106
+ addu t4, t2, s0
107
+ addu t7, t4, s0
108
+ addu t8, t7, s0
109
+ addu t2, t8, s0
110
+ lbu t4, 0(t4)
111
+ lbu t7, 0(t7)
112
+ lbu t8, 0(t8)
113
+ addiu t5, t5, 4
114
+ sb t3, -4(t5)
115
+ sb t4, -3(t5)
116
+ sb t7, -2(t5)
117
+ bne s1, t5, 6b
118
+ sb t8, -1(t5)
119
+ addiu t1, t1, 1
120
+ bne t1, s0, 5b
121
+ nop
122
+ addiu a1, a1, 4
123
+ bgez t9, 4b
124
+ addiu a3, a3, 1
125
+ 7:
126
+ RESTORE_REGS_FROM_STACK 8, s0, s1
127
+
128
+ j ra
129
+ nop
130
+
131
+ END(jsimd_c_null_convert_dspr2)
132
+
133
+
134
+ /*****************************************************************************/
135
+ /*
136
+ * jsimd_extrgb_ycc_convert_dspr2
137
+ * jsimd_extbgr_ycc_convert_dspr2
138
+ * jsimd_extrgbx_ycc_convert_dspr2
139
+ * jsimd_extbgrx_ycc_convert_dspr2
140
+ * jsimd_extxbgr_ycc_convert_dspr2
141
+ * jsimd_extxrgb_ycc_convert_dspr2
142
+ *
143
+ * Colorspace conversion RGB -> YCbCr
144
+ */
145
+
146
+ .macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \
147
+ r_offs, g_offs, b_offs
148
+
149
+ .macro DO_RGB_TO_YCC r, g, b, inptr
150
+ lbu \r, \r_offs(\inptr)
151
+ lbu \g, \g_offs(\inptr)
152
+ lbu \b, \b_offs(\inptr)
153
+ addiu \inptr, \pixel_size
154
+ .endm
155
+
156
+ LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
157
+ /*
158
+ * a0 = cinfo->image_width
159
+ * a1 = input_buf
160
+ * a2 = output_buf
161
+ * a3 = output_row
162
+ * 16(sp) = num_rows
163
+ */
164
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
165
+
166
+ lw t7, 48(sp) /* t7 = num_rows */
167
+ li s0, 0x4c8b /* FIX(0.29900) */
168
+ li s1, 0x9646 /* FIX(0.58700) */
169
+ li s2, 0x1d2f /* FIX(0.11400) */
170
+ li s3, 0xffffd4cd /* -FIX(0.16874) */
171
+ li s4, 0xffffab33 /* -FIX(0.33126) */
172
+ li s5, 0x8000 /* FIX(0.50000) */
173
+ li s6, 0xffff94d1 /* -FIX(0.41869) */
174
+ li s7, 0xffffeb2f /* -FIX(0.08131) */
175
+ li t8, 0x807fff /* CBCR_OFFSET + ONE_HALF-1 */
176
+
177
+ 0:
178
+ addiu t7, -1 /* --num_rows */
179
+ lw t6, 0(a1) /* t6 = input_buf[0] */
180
+ lw t0, 0(a2)
181
+ lw t1, 4(a2)
182
+ lw t2, 8(a2)
183
+ sll t3, a3, 2
184
+ lwx t0, t3(t0) /* t0 = output_buf[0][output_row] */
185
+ lwx t1, t3(t1) /* t1 = output_buf[1][output_row] */
186
+ lwx t2, t3(t2) /* t2 = output_buf[2][output_row] */
187
+
188
+ addu t9, t2, a0 /* t9 = end address */
189
+ addiu a3, 1
190
+
191
+ 1:
192
+ DO_RGB_TO_YCC t3, t4, t5, t6
193
+
194
+ mtlo s5, $ac0
195
+ mtlo t8, $ac1
196
+ mtlo t8, $ac2
197
+ maddu $ac0, s2, t5
198
+ maddu $ac1, s5, t5
199
+ maddu $ac2, s5, t3
200
+ maddu $ac0, s0, t3
201
+ maddu $ac1, s3, t3
202
+ maddu $ac2, s6, t4
203
+ maddu $ac0, s1, t4
204
+ maddu $ac1, s4, t4
205
+ maddu $ac2, s7, t5
206
+ extr.w t3, $ac0, 16
207
+ extr.w t4, $ac1, 16
208
+ extr.w t5, $ac2, 16
209
+ sb t3, 0(t0)
210
+ sb t4, 0(t1)
211
+ sb t5, 0(t2)
212
+ addiu t0, 1
213
+ addiu t2, 1
214
+ bne t2, t9, 1b
215
+ addiu t1, 1
216
+ bgtz t7, 0b
217
+ addiu a1, 4
218
+
219
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
220
+
221
+ j ra
222
+ nop
223
+ END(jsimd_\colorid\()_ycc_convert_dspr2)
224
+
225
+ .purgem DO_RGB_TO_YCC
226
+
227
+ .endm
228
+
229
+ /*-------------------------------------id -- pix R G B */
230
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
231
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
232
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
233
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
234
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
235
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
236
+
237
+
238
+ /*****************************************************************************/
239
+ /*
240
+ * jsimd_ycc_extrgb_convert_dspr2
241
+ * jsimd_ycc_extbgr_convert_dspr2
242
+ * jsimd_ycc_extrgbx_convert_dspr2
243
+ * jsimd_ycc_extbgrx_convert_dspr2
244
+ * jsimd_ycc_extxbgr_convert_dspr2
245
+ * jsimd_ycc_extxrgb_convert_dspr2
246
+ *
247
+ * Colorspace conversion YCbCr -> RGB
248
+ */
249
+
250
+ .macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \
251
+ r_offs, g_offs, b_offs, a_offs
252
+
253
+ .macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr
254
+ sb \scratch0, \r_offs(\outptr)
255
+ sb \scratch1, \g_offs(\outptr)
256
+ sb \scratch2, \b_offs(\outptr)
257
+ .if (\pixel_size == 4)
258
+ li t0, 0xFF
259
+ sb t0, \a_offs(\outptr)
260
+ .endif
261
+ addiu \outptr, \pixel_size
262
+ .endm
263
+
264
+ LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
265
+ /*
266
+ * a0 = cinfo->image_width
267
+ * a1 = input_buf
268
+ * a2 = input_row
269
+ * a3 = output_buf
270
+ * 16(sp) = num_rows
271
+ */
272
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
273
+
274
+ lw s1, 48(sp)
275
+ li t3, 0x8000
276
+ li t4, 0x166e9 /* FIX(1.40200) */
277
+ li t5, 0x1c5a2 /* FIX(1.77200) */
278
+ li t6, 0xffff492e /* -FIX(0.71414) */
279
+ li t7, 0xffffa7e6 /* -FIX(0.34414) */
280
+ repl.ph t8, 128
281
+
282
+ 0:
283
+ lw s0, 0(a3)
284
+ lw t0, 0(a1)
285
+ lw t1, 4(a1)
286
+ lw t2, 8(a1)
287
+ sll s5, a2, 2
288
+ addiu s1, -1
289
+ lwx s2, s5(t0)
290
+ lwx s3, s5(t1)
291
+ lwx s4, s5(t2)
292
+ addu t9, s2, a0
293
+ addiu a2, 1
294
+
295
+ 1:
296
+ lbu s7, 0(s4) /* cr */
297
+ lbu s6, 0(s3) /* cb */
298
+ lbu s5, 0(s2) /* y */
299
+ addiu s2, 1
300
+ addiu s4, 1
301
+ addiu s7, -128
302
+ addiu s6, -128
303
+ mul t2, t7, s6
304
+ mul t0, t6, s7 /* Crgtab[cr] */
305
+ sll s7, 15
306
+ mulq_rs.w t1, t4, s7 /* Crrtab[cr] */
307
+ sll s6, 15
308
+ addu t2, t3 /* Cbgtab[cb] */
309
+ addu t2, t0
310
+
311
+ mulq_rs.w t0, t5, s6 /* Cbbtab[cb] */
312
+ sra t2, 16
313
+ addu t1, s5
314
+ addu t2, s5 /* add y */
315
+ ins t2, t1, 16, 16
316
+ subu.ph t2, t2, t8
317
+ addu t0, s5
318
+ shll_s.ph t2, t2, 8
319
+ subu t0, 128
320
+ shra.ph t2, t2, 8
321
+ shll_s.w t0, t0, 24
322
+ addu.ph t2, t2, t8 /* clip & store */
323
+ sra t0, t0, 24
324
+ sra t1, t2, 16
325
+ addiu t0, 128
326
+
327
+ STORE_YCC_TO_RGB t1, t2, t0, s0
328
+
329
+ bne s2, t9, 1b
330
+ addiu s3, 1
331
+ bgtz s1, 0b
332
+ addiu a3, 4
333
+
334
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
335
+
336
+ j ra
337
+ nop
338
+ END(jsimd_ycc_\colorid\()_convert_dspr2)
339
+
340
+ .purgem STORE_YCC_TO_RGB
341
+
342
+ .endm
343
+
344
+ /*-------------------------------------id -- pix R G B A */
345
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3
346
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3
347
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
348
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
349
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
350
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
351
+
352
+
353
+ /*****************************************************************************/
354
+ /*
355
+ * jsimd_extrgb_gray_convert_dspr2
356
+ * jsimd_extbgr_gray_convert_dspr2
357
+ * jsimd_extrgbx_gray_convert_dspr2
358
+ * jsimd_extbgrx_gray_convert_dspr2
359
+ * jsimd_extxbgr_gray_convert_dspr2
360
+ * jsimd_extxrgb_gray_convert_dspr2
361
+ *
362
+ * Colorspace conversion RGB -> GRAY
363
+ */
364
+
365
+ .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \
366
+ r_offs, g_offs, b_offs
367
+
368
+ .macro DO_RGB_TO_GRAY r, g, b, inptr
369
+ lbu \r, \r_offs(\inptr)
370
+ lbu \g, \g_offs(\inptr)
371
+ lbu \b, \b_offs(\inptr)
372
+ addiu \inptr, \pixel_size
373
+ .endm
374
+
375
+ LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
376
+ /*
377
+ * a0 = cinfo->image_width
378
+ * a1 = input_buf
379
+ * a2 = output_buf
380
+ * a3 = output_row
381
+ * 16(sp) = num_rows
382
+ */
383
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
384
+
385
+ li s0, 0x4c8b /* s0 = FIX(0.29900) */
386
+ li s1, 0x9646 /* s1 = FIX(0.58700) */
387
+ li s2, 0x1d2f /* s2 = FIX(0.11400) */
388
+ li s7, 0x8000 /* s7 = FIX(0.50000) */
389
+ lw s6, 48(sp)
390
+ andi t7, a0, 3
391
+
392
+ 0:
393
+ addiu s6, -1 /* s6 = num_rows */
394
+ lw t0, 0(a1)
395
+ lw t1, 0(a2)
396
+ sll t3, a3, 2
397
+ lwx t1, t3(t1)
398
+ addiu a3, 1
399
+ addu t9, t1, a0
400
+ subu t8, t9, t7
401
+ beq t1, t8, 2f
402
+ nop
403
+
404
+ 1:
405
+ DO_RGB_TO_GRAY t3, t4, t5, t0
406
+ DO_RGB_TO_GRAY s3, s4, s5, t0
407
+
408
+ mtlo s7, $ac0
409
+ maddu $ac0, s2, t5
410
+ maddu $ac0, s1, t4
411
+ maddu $ac0, s0, t3
412
+ mtlo s7, $ac1
413
+ maddu $ac1, s2, s5
414
+ maddu $ac1, s1, s4
415
+ maddu $ac1, s0, s3
416
+ extr.w t6, $ac0, 16
417
+
418
+ DO_RGB_TO_GRAY t3, t4, t5, t0
419
+ DO_RGB_TO_GRAY s3, s4, s5, t0
420
+
421
+ mtlo s7, $ac0
422
+ maddu $ac0, s2, t5
423
+ maddu $ac0, s1, t4
424
+ extr.w t2, $ac1, 16
425
+ maddu $ac0, s0, t3
426
+ mtlo s7, $ac1
427
+ maddu $ac1, s2, s5
428
+ maddu $ac1, s1, s4
429
+ maddu $ac1, s0, s3
430
+ extr.w t5, $ac0, 16
431
+ sb t6, 0(t1)
432
+ sb t2, 1(t1)
433
+ extr.w t3, $ac1, 16
434
+ addiu t1, 4
435
+ sb t5, -2(t1)
436
+ sb t3, -1(t1)
437
+ bne t1, t8, 1b
438
+ nop
439
+
440
+ 2:
441
+ beqz t7, 4f
442
+ nop
443
+
444
+ 3:
445
+ DO_RGB_TO_GRAY t3, t4, t5, t0
446
+
447
+ mtlo s7, $ac0
448
+ maddu $ac0, s2, t5
449
+ maddu $ac0, s1, t4
450
+ maddu $ac0, s0, t3
451
+ extr.w t6, $ac0, 16
452
+ sb t6, 0(t1)
453
+ addiu t1, 1
454
+ bne t1, t9, 3b
455
+ nop
456
+
457
+ 4:
458
+ bgtz s6, 0b
459
+ addiu a1, 4
460
+
461
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
462
+
463
+ j ra
464
+ nop
465
+ END(jsimd_\colorid\()_gray_convert_dspr2)
466
+
467
+ .purgem DO_RGB_TO_GRAY
468
+
469
+ .endm
470
+
471
+ /*-------------------------------------id -- pix R G B */
472
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
473
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
474
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
475
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
476
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
477
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
478
+
479
+
480
+ /*****************************************************************************/
481
+ /*
482
+ * jsimd_h2v2_merged_upsample_dspr2
483
+ * jsimd_h2v2_extrgb_merged_upsample_dspr2
484
+ * jsimd_h2v2_extrgbx_merged_upsample_dspr2
485
+ * jsimd_h2v2_extbgr_merged_upsample_dspr2
486
+ * jsimd_h2v2_extbgrx_merged_upsample_dspr2
487
+ * jsimd_h2v2_extxbgr_merged_upsample_dspr2
488
+ * jsimd_h2v2_extxrgb_merged_upsample_dspr2
489
+ *
490
+ * Merged h2v2 upsample routines
491
+ */
492
+ .macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
493
+ r1_offs, g1_offs, \
494
+ b1_offs, a1_offs, \
495
+ r2_offs, g2_offs, \
496
+ b2_offs, a2_offs
497
+
498
+ .macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
499
+ scratch5 outptr
500
+ sb \scratch0, \r1_offs(\outptr)
501
+ sb \scratch1, \g1_offs(\outptr)
502
+ sb \scratch2, \b1_offs(\outptr)
503
+ sb \scratch3, \r2_offs(\outptr)
504
+ sb \scratch4, \g2_offs(\outptr)
505
+ sb \scratch5, \b2_offs(\outptr)
506
+ .if (\pixel_size == 8)
507
+ li \scratch0, 0xFF
508
+ sb \scratch0, \a1_offs(\outptr)
509
+ sb \scratch0, \a2_offs(\outptr)
510
+ .endif
511
+ addiu \outptr, \pixel_size
512
+ .endm
513
+
514
+ .macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr
515
+ sb \scratch0, \r1_offs(\outptr)
516
+ sb \scratch1, \g1_offs(\outptr)
517
+ sb \scratch2, \b1_offs(\outptr)
518
+
519
+ .if (\pixel_size == 8)
520
+ li t0, 0xFF
521
+ sb t0, \a1_offs(\outptr)
522
+ .endif
523
+ .endm
524
+
525
+ LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
526
+ /*
527
+ * a0 = cinfo->output_width
528
+ * a1 = input_buf
529
+ * a2 = in_row_group_ctr
530
+ * a3 = output_buf
531
+ * 16(sp) = cinfo->sample_range_limit
532
+ */
533
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
534
+
535
+ lw t9, 56(sp) /* cinfo->sample_range_limit */
536
+ lw v0, 0(a1)
537
+ lw v1, 4(a1)
538
+ lw t0, 8(a1)
539
+ sll t1, a2, 3
540
+ addiu t2, t1, 4
541
+ sll t3, a2, 2
542
+ lw t4, 0(a3) /* t4 = output_buf[0] */
543
+ lwx t1, t1(v0) /* t1 = input_buf[0][in_row_group_ctr*2] */
544
+ lwx t2, t2(v0) /* t2 = input_buf[0][in_row_group_ctr*2 + 1] */
545
+ lwx t5, t3(v1) /* t5 = input_buf[1][in_row_group_ctr] */
546
+ lwx t6, t3(t0) /* t6 = input_buf[2][in_row_group_ctr] */
547
+ lw t7, 4(a3) /* t7 = output_buf[1] */
548
+ li s1, 0xe6ea
549
+ addiu t8, s1, 0x7fff /* t8 = 0x166e9 [FIX(1.40200)] */
550
+ addiu s0, t8, 0x5eb9 /* s0 = 0x1c5a2 [FIX(1.77200)] */
551
+ addiu s1, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
552
+ xori s2, s1, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */
553
+ srl t3, a0, 1
554
+ blez t3, 2f
555
+ addu t0, t5, t3 /* t0 = end address */
556
+ 1:
557
+ lbu t3, 0(t5)
558
+ lbu s3, 0(t6)
559
+ addiu t5, t5, 1
560
+ addiu t3, t3, -128 /* (cb - 128) */
561
+ addiu s3, s3, -128 /* (cr - 128) */
562
+ mult $ac1, s1, t3
563
+ madd $ac1, s2, s3
564
+ sll s3, s3, 15
565
+ sll t3, t3, 15
566
+ mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
567
+ extr_r.w s5, $ac1, 16
568
+ mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
569
+ lbu v0, 0(t1)
570
+ addiu t6, t6, 1
571
+ addiu t1, t1, 2
572
+ addu t3, v0, s4 /* y+cred */
573
+ addu s3, v0, s5 /* y+cgreen */
574
+ addu v1, v0, s6 /* y+cblue */
575
+ addu t3, t9, t3 /* y+cred */
576
+ addu s3, t9, s3 /* y+cgreen */
577
+ addu v1, t9, v1 /* y+cblue */
578
+ lbu AT, 0(t3)
579
+ lbu s7, 0(s3)
580
+ lbu ra, 0(v1)
581
+ lbu v0, -1(t1)
582
+ addu t3, v0, s4 /* y+cred */
583
+ addu s3, v0, s5 /* y+cgreen */
584
+ addu v1, v0, s6 /* y+cblue */
585
+ addu t3, t9, t3 /* y+cred */
586
+ addu s3, t9, s3 /* y+cgreen */
587
+ addu v1, t9, v1 /* y+cblue */
588
+ lbu t3, 0(t3)
589
+ lbu s3, 0(s3)
590
+ lbu v1, 0(v1)
591
+ lbu v0, 0(t2)
592
+
593
+ STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
594
+
595
+ addu t3, v0, s4 /* y+cred */
596
+ addu s3, v0, s5 /* y+cgreen */
597
+ addu v1, v0, s6 /* y+cblue */
598
+ addu t3, t9, t3 /* y+cred */
599
+ addu s3, t9, s3 /* y+cgreen */
600
+ addu v1, t9, v1 /* y+cblue */
601
+ lbu AT, 0(t3)
602
+ lbu s7, 0(s3)
603
+ lbu ra, 0(v1)
604
+ lbu v0, 1(t2)
605
+ addiu t2, t2, 2
606
+ addu t3, v0, s4 /* y+cred */
607
+ addu s3, v0, s5 /* y+cgreen */
608
+ addu v1, v0, s6 /* y+cblue */
609
+ addu t3, t9, t3 /* y+cred */
610
+ addu s3, t9, s3 /* y+cgreen */
611
+ addu v1, t9, v1 /* y+cblue */
612
+ lbu t3, 0(t3)
613
+ lbu s3, 0(s3)
614
+ lbu v1, 0(v1)
615
+
616
+ STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
617
+
618
+ bne t0, t5, 1b
619
+ nop
620
+ 2:
621
+ andi t0, a0, 1
622
+ beqz t0, 4f
623
+ lbu t3, 0(t5)
624
+ lbu s3, 0(t6)
625
+ addiu t3, t3, -128 /* (cb - 128) */
626
+ addiu s3, s3, -128 /* (cr - 128) */
627
+ mult $ac1, s1, t3
628
+ madd $ac1, s2, s3
629
+ sll s3, s3, 15
630
+ sll t3, t3, 15
631
+ lbu v0, 0(t1)
632
+ extr_r.w s5, $ac1, 16
633
+ mulq_rs.w s4, t8, s3 /* s4 = (C1 * cr + ONE_HALF)>> SCALEBITS */
634
+ mulq_rs.w s6, s0, t3 /* s6 = (C2 * cb + ONE_HALF)>> SCALEBITS */
635
+ addu t3, v0, s4 /* y+cred */
636
+ addu s3, v0, s5 /* y+cgreen */
637
+ addu v1, v0, s6 /* y+cblue */
638
+ addu t3, t9, t3 /* y+cred */
639
+ addu s3, t9, s3 /* y+cgreen */
640
+ addu v1, t9, v1 /* y+cblue */
641
+ lbu t3, 0(t3)
642
+ lbu s3, 0(s3)
643
+ lbu v1, 0(v1)
644
+ lbu v0, 0(t2)
645
+
646
+ STORE_H2V2_1_PIXEL t3, s3, v1, t4
647
+
648
+ addu t3, v0, s4 /* y+cred */
649
+ addu s3, v0, s5 /* y+cgreen */
650
+ addu v1, v0, s6 /* y+cblue */
651
+ addu t3, t9, t3 /* y+cred */
652
+ addu s3, t9, s3 /* y+cgreen */
653
+ addu v1, t9, v1 /* y+cblue */
654
+ lbu t3, 0(t3)
655
+ lbu s3, 0(s3)
656
+ lbu v1, 0(v1)
657
+
658
+ STORE_H2V2_1_PIXEL t3, s3, v1, t7
659
+ 4:
660
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
661
+
662
+ j ra
663
+ nop
664
+
665
+ END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
666
+
667
+ .purgem STORE_H2V2_1_PIXEL
668
+ .purgem STORE_H2V2_2_PIXELS
669
+ .endm
670
+
671
+ /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
672
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
673
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
674
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
675
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
676
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
677
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
678
+
679
+
680
+ /*****************************************************************************/
681
+ /*
682
+ * jsimd_h2v1_merged_upsample_dspr2
683
+ * jsimd_h2v1_extrgb_merged_upsample_dspr2
684
+ * jsimd_h2v1_extrgbx_merged_upsample_dspr2
685
+ * jsimd_h2v1_extbgr_merged_upsample_dspr2
686
+ * jsimd_h2v1_extbgrx_merged_upsample_dspr2
687
+ * jsimd_h2v1_extxbgr_merged_upsample_dspr2
688
+ * jsimd_h2v1_extxrgb_merged_upsample_dspr2
689
+ *
690
+ * Merged h2v1 upsample routines
691
+ */
692
+
693
+ .macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
694
+ r1_offs, g1_offs, \
695
+ b1_offs, a1_offs, \
696
+ r2_offs, g2_offs, \
697
+ b2_offs, a2_offs
698
+
699
+ .macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
700
+ scratch5 outptr
701
+ sb \scratch0, \r1_offs(\outptr)
702
+ sb \scratch1, \g1_offs(\outptr)
703
+ sb \scratch2, \b1_offs(\outptr)
704
+ sb \scratch3, \r2_offs(\outptr)
705
+ sb \scratch4, \g2_offs(\outptr)
706
+ sb \scratch5, \b2_offs(\outptr)
707
+ .if (\pixel_size == 8)
708
+ li t0, 0xFF
709
+ sb t0, \a1_offs(\outptr)
710
+ sb t0, \a2_offs(\outptr)
711
+ .endif
712
+ addiu \outptr, \pixel_size
713
+ .endm
714
+
715
+ .macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr
716
+ sb \scratch0, \r1_offs(\outptr)
717
+ sb \scratch1, \g1_offs(\outptr)
718
+ sb \scratch2, \b1_offs(\outptr)
719
+ .if (\pixel_size == 8)
720
+ li t0, 0xFF
721
+ sb t0, \a1_offs(\outptr)
722
+ .endif
723
+ .endm
724
+
725
+ LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
726
+ /*
727
+ * a0 = cinfo->output_width
728
+ * a1 = input_buf
729
+ * a2 = in_row_group_ctr
730
+ * a3 = output_buf
731
+ * 16(sp) = range_limit
732
+ */
733
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
734
+
735
+ li t0, 0xe6ea
736
+ lw t1, 0(a1) /* t1 = input_buf[0] */
737
+ lw t2, 4(a1) /* t2 = input_buf[1] */
738
+ lw t3, 8(a1) /* t3 = input_buf[2] */
739
+ lw t8, 56(sp) /* t8 = range_limit */
740
+ addiu s1, t0, 0x7fff /* s1 = 0x166e9 [FIX(1.40200)] */
741
+ addiu s2, s1, 0x5eb9 /* s2 = 0x1c5a2 [FIX(1.77200)] */
742
+ addiu s0, t0, 0x9916 /* s0 = 0x8000 */
743
+ addiu s4, zero, 0xa7e6 /* s4 = 0xffffa7e6 [-FIX(0.34414)] */
744
+ xori s3, s4, 0xeec8 /* s3 = 0xffff492e [-FIX(0.71414)] */
745
+ srl t0, a0, 1
746
+ sll t4, a2, 2
747
+ lwx s5, t4(t1) /* s5 = inptr0 */
748
+ lwx s6, t4(t2) /* s6 = inptr1 */
749
+ lwx s7, t4(t3) /* s7 = inptr2 */
750
+ lw t7, 0(a3) /* t7 = outptr */
751
+ blez t0, 2f
752
+ addu t9, s6, t0 /* t9 = end address */
753
+ 1:
754
+ lbu t2, 0(s6) /* t2 = cb */
755
+ lbu t0, 0(s7) /* t0 = cr */
756
+ lbu t1, 0(s5) /* t1 = y */
757
+ addiu t2, t2, -128 /* t2 = cb - 128 */
758
+ addiu t0, t0, -128 /* t0 = cr - 128 */
759
+ mult $ac1, s4, t2
760
+ madd $ac1, s3, t0
761
+ sll t0, t0, 15
762
+ sll t2, t2, 15
763
+ mulq_rs.w t0, s1, t0 /* t0 = (C1*cr + ONE_HALF)>> SCALEBITS */
764
+ extr_r.w t5, $ac1, 16
765
+ mulq_rs.w t6, s2, t2 /* t6 = (C2*cb + ONE_HALF)>> SCALEBITS */
766
+ addiu s7, s7, 1
767
+ addiu s6, s6, 1
768
+ addu t2, t1, t0 /* t2 = y + cred */
769
+ addu t3, t1, t5 /* t3 = y + cgreen */
770
+ addu t4, t1, t6 /* t4 = y + cblue */
771
+ addu t2, t8, t2
772
+ addu t3, t8, t3
773
+ addu t4, t8, t4
774
+ lbu t1, 1(s5)
775
+ lbu v0, 0(t2)
776
+ lbu v1, 0(t3)
777
+ lbu ra, 0(t4)
778
+ addu t2, t1, t0
779
+ addu t3, t1, t5
780
+ addu t4, t1, t6
781
+ addu t2, t8, t2
782
+ addu t3, t8, t3
783
+ addu t4, t8, t4
784
+ lbu t2, 0(t2)
785
+ lbu t3, 0(t3)
786
+ lbu t4, 0(t4)
787
+
788
+ STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
789
+
790
+ bne t9, s6, 1b
791
+ addiu s5, s5, 2
792
+ 2:
793
+ andi t0, a0, 1
794
+ beqz t0, 4f
795
+ nop
796
+ 3:
797
+ lbu t2, 0(s6)
798
+ lbu t0, 0(s7)
799
+ lbu t1, 0(s5)
800
+ addiu t2, t2, -128 /* (cb - 128) */
801
+ addiu t0, t0, -128 /* (cr - 128) */
802
+ mul t3, s4, t2
803
+ mul t4, s3, t0
804
+ sll t0, t0, 15
805
+ sll t2, t2, 15
806
+ mulq_rs.w t0, s1, t0 /* (C1*cr + ONE_HALF)>> SCALEBITS */
807
+ mulq_rs.w t6, s2, t2 /* (C2*cb + ONE_HALF)>> SCALEBITS */
808
+ addu t3, t3, s0
809
+ addu t3, t4, t3
810
+ sra t5, t3, 16 /* (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS */
811
+ addu t2, t1, t0 /* y + cred */
812
+ addu t3, t1, t5 /* y + cgreen */
813
+ addu t4, t1, t6 /* y + cblue */
814
+ addu t2, t8, t2
815
+ addu t3, t8, t3
816
+ addu t4, t8, t4
817
+ lbu t2, 0(t2)
818
+ lbu t3, 0(t3)
819
+ lbu t4, 0(t4)
820
+
821
+ STORE_H2V1_1_PIXEL t2, t3, t4, t7
822
+ 4:
823
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
824
+
825
+ j ra
826
+ nop
827
+
828
+ END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
829
+
830
+ .purgem STORE_H2V1_1_PIXEL
831
+ .purgem STORE_H2V1_2_PIXELS
832
+ .endm
833
+
834
+ /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
835
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
836
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
837
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
838
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
839
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
840
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
841
+
842
+
843
+ /*****************************************************************************/
844
+ /*
845
+ * jsimd_h2v2_fancy_upsample_dspr2
846
+ *
847
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
848
+ */
849
+ LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
850
+ /*
851
+ * a0 = cinfo->max_v_samp_factor
852
+ * a1 = downsampled_width
853
+ * a2 = input_data
854
+ * a3 = output_data_ptr
855
+ */
856
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
857
+
858
+ li s4, 0
859
+ lw s2, 0(a3) /* s2 = *output_data_ptr */
860
+ 0:
861
+ li t9, 2
862
+ lw s1, -4(a2) /* s1 = inptr1 */
863
+
864
+ 1:
865
+ lw s0, 0(a2) /* s0 = inptr0 */
866
+ lwx s3, s4(s2)
867
+ addiu s5, a1, -2 /* s5 = downsampled_width - 2 */
868
+ srl t4, s5, 1
869
+ sll t4, t4, 1
870
+ lbu t0, 0(s0)
871
+ lbu t1, 1(s0)
872
+ lbu t2, 0(s1)
873
+ lbu t3, 1(s1)
874
+ addiu s0, 2
875
+ addiu s1, 2
876
+ addu t8, s0, t4 /* t8 = end address */
877
+ andi s5, s5, 1 /* s5 = residual */
878
+ sll t4, t0, 1
879
+ sll t6, t1, 1
880
+ addu t0, t0, t4 /* t0 = (*inptr0++) * 3 */
881
+ addu t1, t1, t6 /* t1 = (*inptr0++) * 3 */
882
+ addu t7, t0, t2 /* t7 = thiscolsum */
883
+ addu t6, t1, t3 /* t5 = nextcolsum */
884
+ sll t0, t7, 2 /* t0 = thiscolsum * 4 */
885
+ subu t1, t0, t7 /* t1 = thiscolsum * 3 */
886
+ shra_r.w t0, t0, 4
887
+ addiu t1, 7
888
+ addu t1, t1, t6
889
+ srl t1, t1, 4
890
+ sb t0, 0(s3)
891
+ sb t1, 1(s3)
892
+ beq t8, s0, 22f /* skip to final iteration if width == 3 */
893
+ addiu s3, 2
894
+ 2:
895
+ lh t0, 0(s0) /* t0 = A3|A2 */
896
+ lh t2, 0(s1) /* t2 = B3|B2 */
897
+ addiu s0, 2
898
+ addiu s1, 2
899
+ preceu.ph.qbr t0, t0 /* t0 = 0|A3|0|A2 */
900
+ preceu.ph.qbr t2, t2 /* t2 = 0|B3|0|B2 */
901
+ shll.ph t1, t0, 1
902
+ sll t3, t6, 1
903
+ addu.ph t0, t1, t0 /* t0 = A3*3|A2*3 */
904
+ addu t3, t3, t6 /* t3 = this * 3 */
905
+ addu.ph t0, t0, t2 /* t0 = next2|next1 */
906
+ addu t1, t3, t7
907
+ andi t7, t0, 0xFFFF /* t7 = next1 */
908
+ sll t2, t7, 1
909
+ addu t2, t7, t2 /* t2 = next1*3 */
910
+ addu t4, t2, t6
911
+ srl t6, t0, 16 /* t6 = next2 */
912
+ shra_r.w t1, t1, 4 /* t1 = (this*3 + last + 8) >> 4 */
913
+ addu t0, t3, t7
914
+ addiu t0, 7
915
+ srl t0, t0, 4 /* t0 = (this*3 + next1 + 7) >> 4 */
916
+ shra_r.w t4, t4, 4 /* t3 = (next1*3 + this + 8) >> 4 */
917
+ addu t2, t2, t6
918
+ addiu t2, 7
919
+ srl t2, t2, 4 /* t2 = (next1*3 + next2 + 7) >> 4 */
920
+ sb t1, 0(s3)
921
+ sb t0, 1(s3)
922
+ sb t4, 2(s3)
923
+ sb t2, 3(s3)
924
+ bne t8, s0, 2b
925
+ addiu s3, 4
926
+ 22:
927
+ beqz s5, 4f
928
+ addu t8, s0, s5
929
+ 3:
930
+ lbu t0, 0(s0)
931
+ lbu t2, 0(s1)
932
+ addiu s0, 1
933
+ addiu s1, 1
934
+ sll t3, t6, 1
935
+ sll t1, t0, 1
936
+ addu t1, t0, t1 /* t1 = inptr0 * 3 */
937
+ addu t3, t3, t6 /* t3 = thiscolsum * 3 */
938
+ addu t5, t1, t2
939
+ addu t1, t3, t7
940
+ shra_r.w t1, t1, 4
941
+ addu t0, t3, t5
942
+ addiu t0, 7
943
+ srl t0, t0, 4
944
+ sb t1, 0(s3)
945
+ sb t0, 1(s3)
946
+ addiu s3, 2
947
+ move t7, t6
948
+ bne t8, s0, 3b
949
+ move t6, t5
950
+ 4:
951
+ sll t0, t6, 2 /* t0 = thiscolsum * 4 */
952
+ subu t1, t0, t6 /* t1 = thiscolsum * 3 */
953
+ addu t1, t1, t7
954
+ addiu s4, 4
955
+ shra_r.w t1, t1, 4
956
+ addiu t0, 7
957
+ srl t0, t0, 4
958
+ sb t1, 0(s3)
959
+ sb t0, 1(s3)
960
+ addiu t9, -1
961
+ addiu s3, 2
962
+ bnez t9, 1b
963
+ lw s1, 4(a2)
964
+ srl t0, s4, 2
965
+ subu t0, a0, t0
966
+ bgtz t0, 0b
967
+ addiu a2, 4
968
+
969
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
970
+
971
+ j ra
972
+ nop
973
+ END(jsimd_h2v2_fancy_upsample_dspr2)
974
+
975
+
976
+ /*****************************************************************************/
977
+ LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
978
+ /*
979
+ * a0 = cinfo->max_v_samp_factor
980
+ * a1 = downsampled_width
981
+ * a2 = input_data
982
+ * a3 = output_data_ptr
983
+ */
984
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
985
+
986
+ .set at
987
+
988
+ beqz a0, 3f
989
+ sll t0, a0, 2
990
+ lw s1, 0(a3)
991
+ li s3, 0x10001
992
+ addu s0, s1, t0
993
+ 0:
994
+ addiu t8, a1, -2
995
+ srl t9, t8, 2
996
+ lw t7, 0(a2)
997
+ lw s2, 0(s1)
998
+ lbu t0, 0(t7)
999
+ lbu t1, 1(t7) /* t1 = inptr[1] */
1000
+ sll t2, t0, 1
1001
+ addu t2, t2, t0 /* t2 = invalue*3 */
1002
+ addu t2, t2, t1
1003
+ shra_r.w t2, t2, 2
1004
+ sb t0, 0(s2)
1005
+ sb t2, 1(s2)
1006
+ beqz t9, 11f
1007
+ addiu s2, 2
1008
+ 1:
1009
+ ulw t0, 0(t7) /* t0 = |P3|P2|P1|P0| */
1010
+ ulw t1, 1(t7)
1011
+ ulh t2, 4(t7) /* t2 = |0|0|P5|P4| */
1012
+ preceu.ph.qbl t3, t0 /* t3 = |0|P3|0|P2| */
1013
+ preceu.ph.qbr t0, t0 /* t0 = |0|P1|0|P0| */
1014
+ preceu.ph.qbr t2, t2 /* t2 = |0|P5|0|P4| */
1015
+ preceu.ph.qbl t4, t1 /* t4 = |0|P4|0|P3| */
1016
+ preceu.ph.qbr t1, t1 /* t1 = |0|P2|0|P1| */
1017
+ shll.ph t5, t4, 1
1018
+ shll.ph t6, t1, 1
1019
+ addu.ph t5, t5, t4 /* t5 = |P4*3|P3*3| */
1020
+ addu.ph t6, t6, t1 /* t6 = |P2*3|P1*3| */
1021
+ addu.ph t4, t3, s3
1022
+ addu.ph t0, t0, s3
1023
+ addu.ph t4, t4, t5
1024
+ addu.ph t0, t0, t6
1025
+ shrl.ph t4, t4, 2 /* t4 = |0|P3|0|P2| */
1026
+ shrl.ph t0, t0, 2 /* t0 = |0|P1|0|P0| */
1027
+ addu.ph t2, t2, t5
1028
+ addu.ph t3, t3, t6
1029
+ shra_r.ph t2, t2, 2 /* t2 = |0|P5|0|P4| */
1030
+ shra_r.ph t3, t3, 2 /* t3 = |0|P3|0|P2| */
1031
+ shll.ph t2, t2, 8
1032
+ shll.ph t3, t3, 8
1033
+ or t2, t4, t2
1034
+ or t3, t3, t0
1035
+ addiu t9, -1
1036
+ usw t3, 0(s2)
1037
+ usw t2, 4(s2)
1038
+ addiu s2, 8
1039
+ bgtz t9, 1b
1040
+ addiu t7, 4
1041
+ 11:
1042
+ andi t8, 3
1043
+ beqz t8, 22f
1044
+ addiu t7, 1
1045
+
1046
+ 2:
1047
+ lbu t0, 0(t7)
1048
+ addiu t7, 1
1049
+ sll t1, t0, 1
1050
+ addu t2, t0, t1 /* t2 = invalue */
1051
+ lbu t3, -2(t7)
1052
+ lbu t4, 0(t7)
1053
+ addiu t3, 1
1054
+ addiu t4, 2
1055
+ addu t3, t3, t2
1056
+ addu t4, t4, t2
1057
+ srl t3, 2
1058
+ srl t4, 2
1059
+ sb t3, 0(s2)
1060
+ sb t4, 1(s2)
1061
+ addiu t8, -1
1062
+ bgtz t8, 2b
1063
+ addiu s2, 2
1064
+
1065
+ 22:
1066
+ lbu t0, 0(t7)
1067
+ lbu t2, -1(t7)
1068
+ sll t1, t0, 1
1069
+ addu t1, t1, t0 /* t1 = invalue * 3 */
1070
+ addu t1, t1, t2
1071
+ addiu t1, 1
1072
+ srl t1, t1, 2
1073
+ sb t1, 0(s2)
1074
+ sb t0, 1(s2)
1075
+ addiu s1, 4
1076
+ bne s1, s0, 0b
1077
+ addiu a2, 4
1078
+ 3:
1079
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1080
+
1081
+ j ra
1082
+ nop
1083
+ END(jsimd_h2v1_fancy_upsample_dspr2)
1084
+
1085
+
1086
+ /*****************************************************************************/
1087
+ LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
1088
+ /*
1089
+ * a0 = cinfo->image_width
1090
+ * a1 = cinfo->max_v_samp_factor
1091
+ * a2 = compptr->v_samp_factor
1092
+ * a3 = compptr->width_in_blocks
1093
+ * 16(sp) = input_data
1094
+ * 20(sp) = output_data
1095
+ */
1096
+ .set at
1097
+
1098
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
1099
+
1100
+ beqz a2, 7f
1101
+ lw s1, 44(sp) /* s1 = output_data */
1102
+ lw s0, 40(sp) /* s0 = input_data */
1103
+ srl s2, a0, 2
1104
+ andi t9, a0, 2
1105
+ srl t7, t9, 1
1106
+ addu s2, t7, s2
1107
+ sll t0, a3, 3 /* t0 = width_in_blocks*DCT */
1108
+ srl t7, t0, 1
1109
+ subu s2, t7, s2
1110
+ 0:
1111
+ andi t6, a0, 1 /* t6 = temp_index */
1112
+ addiu t6, -1
1113
+ lw t4, 0(s1) /* t4 = outptr */
1114
+ lw t5, 0(s0) /* t5 = inptr0 */
1115
+ li s3, 0 /* s3 = bias */
1116
+ srl t7, a0, 1 /* t7 = image_width1 */
1117
+ srl s4, t7, 2
1118
+ andi t8, t7, 3
1119
+ 1:
1120
+ ulhu t0, 0(t5)
1121
+ ulhu t1, 2(t5)
1122
+ ulhu t2, 4(t5)
1123
+ ulhu t3, 6(t5)
1124
+ raddu.w.qb t0, t0
1125
+ raddu.w.qb t1, t1
1126
+ raddu.w.qb t2, t2
1127
+ raddu.w.qb t3, t3
1128
+ shra.ph t0, t0, 1
1129
+ shra_r.ph t1, t1, 1
1130
+ shra.ph t2, t2, 1
1131
+ shra_r.ph t3, t3, 1
1132
+ sb t0, 0(t4)
1133
+ sb t1, 1(t4)
1134
+ sb t2, 2(t4)
1135
+ sb t3, 3(t4)
1136
+ addiu s4, -1
1137
+ addiu t4, 4
1138
+ bgtz s4, 1b
1139
+ addiu t5, 8
1140
+ beqz t8, 3f
1141
+ addu s4, t4, t8
1142
+ 2:
1143
+ ulhu t0, 0(t5)
1144
+ raddu.w.qb t0, t0
1145
+ addqh.w t0, t0, s3
1146
+ xori s3, s3, 1
1147
+ sb t0, 0(t4)
1148
+ addiu t4, 1
1149
+ bne t4, s4, 2b
1150
+ addiu t5, 2
1151
+ 3:
1152
+ lbux t1, t6(t5)
1153
+ sll t1, 1
1154
+ addqh.w t2, t1, s3 /* t2 = pixval1 */
1155
+ xori s3, s3, 1
1156
+ addqh.w t3, t1, s3 /* t3 = pixval2 */
1157
+ blez s2, 5f
1158
+ append t3, t2, 8
1159
+ addu t5, t4, s2 /* t5 = loop_end2 */
1160
+ 4:
1161
+ ush t3, 0(t4)
1162
+ addiu s2, -1
1163
+ bgtz s2, 4b
1164
+ addiu t4, 2
1165
+ 5:
1166
+ beqz t9, 6f
1167
+ nop
1168
+ sb t2, 0(t4)
1169
+ 6:
1170
+ addiu s1, 4
1171
+ addiu a2, -1
1172
+ bnez a2, 0b
1173
+ addiu s0, 4
1174
+ 7:
1175
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
1176
+
1177
+ j ra
1178
+ nop
1179
+ END(jsimd_h2v1_downsample_dspr2)
1180
+
1181
+
1182
+ /*****************************************************************************/
1183
+ LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
1184
+ /*
1185
+ * a0 = cinfo->image_width
1186
+ * a1 = cinfo->max_v_samp_factor
1187
+ * a2 = compptr->v_samp_factor
1188
+ * a3 = compptr->width_in_blocks
1189
+ * 16(sp) = input_data
1190
+ * 20(sp) = output_data
1191
+ */
1192
+ .set at
1193
+
1194
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1195
+
1196
+ beqz a2, 8f
1197
+ lw s1, 52(sp) /* s1 = output_data */
1198
+ lw s0, 48(sp) /* s0 = input_data */
1199
+
1200
+ andi t6, a0, 1 /* t6 = temp_index */
1201
+ addiu t6, -1
1202
+ srl t7, a0, 1 /* t7 = image_width1 */
1203
+ srl s4, t7, 2
1204
+ andi t8, t7, 3
1205
+ andi t9, a0, 2
1206
+ srl s2, a0, 2
1207
+ srl t7, t9, 1
1208
+ addu s2, t7, s2
1209
+ sll t0, a3, 3 /* s2 = width_in_blocks*DCT */
1210
+ srl t7, t0, 1
1211
+ subu s2, t7, s2
1212
+ 0:
1213
+ lw t4, 0(s1) /* t4 = outptr */
1214
+ lw t5, 0(s0) /* t5 = inptr0 */
1215
+ lw s7, 4(s0) /* s7 = inptr1 */
1216
+ li s6, 1 /* s6 = bias */
1217
+ 2:
1218
+ ulw t0, 0(t5) /* t0 = |P3|P2|P1|P0| */
1219
+ ulw t1, 0(s7) /* t1 = |Q3|Q2|Q1|Q0| */
1220
+ ulw t2, 4(t5)
1221
+ ulw t3, 4(s7)
1222
+ precrq.ph.w t7, t0, t1 /* t2 = |P3|P2|Q3|Q2| */
1223
+ ins t0, t1, 16, 16 /* t0 = |Q1|Q0|P1|P0| */
1224
+ raddu.w.qb t1, t7
1225
+ raddu.w.qb t0, t0
1226
+ shra_r.w t1, t1, 2
1227
+ addiu t0, 1
1228
+ srl t0, 2
1229
+ precrq.ph.w t7, t2, t3
1230
+ ins t2, t3, 16, 16
1231
+ raddu.w.qb t7, t7
1232
+ raddu.w.qb t2, t2
1233
+ shra_r.w t7, t7, 2
1234
+ addiu t2, 1
1235
+ srl t2, 2
1236
+ sb t0, 0(t4)
1237
+ sb t1, 1(t4)
1238
+ sb t2, 2(t4)
1239
+ sb t7, 3(t4)
1240
+ addiu t4, 4
1241
+ addiu t5, 8
1242
+ addiu s4, s4, -1
1243
+ bgtz s4, 2b
1244
+ addiu s7, 8
1245
+ beqz t8, 4f
1246
+ addu t8, t4, t8
1247
+ 3:
1248
+ ulhu t0, 0(t5)
1249
+ ulhu t1, 0(s7)
1250
+ ins t0, t1, 16, 16
1251
+ raddu.w.qb t0, t0
1252
+ addu t0, t0, s6
1253
+ srl t0, 2
1254
+ xori s6, s6, 3
1255
+ sb t0, 0(t4)
1256
+ addiu t5, 2
1257
+ addiu t4, 1
1258
+ bne t8, t4, 3b
1259
+ addiu s7, 2
1260
+ 4:
1261
+ lbux t1, t6(t5)
1262
+ sll t1, 1
1263
+ lbux t0, t6(s7)
1264
+ sll t0, 1
1265
+ addu t1, t1, t0
1266
+ addu t3, t1, s6
1267
+ srl t0, t3, 2 /* t2 = pixval1 */
1268
+ xori s6, s6, 3
1269
+ addu t2, t1, s6
1270
+ srl t1, t2, 2 /* t3 = pixval2 */
1271
+ blez s2, 6f
1272
+ append t1, t0, 8
1273
+ 5:
1274
+ ush t1, 0(t4)
1275
+ addiu s2, -1
1276
+ bgtz s2, 5b
1277
+ addiu t4, 2
1278
+ 6:
1279
+ beqz t9, 7f
1280
+ nop
1281
+ sb t0, 0(t4)
1282
+ 7:
1283
+ addiu s1, 4
1284
+ addiu a2, -1
1285
+ bnez a2, 0b
1286
+ addiu s0, 8
1287
+ 8:
1288
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1289
+
1290
+ j ra
1291
+ nop
1292
+ END(jsimd_h2v2_downsample_dspr2)
1293
+
1294
+
1295
+ /*****************************************************************************/
1296
+ LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
1297
+ /*
1298
+ * a0 = input_data
1299
+ * a1 = output_data
1300
+ * a2 = compptr->v_samp_factor
1301
+ * a3 = cinfo->max_v_samp_factor
1302
+ * 16(sp) = cinfo->smoothing_factor
1303
+ * 20(sp) = compptr->width_in_blocks
1304
+ * 24(sp) = cinfo->image_width
1305
+ */
1306
+ .set at
1307
+
1308
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1309
+
1310
+ lw s7, 52(sp) /* compptr->width_in_blocks */
1311
+ lw s0, 56(sp) /* cinfo->image_width */
1312
+ lw s6, 48(sp) /* cinfo->smoothing_factor */
1313
+ sll s7, 3 /* output_cols = width_in_blocks * DCTSIZE */
1314
+ sll v0, s7, 1
1315
+ subu v0, v0, s0
1316
+ blez v0, 2f
1317
+ move v1, zero
1318
+ addiu t0, a3, 2 /* t0 = cinfo->max_v_samp_factor + 2 */
1319
+ 0:
1320
+ addiu t1, a0, -4
1321
+ sll t2, v1, 2
1322
+ lwx t1, t2(t1)
1323
+ move t3, v0
1324
+ addu t1, t1, s0
1325
+ lbu t2, -1(t1)
1326
+ 1:
1327
+ addiu t3, t3, -1
1328
+ sb t2, 0(t1)
1329
+ bgtz t3, 1b
1330
+ addiu t1, t1, 1
1331
+ addiu v1, v1, 1
1332
+ bne v1, t0, 0b
1333
+ nop
1334
+ 2:
1335
+ li v0, 80
1336
+ mul v0, s6, v0
1337
+ li v1, 16384
1338
+ move t4, zero
1339
+ move t5, zero
1340
+ subu t6, v1, v0 /* t6 = 16384 - tmp_smoot_f * 80 */
1341
+ sll t7, s6, 4 /* t7 = tmp_smoot_f * 16 */
1342
+ 3:
1343
+ /* Special case for first column: pretend column -1 is same as column 0 */
1344
+ sll v0, t4, 2
1345
+ lwx t8, v0(a1) /* outptr = output_data[outrow] */
1346
+ sll v1, t5, 2
1347
+ addiu t9, v1, 4
1348
+ addiu s0, v1, -4
1349
+ addiu s1, v1, 8
1350
+ lwx s2, v1(a0) /* inptr0 = input_data[inrow] */
1351
+ lwx t9, t9(a0) /* inptr1 = input_data[inrow+1] */
1352
+ lwx s0, s0(a0) /* above_ptr = input_data[inrow-1] */
1353
+ lwx s1, s1(a0) /* below_ptr = input_data[inrow+2] */
1354
+ lh v0, 0(s2)
1355
+ lh v1, 0(t9)
1356
+ lh t0, 0(s0)
1357
+ lh t1, 0(s1)
1358
+ ins v0, v1, 16, 16
1359
+ ins t0, t1, 16, 16
1360
+ raddu.w.qb t2, v0
1361
+ raddu.w.qb s3, t0
1362
+ lbu v0, 0(s2)
1363
+ lbu v1, 2(s2)
1364
+ lbu t0, 0(t9)
1365
+ lbu t1, 2(t9)
1366
+ addu v0, v0, v1
1367
+ mult $ac1, t2, t6
1368
+ addu t0, t0, t1
1369
+ lbu t2, 2(s0)
1370
+ addu t0, t0, v0
1371
+ lbu t3, 2(s1)
1372
+ addu s3, t0, s3
1373
+ lbu v0, 0(s0)
1374
+ lbu t0, 0(s1)
1375
+ sll s3, s3, 1
1376
+ addu v0, v0, t2
1377
+ addu t0, t0, t3
1378
+ addu t0, t0, v0
1379
+ addu s3, t0, s3
1380
+ madd $ac1, s3, t7
1381
+ extr_r.w v0, $ac1, 16
1382
+ addiu t8, t8, 1
1383
+ addiu s2, s2, 2
1384
+ addiu t9, t9, 2
1385
+ addiu s0, s0, 2
1386
+ addiu s1, s1, 2
1387
+ sb v0, -1(t8)
1388
+ addiu s4, s7, -2
1389
+ and s4, s4, 3
1390
+ addu s5, s4, t8 /* end address */
1391
+ 4:
1392
+ lh v0, 0(s2)
1393
+ lh v1, 0(t9)
1394
+ lh t0, 0(s0)
1395
+ lh t1, 0(s1)
1396
+ ins v0, v1, 16, 16
1397
+ ins t0, t1, 16, 16
1398
+ raddu.w.qb t2, v0
1399
+ raddu.w.qb s3, t0
1400
+ lbu v0, -1(s2)
1401
+ lbu v1, 2(s2)
1402
+ lbu t0, -1(t9)
1403
+ lbu t1, 2(t9)
1404
+ addu v0, v0, v1
1405
+ mult $ac1, t2, t6
1406
+ addu t0, t0, t1
1407
+ lbu t2, 2(s0)
1408
+ addu t0, t0, v0
1409
+ lbu t3, 2(s1)
1410
+ addu s3, t0, s3
1411
+ lbu v0, -1(s0)
1412
+ lbu t0, -1(s1)
1413
+ sll s3, s3, 1
1414
+ addu v0, v0, t2
1415
+ addu t0, t0, t3
1416
+ addu t0, t0, v0
1417
+ addu s3, t0, s3
1418
+ madd $ac1, s3, t7
1419
+ extr_r.w t2, $ac1, 16
1420
+ addiu t8, t8, 1
1421
+ addiu s2, s2, 2
1422
+ addiu t9, t9, 2
1423
+ addiu s0, s0, 2
1424
+ sb t2, -1(t8)
1425
+ bne s5, t8, 4b
1426
+ addiu s1, s1, 2
1427
+ addiu s5, s7, -2
1428
+ subu s5, s5, s4
1429
+ addu s5, s5, t8 /* end address */
1430
+ 5:
1431
+ lh v0, 0(s2)
1432
+ lh v1, 0(t9)
1433
+ lh t0, 0(s0)
1434
+ lh t1, 0(s1)
1435
+ ins v0, v1, 16, 16
1436
+ ins t0, t1, 16, 16
1437
+ raddu.w.qb t2, v0
1438
+ raddu.w.qb s3, t0
1439
+ lbu v0, -1(s2)
1440
+ lbu v1, 2(s2)
1441
+ lbu t0, -1(t9)
1442
+ lbu t1, 2(t9)
1443
+ addu v0, v0, v1
1444
+ mult $ac1, t2, t6
1445
+ addu t0, t0, t1
1446
+ lbu t2, 2(s0)
1447
+ addu t0, t0, v0
1448
+ lbu t3, 2(s1)
1449
+ addu s3, t0, s3
1450
+ lbu v0, -1(s0)
1451
+ lbu t0, -1(s1)
1452
+ sll s3, s3, 1
1453
+ addu v0, v0, t2
1454
+ addu t0, t0, t3
1455
+ lh v1, 2(t9)
1456
+ addu t0, t0, v0
1457
+ lh v0, 2(s2)
1458
+ addu s3, t0, s3
1459
+ lh t0, 2(s0)
1460
+ lh t1, 2(s1)
1461
+ madd $ac1, s3, t7
1462
+ extr_r.w t2, $ac1, 16
1463
+ ins t0, t1, 16, 16
1464
+ ins v0, v1, 16, 16
1465
+ raddu.w.qb s3, t0
1466
+ lbu v1, 4(s2)
1467
+ lbu t0, 1(t9)
1468
+ lbu t1, 4(t9)
1469
+ sb t2, 0(t8)
1470
+ raddu.w.qb t3, v0
1471
+ lbu v0, 1(s2)
1472
+ addu t0, t0, t1
1473
+ mult $ac1, t3, t6
1474
+ addu v0, v0, v1
1475
+ lbu t2, 4(s0)
1476
+ addu t0, t0, v0
1477
+ lbu v0, 1(s0)
1478
+ addu s3, t0, s3
1479
+ lbu t0, 1(s1)
1480
+ lbu t3, 4(s1)
1481
+ addu v0, v0, t2
1482
+ sll s3, s3, 1
1483
+ addu t0, t0, t3
1484
+ lh v1, 4(t9)
1485
+ addu t0, t0, v0
1486
+ lh v0, 4(s2)
1487
+ addu s3, t0, s3
1488
+ lh t0, 4(s0)
1489
+ lh t1, 4(s1)
1490
+ madd $ac1, s3, t7
1491
+ extr_r.w t2, $ac1, 16
1492
+ ins t0, t1, 16, 16
1493
+ ins v0, v1, 16, 16
1494
+ raddu.w.qb s3, t0
1495
+ lbu v1, 6(s2)
1496
+ lbu t0, 3(t9)
1497
+ lbu t1, 6(t9)
1498
+ sb t2, 1(t8)
1499
+ raddu.w.qb t3, v0
1500
+ lbu v0, 3(s2)
1501
+ addu t0, t0, t1
1502
+ mult $ac1, t3, t6
1503
+ addu v0, v0, v1
1504
+ lbu t2, 6(s0)
1505
+ addu t0, t0, v0
1506
+ lbu v0, 3(s0)
1507
+ addu s3, t0, s3
1508
+ lbu t0, 3(s1)
1509
+ lbu t3, 6(s1)
1510
+ addu v0, v0, t2
1511
+ sll s3, s3, 1
1512
+ addu t0, t0, t3
1513
+ lh v1, 6(t9)
1514
+ addu t0, t0, v0
1515
+ lh v0, 6(s2)
1516
+ addu s3, t0, s3
1517
+ lh t0, 6(s0)
1518
+ lh t1, 6(s1)
1519
+ madd $ac1, s3, t7
1520
+ extr_r.w t3, $ac1, 16
1521
+ ins t0, t1, 16, 16
1522
+ ins v0, v1, 16, 16
1523
+ raddu.w.qb s3, t0
1524
+ lbu v1, 8(s2)
1525
+ lbu t0, 5(t9)
1526
+ lbu t1, 8(t9)
1527
+ sb t3, 2(t8)
1528
+ raddu.w.qb t2, v0
1529
+ lbu v0, 5(s2)
1530
+ addu t0, t0, t1
1531
+ mult $ac1, t2, t6
1532
+ addu v0, v0, v1
1533
+ lbu t2, 8(s0)
1534
+ addu t0, t0, v0
1535
+ lbu v0, 5(s0)
1536
+ addu s3, t0, s3
1537
+ lbu t0, 5(s1)
1538
+ lbu t3, 8(s1)
1539
+ addu v0, v0, t2
1540
+ sll s3, s3, 1
1541
+ addu t0, t0, t3
1542
+ addiu t8, t8, 4
1543
+ addu t0, t0, v0
1544
+ addiu s2, s2, 8
1545
+ addu s3, t0, s3
1546
+ addiu t9, t9, 8
1547
+ madd $ac1, s3, t7
1548
+ extr_r.w t1, $ac1, 16
1549
+ addiu s0, s0, 8
1550
+ addiu s1, s1, 8
1551
+ bne s5, t8, 5b
1552
+ sb t1, -1(t8)
1553
+ /* Special case for last column */
1554
+ lh v0, 0(s2)
1555
+ lh v1, 0(t9)
1556
+ lh t0, 0(s0)
1557
+ lh t1, 0(s1)
1558
+ ins v0, v1, 16, 16
1559
+ ins t0, t1, 16, 16
1560
+ raddu.w.qb t2, v0
1561
+ raddu.w.qb s3, t0
1562
+ lbu v0, -1(s2)
1563
+ lbu v1, 1(s2)
1564
+ lbu t0, -1(t9)
1565
+ lbu t1, 1(t9)
1566
+ addu v0, v0, v1
1567
+ mult $ac1, t2, t6
1568
+ addu t0, t0, t1
1569
+ lbu t2, 1(s0)
1570
+ addu t0, t0, v0
1571
+ lbu t3, 1(s1)
1572
+ addu s3, t0, s3
1573
+ lbu v0, -1(s0)
1574
+ lbu t0, -1(s1)
1575
+ sll s3, s3, 1
1576
+ addu v0, v0, t2
1577
+ addu t0, t0, t3
1578
+ addu t0, t0, v0
1579
+ addu s3, t0, s3
1580
+ madd $ac1, s3, t7
1581
+ extr_r.w t0, $ac1, 16
1582
+ addiu t5, t5, 2
1583
+ sb t0, 0(t8)
1584
+ addiu t4, t4, 1
1585
+ bne t4, a2, 3b
1586
+ addiu t5, t5, 2
1587
+
1588
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1589
+
1590
+ j ra
1591
+ nop
1592
+
1593
+ END(jsimd_h2v2_smooth_downsample_dspr2)
1594
+
1595
+
1596
+ /*****************************************************************************/
1597
+ LEAF_DSPR2(jsimd_int_upsample_dspr2)
1598
+ /*
1599
+ * a0 = upsample->h_expand[compptr->component_index]
1600
+ * a1 = upsample->v_expand[compptr->component_index]
1601
+ * a2 = input_data
1602
+ * a3 = output_data_ptr
1603
+ * 16(sp) = cinfo->output_width
1604
+ * 20(sp) = cinfo->max_v_samp_factor
1605
+ */
1606
+ .set at
1607
+
1608
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1609
+
1610
+ lw s0, 0(a3) /* s0 = output_data */
1611
+ lw s1, 32(sp) /* s1 = cinfo->output_width */
1612
+ lw s2, 36(sp) /* s2 = cinfo->max_v_samp_factor */
1613
+ li t6, 0 /* t6 = inrow */
1614
+ beqz s2, 10f
1615
+ li s3, 0 /* s3 = outrow */
1616
+ 0:
1617
+ addu t0, a2, t6
1618
+ addu t7, s0, s3
1619
+ lw t3, 0(t0) /* t3 = inptr */
1620
+ lw t8, 0(t7) /* t8 = outptr */
1621
+ beqz s1, 4f
1622
+ addu t5, t8, s1 /* t5 = outend */
1623
+ 1:
1624
+ lb t2, 0(t3) /* t2 = invalue = *inptr++ */
1625
+ addiu t3, 1
1626
+ beqz a0, 3f
1627
+ move t0, a0 /* t0 = h_expand */
1628
+ 2:
1629
+ sb t2, 0(t8)
1630
+ addiu t0, -1
1631
+ bgtz t0, 2b
1632
+ addiu t8, 1
1633
+ 3:
1634
+ bgt t5, t8, 1b
1635
+ nop
1636
+ 4:
1637
+ addiu t9, a1, -1 /* t9 = v_expand - 1 */
1638
+ blez t9, 9f
1639
+ nop
1640
+ 5:
1641
+ lw t3, 0(s0)
1642
+ lw t4, 4(s0)
1643
+ subu t0, s1, 0xF
1644
+ blez t0, 7f
1645
+ addu t5, t3, s1 /* t5 = end address */
1646
+ andi t7, s1, 0xF /* t7 = residual */
1647
+ subu t8, t5, t7
1648
+ 6:
1649
+ ulw t0, 0(t3)
1650
+ ulw t1, 4(t3)
1651
+ ulw t2, 8(t3)
1652
+ usw t0, 0(t4)
1653
+ ulw t0, 12(t3)
1654
+ usw t1, 4(t4)
1655
+ usw t2, 8(t4)
1656
+ usw t0, 12(t4)
1657
+ addiu t3, 16
1658
+ bne t3, t8, 6b
1659
+ addiu t4, 16
1660
+ beqz t7, 8f
1661
+ nop
1662
+ 7:
1663
+ lbu t0, 0(t3)
1664
+ sb t0, 0(t4)
1665
+ addiu t3, 1
1666
+ bne t3, t5, 7b
1667
+ addiu t4, 1
1668
+ 8:
1669
+ addiu t9, -1
1670
+ bgtz t9, 5b
1671
+ addiu s0, 8
1672
+ 9:
1673
+ addu s3, s3, a1
1674
+ bne s3, s2, 0b
1675
+ addiu t6, 1
1676
+ 10:
1677
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1678
+
1679
+ j ra
1680
+ nop
1681
+ END(jsimd_int_upsample_dspr2)
1682
+
1683
+
1684
+ /*****************************************************************************/
1685
+ LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
1686
+ /*
1687
+ * a0 = cinfo->max_v_samp_factor
1688
+ * a1 = cinfo->output_width
1689
+ * a2 = input_data
1690
+ * a3 = output_data_ptr
1691
+ */
1692
+ lw t7, 0(a3) /* t7 = output_data */
1693
+ andi t8, a1, 0xf /* t8 = residual */
1694
+ sll t0, a0, 2
1695
+ blez a0, 4f
1696
+ addu t9, t7, t0 /* t9 = output_data end address */
1697
+ 0:
1698
+ lw t5, 0(t7) /* t5 = outptr */
1699
+ lw t6, 0(a2) /* t6 = inptr */
1700
+ addu t3, t5, a1 /* t3 = outptr + output_width (end address) */
1701
+ subu t3, t8 /* t3 = end address - residual */
1702
+ beq t5, t3, 2f
1703
+ move t4, t8
1704
+ 1:
1705
+ ulw t0, 0(t6) /* t0 = |P3|P2|P1|P0| */
1706
+ ulw t2, 4(t6) /* t2 = |P7|P6|P5|P4| */
1707
+ srl t1, t0, 16 /* t1 = |X|X|P3|P2| */
1708
+ ins t0, t0, 16, 16 /* t0 = |P1|P0|P1|P0| */
1709
+ ins t1, t1, 16, 16 /* t1 = |P3|P2|P3|P2| */
1710
+ ins t0, t0, 8, 16 /* t0 = |P1|P1|P0|P0| */
1711
+ ins t1, t1, 8, 16 /* t1 = |P3|P3|P2|P2| */
1712
+ usw t0, 0(t5)
1713
+ usw t1, 4(t5)
1714
+ srl t0, t2, 16 /* t0 = |X|X|P7|P6| */
1715
+ ins t2, t2, 16, 16 /* t2 = |P5|P4|P5|P4| */
1716
+ ins t0, t0, 16, 16 /* t0 = |P7|P6|P7|P6| */
1717
+ ins t2, t2, 8, 16 /* t2 = |P5|P5|P4|P4| */
1718
+ ins t0, t0, 8, 16 /* t0 = |P7|P7|P6|P6| */
1719
+ usw t2, 8(t5)
1720
+ usw t0, 12(t5)
1721
+ addiu t5, 16
1722
+ bne t5, t3, 1b
1723
+ addiu t6, 8
1724
+ beqz t8, 3f
1725
+ move t4, t8
1726
+ 2:
1727
+ lbu t1, 0(t6)
1728
+ sb t1, 0(t5)
1729
+ sb t1, 1(t5)
1730
+ addiu t4, -2
1731
+ addiu t6, 1
1732
+ bgtz t4, 2b
1733
+ addiu t5, 2
1734
+ 3:
1735
+ addiu t7, 4
1736
+ bne t9, t7, 0b
1737
+ addiu a2, 4
1738
+ 4:
1739
+ j ra
1740
+ nop
1741
+ END(jsimd_h2v1_upsample_dspr2)
1742
+
1743
+
1744
+ /*****************************************************************************/
1745
+ LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
1746
+ /*
1747
+ * a0 = cinfo->max_v_samp_factor
1748
+ * a1 = cinfo->output_width
1749
+ * a2 = input_data
1750
+ * a3 = output_data_ptr
1751
+ */
1752
+ lw t7, 0(a3)
1753
+ blez a0, 7f
1754
+ andi t9, a1, 0xf /* t9 = residual */
1755
+ 0:
1756
+ lw t6, 0(a2) /* t6 = inptr */
1757
+ lw t5, 0(t7) /* t5 = outptr */
1758
+ addu t8, t5, a1 /* t8 = outptr end address */
1759
+ subu t8, t9 /* t8 = end address - residual */
1760
+ beq t5, t8, 2f
1761
+ move t4, t9
1762
+ 1:
1763
+ ulw t0, 0(t6)
1764
+ srl t1, t0, 16
1765
+ ins t0, t0, 16, 16
1766
+ ins t0, t0, 8, 16
1767
+ ins t1, t1, 16, 16
1768
+ ins t1, t1, 8, 16
1769
+ ulw t2, 4(t6)
1770
+ usw t0, 0(t5)
1771
+ usw t1, 4(t5)
1772
+ srl t3, t2, 16
1773
+ ins t2, t2, 16, 16
1774
+ ins t2, t2, 8, 16
1775
+ ins t3, t3, 16, 16
1776
+ ins t3, t3, 8, 16
1777
+ usw t2, 8(t5)
1778
+ usw t3, 12(t5)
1779
+ addiu t5, 16
1780
+ bne t5, t8, 1b
1781
+ addiu t6, 8
1782
+ beqz t9, 3f
1783
+ move t4, t9
1784
+ 2:
1785
+ lbu t0, 0(t6)
1786
+ sb t0, 0(t5)
1787
+ sb t0, 1(t5)
1788
+ addiu t4, -2
1789
+ addiu t6, 1
1790
+ bgtz t4, 2b
1791
+ addiu t5, 2
1792
+ 3:
1793
+ lw t6, 0(t7) /* t6 = outptr[0] */
1794
+ lw t5, 4(t7) /* t5 = outptr[1] */
1795
+ addu t4, t6, a1 /* t4 = new end address */
1796
+ beq a1, t9, 5f
1797
+ subu t8, t4, t9
1798
+ 4:
1799
+ ulw t0, 0(t6)
1800
+ ulw t1, 4(t6)
1801
+ ulw t2, 8(t6)
1802
+ usw t0, 0(t5)
1803
+ ulw t0, 12(t6)
1804
+ usw t1, 4(t5)
1805
+ usw t2, 8(t5)
1806
+ usw t0, 12(t5)
1807
+ addiu t6, 16
1808
+ bne t6, t8, 4b
1809
+ addiu t5, 16
1810
+ beqz t9, 6f
1811
+ nop
1812
+ 5:
1813
+ lbu t0, 0(t6)
1814
+ sb t0, 0(t5)
1815
+ addiu t6, 1
1816
+ bne t6, t4, 5b
1817
+ addiu t5, 1
1818
+ 6:
1819
+ addiu t7, 8
1820
+ addiu a0, -2
1821
+ bgtz a0, 0b
1822
+ addiu a2, 4
1823
+ 7:
1824
+ j ra
1825
+ nop
1826
+ END(jsimd_h2v2_upsample_dspr2)
1827
+
1828
+
1829
+ /*****************************************************************************/
1830
+ LEAF_DSPR2(jsimd_idct_islow_dspr2)
1831
+ /*
1832
+ * a0 = coef_block
1833
+ * a1 = compptr->dcttable
1834
+ * a2 = output
1835
+ * a3 = range_limit
1836
+ */
1837
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1838
+
1839
+ addiu sp, sp, -256
1840
+ move v0, sp
1841
+ addiu v1, zero, 8 /* v1 = DCTSIZE = 8 */
1842
+ 1:
1843
+ lh s4, 32(a0) /* s4 = inptr[16] */
1844
+ lh s5, 64(a0) /* s5 = inptr[32] */
1845
+ lh s6, 96(a0) /* s6 = inptr[48] */
1846
+ lh t1, 112(a0) /* t1 = inptr[56] */
1847
+ lh t7, 16(a0) /* t7 = inptr[8] */
1848
+ lh t5, 80(a0) /* t5 = inptr[40] */
1849
+ lh t3, 48(a0) /* t3 = inptr[24] */
1850
+ or s4, s4, t1
1851
+ or s4, s4, t3
1852
+ or s4, s4, t5
1853
+ or s4, s4, t7
1854
+ or s4, s4, s5
1855
+ or s4, s4, s6
1856
+ bnez s4, 2f
1857
+ addiu v1, v1, -1
1858
+ lh s5, 0(a1) /* quantptr[DCTSIZE*0] */
1859
+ lh s6, 0(a0) /* inptr[DCTSIZE*0] */
1860
+ mul s5, s5, s6 /* DEQUANTIZE(inptr[0], quantptr[0]) */
1861
+ sll s5, s5, 2
1862
+ sw s5, 0(v0)
1863
+ sw s5, 32(v0)
1864
+ sw s5, 64(v0)
1865
+ sw s5, 96(v0)
1866
+ sw s5, 128(v0)
1867
+ sw s5, 160(v0)
1868
+ sw s5, 192(v0)
1869
+ b 3f
1870
+ sw s5, 224(v0)
1871
+ 2:
1872
+ lh t0, 112(a1)
1873
+ lh t2, 48(a1)
1874
+ lh t4, 80(a1)
1875
+ lh t6, 16(a1)
1876
+ mul t0, t0, t1 /* DEQUANTIZE(inptr[DCTSIZE*7],
1877
+ quantptr[DCTSIZE*7]) */
1878
+ mul t1, t2, t3 /* DEQUANTIZE(inptr[DCTSIZE*3],
1879
+ quantptr[DCTSIZE*3]) */
1880
+ mul t2, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*5],
1881
+ quantptr[DCTSIZE*5]) */
1882
+ mul t3, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*1],
1883
+ quantptr[DCTSIZE*1]) */
1884
+ lh t4, 32(a1)
1885
+ lh t5, 32(a0)
1886
+ lh t6, 96(a1)
1887
+ lh t7, 96(a0)
1888
+ addu s0, t0, t1 /* z3 = tmp0 + tmp2 */
1889
+ addu s1, t1, t2 /* z2 = tmp1 + tmp2 */
1890
+ addu s2, t2, t3 /* z4 = tmp1 + tmp3 */
1891
+ addu s3, s0, s2 /* z3 + z4 */
1892
+ addiu t9, zero, 9633 /* FIX_1_175875602 */
1893
+ mul s3, s3, t9 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
1894
+ addu t8, t0, t3 /* z1 = tmp0 + tmp3 */
1895
+ addiu t9, zero, 2446 /* FIX_0_298631336 */
1896
+ mul t0, t0, t9 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
1897
+ addiu t9, zero, 16819 /* FIX_2_053119869 */
1898
+ mul t2, t2, t9 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
1899
+ addiu t9, zero, 25172 /* FIX_3_072711026 */
1900
+ mul t1, t1, t9 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
1901
+ addiu t9, zero, 12299 /* FIX_1_501321110 */
1902
+ mul t3, t3, t9 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
1903
+ addiu t9, zero, 16069 /* FIX_1_961570560 */
1904
+ mul s0, s0, t9 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
1905
+ addiu t9, zero, 3196 /* FIX_0_390180644 */
1906
+ mul s2, s2, t9 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
1907
+ addiu t9, zero, 7373 /* FIX_0_899976223 */
1908
+ mul t8, t8, t9 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
1909
+ addiu t9, zero, 20995 /* FIX_2_562915447 */
1910
+ mul s1, s1, t9 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
1911
+ subu s0, s3, s0 /* z3 += z5 */
1912
+ addu t0, t0, s0 /* tmp0 += z3 */
1913
+ addu t1, t1, s0 /* tmp2 += z3 */
1914
+ subu s2, s3, s2 /* z4 += z5 */
1915
+ addu t2, t2, s2 /* tmp1 += z4 */
1916
+ addu t3, t3, s2 /* tmp3 += z4 */
1917
+ subu t0, t0, t8 /* tmp0 += z1 */
1918
+ subu t1, t1, s1 /* tmp2 += z2 */
1919
+ subu t2, t2, s1 /* tmp1 += z2 */
1920
+ subu t3, t3, t8 /* tmp3 += z1 */
1921
+ mul s0, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*2],
1922
+ quantptr[DCTSIZE*2]) */
1923
+ addiu t9, zero, 6270 /* FIX_0_765366865 */
1924
+ mul s1, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*6],
1925
+ quantptr[DCTSIZE*6]) */
1926
+ lh t4, 0(a1)
1927
+ lh t5, 0(a0)
1928
+ lh t6, 64(a1)
1929
+ lh t7, 64(a0)
1930
+ mul s2, t9, s0 /* MULTIPLY(z2, FIX_0_765366865) */
1931
+ mul t5, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*0],
1932
+ quantptr[DCTSIZE*0]) */
1933
+ mul t6, t6, t7 /* DEQUANTIZE(inptr[DCTSIZE*4],
1934
+ quantptr[DCTSIZE*4]) */
1935
+ addiu t9, zero, 4433 /* FIX_0_541196100 */
1936
+ addu s3, s0, s1 /* z2 + z3 */
1937
+ mul s3, s3, t9 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
1938
+ addiu t9, zero, 15137 /* FIX_1_847759065 */
1939
+ mul t8, s1, t9 /* MULTIPLY(z3, FIX_1_847759065) */
1940
+ addu t4, t5, t6
1941
+ subu t5, t5, t6
1942
+ sll t4, t4, 13 /* tmp0 = (z2 + z3) << CONST_BITS */
1943
+ sll t5, t5, 13 /* tmp1 = (z2 - z3) << CONST_BITS */
1944
+ addu t7, s3, s2 /* tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865) */
1945
+ subu t6, s3, t8 /* tmp2 =
1946
+ z1 + MULTIPLY(z3, -FIX_1_847759065) */
1947
+ addu s0, t4, t7
1948
+ subu s1, t4, t7
1949
+ addu s2, t5, t6
1950
+ subu s3, t5, t6
1951
+ addu t4, s0, t3
1952
+ subu s0, s0, t3
1953
+ addu t3, s2, t1
1954
+ subu s2, s2, t1
1955
+ addu t1, s3, t2
1956
+ subu s3, s3, t2
1957
+ addu t2, s1, t0
1958
+ subu s1, s1, t0
1959
+ shra_r.w t4, t4, 11
1960
+ shra_r.w t3, t3, 11
1961
+ shra_r.w t1, t1, 11
1962
+ shra_r.w t2, t2, 11
1963
+ shra_r.w s1, s1, 11
1964
+ shra_r.w s3, s3, 11
1965
+ shra_r.w s2, s2, 11
1966
+ shra_r.w s0, s0, 11
1967
+ sw t4, 0(v0)
1968
+ sw t3, 32(v0)
1969
+ sw t1, 64(v0)
1970
+ sw t2, 96(v0)
1971
+ sw s1, 128(v0)
1972
+ sw s3, 160(v0)
1973
+ sw s2, 192(v0)
1974
+ sw s0, 224(v0)
1975
+ 3:
1976
+ addiu a1, a1, 2
1977
+ addiu a0, a0, 2
1978
+ bgtz v1, 1b
1979
+ addiu v0, v0, 4
1980
+ move v0, sp
1981
+ addiu v1, zero, 8
1982
+ 4:
1983
+ lw t0, 8(v0) /* z2 = (JLONG)wsptr[2] */
1984
+ lw t1, 24(v0) /* z3 = (JLONG)wsptr[6] */
1985
+ lw t2, 0(v0) /* (JLONG)wsptr[0] */
1986
+ lw t3, 16(v0) /* (JLONG)wsptr[4] */
1987
+ lw s4, 4(v0) /* (JLONG)wsptr[1] */
1988
+ lw s5, 12(v0) /* (JLONG)wsptr[3] */
1989
+ lw s6, 20(v0) /* (JLONG)wsptr[5] */
1990
+ lw s7, 28(v0) /* (JLONG)wsptr[7] */
1991
+ or s4, s4, t0
1992
+ or s4, s4, t1
1993
+ or s4, s4, t3
1994
+ or s4, s4, s7
1995
+ or s4, s4, s5
1996
+ or s4, s4, s6
1997
+ bnez s4, 5f
1998
+ addiu v1, v1, -1
1999
+ shra_r.w s5, t2, 5
2000
+ andi s5, s5, 0x3ff
2001
+ lbux s5, s5(a3)
2002
+ lw s1, 0(a2)
2003
+ replv.qb s5, s5
2004
+ usw s5, 0(s1)
2005
+ usw s5, 4(s1)
2006
+ b 6f
2007
+ nop
2008
+ 5:
2009
+ addu t4, t0, t1 /* z2 + z3 */
2010
+ addiu t8, zero, 4433 /* FIX_0_541196100 */
2011
+ mul t5, t4, t8 /* z1 = MULTIPLY(z2 + z3, FIX_0_541196100) */
2012
+ addiu t8, zero, 15137 /* FIX_1_847759065 */
2013
+ mul t1, t1, t8 /* MULTIPLY(z3, FIX_1_847759065) */
2014
+ addiu t8, zero, 6270 /* FIX_0_765366865 */
2015
+ mul t0, t0, t8 /* MULTIPLY(z2, FIX_0_765366865) */
2016
+ addu t4, t2, t3 /* (JLONG)wsptr[0] + (JLONG)wsptr[4] */
2017
+ subu t2, t2, t3 /* (JLONG)wsptr[0] - (JLONG)wsptr[4] */
2018
+ sll t4, t4, 13 /* tmp0 =
2019
+ (wsptr[0] + wsptr[4]) << CONST_BITS */
2020
+ sll t2, t2, 13 /* tmp1 =
2021
+ (wsptr[0] - wsptr[4]) << CONST_BITS */
2022
+ subu t1, t5, t1 /* tmp2 =
2023
+ z1 + MULTIPLY(z3, -FIX_1_847759065) */
2024
+ subu t3, t2, t1 /* tmp12 = tmp1 - tmp2 */
2025
+ addu t2, t2, t1 /* tmp11 = tmp1 + tmp2 */
2026
+ addu t5, t5, t0 /* tmp3 =
2027
+ z1 + MULTIPLY(z2, FIX_0_765366865) */
2028
+ subu t1, t4, t5 /* tmp13 = tmp0 - tmp3 */
2029
+ addu t0, t4, t5 /* tmp10 = tmp0 + tmp3 */
2030
+ lw t4, 28(v0) /* tmp0 = (JLONG)wsptr[7] */
2031
+ lw t6, 12(v0) /* tmp2 = (JLONG)wsptr[3] */
2032
+ lw t5, 20(v0) /* tmp1 = (JLONG)wsptr[5] */
2033
+ lw t7, 4(v0) /* tmp3 = (JLONG)wsptr[1] */
2034
+ addu s0, t4, t6 /* z3 = tmp0 + tmp2 */
2035
+ addiu t8, zero, 9633 /* FIX_1_175875602 */
2036
+ addu s1, t5, t7 /* z4 = tmp1 + tmp3 */
2037
+ addu s2, s0, s1 /* z3 + z4 */
2038
+ mul s2, s2, t8 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
2039
+ addu s3, t4, t7 /* z1 = tmp0 + tmp3 */
2040
+ addu t9, t5, t6 /* z2 = tmp1 + tmp2 */
2041
+ addiu t8, zero, 16069 /* FIX_1_961570560 */
2042
+ mul s0, s0, t8 /* -z3 = MULTIPLY(z3, FIX_1_961570560) */
2043
+ addiu t8, zero, 3196 /* FIX_0_390180644 */
2044
+ mul s1, s1, t8 /* -z4 = MULTIPLY(z4, FIX_0_390180644) */
2045
+ addiu t8, zero, 2446 /* FIX_0_298631336 */
2046
+ mul t4, t4, t8 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
2047
+ addiu t8, zero, 7373 /* FIX_0_899976223 */
2048
+ mul s3, s3, t8 /* -z1 = MULTIPLY(z1, FIX_0_899976223) */
2049
+ addiu t8, zero, 16819 /* FIX_2_053119869 */
2050
+ mul t5, t5, t8 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
2051
+ addiu t8, zero, 20995 /* FIX_2_562915447 */
2052
+ mul t9, t9, t8 /* -z2 = MULTIPLY(z2, FIX_2_562915447) */
2053
+ addiu t8, zero, 25172 /* FIX_3_072711026 */
2054
+ mul t6, t6, t8 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
2055
+ addiu t8, zero, 12299 /* FIX_1_501321110 */
2056
+ mul t7, t7, t8 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
2057
+ subu s0, s2, s0 /* z3 += z5 */
2058
+ subu s1, s2, s1 /* z4 += z5 */
2059
+ addu t4, t4, s0
2060
+ subu t4, t4, s3 /* tmp0 */
2061
+ addu t5, t5, s1
2062
+ subu t5, t5, t9 /* tmp1 */
2063
+ addu t6, t6, s0
2064
+ subu t6, t6, t9 /* tmp2 */
2065
+ addu t7, t7, s1
2066
+ subu t7, t7, s3 /* tmp3 */
2067
+ addu s0, t0, t7
2068
+ subu t0, t0, t7
2069
+ addu t7, t2, t6
2070
+ subu t2, t2, t6
2071
+ addu t6, t3, t5
2072
+ subu t3, t3, t5
2073
+ addu t5, t1, t4
2074
+ subu t1, t1, t4
2075
+ shra_r.w s0, s0, 18
2076
+ shra_r.w t7, t7, 18
2077
+ shra_r.w t6, t6, 18
2078
+ shra_r.w t5, t5, 18
2079
+ shra_r.w t1, t1, 18
2080
+ shra_r.w t3, t3, 18
2081
+ shra_r.w t2, t2, 18
2082
+ shra_r.w t0, t0, 18
2083
+ andi s0, s0, 0x3ff
2084
+ andi t7, t7, 0x3ff
2085
+ andi t6, t6, 0x3ff
2086
+ andi t5, t5, 0x3ff
2087
+ andi t1, t1, 0x3ff
2088
+ andi t3, t3, 0x3ff
2089
+ andi t2, t2, 0x3ff
2090
+ andi t0, t0, 0x3ff
2091
+ lw s1, 0(a2)
2092
+ lbux s0, s0(a3)
2093
+ lbux t7, t7(a3)
2094
+ lbux t6, t6(a3)
2095
+ lbux t5, t5(a3)
2096
+ lbux t1, t1(a3)
2097
+ lbux t3, t3(a3)
2098
+ lbux t2, t2(a3)
2099
+ lbux t0, t0(a3)
2100
+ sb s0, 0(s1)
2101
+ sb t7, 1(s1)
2102
+ sb t6, 2(s1)
2103
+ sb t5, 3(s1)
2104
+ sb t1, 4(s1)
2105
+ sb t3, 5(s1)
2106
+ sb t2, 6(s1)
2107
+ sb t0, 7(s1)
2108
+ 6:
2109
+ addiu v0, v0, 32
2110
+ bgtz v1, 4b
2111
+ addiu a2, a2, 4
2112
+ addiu sp, sp, 256
2113
+
2114
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2115
+
2116
+ j ra
2117
+ nop
2118
+
2119
+ END(jsimd_idct_islow_dspr2)
2120
+
2121
+
2122
+ /*****************************************************************************/
2123
+ LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
2124
+ /*
2125
+ * a0 = inptr
2126
+ * a1 = quantptr
2127
+ * a2 = wsptr
2128
+ * a3 = mips_idct_ifast_coefs
2129
+ */
2130
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2131
+
2132
+ addiu t9, a0, 16 /* end address */
2133
+ or AT, a3, zero
2134
+
2135
+ 0:
2136
+ lw s0, 0(a1) /* quantptr[DCTSIZE*0] */
2137
+ lw t0, 0(a0) /* inptr[DCTSIZE*0] */
2138
+ lw t1, 16(a0) /* inptr[DCTSIZE*1] */
2139
+ muleq_s.w.phl v0, t0, s0 /* tmp0 ... */
2140
+ lw t2, 32(a0) /* inptr[DCTSIZE*2] */
2141
+ lw t3, 48(a0) /* inptr[DCTSIZE*3] */
2142
+ lw t4, 64(a0) /* inptr[DCTSIZE*4] */
2143
+ lw t5, 80(a0) /* inptr[DCTSIZE*5] */
2144
+ muleq_s.w.phr t0, t0, s0 /* ... tmp0 ... */
2145
+ lw t6, 96(a0) /* inptr[DCTSIZE*6] */
2146
+ lw t7, 112(a0) /* inptr[DCTSIZE*7] */
2147
+ or s4, t1, t2
2148
+ or s5, t3, t4
2149
+ bnez s4, 1f
2150
+ ins t0, v0, 16, 16 /* ... tmp0 */
2151
+ bnez s5, 1f
2152
+ or s6, t5, t6
2153
+ or s6, s6, t7
2154
+ bnez s6, 1f
2155
+ sw t0, 0(a2) /* wsptr[DCTSIZE*0] */
2156
+ sw t0, 16(a2) /* wsptr[DCTSIZE*1] */
2157
+ sw t0, 32(a2) /* wsptr[DCTSIZE*2] */
2158
+ sw t0, 48(a2) /* wsptr[DCTSIZE*3] */
2159
+ sw t0, 64(a2) /* wsptr[DCTSIZE*4] */
2160
+ sw t0, 80(a2) /* wsptr[DCTSIZE*5] */
2161
+ sw t0, 96(a2) /* wsptr[DCTSIZE*6] */
2162
+ sw t0, 112(a2) /* wsptr[DCTSIZE*7] */
2163
+ addiu a0, a0, 4
2164
+ b 2f
2165
+ addiu a1, a1, 4
2166
+
2167
+ 1:
2168
+ lw s1, 32(a1) /* quantptr[DCTSIZE*2] */
2169
+ lw s2, 64(a1) /* quantptr[DCTSIZE*4] */
2170
+ muleq_s.w.phl v0, t2, s1 /* tmp1 ... */
2171
+ muleq_s.w.phr t2, t2, s1 /* ... tmp1 ... */
2172
+ lw s0, 16(a1) /* quantptr[DCTSIZE*1] */
2173
+ lw s1, 48(a1) /* quantptr[DCTSIZE*3] */
2174
+ lw s3, 96(a1) /* quantptr[DCTSIZE*6] */
2175
+ muleq_s.w.phl v1, t4, s2 /* tmp2 ... */
2176
+ muleq_s.w.phr t4, t4, s2 /* ... tmp2 ... */
2177
+ lw s2, 80(a1) /* quantptr[DCTSIZE*5] */
2178
+ lw t8, 4(AT) /* FIX(1.414213562) */
2179
+ ins t2, v0, 16, 16 /* ... tmp1 */
2180
+ muleq_s.w.phl v0, t6, s3 /* tmp3 ... */
2181
+ muleq_s.w.phr t6, t6, s3 /* ... tmp3 ... */
2182
+ ins t4, v1, 16, 16 /* ... tmp2 */
2183
+ addq.ph s4, t0, t4 /* tmp10 */
2184
+ subq.ph s5, t0, t4 /* tmp11 */
2185
+ ins t6, v0, 16, 16 /* ... tmp3 */
2186
+ subq.ph s6, t2, t6 /* tmp12 ... */
2187
+ addq.ph s7, t2, t6 /* tmp13 */
2188
+ mulq_s.ph s6, s6, t8 /* ... tmp12 ... */
2189
+ addq.ph t0, s4, s7 /* tmp0 */
2190
+ subq.ph t6, s4, s7 /* tmp3 */
2191
+ muleq_s.w.phl v0, t1, s0 /* tmp4 ... */
2192
+ muleq_s.w.phr t1, t1, s0 /* ... tmp4 ... */
2193
+ shll_s.ph s6, s6, 1 /* x2 */
2194
+ lw s3, 112(a1) /* quantptr[DCTSIZE*7] */
2195
+ subq.ph s6, s6, s7 /* ... tmp12 */
2196
+ muleq_s.w.phl v1, t7, s3 /* tmp7 ... */
2197
+ muleq_s.w.phr t7, t7, s3 /* ... tmp7 ... */
2198
+ ins t1, v0, 16, 16 /* ... tmp4 */
2199
+ addq.ph t2, s5, s6 /* tmp1 */
2200
+ subq.ph t4, s5, s6 /* tmp2 */
2201
+ muleq_s.w.phl v0, t5, s2 /* tmp6 ... */
2202
+ muleq_s.w.phr t5, t5, s2 /* ... tmp6 ... */
2203
+ ins t7, v1, 16, 16 /* ... tmp7 */
2204
+ addq.ph s5, t1, t7 /* z11 */
2205
+ subq.ph s6, t1, t7 /* z12 */
2206
+ muleq_s.w.phl v1, t3, s1 /* tmp5 ... */
2207
+ muleq_s.w.phr t3, t3, s1 /* ... tmp5 ... */
2208
+ ins t5, v0, 16, 16 /* ... tmp6 */
2209
+ ins t3, v1, 16, 16 /* ... tmp5 */
2210
+ addq.ph s7, t5, t3 /* z13 */
2211
+ subq.ph v0, t5, t3 /* z10 */
2212
+ addq.ph t7, s5, s7 /* tmp7 */
2213
+ subq.ph s5, s5, s7 /* tmp11 ... */
2214
+ addq.ph v1, v0, s6 /* z5 ... */
2215
+ mulq_s.ph s5, s5, t8 /* ... tmp11 */
2216
+ lw t8, 8(AT) /* FIX(1.847759065) */
2217
+ lw s4, 0(AT) /* FIX(1.082392200) */
2218
+ addq.ph s0, t0, t7
2219
+ subq.ph s1, t0, t7
2220
+ mulq_s.ph v1, v1, t8 /* ... z5 */
2221
+ shll_s.ph s5, s5, 1 /* x2 */
2222
+ lw t8, 12(AT) /* FIX(-2.613125930) */
2223
+ sw s0, 0(a2) /* wsptr[DCTSIZE*0] */
2224
+ shll_s.ph v0, v0, 1 /* x4 */
2225
+ mulq_s.ph v0, v0, t8 /* tmp12 ... */
2226
+ mulq_s.ph s4, s6, s4 /* tmp10 ... */
2227
+ shll_s.ph v1, v1, 1 /* x2 */
2228
+ addiu a0, a0, 4
2229
+ addiu a1, a1, 4
2230
+ sw s1, 112(a2) /* wsptr[DCTSIZE*7] */
2231
+ shll_s.ph s6, v0, 1 /* x4 */
2232
+ shll_s.ph s4, s4, 1 /* x2 */
2233
+ addq.ph s6, s6, v1 /* ... tmp12 */
2234
+ subq.ph t5, s6, t7 /* tmp6 */
2235
+ subq.ph s4, s4, v1 /* ... tmp10 */
2236
+ subq.ph t3, s5, t5 /* tmp5 */
2237
+ addq.ph s2, t2, t5
2238
+ addq.ph t1, s4, t3 /* tmp4 */
2239
+ subq.ph s3, t2, t5
2240
+ sw s2, 16(a2) /* wsptr[DCTSIZE*1] */
2241
+ sw s3, 96(a2) /* wsptr[DCTSIZE*6] */
2242
+ addq.ph v0, t4, t3
2243
+ subq.ph v1, t4, t3
2244
+ sw v0, 32(a2) /* wsptr[DCTSIZE*2] */
2245
+ sw v1, 80(a2) /* wsptr[DCTSIZE*5] */
2246
+ addq.ph v0, t6, t1
2247
+ subq.ph v1, t6, t1
2248
+ sw v0, 64(a2) /* wsptr[DCTSIZE*4] */
2249
+ sw v1, 48(a2) /* wsptr[DCTSIZE*3] */
2250
+
2251
+ 2:
2252
+ bne a0, t9, 0b
2253
+ addiu a2, a2, 4
2254
+
2255
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2256
+
2257
+ j ra
2258
+ nop
2259
+
2260
+ END(jsimd_idct_ifast_cols_dspr2)
2261
+
2262
+
2263
+ /*****************************************************************************/
2264
+ LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
2265
+ /*
2266
+ * a0 = wsptr
2267
+ * a1 = output_buf
2268
+ * a2 = output_col
2269
+ * a3 = mips_idct_ifast_coefs
2270
+ */
2271
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2272
+
2273
+ addiu t9, a0, 128 /* end address */
2274
+ lui s8, 0x8080
2275
+ ori s8, s8, 0x8080
2276
+
2277
+ 0:
2278
+ lw AT, 36(sp) /* restore $a3 (mips_idct_ifast_coefs) */
2279
+ lw t0, 0(a0) /* wsptr[DCTSIZE*0+0/1] b a */
2280
+ lw s0, 16(a0) /* wsptr[DCTSIZE*1+0/1] B A */
2281
+ lw t2, 4(a0) /* wsptr[DCTSIZE*0+2/3] d c */
2282
+ lw s2, 20(a0) /* wsptr[DCTSIZE*1+2/3] D C */
2283
+ lw t4, 8(a0) /* wsptr[DCTSIZE*0+4/5] f e */
2284
+ lw s4, 24(a0) /* wsptr[DCTSIZE*1+4/5] F E */
2285
+ lw t6, 12(a0) /* wsptr[DCTSIZE*0+6/7] h g */
2286
+ lw s6, 28(a0) /* wsptr[DCTSIZE*1+6/7] H G */
2287
+ precrq.ph.w t1, s0, t0 /* B b */
2288
+ ins t0, s0, 16, 16 /* A a */
2289
+ bnez t1, 1f
2290
+ or s0, t2, s2
2291
+ bnez s0, 1f
2292
+ or s0, t4, s4
2293
+ bnez s0, 1f
2294
+ or s0, t6, s6
2295
+ bnez s0, 1f
2296
+ shll_s.ph s0, t0, 2 /* A a */
2297
+ lw a3, 0(a1)
2298
+ lw AT, 4(a1)
2299
+ precrq.ph.w t0, s0, s0 /* A A */
2300
+ ins s0, s0, 16, 16 /* a a */
2301
+ addu a3, a3, a2
2302
+ addu AT, AT, a2
2303
+ precrq.qb.ph t0, t0, t0 /* A A A A */
2304
+ precrq.qb.ph s0, s0, s0 /* a a a a */
2305
+ addu.qb s0, s0, s8
2306
+ addu.qb t0, t0, s8
2307
+ sw s0, 0(a3)
2308
+ sw s0, 4(a3)
2309
+ sw t0, 0(AT)
2310
+ sw t0, 4(AT)
2311
+ addiu a0, a0, 32
2312
+ bne a0, t9, 0b
2313
+ addiu a1, a1, 8
2314
+ b 2f
2315
+ nop
2316
+
2317
+ 1:
2318
+ precrq.ph.w t3, s2, t2
2319
+ ins t2, s2, 16, 16
2320
+ precrq.ph.w t5, s4, t4
2321
+ ins t4, s4, 16, 16
2322
+ precrq.ph.w t7, s6, t6
2323
+ ins t6, s6, 16, 16
2324
+ lw t8, 4(AT) /* FIX(1.414213562) */
2325
+ addq.ph s4, t0, t4 /* tmp10 */
2326
+ subq.ph s5, t0, t4 /* tmp11 */
2327
+ subq.ph s6, t2, t6 /* tmp12 ... */
2328
+ addq.ph s7, t2, t6 /* tmp13 */
2329
+ mulq_s.ph s6, s6, t8 /* ... tmp12 ... */
2330
+ addq.ph t0, s4, s7 /* tmp0 */
2331
+ subq.ph t6, s4, s7 /* tmp3 */
2332
+ shll_s.ph s6, s6, 1 /* x2 */
2333
+ subq.ph s6, s6, s7 /* ... tmp12 */
2334
+ addq.ph t2, s5, s6 /* tmp1 */
2335
+ subq.ph t4, s5, s6 /* tmp2 */
2336
+ addq.ph s5, t1, t7 /* z11 */
2337
+ subq.ph s6, t1, t7 /* z12 */
2338
+ addq.ph s7, t5, t3 /* z13 */
2339
+ subq.ph v0, t5, t3 /* z10 */
2340
+ addq.ph t7, s5, s7 /* tmp7 */
2341
+ subq.ph s5, s5, s7 /* tmp11 ... */
2342
+ addq.ph v1, v0, s6 /* z5 ... */
2343
+ mulq_s.ph s5, s5, t8 /* ... tmp11 */
2344
+ lw t8, 8(AT) /* FIX(1.847759065) */
2345
+ lw s4, 0(AT) /* FIX(1.082392200) */
2346
+ addq.ph s0, t0, t7 /* tmp0 + tmp7 */
2347
+ subq.ph s7, t0, t7 /* tmp0 - tmp7 */
2348
+ mulq_s.ph v1, v1, t8 /* ... z5 */
2349
+ lw a3, 0(a1)
2350
+ lw t8, 12(AT) /* FIX(-2.613125930) */
2351
+ shll_s.ph s5, s5, 1 /* x2 */
2352
+ addu a3, a3, a2
2353
+ shll_s.ph v0, v0, 1 /* x4 */
2354
+ mulq_s.ph v0, v0, t8 /* tmp12 ... */
2355
+ mulq_s.ph s4, s6, s4 /* tmp10 ... */
2356
+ shll_s.ph v1, v1, 1 /* x2 */
2357
+ addiu a0, a0, 32
2358
+ addiu a1, a1, 8
2359
+ shll_s.ph s6, v0, 1 /* x4 */
2360
+ shll_s.ph s4, s4, 1 /* x2 */
2361
+ addq.ph s6, s6, v1 /* ... tmp12 */
2362
+ shll_s.ph s0, s0, 2
2363
+ subq.ph t5, s6, t7 /* tmp6 */
2364
+ subq.ph s4, s4, v1 /* ... tmp10 */
2365
+ subq.ph t3, s5, t5 /* tmp5 */
2366
+ shll_s.ph s7, s7, 2
2367
+ addq.ph t1, s4, t3 /* tmp4 */
2368
+ addq.ph s1, t2, t5 /* tmp1 + tmp6 */
2369
+ subq.ph s6, t2, t5 /* tmp1 - tmp6 */
2370
+ addq.ph s2, t4, t3 /* tmp2 + tmp5 */
2371
+ subq.ph s5, t4, t3 /* tmp2 - tmp5 */
2372
+ addq.ph s4, t6, t1 /* tmp3 + tmp4 */
2373
+ subq.ph s3, t6, t1 /* tmp3 - tmp4 */
2374
+ shll_s.ph s1, s1, 2
2375
+ shll_s.ph s2, s2, 2
2376
+ shll_s.ph s3, s3, 2
2377
+ shll_s.ph s4, s4, 2
2378
+ shll_s.ph s5, s5, 2
2379
+ shll_s.ph s6, s6, 2
2380
+ precrq.ph.w t0, s1, s0 /* B A */
2381
+ ins s0, s1, 16, 16 /* b a */
2382
+ precrq.ph.w t2, s3, s2 /* D C */
2383
+ ins s2, s3, 16, 16 /* d c */
2384
+ precrq.ph.w t4, s5, s4 /* F E */
2385
+ ins s4, s5, 16, 16 /* f e */
2386
+ precrq.ph.w t6, s7, s6 /* H G */
2387
+ ins s6, s7, 16, 16 /* h g */
2388
+ precrq.qb.ph t0, t2, t0 /* D C B A */
2389
+ precrq.qb.ph s0, s2, s0 /* d c b a */
2390
+ precrq.qb.ph t4, t6, t4 /* H G F E */
2391
+ precrq.qb.ph s4, s6, s4 /* h g f e */
2392
+ addu.qb s0, s0, s8
2393
+ addu.qb s4, s4, s8
2394
+ sw s0, 0(a3) /* outptr[0/1/2/3] d c b a */
2395
+ sw s4, 4(a3) /* outptr[4/5/6/7] h g f e */
2396
+ lw a3, -4(a1)
2397
+ addu.qb t0, t0, s8
2398
+ addu a3, a3, a2
2399
+ addu.qb t4, t4, s8
2400
+ sw t0, 0(a3) /* outptr[0/1/2/3] D C B A */
2401
+ bne a0, t9, 0b
2402
+ sw t4, 4(a3) /* outptr[4/5/6/7] H G F E */
2403
+
2404
+ 2:
2405
+
2406
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2407
+
2408
+ j ra
2409
+ nop
2410
+
2411
+ END(jsimd_idct_ifast_rows_dspr2)
2412
+
2413
+
2414
+ /*****************************************************************************/
2415
+ LEAF_DSPR2(jsimd_fdct_islow_dspr2)
2416
+ /*
2417
+ * a0 = data
2418
+ */
2419
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2420
+
2421
+ lui t0, 6437
2422
+ ori t0, 2260
2423
+ lui t1, 9633
2424
+ ori t1, 11363
2425
+ lui t2, 0xd39e
2426
+ ori t2, 0xe6dc
2427
+ lui t3, 0xf72d
2428
+ ori t3, 9633
2429
+ lui t4, 2261
2430
+ ori t4, 9633
2431
+ lui t5, 0xd39e
2432
+ ori t5, 6437
2433
+ lui t6, 9633
2434
+ ori t6, 0xd39d
2435
+ lui t7, 0xe6dc
2436
+ ori t7, 2260
2437
+ lui t8, 4433
2438
+ ori t8, 10703
2439
+ lui t9, 0xd630
2440
+ ori t9, 4433
2441
+ li s8, 8
2442
+ move a1, a0
2443
+ 1:
2444
+ lw s0, 0(a1) /* tmp0 = 1|0 */
2445
+ lw s1, 4(a1) /* tmp1 = 3|2 */
2446
+ lw s2, 8(a1) /* tmp2 = 5|4 */
2447
+ lw s3, 12(a1) /* tmp3 = 7|6 */
2448
+ packrl.ph s1, s1, s1 /* tmp1 = 2|3 */
2449
+ packrl.ph s3, s3, s3 /* tmp3 = 6|7 */
2450
+ subq.ph s7, s1, s2 /* tmp7 = 2-5|3-4 = t5|t4 */
2451
+ subq.ph s5, s0, s3 /* tmp5 = 1-6|0-7 = t6|t7 */
2452
+ mult $0, $0 /* ac0 = 0 */
2453
+ dpa.w.ph $ac0, s7, t0 /* ac0 += t5* 6437 + t4* 2260 */
2454
+ dpa.w.ph $ac0, s5, t1 /* ac0 += t6* 9633 + t7* 11363 */
2455
+ mult $ac1, $0, $0 /* ac1 = 0 */
2456
+ dpa.w.ph $ac1, s7, t2 /* ac1 += t5*-11362 + t4* -6436 */
2457
+ dpa.w.ph $ac1, s5, t3 /* ac1 += t6* -2259 + t7* 9633 */
2458
+ mult $ac2, $0, $0 /* ac2 = 0 */
2459
+ dpa.w.ph $ac2, s7, t4 /* ac2 += t5* 2261 + t4* 9633 */
2460
+ dpa.w.ph $ac2, s5, t5 /* ac2 += t6*-11362 + t7* 6437 */
2461
+ mult $ac3, $0, $0 /* ac3 = 0 */
2462
+ dpa.w.ph $ac3, s7, t6 /* ac3 += t5* 9633 + t4*-11363 */
2463
+ dpa.w.ph $ac3, s5, t7 /* ac3 += t6* -6436 + t7* 2260 */
2464
+ addq.ph s6, s1, s2 /* tmp6 = 2+5|3+4 = t2|t3 */
2465
+ addq.ph s4, s0, s3 /* tmp4 = 1+6|0+7 = t1|t0 */
2466
+ extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */
2467
+ extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */
2468
+ extr_r.w s2, $ac2, 11 /* tmp2 = (ac2 + 1024) >> 11 */
2469
+ extr_r.w s3, $ac3, 11 /* tmp3 = (ac3 + 1024) >> 11 */
2470
+ addq.ph s5, s4, s6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */
2471
+ subq.ph s7, s4, s6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */
2472
+ sh s0, 2(a1)
2473
+ sh s1, 6(a1)
2474
+ sh s2, 10(a1)
2475
+ sh s3, 14(a1)
2476
+ mult $0, $0 /* ac0 = 0 */
2477
+ dpa.w.ph $ac0, s7, t8 /* ac0 += t12* 4433 + t13* 10703 */
2478
+ mult $ac1, $0, $0 /* ac1 = 0 */
2479
+ dpa.w.ph $ac1, s7, t9 /* ac1 += t12*-10704 + t13* 4433 */
2480
+ sra s4, s5, 16 /* tmp4 = t11 */
2481
+ addiu a1, a1, 16
2482
+ addiu s8, s8, -1
2483
+ extr_r.w s0, $ac0, 11 /* tmp0 = (ac0 + 1024) >> 11 */
2484
+ extr_r.w s1, $ac1, 11 /* tmp1 = (ac1 + 1024) >> 11 */
2485
+ addu s2, s5, s4 /* tmp2 = t10 + t11 */
2486
+ subu s3, s5, s4 /* tmp3 = t10 - t11 */
2487
+ sll s2, s2, 2 /* tmp2 = (t10 + t11) << 2 */
2488
+ sll s3, s3, 2 /* tmp3 = (t10 - t11) << 2 */
2489
+ sh s2, -16(a1)
2490
+ sh s3, -8(a1)
2491
+ sh s0, -12(a1)
2492
+ bgtz s8, 1b
2493
+ sh s1, -4(a1)
2494
+ li t0, 2260
2495
+ li t1, 11363
2496
+ li t2, 9633
2497
+ li t3, 6436
2498
+ li t4, 6437
2499
+ li t5, 2261
2500
+ li t6, 11362
2501
+ li t7, 2259
2502
+ li t8, 4433
2503
+ li t9, 10703
2504
+ li a1, 10704
2505
+ li s8, 8
2506
+
2507
+ 2:
2508
+ lh a2, 0(a0) /* 0 */
2509
+ lh a3, 16(a0) /* 8 */
2510
+ lh v0, 32(a0) /* 16 */
2511
+ lh v1, 48(a0) /* 24 */
2512
+ lh s4, 64(a0) /* 32 */
2513
+ lh s5, 80(a0) /* 40 */
2514
+ lh s6, 96(a0) /* 48 */
2515
+ lh s7, 112(a0) /* 56 */
2516
+ addu s2, v0, s5 /* tmp2 = 16 + 40 */
2517
+ subu s5, v0, s5 /* tmp5 = 16 - 40 */
2518
+ addu s3, v1, s4 /* tmp3 = 24 + 32 */
2519
+ subu s4, v1, s4 /* tmp4 = 24 - 32 */
2520
+ addu s0, a2, s7 /* tmp0 = 0 + 56 */
2521
+ subu s7, a2, s7 /* tmp7 = 0 - 56 */
2522
+ addu s1, a3, s6 /* tmp1 = 8 + 48 */
2523
+ subu s6, a3, s6 /* tmp6 = 8 - 48 */
2524
+ addu a2, s0, s3 /* tmp10 = tmp0 + tmp3 */
2525
+ subu v1, s0, s3 /* tmp13 = tmp0 - tmp3 */
2526
+ addu a3, s1, s2 /* tmp11 = tmp1 + tmp2 */
2527
+ subu v0, s1, s2 /* tmp12 = tmp1 - tmp2 */
2528
+ mult s7, t1 /* ac0 = tmp7 * c1 */
2529
+ madd s4, t0 /* ac0 += tmp4 * c0 */
2530
+ madd s5, t4 /* ac0 += tmp5 * c4 */
2531
+ madd s6, t2 /* ac0 += tmp6 * c2 */
2532
+ mult $ac1, s7, t2 /* ac1 = tmp7 * c2 */
2533
+ msub $ac1, s4, t3 /* ac1 -= tmp4 * c3 */
2534
+ msub $ac1, s5, t6 /* ac1 -= tmp5 * c6 */
2535
+ msub $ac1, s6, t7 /* ac1 -= tmp6 * c7 */
2536
+ mult $ac2, s7, t4 /* ac2 = tmp7 * c4 */
2537
+ madd $ac2, s4, t2 /* ac2 += tmp4 * c2 */
2538
+ madd $ac2, s5, t5 /* ac2 += tmp5 * c5 */
2539
+ msub $ac2, s6, t6 /* ac2 -= tmp6 * c6 */
2540
+ mult $ac3, s7, t0 /* ac3 = tmp7 * c0 */
2541
+ msub $ac3, s4, t1 /* ac3 -= tmp4 * c1 */
2542
+ madd $ac3, s5, t2 /* ac3 += tmp5 * c2 */
2543
+ msub $ac3, s6, t3 /* ac3 -= tmp6 * c3 */
2544
+ extr_r.w s0, $ac0, 15 /* tmp0 = (ac0 + 16384) >> 15 */
2545
+ extr_r.w s1, $ac1, 15 /* tmp1 = (ac1 + 16384) >> 15 */
2546
+ extr_r.w s2, $ac2, 15 /* tmp2 = (ac2 + 16384) >> 15 */
2547
+ extr_r.w s3, $ac3, 15 /* tmp3 = (ac3 + 16384) >> 15 */
2548
+ addiu s8, s8, -1
2549
+ addu s4, a2, a3 /* tmp4 = tmp10 + tmp11 */
2550
+ subu s5, a2, a3 /* tmp5 = tmp10 - tmp11 */
2551
+ sh s0, 16(a0)
2552
+ sh s1, 48(a0)
2553
+ sh s2, 80(a0)
2554
+ sh s3, 112(a0)
2555
+ mult v0, t8 /* ac0 = tmp12 * c8 */
2556
+ madd v1, t9 /* ac0 += tmp13 * c9 */
2557
+ mult $ac1, v1, t8 /* ac1 = tmp13 * c8 */
2558
+ msub $ac1, v0, a1 /* ac1 -= tmp12 * c10 */
2559
+ addiu a0, a0, 2
2560
+ extr_r.w s6, $ac0, 15 /* tmp6 = (ac0 + 16384) >> 15 */
2561
+ extr_r.w s7, $ac1, 15 /* tmp7 = (ac1 + 16384) >> 15 */
2562
+ shra_r.w s4, s4, 2 /* tmp4 = (tmp4 + 2) >> 2 */
2563
+ shra_r.w s5, s5, 2 /* tmp5 = (tmp5 + 2) >> 2 */
2564
+ sh s4, -2(a0)
2565
+ sh s5, 62(a0)
2566
+ sh s6, 30(a0)
2567
+ bgtz s8, 2b
2568
+ sh s7, 94(a0)
2569
+
2570
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2571
+
2572
+ jr ra
2573
+ nop
2574
+
2575
+ END(jsimd_fdct_islow_dspr2)
2576
+
2577
+
2578
+ /**************************************************************************/
2579
+ LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
2580
+ /*
2581
+ * a0 = data
2582
+ */
2583
+ .set at
2584
+
2585
+ SAVE_REGS_ON_STACK 8, s0, s1
2586
+
2587
+ li a1, 0x014e014e /* FIX_1_306562965 (334 << 16) |
2588
+ (334 & 0xffff) */
2589
+ li a2, 0x008b008b /* FIX_0_541196100 (139 << 16) |
2590
+ (139 & 0xffff) */
2591
+ li a3, 0x00620062 /* FIX_0_382683433 (98 << 16) |
2592
+ (98 & 0xffff) */
2593
+ li s1, 0x00b500b5 /* FIX_0_707106781 (181 << 16) |
2594
+ (181 & 0xffff) */
2595
+
2596
+ move v0, a0
2597
+ addiu v1, v0, 128 /* end address */
2598
+
2599
+ 0:
2600
+ lw t0, 0(v0) /* tmp0 = 1|0 */
2601
+ lw t1, 4(v0) /* tmp1 = 3|2 */
2602
+ lw t2, 8(v0) /* tmp2 = 5|4 */
2603
+ lw t3, 12(v0) /* tmp3 = 7|6 */
2604
+ packrl.ph t1, t1, t1 /* tmp1 = 2|3 */
2605
+ packrl.ph t3, t3, t3 /* tmp3 = 6|7 */
2606
+ subq.ph t7, t1, t2 /* tmp7 = 2-5|3-4 = t5|t4 */
2607
+ subq.ph t5, t0, t3 /* tmp5 = 1-6|0-7 = t6|t7 */
2608
+ addq.ph t6, t1, t2 /* tmp6 = 2+5|3+4 = t2|t3 */
2609
+ addq.ph t4, t0, t3 /* tmp4 = 1+6|0+7 = t1|t0 */
2610
+ addq.ph t8, t4, t6 /* tmp5 = t1+t2|t0+t3 = t11|t10 */
2611
+ subq.ph t9, t4, t6 /* tmp7 = t1-t2|t0-t3 = t12|t13 */
2612
+ sra t4, t8, 16 /* tmp4 = t11 */
2613
+ mult $0, $0 /* ac0 = 0 */
2614
+ dpa.w.ph $ac0, t9, s1
2615
+ mult $ac1, $0, $0 /* ac1 = 0 */
2616
+ dpa.w.ph $ac1, t7, a3 /* ac1 += t4*98 + t5*98 */
2617
+ dpsx.w.ph $ac1, t5, a3 /* ac1 += t6*98 + t7*98 */
2618
+ mult $ac2, $0, $0 /* ac2 = 0 */
2619
+ dpa.w.ph $ac2, t7, a2 /* ac2 += t4*139 + t5*139 */
2620
+ mult $ac3, $0, $0 /* ac3 = 0 */
2621
+ dpa.w.ph $ac3, t5, a1 /* ac3 += t6*334 + t7*334 */
2622
+ precrq.ph.w t0, t5, t7 /* t0 = t5|t6 */
2623
+ addq.ph t2, t8, t4 /* tmp2 = t10 + t11 */
2624
+ subq.ph t3, t8, t4 /* tmp3 = t10 - t11 */
2625
+ extr.w t4, $ac0, 8
2626
+ mult $0, $0 /* ac0 = 0 */
2627
+ dpa.w.ph $ac0, t0, s1 /* ac0 += t5*181 + t6*181 */
2628
+ extr.w t0, $ac1, 8 /* t0 = z5 */
2629
+ extr.w t1, $ac2, 8 /* t1 = MULTIPLY(tmp10, 139) */
2630
+ extr.w t7, $ac3, 8 /* t2 = MULTIPLY(tmp12, 334) */
2631
+ extr.w t8, $ac0, 8 /* t8 = z3 = MULTIPLY(tmp11, 181) */
2632
+ add t6, t1, t0 /* t6 = z2 */
2633
+ add t7, t7, t0 /* t7 = z4 */
2634
+ subq.ph t0, t5, t8 /* t0 = z13 = tmp7 - z3 */
2635
+ addq.ph t8, t5, t8 /* t9 = z11 = tmp7 + z3 */
2636
+ addq.ph t1, t0, t6 /* t1 = z13 + z2 */
2637
+ subq.ph t6, t0, t6 /* t6 = z13 - z2 */
2638
+ addq.ph t0, t8, t7 /* t0 = z11 + z4 */
2639
+ subq.ph t7, t8, t7 /* t7 = z11 - z4 */
2640
+ addq.ph t5, t4, t9
2641
+ subq.ph t4, t9, t4
2642
+ sh t2, 0(v0)
2643
+ sh t5, 4(v0)
2644
+ sh t3, 8(v0)
2645
+ sh t4, 12(v0)
2646
+ sh t1, 10(v0)
2647
+ sh t6, 6(v0)
2648
+ sh t0, 2(v0)
2649
+ sh t7, 14(v0)
2650
+ addiu v0, 16
2651
+ bne v1, v0, 0b
2652
+ nop
2653
+ move v0, a0
2654
+ addiu v1, v0, 16
2655
+
2656
+ 1:
2657
+ lh t0, 0(v0) /* 0 */
2658
+ lh t1, 16(v0) /* 8 */
2659
+ lh t2, 32(v0) /* 16 */
2660
+ lh t3, 48(v0) /* 24 */
2661
+ lh t4, 64(v0) /* 32 */
2662
+ lh t5, 80(v0) /* 40 */
2663
+ lh t6, 96(v0) /* 48 */
2664
+ lh t7, 112(v0) /* 56 */
2665
+ add t8, t0, t7 /* t8 = tmp0 */
2666
+ sub t7, t0, t7 /* t7 = tmp7 */
2667
+ add t0, t1, t6 /* t0 = tmp1 */
2668
+ sub t1, t1, t6 /* t1 = tmp6 */
2669
+ add t6, t2, t5 /* t6 = tmp2 */
2670
+ sub t5, t2, t5 /* t5 = tmp5 */
2671
+ add t2, t3, t4 /* t2 = tmp3 */
2672
+ sub t3, t3, t4 /* t3 = tmp4 */
2673
+ add t4, t8, t2 /* t4 = tmp10 = tmp0 + tmp3 */
2674
+ sub t8, t8, t2 /* t8 = tmp13 = tmp0 - tmp3 */
2675
+ sub s0, t0, t6 /* s0 = tmp12 = tmp1 - tmp2 */
2676
+ ins t8, s0, 16, 16 /* t8 = tmp12|tmp13 */
2677
+ add t2, t0, t6 /* t2 = tmp11 = tmp1 + tmp2 */
2678
+ mult $0, $0 /* ac0 = 0 */
2679
+ dpa.w.ph $ac0, t8, s1 /* ac0 += t12*181 + t13*181 */
2680
+ add s0, t4, t2 /* t8 = tmp10+tmp11 */
2681
+ sub t4, t4, t2 /* t4 = tmp10-tmp11 */
2682
+ sh s0, 0(v0)
2683
+ sh t4, 64(v0)
2684
+ extr.w t2, $ac0, 8 /* z1 = MULTIPLY(tmp12+tmp13,
2685
+ FIX_0_707106781) */
2686
+ addq.ph t4, t8, t2 /* t9 = tmp13 + z1 */
2687
+ subq.ph t8, t8, t2 /* t2 = tmp13 - z1 */
2688
+ sh t4, 32(v0)
2689
+ sh t8, 96(v0)
2690
+ add t3, t3, t5 /* t3 = tmp10 = tmp4 + tmp5 */
2691
+ add t0, t5, t1 /* t0 = tmp11 = tmp5 + tmp6 */
2692
+ add t1, t1, t7 /* t1 = tmp12 = tmp6 + tmp7 */
2693
+ andi t4, a1, 0xffff
2694
+ mul s0, t1, t4
2695
+ sra s0, s0, 8 /* s0 = z4 =
2696
+ MULTIPLY(tmp12, FIX_1_306562965) */
2697
+ ins t1, t3, 16, 16 /* t1 = tmp10|tmp12 */
2698
+ mult $0, $0 /* ac0 = 0 */
2699
+ mulsa.w.ph $ac0, t1, a3 /* ac0 += t10*98 - t12*98 */
2700
+ extr.w t8, $ac0, 8 /* z5 = MULTIPLY(tmp10-tmp12,
2701
+ FIX_0_382683433) */
2702
+ add t2, t7, t8 /* t2 = tmp7 + z5 */
2703
+ sub t7, t7, t8 /* t7 = tmp7 - z5 */
2704
+ andi t4, a2, 0xffff
2705
+ mul t8, t3, t4
2706
+ sra t8, t8, 8 /* t8 = z2 =
2707
+ MULTIPLY(tmp10, FIX_0_541196100) */
2708
+ andi t4, s1, 0xffff
2709
+ mul t6, t0, t4
2710
+ sra t6, t6, 8 /* t6 = z3 =
2711
+ MULTIPLY(tmp11, FIX_0_707106781) */
2712
+ add t0, t6, t8 /* t0 = z3 + z2 */
2713
+ sub t1, t6, t8 /* t1 = z3 - z2 */
2714
+ add t3, t6, s0 /* t3 = z3 + z4 */
2715
+ sub t4, t6, s0 /* t4 = z3 - z4 */
2716
+ sub t5, t2, t1 /* t5 = dataptr[5] */
2717
+ sub t6, t7, t0 /* t6 = dataptr[3] */
2718
+ add t3, t2, t3 /* t3 = dataptr[1] */
2719
+ add t4, t7, t4 /* t4 = dataptr[7] */
2720
+ sh t5, 80(v0)
2721
+ sh t6, 48(v0)
2722
+ sh t3, 16(v0)
2723
+ sh t4, 112(v0)
2724
+ addiu v0, 2
2725
+ bne v0, v1, 1b
2726
+ nop
2727
+
2728
+ RESTORE_REGS_FROM_STACK 8, s0, s1
2729
+
2730
+ j ra
2731
+ nop
2732
+ END(jsimd_fdct_ifast_dspr2)
2733
+
2734
+
2735
+ /*****************************************************************************/
2736
+ LEAF_DSPR2(jsimd_quantize_dspr2)
2737
+ /*
2738
+ * a0 = coef_block
2739
+ * a1 = divisors
2740
+ * a2 = workspace
2741
+ */
2742
+ .set at
2743
+
2744
+ SAVE_REGS_ON_STACK 16, s0, s1, s2
2745
+
2746
+ addiu v0, a2, 124 /* v0 = workspace_end */
2747
+ lh t0, 0(a2)
2748
+ lh t1, 0(a1)
2749
+ lh t2, 128(a1)
2750
+ sra t3, t0, 15
2751
+ sll t3, t3, 1
2752
+ addiu t3, t3, 1
2753
+ mul t0, t0, t3
2754
+ lh t4, 384(a1)
2755
+ lh t5, 130(a1)
2756
+ lh t6, 2(a2)
2757
+ lh t7, 2(a1)
2758
+ lh t8, 386(a1)
2759
+
2760
+ 1:
2761
+ andi t1, 0xffff
2762
+ add t9, t0, t2
2763
+ andi t9, 0xffff
2764
+ mul v1, t9, t1
2765
+ sra s0, t6, 15
2766
+ sll s0, s0, 1
2767
+ addiu s0, s0, 1
2768
+ addiu t9, t4, 16
2769
+ srav v1, v1, t9
2770
+ mul v1, v1, t3
2771
+ mul t6, t6, s0
2772
+ andi t7, 0xffff
2773
+ addiu a2, a2, 4
2774
+ addiu a1, a1, 4
2775
+ add s1, t6, t5
2776
+ andi s1, 0xffff
2777
+ sh v1, 0(a0)
2778
+
2779
+ mul s2, s1, t7
2780
+ addiu s1, t8, 16
2781
+ srav s2, s2, s1
2782
+ mul s2, s2, s0
2783
+ lh t0, 0(a2)
2784
+ lh t1, 0(a1)
2785
+ sra t3, t0, 15
2786
+ sll t3, t3, 1
2787
+ addiu t3, t3, 1
2788
+ mul t0, t0, t3
2789
+ lh t2, 128(a1)
2790
+ lh t4, 384(a1)
2791
+ lh t5, 130(a1)
2792
+ lh t8, 386(a1)
2793
+ lh t6, 2(a2)
2794
+ lh t7, 2(a1)
2795
+ sh s2, 2(a0)
2796
+ lh t0, 0(a2)
2797
+ sra t3, t0, 15
2798
+ sll t3, t3, 1
2799
+ addiu t3, t3, 1
2800
+ mul t0, t0, t3
2801
+ bne a2, v0, 1b
2802
+ addiu a0, a0, 4
2803
+
2804
+ andi t1, 0xffff
2805
+ add t9, t0, t2
2806
+ andi t9, 0xffff
2807
+ mul v1, t9, t1
2808
+ sra s0, t6, 15
2809
+ sll s0, s0, 1
2810
+ addiu s0, s0, 1
2811
+ addiu t9, t4, 16
2812
+ srav v1, v1, t9
2813
+ mul v1, v1, t3
2814
+ mul t6, t6, s0
2815
+ andi t7, 0xffff
2816
+ sh v1, 0(a0)
2817
+ add s1, t6, t5
2818
+ andi s1, 0xffff
2819
+ mul s2, s1, t7
2820
+ addiu s1, t8, 16
2821
+ addiu a2, a2, 4
2822
+ addiu a1, a1, 4
2823
+ srav s2, s2, s1
2824
+ mul s2, s2, s0
2825
+ sh s2, 2(a0)
2826
+
2827
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2
2828
+
2829
+ j ra
2830
+ nop
2831
+
2832
+ END(jsimd_quantize_dspr2)
2833
+
2834
+
2835
+ #ifndef __mips_soft_float
2836
+
2837
+ /*****************************************************************************/
2838
+ LEAF_DSPR2(jsimd_quantize_float_dspr2)
2839
+ /*
2840
+ * a0 = coef_block
2841
+ * a1 = divisors
2842
+ * a2 = workspace
2843
+ */
2844
+ .set at
2845
+
2846
+ li t1, 0x46800100 /* integer representation 16384.5 */
2847
+ mtc1 t1, f0
2848
+ li t0, 63
2849
+ 0:
2850
+ lwc1 f2, 0(a2)
2851
+ lwc1 f10, 0(a1)
2852
+ lwc1 f4, 4(a2)
2853
+ lwc1 f12, 4(a1)
2854
+ lwc1 f6, 8(a2)
2855
+ lwc1 f14, 8(a1)
2856
+ lwc1 f8, 12(a2)
2857
+ lwc1 f16, 12(a1)
2858
+ madd.s f2, f0, f2, f10
2859
+ madd.s f4, f0, f4, f12
2860
+ madd.s f6, f0, f6, f14
2861
+ madd.s f8, f0, f8, f16
2862
+ lwc1 f10, 16(a1)
2863
+ lwc1 f12, 20(a1)
2864
+ trunc.w.s f2, f2
2865
+ trunc.w.s f4, f4
2866
+ trunc.w.s f6, f6
2867
+ trunc.w.s f8, f8
2868
+ lwc1 f14, 24(a1)
2869
+ lwc1 f16, 28(a1)
2870
+ mfc1 t1, f2
2871
+ mfc1 t2, f4
2872
+ mfc1 t3, f6
2873
+ mfc1 t4, f8
2874
+ lwc1 f2, 16(a2)
2875
+ lwc1 f4, 20(a2)
2876
+ lwc1 f6, 24(a2)
2877
+ lwc1 f8, 28(a2)
2878
+ madd.s f2, f0, f2, f10
2879
+ madd.s f4, f0, f4, f12
2880
+ madd.s f6, f0, f6, f14
2881
+ madd.s f8, f0, f8, f16
2882
+ addiu t1, t1, -16384
2883
+ addiu t2, t2, -16384
2884
+ addiu t3, t3, -16384
2885
+ addiu t4, t4, -16384
2886
+ trunc.w.s f2, f2
2887
+ trunc.w.s f4, f4
2888
+ trunc.w.s f6, f6
2889
+ trunc.w.s f8, f8
2890
+ sh t1, 0(a0)
2891
+ sh t2, 2(a0)
2892
+ sh t3, 4(a0)
2893
+ sh t4, 6(a0)
2894
+ mfc1 t1, f2
2895
+ mfc1 t2, f4
2896
+ mfc1 t3, f6
2897
+ mfc1 t4, f8
2898
+ addiu t0, t0, -8
2899
+ addiu a2, a2, 32
2900
+ addiu a1, a1, 32
2901
+ addiu t1, t1, -16384
2902
+ addiu t2, t2, -16384
2903
+ addiu t3, t3, -16384
2904
+ addiu t4, t4, -16384
2905
+ sh t1, 8(a0)
2906
+ sh t2, 10(a0)
2907
+ sh t3, 12(a0)
2908
+ sh t4, 14(a0)
2909
+ bgez t0, 0b
2910
+ addiu a0, a0, 16
2911
+
2912
+ j ra
2913
+ nop
2914
+
2915
+ END(jsimd_quantize_float_dspr2)
2916
+
2917
+ #endif
2918
+
2919
+
2920
+ /*****************************************************************************/
2921
+ LEAF_DSPR2(jsimd_idct_2x2_dspr2)
2922
+ /*
2923
+ * a0 = compptr->dct_table
2924
+ * a1 = coef_block
2925
+ * a2 = output_buf
2926
+ * a3 = output_col
2927
+ */
2928
+ .set at
2929
+
2930
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
2931
+
2932
+ addiu sp, sp, -40
2933
+ move v0, sp
2934
+ addiu s2, zero, 29692
2935
+ addiu s3, zero, -10426
2936
+ addiu s4, zero, 6967
2937
+ addiu s5, zero, -5906
2938
+ lh t0, 0(a1) /* t0 = inptr[DCTSIZE*0] */
2939
+ lh t5, 0(a0) /* t5 = quantptr[DCTSIZE*0] */
2940
+ lh t1, 48(a1) /* t1 = inptr[DCTSIZE*3] */
2941
+ lh t6, 48(a0) /* t6 = quantptr[DCTSIZE*3] */
2942
+ mul t4, t5, t0
2943
+ lh t0, 16(a1) /* t0 = inptr[DCTSIZE*1] */
2944
+ lh t5, 16(a0) /* t5 = quantptr[DCTSIZE*1] */
2945
+ mul t6, t6, t1
2946
+ mul t5, t5, t0
2947
+ lh t2, 80(a1) /* t2 = inptr[DCTSIZE*5] */
2948
+ lh t7, 80(a0) /* t7 = quantptr[DCTSIZE*5] */
2949
+ lh t3, 112(a1) /* t3 = inptr[DCTSIZE*7] */
2950
+ lh t8, 112(a0) /* t8 = quantptr[DCTSIZE*7] */
2951
+ mul t7, t7, t2
2952
+ mult zero, zero
2953
+ mul t8, t8, t3
2954
+ li s0, 0x73FCD746 /* s0 = (29692 << 16) | (-10426 & 0xffff) */
2955
+ li s1, 0x1B37E8EE /* s1 = (6967 << 16) | (-5906 & 0xffff) */
2956
+ ins t6, t5, 16, 16 /* t6 = t5|t6 */
2957
+ sll t4, t4, 15
2958
+ dpa.w.ph $ac0, t6, s0
2959
+ lh t1, 2(a1)
2960
+ lh t6, 2(a0)
2961
+ ins t8, t7, 16, 16 /* t8 = t7|t8 */
2962
+ dpa.w.ph $ac0, t8, s1
2963
+ mflo t0, $ac0
2964
+ mul t5, t6, t1
2965
+ lh t1, 18(a1)
2966
+ lh t6, 18(a0)
2967
+ lh t2, 50(a1)
2968
+ lh t7, 50(a0)
2969
+ mul t6, t6, t1
2970
+ subu t8, t4, t0
2971
+ mul t7, t7, t2
2972
+ addu t0, t4, t0
2973
+ shra_r.w t0, t0, 13
2974
+ lh t1, 82(a1)
2975
+ lh t2, 82(a0)
2976
+ lh t3, 114(a1)
2977
+ lh t4, 114(a0)
2978
+ shra_r.w t8, t8, 13
2979
+ mul t1, t1, t2
2980
+ mul t3, t3, t4
2981
+ sw t0, 0(v0)
2982
+ sw t8, 20(v0)
2983
+ sll t4, t5, 15
2984
+ ins t7, t6, 16, 16
2985
+ mult zero, zero
2986
+ dpa.w.ph $ac0, t7, s0
2987
+ ins t3, t1, 16, 16
2988
+ lh t1, 6(a1)
2989
+ lh t6, 6(a0)
2990
+ dpa.w.ph $ac0, t3, s1
2991
+ mflo t0, $ac0
2992
+ mul t5, t6, t1
2993
+ lh t1, 22(a1)
2994
+ lh t6, 22(a0)
2995
+ lh t2, 54(a1)
2996
+ lh t7, 54(a0)
2997
+ mul t6, t6, t1
2998
+ subu t8, t4, t0
2999
+ mul t7, t7, t2
3000
+ addu t0, t4, t0
3001
+ shra_r.w t0, t0, 13
3002
+ lh t1, 86(a1)
3003
+ lh t2, 86(a0)
3004
+ lh t3, 118(a1)
3005
+ lh t4, 118(a0)
3006
+ shra_r.w t8, t8, 13
3007
+ mul t1, t1, t2
3008
+ mul t3, t3, t4
3009
+ sw t0, 4(v0)
3010
+ sw t8, 24(v0)
3011
+ sll t4, t5, 15
3012
+ ins t7, t6, 16, 16
3013
+ mult zero, zero
3014
+ dpa.w.ph $ac0, t7, s0
3015
+ ins t3, t1, 16, 16
3016
+ lh t1, 10(a1)
3017
+ lh t6, 10(a0)
3018
+ dpa.w.ph $ac0, t3, s1
3019
+ mflo t0, $ac0
3020
+ mul t5, t6, t1
3021
+ lh t1, 26(a1)
3022
+ lh t6, 26(a0)
3023
+ lh t2, 58(a1)
3024
+ lh t7, 58(a0)
3025
+ mul t6, t6, t1
3026
+ subu t8, t4, t0
3027
+ mul t7, t7, t2
3028
+ addu t0, t4, t0
3029
+ shra_r.w t0, t0, 13
3030
+ lh t1, 90(a1)
3031
+ lh t2, 90(a0)
3032
+ lh t3, 122(a1)
3033
+ lh t4, 122(a0)
3034
+ shra_r.w t8, t8, 13
3035
+ mul t1, t1, t2
3036
+ mul t3, t3, t4
3037
+ sw t0, 8(v0)
3038
+ sw t8, 28(v0)
3039
+ sll t4, t5, 15
3040
+ ins t7, t6, 16, 16
3041
+ mult zero, zero
3042
+ dpa.w.ph $ac0, t7, s0
3043
+ ins t3, t1, 16, 16
3044
+ lh t1, 14(a1)
3045
+ lh t6, 14(a0)
3046
+ dpa.w.ph $ac0, t3, s1
3047
+ mflo t0, $ac0
3048
+ mul t5, t6, t1
3049
+ lh t1, 30(a1)
3050
+ lh t6, 30(a0)
3051
+ lh t2, 62(a1)
3052
+ lh t7, 62(a0)
3053
+ mul t6, t6, t1
3054
+ subu t8, t4, t0
3055
+ mul t7, t7, t2
3056
+ addu t0, t4, t0
3057
+ shra_r.w t0, t0, 13
3058
+ lh t1, 94(a1)
3059
+ lh t2, 94(a0)
3060
+ lh t3, 126(a1)
3061
+ lh t4, 126(a0)
3062
+ shra_r.w t8, t8, 13
3063
+ mul t1, t1, t2
3064
+ mul t3, t3, t4
3065
+ sw t0, 12(v0)
3066
+ sw t8, 32(v0)
3067
+ sll t4, t5, 15
3068
+ ins t7, t6, 16, 16
3069
+ mult zero, zero
3070
+ dpa.w.ph $ac0, t7, s0
3071
+ ins t3, t1, 16, 16
3072
+ dpa.w.ph $ac0, t3, s1
3073
+ mflo t0, $ac0
3074
+ lw t9, 0(a2)
3075
+ lw t3, 0(v0)
3076
+ lw t7, 4(v0)
3077
+ lw t1, 8(v0)
3078
+ addu t9, t9, a3
3079
+ sll t3, t3, 15
3080
+ subu t8, t4, t0
3081
+ addu t0, t4, t0
3082
+ shra_r.w t0, t0, 13
3083
+ shra_r.w t8, t8, 13
3084
+ sw t0, 16(v0)
3085
+ sw t8, 36(v0)
3086
+ lw t5, 12(v0)
3087
+ lw t6, 16(v0)
3088
+ mult t7, s2
3089
+ madd t1, s3
3090
+ madd t5, s4
3091
+ madd t6, s5
3092
+ lw t5, 24(v0)
3093
+ lw t7, 28(v0)
3094
+ mflo t0, $ac0
3095
+ lw t8, 32(v0)
3096
+ lw t2, 36(v0)
3097
+ mult $ac1, t5, s2
3098
+ madd $ac1, t7, s3
3099
+ madd $ac1, t8, s4
3100
+ madd $ac1, t2, s5
3101
+ addu t1, t3, t0
3102
+ subu t6, t3, t0
3103
+ shra_r.w t1, t1, 20
3104
+ shra_r.w t6, t6, 20
3105
+ mflo t4, $ac1
3106
+ shll_s.w t1, t1, 24
3107
+ shll_s.w t6, t6, 24
3108
+ sra t1, t1, 24
3109
+ sra t6, t6, 24
3110
+ addiu t1, t1, 128
3111
+ addiu t6, t6, 128
3112
+ lw t0, 20(v0)
3113
+ sb t1, 0(t9)
3114
+ sb t6, 1(t9)
3115
+ sll t0, t0, 15
3116
+ lw t9, 4(a2)
3117
+ addu t1, t0, t4
3118
+ subu t6, t0, t4
3119
+ addu t9, t9, a3
3120
+ shra_r.w t1, t1, 20
3121
+ shra_r.w t6, t6, 20
3122
+ shll_s.w t1, t1, 24
3123
+ shll_s.w t6, t6, 24
3124
+ sra t1, t1, 24
3125
+ sra t6, t6, 24
3126
+ addiu t1, t1, 128
3127
+ addiu t6, t6, 128
3128
+ sb t1, 0(t9)
3129
+ sb t6, 1(t9)
3130
+ addiu sp, sp, 40
3131
+
3132
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
3133
+
3134
+ j ra
3135
+ nop
3136
+
3137
+ END(jsimd_idct_2x2_dspr2)
3138
+
3139
+
3140
+ /*****************************************************************************/
3141
+ LEAF_DSPR2(jsimd_idct_4x4_dspr2)
3142
+ /*
3143
+ * a0 = compptr->dct_table
3144
+ * a1 = coef_block
3145
+ * a2 = output_buf
3146
+ * a3 = output_col
3147
+ * 16(sp) = workspace[DCTSIZE*4] (buffers data between passes)
3148
+ */
3149
+ .set at
3150
+
3151
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3152
+
3153
+ lw v1, 48(sp)
3154
+ move t0, a1
3155
+ move t1, v1
3156
+ li t9, 4
3157
+ li s0, 0x2e75f93e
3158
+ li s1, 0x21f9ba79
3159
+ li s2, 0xecc2efb0
3160
+ li s3, 0x52031ccd
3161
+
3162
+ 0:
3163
+ lh s6, 32(t0) /* inptr[DCTSIZE*2] */
3164
+ lh t6, 32(a0) /* quantptr[DCTSIZE*2] */
3165
+ lh s7, 96(t0) /* inptr[DCTSIZE*6] */
3166
+ lh t7, 96(a0) /* quantptr[DCTSIZE*6] */
3167
+ mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
3168
+ quantptr[DCTSIZE*2]) */
3169
+ lh s4, 0(t0) /* inptr[DCTSIZE*0] */
3170
+ mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
3171
+ quantptr[DCTSIZE*6]) */
3172
+ lh s5, 0(a0) /* quantptr[0] */
3173
+ li s6, 15137
3174
+ li s7, 6270
3175
+ mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */
3176
+ mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
3177
+ quantptr[DCTSIZE*2]) */
3178
+ lh t5, 112(t0) /* inptr[DCTSIZE*7] */
3179
+ mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
3180
+ quantptr[DCTSIZE*6]) */
3181
+ lh s4, 112(a0) /* quantptr[DCTSIZE*7] */
3182
+ lh v0, 80(t0) /* inptr[DCTSIZE*5] */
3183
+ lh s5, 80(a0) /* quantptr[DCTSIZE*5] */
3184
+ lh s6, 48(a0) /* quantptr[DCTSIZE*3] */
3185
+ sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */
3186
+ lh s7, 16(a0) /* quantptr[DCTSIZE*1] */
3187
+ lh t8, 16(t0) /* inptr[DCTSIZE*1] */
3188
+ subu t6, t6, t7 /* tmp2 =
3189
+ MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
3190
+ lh t7, 48(t0) /* inptr[DCTSIZE*3] */
3191
+ mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] *
3192
+ quantptr[DCTSIZE*7]) */
3193
+ mul v0, s5, v0 /* z2 = (inptr[DCTSIZE*5] *
3194
+ quantptr[DCTSIZE*5]) */
3195
+ mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] *
3196
+ quantptr[DCTSIZE*3]) */
3197
+ mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] *
3198
+ quantptr[DCTSIZE*1]) */
3199
+ addu t3, t2, t6 /* tmp10 = tmp0 + z2 */
3200
+ subu t4, t2, t6 /* tmp10 = tmp0 - z2 */
3201
+ mult $ac0, zero, zero
3202
+ mult $ac1, zero, zero
3203
+ ins t5, v0, 16, 16
3204
+ ins t7, t8, 16, 16
3205
+ addiu t9, t9, -1
3206
+ dpa.w.ph $ac0, t5, s0
3207
+ dpa.w.ph $ac0, t7, s1
3208
+ dpa.w.ph $ac1, t5, s2
3209
+ dpa.w.ph $ac1, t7, s3
3210
+ mflo s4, $ac0
3211
+ mflo s5, $ac1
3212
+ addiu a0, a0, 2
3213
+ addiu t1, t1, 4
3214
+ addiu t0, t0, 2
3215
+ addu t6, t4, s4
3216
+ subu t5, t4, s4
3217
+ addu s6, t3, s5
3218
+ subu s7, t3, s5
3219
+ shra_r.w t6, t6, 12 /* DESCALE(tmp12 + temp1, 12) */
3220
+ shra_r.w t5, t5, 12 /* DESCALE(tmp12 - temp1, 12) */
3221
+ shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */
3222
+ shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */
3223
+ sw t6, 28(t1)
3224
+ sw t5, 60(t1)
3225
+ sw s6, -4(t1)
3226
+ bgtz t9, 0b
3227
+ sw s7, 92(t1)
3228
+ /* second loop three pass */
3229
+ li t9, 3
3230
+ 1:
3231
+ lh s6, 34(t0) /* inptr[DCTSIZE*2] */
3232
+ lh t6, 34(a0) /* quantptr[DCTSIZE*2] */
3233
+ lh s7, 98(t0) /* inptr[DCTSIZE*6] */
3234
+ lh t7, 98(a0) /* quantptr[DCTSIZE*6] */
3235
+ mul t6, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
3236
+ quantptr[DCTSIZE*2]) */
3237
+ lh s4, 2(t0) /* inptr[DCTSIZE*0] */
3238
+ mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
3239
+ quantptr[DCTSIZE*6]) */
3240
+ lh s5, 2(a0) /* quantptr[DCTSIZE*0] */
3241
+ li s6, 15137
3242
+ li s7, 6270
3243
+ mul t2, s4, s5 /* tmp0 = (inptr[0] * quantptr[0]) */
3244
+ mul v0, s6, t6 /* z2 = (inptr[DCTSIZE*2] *
3245
+ quantptr[DCTSIZE*2]) */
3246
+ lh t5, 114(t0) /* inptr[DCTSIZE*7] */
3247
+ mul t7, s7, t7 /* z3 = (inptr[DCTSIZE*6] *
3248
+ quantptr[DCTSIZE*6]) */
3249
+ lh s4, 114(a0) /* quantptr[DCTSIZE*7] */
3250
+ lh s5, 82(a0) /* quantptr[DCTSIZE*5] */
3251
+ lh t6, 82(t0) /* inptr[DCTSIZE*5] */
3252
+ sll t2, t2, 14 /* tmp0 <<= (CONST_BITS+1) */
3253
+ lh s6, 50(a0) /* quantptr[DCTSIZE*3] */
3254
+ lh t8, 18(t0) /* inptr[DCTSIZE*1] */
3255
+ subu v0, v0, t7 /* tmp2 =
3256
+ MULTIPLY(z2, t5) - MULTIPLY(z3, t6) */
3257
+ lh t7, 50(t0) /* inptr[DCTSIZE*3] */
3258
+ lh s7, 18(a0) /* quantptr[DCTSIZE*1] */
3259
+ mul t5, s4, t5 /* z1 = (inptr[DCTSIZE*7] *
3260
+ quantptr[DCTSIZE*7]) */
3261
+ mul t6, s5, t6 /* z2 = (inptr[DCTSIZE*5] *
3262
+ quantptr[DCTSIZE*5]) */
3263
+ mul t7, s6, t7 /* z3 = (inptr[DCTSIZE*3] *
3264
+ quantptr[DCTSIZE*3]) */
3265
+ mul t8, s7, t8 /* z4 = (inptr[DCTSIZE*1] *
3266
+ quantptr[DCTSIZE*1]) */
3267
+ addu t3, t2, v0 /* tmp10 = tmp0 + z2 */
3268
+ subu t4, t2, v0 /* tmp10 = tmp0 - z2 */
3269
+ mult $ac0, zero, zero
3270
+ mult $ac1, zero, zero
3271
+ ins t5, t6, 16, 16
3272
+ ins t7, t8, 16, 16
3273
+ dpa.w.ph $ac0, t5, s0
3274
+ dpa.w.ph $ac0, t7, s1
3275
+ dpa.w.ph $ac1, t5, s2
3276
+ dpa.w.ph $ac1, t7, s3
3277
+ mflo t5, $ac0
3278
+ mflo t6, $ac1
3279
+ addiu t9, t9, -1
3280
+ addiu t0, t0, 2
3281
+ addiu a0, a0, 2
3282
+ addiu t1, t1, 4
3283
+ addu s5, t4, t5
3284
+ subu s4, t4, t5
3285
+ addu s6, t3, t6
3286
+ subu s7, t3, t6
3287
+ shra_r.w s5, s5, 12 /* DESCALE(tmp12 + temp1, 12) */
3288
+ shra_r.w s4, s4, 12 /* DESCALE(tmp12 - temp1, 12) */
3289
+ shra_r.w s6, s6, 12 /* DESCALE(tmp10 + temp2, 12) */
3290
+ shra_r.w s7, s7, 12 /* DESCALE(tmp10 - temp2, 12) */
3291
+ sw s5, 32(t1)
3292
+ sw s4, 64(t1)
3293
+ sw s6, 0(t1)
3294
+ bgtz t9, 1b
3295
+ sw s7, 96(t1)
3296
+ move t1, v1
3297
+ li s4, 15137
3298
+ lw s6, 8(t1) /* wsptr[2] */
3299
+ li s5, 6270
3300
+ lw s7, 24(t1) /* wsptr[6] */
3301
+ mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
3302
+ FIX_1_847759065) */
3303
+ lw t2, 0(t1) /* wsptr[0] */
3304
+ mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
3305
+ -FIX_0_765366865) */
3306
+ lh t5, 28(t1) /* wsptr[7] */
3307
+ lh t6, 20(t1) /* wsptr[5] */
3308
+ lh t7, 12(t1) /* wsptr[3] */
3309
+ lh t8, 4(t1) /* wsptr[1] */
3310
+ ins t5, t6, 16, 16
3311
+ ins t7, t8, 16, 16
3312
+ mult $ac0, zero, zero
3313
+ dpa.w.ph $ac0, t5, s0
3314
+ dpa.w.ph $ac0, t7, s1
3315
+ mult $ac1, zero, zero
3316
+ dpa.w.ph $ac1, t5, s2
3317
+ dpa.w.ph $ac1, t7, s3
3318
+ sll t2, t2, 14 /* tmp0 =
3319
+ ((JLONG)wsptr[0]) << (CONST_BITS+1) */
3320
+ mflo s6, $ac0
3321
+ /* MULTIPLY(wsptr[2], FIX_1_847759065) +
3322
+ MULTIPLY(wsptr[6], -FIX_0_765366865) */
3323
+ subu s4, s4, s5
3324
+ addu t3, t2, s4 /* tmp10 = tmp0 + z2 */
3325
+ mflo s7, $ac1
3326
+ subu t4, t2, s4 /* tmp10 = tmp0 - z2 */
3327
+ addu t7, t4, s6
3328
+ subu t8, t4, s6
3329
+ addu t5, t3, s7
3330
+ subu t6, t3, s7
3331
+ shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */
3332
+ shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */
3333
+ shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */
3334
+ shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */
3335
+ sll s4, t9, 2
3336
+ lw v0, 0(a2) /* output_buf[ctr] */
3337
+ shll_s.w t5, t5, 24
3338
+ shll_s.w t6, t6, 24
3339
+ shll_s.w t7, t7, 24
3340
+ shll_s.w t8, t8, 24
3341
+ sra t5, t5, 24
3342
+ sra t6, t6, 24
3343
+ sra t7, t7, 24
3344
+ sra t8, t8, 24
3345
+ addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
3346
+ addiu t5, t5, 128
3347
+ addiu t6, t6, 128
3348
+ addiu t7, t7, 128
3349
+ addiu t8, t8, 128
3350
+ sb t5, 0(v0)
3351
+ sb t7, 1(v0)
3352
+ sb t8, 2(v0)
3353
+ sb t6, 3(v0)
3354
+ /* 2 */
3355
+ li s4, 15137
3356
+ lw s6, 40(t1) /* wsptr[2] */
3357
+ li s5, 6270
3358
+ lw s7, 56(t1) /* wsptr[6] */
3359
+ mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
3360
+ FIX_1_847759065) */
3361
+ lw t2, 32(t1) /* wsptr[0] */
3362
+ mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
3363
+ -FIX_0_765366865) */
3364
+ lh t5, 60(t1) /* wsptr[7] */
3365
+ lh t6, 52(t1) /* wsptr[5] */
3366
+ lh t7, 44(t1) /* wsptr[3] */
3367
+ lh t8, 36(t1) /* wsptr[1] */
3368
+ ins t5, t6, 16, 16
3369
+ ins t7, t8, 16, 16
3370
+ mult $ac0, zero, zero
3371
+ dpa.w.ph $ac0, t5, s0
3372
+ dpa.w.ph $ac0, t7, s1
3373
+ mult $ac1, zero, zero
3374
+ dpa.w.ph $ac1, t5, s2
3375
+ dpa.w.ph $ac1, t7, s3
3376
+ sll t2, t2, 14 /* tmp0 =
3377
+ ((JLONG)wsptr[0]) << (CONST_BITS+1) */
3378
+ mflo s6, $ac0
3379
+ /* MULTIPLY(wsptr[2], FIX_1_847759065) +
3380
+ MULTIPLY(wsptr[6], -FIX_0_765366865) */
3381
+ subu s4, s4, s5
3382
+ addu t3, t2, s4 /* tmp10 = tmp0 + z2 */
3383
+ mflo s7, $ac1
3384
+ subu t4, t2, s4 /* tmp10 = tmp0 - z2 */
3385
+ addu t7, t4, s6
3386
+ subu t8, t4, s6
3387
+ addu t5, t3, s7
3388
+ subu t6, t3, s7
3389
+ shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2,
3390
+ CONST_BITS-PASS1_BITS+1) */
3391
+ shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2,
3392
+ CONST_BITS-PASS1_BITS+1) */
3393
+ shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1,
3394
+ CONST_BITS-PASS1_BITS+1) */
3395
+ shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1,
3396
+ CONST_BITS-PASS1_BITS+1) */
3397
+ sll s4, t9, 2
3398
+ lw v0, 4(a2) /* output_buf[ctr] */
3399
+ shll_s.w t5, t5, 24
3400
+ shll_s.w t6, t6, 24
3401
+ shll_s.w t7, t7, 24
3402
+ shll_s.w t8, t8, 24
3403
+ sra t5, t5, 24
3404
+ sra t6, t6, 24
3405
+ sra t7, t7, 24
3406
+ sra t8, t8, 24
3407
+ addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
3408
+ addiu t5, t5, 128
3409
+ addiu t6, t6, 128
3410
+ addiu t7, t7, 128
3411
+ addiu t8, t8, 128
3412
+ sb t5, 0(v0)
3413
+ sb t7, 1(v0)
3414
+ sb t8, 2(v0)
3415
+ sb t6, 3(v0)
3416
+ /* 3 */
3417
+ li s4, 15137
3418
+ lw s6, 72(t1) /* wsptr[2] */
3419
+ li s5, 6270
3420
+ lw s7, 88(t1) /* wsptr[6] */
3421
+ mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
3422
+ FIX_1_847759065) */
3423
+ lw t2, 64(t1) /* wsptr[0] */
3424
+ mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
3425
+ -FIX_0_765366865) */
3426
+ lh t5, 92(t1) /* wsptr[7] */
3427
+ lh t6, 84(t1) /* wsptr[5] */
3428
+ lh t7, 76(t1) /* wsptr[3] */
3429
+ lh t8, 68(t1) /* wsptr[1] */
3430
+ ins t5, t6, 16, 16
3431
+ ins t7, t8, 16, 16
3432
+ mult $ac0, zero, zero
3433
+ dpa.w.ph $ac0, t5, s0
3434
+ dpa.w.ph $ac0, t7, s1
3435
+ mult $ac1, zero, zero
3436
+ dpa.w.ph $ac1, t5, s2
3437
+ dpa.w.ph $ac1, t7, s3
3438
+ sll t2, t2, 14 /* tmp0 =
3439
+ ((JLONG)wsptr[0]) << (CONST_BITS+1) */
3440
+ mflo s6, $ac0
3441
+ /* MULTIPLY(wsptr[2], FIX_1_847759065) +
3442
+ MULTIPLY(wsptr[6], -FIX_0_765366865) */
3443
+ subu s4, s4, s5
3444
+ addu t3, t2, s4 /* tmp10 = tmp0 + z2 */
3445
+ mflo s7, $ac1
3446
+ subu t4, t2, s4 /* tmp10 = tmp0 - z2 */
3447
+ addu t7, t4, s6
3448
+ subu t8, t4, s6
3449
+ addu t5, t3, s7
3450
+ subu t6, t3, s7
3451
+ shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */
3452
+ shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */
3453
+ shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */
3454
+ shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */
3455
+ sll s4, t9, 2
3456
+ lw v0, 8(a2) /* output_buf[ctr] */
3457
+ shll_s.w t5, t5, 24
3458
+ shll_s.w t6, t6, 24
3459
+ shll_s.w t7, t7, 24
3460
+ shll_s.w t8, t8, 24
3461
+ sra t5, t5, 24
3462
+ sra t6, t6, 24
3463
+ sra t7, t7, 24
3464
+ sra t8, t8, 24
3465
+ addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
3466
+ addiu t5, t5, 128
3467
+ addiu t6, t6, 128
3468
+ addiu t7, t7, 128
3469
+ addiu t8, t8, 128
3470
+ sb t5, 0(v0)
3471
+ sb t7, 1(v0)
3472
+ sb t8, 2(v0)
3473
+ sb t6, 3(v0)
3474
+ li s4, 15137
3475
+ lw s6, 104(t1) /* wsptr[2] */
3476
+ li s5, 6270
3477
+ lw s7, 120(t1) /* wsptr[6] */
3478
+ mul s4, s4, s6 /* MULTIPLY((JLONG)wsptr[2],
3479
+ FIX_1_847759065) */
3480
+ lw t2, 96(t1) /* wsptr[0] */
3481
+ mul s5, s5, s7 /* MULTIPLY((JLONG)wsptr[6],
3482
+ -FIX_0_765366865) */
3483
+ lh t5, 124(t1) /* wsptr[7] */
3484
+ lh t6, 116(t1) /* wsptr[5] */
3485
+ lh t7, 108(t1) /* wsptr[3] */
3486
+ lh t8, 100(t1) /* wsptr[1] */
3487
+ ins t5, t6, 16, 16
3488
+ ins t7, t8, 16, 16
3489
+ mult $ac0, zero, zero
3490
+ dpa.w.ph $ac0, t5, s0
3491
+ dpa.w.ph $ac0, t7, s1
3492
+ mult $ac1, zero, zero
3493
+ dpa.w.ph $ac1, t5, s2
3494
+ dpa.w.ph $ac1, t7, s3
3495
+ sll t2, t2, 14 /* tmp0 =
3496
+ ((JLONG)wsptr[0]) << (CONST_BITS+1) */
3497
+ mflo s6, $ac0
3498
+ /* MULTIPLY(wsptr[2], FIX_1_847759065) +
3499
+ MULTIPLY(wsptr[6], -FIX_0_765366865) */
3500
+ subu s4, s4, s5
3501
+ addu t3, t2, s4 /* tmp10 = tmp0 + z2; */
3502
+ mflo s7, $ac1
3503
+ subu t4, t2, s4 /* tmp10 = tmp0 - z2; */
3504
+ addu t7, t4, s6
3505
+ subu t8, t4, s6
3506
+ addu t5, t3, s7
3507
+ subu t6, t3, s7
3508
+ shra_r.w t5, t5, 19 /* DESCALE(tmp10 + temp2, 19) */
3509
+ shra_r.w t6, t6, 19 /* DESCALE(tmp10 - temp2, 19) */
3510
+ shra_r.w t7, t7, 19 /* DESCALE(tmp12 + temp1, 19) */
3511
+ shra_r.w t8, t8, 19 /* DESCALE(tmp12 - temp1, 19) */
3512
+ sll s4, t9, 2
3513
+ lw v0, 12(a2) /* output_buf[ctr] */
3514
+ shll_s.w t5, t5, 24
3515
+ shll_s.w t6, t6, 24
3516
+ shll_s.w t7, t7, 24
3517
+ shll_s.w t8, t8, 24
3518
+ sra t5, t5, 24
3519
+ sra t6, t6, 24
3520
+ sra t7, t7, 24
3521
+ sra t8, t8, 24
3522
+ addu v0, v0, a3 /* outptr = output_buf[ctr] + output_col */
3523
+ addiu t5, t5, 128
3524
+ addiu t6, t6, 128
3525
+ addiu t7, t7, 128
3526
+ addiu t8, t8, 128
3527
+ sb t5, 0(v0)
3528
+ sb t7, 1(v0)
3529
+ sb t8, 2(v0)
3530
+ sb t6, 3(v0)
3531
+
3532
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3533
+
3534
+ j ra
3535
+ nop
3536
+ END(jsimd_idct_4x4_dspr2)
3537
+
3538
+
3539
+ /*****************************************************************************/
3540
+ LEAF_DSPR2(jsimd_idct_6x6_dspr2)
3541
+ /*
3542
+ * a0 = compptr->dct_table
3543
+ * a1 = coef_block
3544
+ * a2 = output_buf
3545
+ * a3 = output_col
3546
+ */
3547
+ .set at
3548
+
3549
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3550
+
3551
+ addiu sp, sp, -144
3552
+ move v0, sp
3553
+ addiu v1, v0, 24
3554
+ addiu t9, zero, 5793
3555
+ addiu s0, zero, 10033
3556
+ addiu s1, zero, 2998
3557
+
3558
+ 1:
3559
+ lh s2, 0(a0) /* q0 = quantptr[ 0] */
3560
+ lh s3, 32(a0) /* q1 = quantptr[16] */
3561
+ lh s4, 64(a0) /* q2 = quantptr[32] */
3562
+ lh t2, 64(a1) /* tmp2 = inptr[32] */
3563
+ lh t1, 32(a1) /* tmp1 = inptr[16] */
3564
+ lh t0, 0(a1) /* tmp0 = inptr[ 0] */
3565
+ mul t2, t2, s4 /* tmp2 = tmp2 * q2 */
3566
+ mul t1, t1, s3 /* tmp1 = tmp1 * q1 */
3567
+ mul t0, t0, s2 /* tmp0 = tmp0 * q0 */
3568
+ lh t6, 16(a1) /* z1 = inptr[ 8] */
3569
+ lh t8, 80(a1) /* z3 = inptr[40] */
3570
+ lh t7, 48(a1) /* z2 = inptr[24] */
3571
+ lh s2, 16(a0) /* q0 = quantptr[ 8] */
3572
+ lh s4, 80(a0) /* q2 = quantptr[40] */
3573
+ lh s3, 48(a0) /* q1 = quantptr[24] */
3574
+ mul t2, t2, t9 /* tmp2 = tmp2 * 5793 */
3575
+ mul t1, t1, s0 /* tmp1 = tmp1 * 10033 */
3576
+ sll t0, t0, 13 /* tmp0 = tmp0 << 13 */
3577
+ mul t6, t6, s2 /* z1 = z1 * q0 */
3578
+ mul t8, t8, s4 /* z3 = z3 * q2 */
3579
+ mul t7, t7, s3 /* z2 = z2 * q1 */
3580
+ addu t3, t0, t2 /* tmp10 = tmp0 + tmp2 */
3581
+ sll t2, t2, 1 /* tmp2 = tmp2 << 2 */
3582
+ subu t4, t0, t2 /* tmp11 = tmp0 - tmp2; */
3583
+ subu t5, t3, t1 /* tmp12 = tmp10 - tmp1 */
3584
+ addu t3, t3, t1 /* tmp10 = tmp10 + tmp1 */
3585
+ addu t1, t6, t8 /* tmp1 = z1 + z3 */
3586
+ mul t1, t1, s1 /* tmp1 = tmp1 * 2998 */
3587
+ shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */
3588
+ subu t2, t6, t8 /* tmp2 = z1 - z3 */
3589
+ subu t2, t2, t7 /* tmp2 = tmp2 - z2 */
3590
+ sll t2, t2, 2 /* tmp2 = tmp2 << 2 */
3591
+ addu t0, t6, t7 /* tmp0 = z1 + z2 */
3592
+ sll t0, t0, 13 /* tmp0 = tmp0 << 13 */
3593
+ subu s2, t8, t7 /* q0 = z3 - z2 */
3594
+ sll s2, s2, 13 /* q0 = q0 << 13 */
3595
+ addu t0, t0, t1 /* tmp0 = tmp0 + tmp1 */
3596
+ addu t1, s2, t1 /* tmp1 = q0 + tmp1 */
3597
+ addu s2, t4, t2 /* q0 = tmp11 + tmp2 */
3598
+ subu s3, t4, t2 /* q1 = tmp11 - tmp2 */
3599
+ addu t6, t3, t0 /* z1 = tmp10 + tmp0 */
3600
+ subu t7, t3, t0 /* z2 = tmp10 - tmp0 */
3601
+ addu t4, t5, t1 /* tmp11 = tmp12 + tmp1 */
3602
+ subu t5, t5, t1 /* tmp12 = tmp12 - tmp1 */
3603
+ shra_r.w t6, t6, 11 /* z1 = (z1 + 1024) >> 11 */
3604
+ shra_r.w t7, t7, 11 /* z2 = (z2 + 1024) >> 11 */
3605
+ shra_r.w t4, t4, 11 /* tmp11 = (tmp11 + 1024) >> 11 */
3606
+ shra_r.w t5, t5, 11 /* tmp12 = (tmp12 + 1024) >> 11 */
3607
+ sw s2, 24(v0)
3608
+ sw s3, 96(v0)
3609
+ sw t6, 0(v0)
3610
+ sw t7, 120(v0)
3611
+ sw t4, 48(v0)
3612
+ sw t5, 72(v0)
3613
+ addiu v0, v0, 4
3614
+ addiu a1, a1, 2
3615
+ bne v0, v1, 1b
3616
+ addiu a0, a0, 2
3617
+
3618
+ /* Pass 2: process 6 rows from work array, store into output array. */
3619
+ move v0, sp
3620
+ addiu v1, v0, 144
3621
+
3622
+ 2:
3623
+ lw t0, 0(v0)
3624
+ lw t2, 16(v0)
3625
+ lw s5, 0(a2)
3626
+ addiu t0, t0, 16
3627
+ sll t0, t0, 13
3628
+ mul t3, t2, t9
3629
+ lw t6, 4(v0)
3630
+ lw t8, 20(v0)
3631
+ lw t7, 12(v0)
3632
+ addu s5, s5, a3
3633
+ addu s6, t6, t8
3634
+ mul s6, s6, s1
3635
+ addu t1, t0, t3
3636
+ subu t4, t0, t3
3637
+ subu t4, t4, t3
3638
+ lw t3, 8(v0)
3639
+ mul t0, t3, s0
3640
+ addu s7, t6, t7
3641
+ sll s7, s7, 13
3642
+ addu s7, s6, s7
3643
+ subu t2, t8, t7
3644
+ sll t2, t2, 13
3645
+ addu t2, s6, t2
3646
+ subu s6, t6, t7
3647
+ subu s6, s6, t8
3648
+ sll s6, s6, 13
3649
+ addu t3, t1, t0
3650
+ subu t5, t1, t0
3651
+ addu t6, t3, s7
3652
+ subu t3, t3, s7
3653
+ addu t7, t4, s6
3654
+ subu t4, t4, s6
3655
+ addu t8, t5, t2
3656
+ subu t5, t5, t2
3657
+ shll_s.w t6, t6, 6
3658
+ shll_s.w t3, t3, 6
3659
+ shll_s.w t7, t7, 6
3660
+ shll_s.w t4, t4, 6
3661
+ shll_s.w t8, t8, 6
3662
+ shll_s.w t5, t5, 6
3663
+ sra t6, t6, 24
3664
+ addiu t6, t6, 128
3665
+ sra t3, t3, 24
3666
+ addiu t3, t3, 128
3667
+ sb t6, 0(s5)
3668
+ sra t7, t7, 24
3669
+ addiu t7, t7, 128
3670
+ sb t3, 5(s5)
3671
+ sra t4, t4, 24
3672
+ addiu t4, t4, 128
3673
+ sb t7, 1(s5)
3674
+ sra t8, t8, 24
3675
+ addiu t8, t8, 128
3676
+ sb t4, 4(s5)
3677
+ addiu v0, v0, 24
3678
+ sra t5, t5, 24
3679
+ addiu t5, t5, 128
3680
+ sb t8, 2(s5)
3681
+ addiu a2, a2, 4
3682
+ bne v0, v1, 2b
3683
+ sb t5, 3(s5)
3684
+
3685
+ addiu sp, sp, 144
3686
+
3687
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3688
+
3689
+ j ra
3690
+ nop
3691
+
3692
+ END(jsimd_idct_6x6_dspr2)
3693
+
3694
+
3695
+ /*****************************************************************************/
3696
+ LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
3697
+ /*
3698
+ * a0 = compptr->dct_table
3699
+ * a1 = coef_block
3700
+ * a2 = workspace
3701
+ */
3702
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3703
+
3704
+ li a3, 8
3705
+
3706
+ 1:
3707
+ /* odd part */
3708
+ lh t0, 48(a1)
3709
+ lh t1, 48(a0)
3710
+ lh t2, 16(a1)
3711
+ lh t3, 16(a0)
3712
+ lh t4, 80(a1)
3713
+ lh t5, 80(a0)
3714
+ lh t6, 112(a1)
3715
+ lh t7, 112(a0)
3716
+ mul t0, t0, t1 /* z2 */
3717
+ mul t1, t2, t3 /* z1 */
3718
+ mul t2, t4, t5 /* z3 */
3719
+ mul t3, t6, t7 /* z4 */
3720
+ li t4, 10703 /* FIX(1.306562965) */
3721
+ li t5, 4433 /* FIX_0_541196100 */
3722
+ li t6, 7053 /* FIX(0.860918669) */
3723
+ mul t4, t0, t4 /* tmp11 */
3724
+ mul t5, t0, t5 /* -tmp14 */
3725
+ addu t7, t1, t2 /* tmp10 */
3726
+ addu t8, t7, t3 /* tmp10 + z4 */
3727
+ mul t6, t6, t8 /* tmp15 */
3728
+ li t8, 2139 /* FIX(0.261052384) */
3729
+ mul t8, t7, t8 /* MULTIPLY(tmp10, FIX(0.261052384)) */
3730
+ li t7, 2295 /* FIX(0.280143716) */
3731
+ mul t7, t1, t7 /* MULTIPLY(z1, FIX(0.280143716)) */
3732
+ addu t9, t2, t3 /* z3 + z4 */
3733
+ li s0, 8565 /* FIX(1.045510580) */
3734
+ mul t9, t9, s0 /* -tmp13 */
3735
+ li s0, 12112 /* FIX(1.478575242) */
3736
+ mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242) */
3737
+ li s1, 12998 /* FIX(1.586706681) */
3738
+ mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */
3739
+ li s2, 5540 /* FIX(0.676326758) */
3740
+ mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */
3741
+ li s3, 16244 /* FIX(1.982889723) */
3742
+ mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */
3743
+ subu t1, t1, t3 /* z1-=z4 */
3744
+ subu t0, t0, t2 /* z2-=z3 */
3745
+ addu t2, t0, t1 /* z1+z2 */
3746
+ li t3, 4433 /* FIX_0_541196100 */
3747
+ mul t2, t2, t3 /* z3 */
3748
+ li t3, 6270 /* FIX_0_765366865 */
3749
+ mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */
3750
+ li t3, 15137 /* FIX_0_765366865 */
3751
+ mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */
3752
+ addu t8, t6, t8 /* tmp12 */
3753
+ addu t3, t8, t4 /* tmp12 + tmp11 */
3754
+ addu t3, t3, t7 /* tmp10 */
3755
+ subu t8, t8, t9 /* tmp12 + tmp13 */
3756
+ addu s0, t5, s0
3757
+ subu t8, t8, s0 /* tmp12 */
3758
+ subu t9, t6, t9
3759
+ subu s1, s1, t4
3760
+ addu t9, t9, s1 /* tmp13 */
3761
+ subu t6, t6, t5
3762
+ subu t6, t6, s2
3763
+ subu t6, t6, s3 /* tmp15 */
3764
+ /* even part start */
3765
+ lh t4, 64(a1)
3766
+ lh t5, 64(a0)
3767
+ lh t7, 32(a1)
3768
+ lh s0, 32(a0)
3769
+ lh s1, 0(a1)
3770
+ lh s2, 0(a0)
3771
+ lh s3, 96(a1)
3772
+ lh v0, 96(a0)
3773
+ mul t4, t4, t5 /* DEQUANTIZE(inptr[DCTSIZE*4],
3774
+ quantptr[DCTSIZE*4]) */
3775
+ mul t5, t7, s0 /* DEQUANTIZE(inptr[DCTSIZE*2],
3776
+ quantptr[DCTSIZE*2]) */
3777
+ mul t7, s1, s2 /* DEQUANTIZE(inptr[DCTSIZE*0],
3778
+ quantptr[DCTSIZE*0]) */
3779
+ mul s0, s3, v0 /* DEQUANTIZE(inptr[DCTSIZE*6],
3780
+ quantptr[DCTSIZE*6]) */
3781
+ /* odd part end */
3782
+ addu t1, t2, t1 /* tmp11 */
3783
+ subu t0, t2, t0 /* tmp14 */
3784
+ /* update counter and pointers */
3785
+ addiu a3, a3, -1
3786
+ addiu a0, a0, 2
3787
+ addiu a1, a1, 2
3788
+ /* even part rest */
3789
+ li s1, 10033
3790
+ li s2, 11190
3791
+ mul t4, t4, s1 /* z4 */
3792
+ mul s1, t5, s2 /* z4 */
3793
+ sll t5, t5, 13 /* z1 */
3794
+ sll t7, t7, 13
3795
+ addiu t7, t7, 1024 /* z3 */
3796
+ sll s0, s0, 13 /* z2 */
3797
+ addu s2, t7, t4 /* tmp10 */
3798
+ subu t4, t7, t4 /* tmp11 */
3799
+ subu s3, t5, s0 /* tmp12 */
3800
+ addu t2, t7, s3 /* tmp21 */
3801
+ subu s3, t7, s3 /* tmp24 */
3802
+ addu t7, s1, s0 /* tmp12 */
3803
+ addu v0, s2, t7 /* tmp20 */
3804
+ subu s2, s2, t7 /* tmp25 */
3805
+ subu s1, s1, t5 /* z4 - z1 */
3806
+ subu s1, s1, s0 /* tmp12 */
3807
+ addu s0, t4, s1 /* tmp22 */
3808
+ subu t4, t4, s1 /* tmp23 */
3809
+ /* final output stage */
3810
+ addu t5, v0, t3
3811
+ subu v0, v0, t3
3812
+ addu t3, t2, t1
3813
+ subu t2, t2, t1
3814
+ addu t1, s0, t8
3815
+ subu s0, s0, t8
3816
+ addu t8, t4, t9
3817
+ subu t4, t4, t9
3818
+ addu t9, s3, t0
3819
+ subu s3, s3, t0
3820
+ addu t0, s2, t6
3821
+ subu s2, s2, t6
3822
+ sra t5, t5, 11
3823
+ sra t3, t3, 11
3824
+ sra t1, t1, 11
3825
+ sra t8, t8, 11
3826
+ sra t9, t9, 11
3827
+ sra t0, t0, 11
3828
+ sra s2, s2, 11
3829
+ sra s3, s3, 11
3830
+ sra t4, t4, 11
3831
+ sra s0, s0, 11
3832
+ sra t2, t2, 11
3833
+ sra v0, v0, 11
3834
+ sw t5, 0(a2)
3835
+ sw t3, 32(a2)
3836
+ sw t1, 64(a2)
3837
+ sw t8, 96(a2)
3838
+ sw t9, 128(a2)
3839
+ sw t0, 160(a2)
3840
+ sw s2, 192(a2)
3841
+ sw s3, 224(a2)
3842
+ sw t4, 256(a2)
3843
+ sw s0, 288(a2)
3844
+ sw t2, 320(a2)
3845
+ sw v0, 352(a2)
3846
+ bgtz a3, 1b
3847
+ addiu a2, a2, 4
3848
+
3849
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3850
+
3851
+ j ra
3852
+ nop
3853
+
3854
+ END(jsimd_idct_12x12_pass1_dspr2)
3855
+
3856
+
3857
+ /*****************************************************************************/
3858
+ LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
3859
+ /*
3860
+ * a0 = workspace
3861
+ * a1 = output
3862
+ */
3863
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3864
+
3865
+ li a3, 12
3866
+
3867
+ 1:
3868
+ /* Odd part */
3869
+ lw t0, 12(a0)
3870
+ lw t1, 4(a0)
3871
+ lw t2, 20(a0)
3872
+ lw t3, 28(a0)
3873
+ li t4, 10703 /* FIX(1.306562965) */
3874
+ li t5, 4433 /* FIX_0_541196100 */
3875
+ mul t4, t0, t4 /* tmp11 */
3876
+ mul t5, t0, t5 /* -tmp14 */
3877
+ addu t6, t1, t2 /* tmp10 */
3878
+ li t7, 2139 /* FIX(0.261052384) */
3879
+ mul t7, t6, t7 /* MULTIPLY(tmp10, FIX(0.261052384)) */
3880
+ addu t6, t6, t3 /* tmp10 + z4 */
3881
+ li t8, 7053 /* FIX(0.860918669) */
3882
+ mul t6, t6, t8 /* tmp15 */
3883
+ li t8, 2295 /* FIX(0.280143716) */
3884
+ mul t8, t1, t8 /* MULTIPLY(z1, FIX(0.280143716)) */
3885
+ addu t9, t2, t3 /* z3 + z4 */
3886
+ li s0, 8565 /* FIX(1.045510580) */
3887
+ mul t9, t9, s0 /* -tmp13 */
3888
+ li s0, 12112 /* FIX(1.478575242) */
3889
+ mul s0, t2, s0 /* MULTIPLY(z3, FIX(1.478575242)) */
3890
+ li s1, 12998 /* FIX(1.586706681) */
3891
+ mul s1, t3, s1 /* MULTIPLY(z4, FIX(1.586706681)) */
3892
+ li s2, 5540 /* FIX(0.676326758) */
3893
+ mul s2, t1, s2 /* MULTIPLY(z1, FIX(0.676326758)) */
3894
+ li s3, 16244 /* FIX(1.982889723) */
3895
+ mul s3, t3, s3 /* MULTIPLY(z4, FIX(1.982889723)) */
3896
+ subu t1, t1, t3 /* z1 -= z4 */
3897
+ subu t0, t0, t2 /* z2 -= z3 */
3898
+ addu t2, t1, t0 /* z1 + z2 */
3899
+ li t3, 4433 /* FIX_0_541196100 */
3900
+ mul t2, t2, t3 /* z3 */
3901
+ li t3, 6270 /* FIX_0_765366865 */
3902
+ mul t1, t1, t3 /* MULTIPLY(z1, FIX_0_765366865) */
3903
+ li t3, 15137 /* FIX_1_847759065 */
3904
+ mul t0, t0, t3 /* MULTIPLY(z2, FIX_1_847759065) */
3905
+ addu t3, t6, t7 /* tmp12 */
3906
+ addu t7, t3, t4
3907
+ addu t7, t7, t8 /* tmp10 */
3908
+ subu t3, t3, t9
3909
+ subu t3, t3, t5
3910
+ subu t3, t3, s0 /* tmp12 */
3911
+ subu t9, t6, t9
3912
+ subu t9, t9, t4
3913
+ addu t9, t9, s1 /* tmp13 */
3914
+ subu t6, t6, t5
3915
+ subu t6, t6, s2
3916
+ subu t6, t6, s3 /* tmp15 */
3917
+ addu t1, t2, t1 /* tmp11 */
3918
+ subu t0, t2, t0 /* tmp14 */
3919
+ /* even part */
3920
+ lw t2, 16(a0) /* z4 */
3921
+ lw t4, 8(a0) /* z1 */
3922
+ lw t5, 0(a0) /* z3 */
3923
+ lw t8, 24(a0) /* z2 */
3924
+ li s0, 10033 /* FIX(1.224744871) */
3925
+ li s1, 11190 /* FIX(1.366025404) */
3926
+ mul t2, t2, s0 /* z4 */
3927
+ mul s0, t4, s1 /* z4 */
3928
+ addiu t5, t5, 0x10
3929
+ sll t5, t5, 13 /* z3 */
3930
+ sll t4, t4, 13 /* z1 */
3931
+ sll t8, t8, 13 /* z2 */
3932
+ subu s1, t4, t8 /* tmp12 */
3933
+ addu s2, t5, t2 /* tmp10 */
3934
+ subu t2, t5, t2 /* tmp11 */
3935
+ addu s3, t5, s1 /* tmp21 */
3936
+ subu s1, t5, s1 /* tmp24 */
3937
+ addu t5, s0, t8 /* tmp12 */
3938
+ addu v0, s2, t5 /* tmp20 */
3939
+ subu t5, s2, t5 /* tmp25 */
3940
+ subu t4, s0, t4
3941
+ subu t4, t4, t8 /* tmp12 */
3942
+ addu t8, t2, t4 /* tmp22 */
3943
+ subu t2, t2, t4 /* tmp23 */
3944
+ /* increment counter and pointers */
3945
+ addiu a3, a3, -1
3946
+ addiu a0, a0, 32
3947
+ /* Final stage */
3948
+ addu t4, v0, t7
3949
+ subu v0, v0, t7
3950
+ addu t7, s3, t1
3951
+ subu s3, s3, t1
3952
+ addu t1, t8, t3
3953
+ subu t8, t8, t3
3954
+ addu t3, t2, t9
3955
+ subu t2, t2, t9
3956
+ addu t9, s1, t0
3957
+ subu s1, s1, t0
3958
+ addu t0, t5, t6
3959
+ subu t5, t5, t6
3960
+ sll t4, t4, 4
3961
+ sll t7, t7, 4
3962
+ sll t1, t1, 4
3963
+ sll t3, t3, 4
3964
+ sll t9, t9, 4
3965
+ sll t0, t0, 4
3966
+ sll t5, t5, 4
3967
+ sll s1, s1, 4
3968
+ sll t2, t2, 4
3969
+ sll t8, t8, 4
3970
+ sll s3, s3, 4
3971
+ sll v0, v0, 4
3972
+ shll_s.w t4, t4, 2
3973
+ shll_s.w t7, t7, 2
3974
+ shll_s.w t1, t1, 2
3975
+ shll_s.w t3, t3, 2
3976
+ shll_s.w t9, t9, 2
3977
+ shll_s.w t0, t0, 2
3978
+ shll_s.w t5, t5, 2
3979
+ shll_s.w s1, s1, 2
3980
+ shll_s.w t2, t2, 2
3981
+ shll_s.w t8, t8, 2
3982
+ shll_s.w s3, s3, 2
3983
+ shll_s.w v0, v0, 2
3984
+ srl t4, t4, 24
3985
+ srl t7, t7, 24
3986
+ srl t1, t1, 24
3987
+ srl t3, t3, 24
3988
+ srl t9, t9, 24
3989
+ srl t0, t0, 24
3990
+ srl t5, t5, 24
3991
+ srl s1, s1, 24
3992
+ srl t2, t2, 24
3993
+ srl t8, t8, 24
3994
+ srl s3, s3, 24
3995
+ srl v0, v0, 24
3996
+ lw t6, 0(a1)
3997
+ addiu t4, t4, 0x80
3998
+ addiu t7, t7, 0x80
3999
+ addiu t1, t1, 0x80
4000
+ addiu t3, t3, 0x80
4001
+ addiu t9, t9, 0x80
4002
+ addiu t0, t0, 0x80
4003
+ addiu t5, t5, 0x80
4004
+ addiu s1, s1, 0x80
4005
+ addiu t2, t2, 0x80
4006
+ addiu t8, t8, 0x80
4007
+ addiu s3, s3, 0x80
4008
+ addiu v0, v0, 0x80
4009
+ sb t4, 0(t6)
4010
+ sb t7, 1(t6)
4011
+ sb t1, 2(t6)
4012
+ sb t3, 3(t6)
4013
+ sb t9, 4(t6)
4014
+ sb t0, 5(t6)
4015
+ sb t5, 6(t6)
4016
+ sb s1, 7(t6)
4017
+ sb t2, 8(t6)
4018
+ sb t8, 9(t6)
4019
+ sb s3, 10(t6)
4020
+ sb v0, 11(t6)
4021
+ bgtz a3, 1b
4022
+ addiu a1, a1, 4
4023
+
4024
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
4025
+
4026
+ jr ra
4027
+ nop
4028
+
4029
+ END(jsimd_idct_12x12_pass2_dspr2)
4030
+
4031
+
4032
+ /*****************************************************************************/
4033
+ LEAF_DSPR2(jsimd_convsamp_dspr2)
4034
+ /*
4035
+ * a0 = sample_data
4036
+ * a1 = start_col
4037
+ * a2 = workspace
4038
+ */
4039
+ lw t0, 0(a0)
4040
+ li t7, 0xff80ff80
4041
+ addu t0, t0, a1
4042
+ ulw t1, 0(t0)
4043
+ ulw t2, 4(t0)
4044
+ preceu.ph.qbr t3, t1
4045
+ preceu.ph.qbl t4, t1
4046
+ lw t0, 4(a0)
4047
+ preceu.ph.qbr t5, t2
4048
+ preceu.ph.qbl t6, t2
4049
+ addu t0, t0, a1
4050
+ addu.ph t3, t3, t7
4051
+ addu.ph t4, t4, t7
4052
+ ulw t1, 0(t0)
4053
+ ulw t2, 4(t0)
4054
+ addu.ph t5, t5, t7
4055
+ addu.ph t6, t6, t7
4056
+ usw t3, 0(a2)
4057
+ usw t4, 4(a2)
4058
+ preceu.ph.qbr t3, t1
4059
+ preceu.ph.qbl t4, t1
4060
+ usw t5, 8(a2)
4061
+ usw t6, 12(a2)
4062
+
4063
+ lw t0, 8(a0)
4064
+ preceu.ph.qbr t5, t2
4065
+ preceu.ph.qbl t6, t2
4066
+ addu t0, t0, a1
4067
+ addu.ph t3, t3, t7
4068
+ addu.ph t4, t4, t7
4069
+ ulw t1, 0(t0)
4070
+ ulw t2, 4(t0)
4071
+ addu.ph t5, t5, t7
4072
+ addu.ph t6, t6, t7
4073
+ usw t3, 16(a2)
4074
+ usw t4, 20(a2)
4075
+ preceu.ph.qbr t3, t1
4076
+ preceu.ph.qbl t4, t1
4077
+ usw t5, 24(a2)
4078
+ usw t6, 28(a2)
4079
+
4080
+ lw t0, 12(a0)
4081
+ preceu.ph.qbr t5, t2
4082
+ preceu.ph.qbl t6, t2
4083
+ addu t0, t0, a1
4084
+ addu.ph t3, t3, t7
4085
+ addu.ph t4, t4, t7
4086
+ ulw t1, 0(t0)
4087
+ ulw t2, 4(t0)
4088
+ addu.ph t5, t5, t7
4089
+ addu.ph t6, t6, t7
4090
+ usw t3, 32(a2)
4091
+ usw t4, 36(a2)
4092
+ preceu.ph.qbr t3, t1
4093
+ preceu.ph.qbl t4, t1
4094
+ usw t5, 40(a2)
4095
+ usw t6, 44(a2)
4096
+
4097
+ lw t0, 16(a0)
4098
+ preceu.ph.qbr t5, t2
4099
+ preceu.ph.qbl t6, t2
4100
+ addu t0, t0, a1
4101
+ addu.ph t3, t3, t7
4102
+ addu.ph t4, t4, t7
4103
+ ulw t1, 0(t0)
4104
+ ulw t2, 4(t0)
4105
+ addu.ph t5, t5, t7
4106
+ addu.ph t6, t6, t7
4107
+ usw t3, 48(a2)
4108
+ usw t4, 52(a2)
4109
+ preceu.ph.qbr t3, t1
4110
+ preceu.ph.qbl t4, t1
4111
+ usw t5, 56(a2)
4112
+ usw t6, 60(a2)
4113
+
4114
+ lw t0, 20(a0)
4115
+ preceu.ph.qbr t5, t2
4116
+ preceu.ph.qbl t6, t2
4117
+ addu t0, t0, a1
4118
+ addu.ph t3, t3, t7
4119
+ addu.ph t4, t4, t7
4120
+ ulw t1, 0(t0)
4121
+ ulw t2, 4(t0)
4122
+ addu.ph t5, t5, t7
4123
+ addu.ph t6, t6, t7
4124
+ usw t3, 64(a2)
4125
+ usw t4, 68(a2)
4126
+ preceu.ph.qbr t3, t1
4127
+ preceu.ph.qbl t4, t1
4128
+ usw t5, 72(a2)
4129
+ usw t6, 76(a2)
4130
+
4131
+ lw t0, 24(a0)
4132
+ preceu.ph.qbr t5, t2
4133
+ preceu.ph.qbl t6, t2
4134
+ addu t0, t0, a1
4135
+ addu.ph t3, t3, t7
4136
+ addu.ph t4, t4, t7
4137
+ ulw t1, 0(t0)
4138
+ ulw t2, 4(t0)
4139
+ addu.ph t5, t5, t7
4140
+ addu.ph t6, t6, t7
4141
+ usw t3, 80(a2)
4142
+ usw t4, 84(a2)
4143
+ preceu.ph.qbr t3, t1
4144
+ preceu.ph.qbl t4, t1
4145
+ usw t5, 88(a2)
4146
+ usw t6, 92(a2)
4147
+
4148
+ lw t0, 28(a0)
4149
+ preceu.ph.qbr t5, t2
4150
+ preceu.ph.qbl t6, t2
4151
+ addu t0, t0, a1
4152
+ addu.ph t3, t3, t7
4153
+ addu.ph t4, t4, t7
4154
+ ulw t1, 0(t0)
4155
+ ulw t2, 4(t0)
4156
+ addu.ph t5, t5, t7
4157
+ addu.ph t6, t6, t7
4158
+ usw t3, 96(a2)
4159
+ usw t4, 100(a2)
4160
+ preceu.ph.qbr t3, t1
4161
+ preceu.ph.qbl t4, t1
4162
+ usw t5, 104(a2)
4163
+ usw t6, 108(a2)
4164
+ preceu.ph.qbr t5, t2
4165
+ preceu.ph.qbl t6, t2
4166
+ addu.ph t3, t3, t7
4167
+ addu.ph t4, t4, t7
4168
+ addu.ph t5, t5, t7
4169
+ addu.ph t6, t6, t7
4170
+ usw t3, 112(a2)
4171
+ usw t4, 116(a2)
4172
+ usw t5, 120(a2)
4173
+ usw t6, 124(a2)
4174
+
4175
+ j ra
4176
+ nop
4177
+
4178
+ END(jsimd_convsamp_dspr2)
4179
+
4180
+
4181
+ #ifndef __mips_soft_float
4182
+
4183
+ /*****************************************************************************/
4184
+ LEAF_DSPR2(jsimd_convsamp_float_dspr2)
4185
+ /*
4186
+ * a0 = sample_data
4187
+ * a1 = start_col
4188
+ * a2 = workspace
4189
+ */
4190
+ .set at
4191
+
4192
+ lw t0, 0(a0)
4193
+ addu t0, t0, a1
4194
+ lbu t1, 0(t0)
4195
+ lbu t2, 1(t0)
4196
+ lbu t3, 2(t0)
4197
+ lbu t4, 3(t0)
4198
+ lbu t5, 4(t0)
4199
+ lbu t6, 5(t0)
4200
+ lbu t7, 6(t0)
4201
+ lbu t8, 7(t0)
4202
+ addiu t1, t1, -128
4203
+ addiu t2, t2, -128
4204
+ addiu t3, t3, -128
4205
+ addiu t4, t4, -128
4206
+ addiu t5, t5, -128
4207
+ addiu t6, t6, -128
4208
+ addiu t7, t7, -128
4209
+ addiu t8, t8, -128
4210
+ mtc1 t1, f2
4211
+ mtc1 t2, f4
4212
+ mtc1 t3, f6
4213
+ mtc1 t4, f8
4214
+ mtc1 t5, f10
4215
+ mtc1 t6, f12
4216
+ mtc1 t7, f14
4217
+ mtc1 t8, f16
4218
+ cvt.s.w f2, f2
4219
+ cvt.s.w f4, f4
4220
+ cvt.s.w f6, f6
4221
+ cvt.s.w f8, f8
4222
+ cvt.s.w f10, f10
4223
+ cvt.s.w f12, f12
4224
+ cvt.s.w f14, f14
4225
+ cvt.s.w f16, f16
4226
+ lw t0, 4(a0)
4227
+ swc1 f2, 0(a2)
4228
+ swc1 f4, 4(a2)
4229
+ swc1 f6, 8(a2)
4230
+ addu t0, t0, a1
4231
+ swc1 f8, 12(a2)
4232
+ swc1 f10, 16(a2)
4233
+ swc1 f12, 20(a2)
4234
+ swc1 f14, 24(a2)
4235
+ swc1 f16, 28(a2)
4236
+ /* elemr 1 */
4237
+ lbu t1, 0(t0)
4238
+ lbu t2, 1(t0)
4239
+ lbu t3, 2(t0)
4240
+ lbu t4, 3(t0)
4241
+ lbu t5, 4(t0)
4242
+ lbu t6, 5(t0)
4243
+ lbu t7, 6(t0)
4244
+ lbu t8, 7(t0)
4245
+ addiu t1, t1, -128
4246
+ addiu t2, t2, -128
4247
+ addiu t3, t3, -128
4248
+ addiu t4, t4, -128
4249
+ addiu t5, t5, -128
4250
+ addiu t6, t6, -128
4251
+ addiu t7, t7, -128
4252
+ addiu t8, t8, -128
4253
+ mtc1 t1, f2
4254
+ mtc1 t2, f4
4255
+ mtc1 t3, f6
4256
+ mtc1 t4, f8
4257
+ mtc1 t5, f10
4258
+ mtc1 t6, f12
4259
+ mtc1 t7, f14
4260
+ mtc1 t8, f16
4261
+ cvt.s.w f2, f2
4262
+ cvt.s.w f4, f4
4263
+ cvt.s.w f6, f6
4264
+ cvt.s.w f8, f8
4265
+ cvt.s.w f10, f10
4266
+ cvt.s.w f12, f12
4267
+ cvt.s.w f14, f14
4268
+ cvt.s.w f16, f16
4269
+ lw t0, 8(a0)
4270
+ swc1 f2, 32(a2)
4271
+ swc1 f4, 36(a2)
4272
+ swc1 f6, 40(a2)
4273
+ addu t0, t0, a1
4274
+ swc1 f8, 44(a2)
4275
+ swc1 f10, 48(a2)
4276
+ swc1 f12, 52(a2)
4277
+ swc1 f14, 56(a2)
4278
+ swc1 f16, 60(a2)
4279
+ /* elemr 2 */
4280
+ lbu t1, 0(t0)
4281
+ lbu t2, 1(t0)
4282
+ lbu t3, 2(t0)
4283
+ lbu t4, 3(t0)
4284
+ lbu t5, 4(t0)
4285
+ lbu t6, 5(t0)
4286
+ lbu t7, 6(t0)
4287
+ lbu t8, 7(t0)
4288
+ addiu t1, t1, -128
4289
+ addiu t2, t2, -128
4290
+ addiu t3, t3, -128
4291
+ addiu t4, t4, -128
4292
+ addiu t5, t5, -128
4293
+ addiu t6, t6, -128
4294
+ addiu t7, t7, -128
4295
+ addiu t8, t8, -128
4296
+ mtc1 t1, f2
4297
+ mtc1 t2, f4
4298
+ mtc1 t3, f6
4299
+ mtc1 t4, f8
4300
+ mtc1 t5, f10
4301
+ mtc1 t6, f12
4302
+ mtc1 t7, f14
4303
+ mtc1 t8, f16
4304
+ cvt.s.w f2, f2
4305
+ cvt.s.w f4, f4
4306
+ cvt.s.w f6, f6
4307
+ cvt.s.w f8, f8
4308
+ cvt.s.w f10, f10
4309
+ cvt.s.w f12, f12
4310
+ cvt.s.w f14, f14
4311
+ cvt.s.w f16, f16
4312
+ lw t0, 12(a0)
4313
+ swc1 f2, 64(a2)
4314
+ swc1 f4, 68(a2)
4315
+ swc1 f6, 72(a2)
4316
+ addu t0, t0, a1
4317
+ swc1 f8, 76(a2)
4318
+ swc1 f10, 80(a2)
4319
+ swc1 f12, 84(a2)
4320
+ swc1 f14, 88(a2)
4321
+ swc1 f16, 92(a2)
4322
+ /* elemr 3 */
4323
+ lbu t1, 0(t0)
4324
+ lbu t2, 1(t0)
4325
+ lbu t3, 2(t0)
4326
+ lbu t4, 3(t0)
4327
+ lbu t5, 4(t0)
4328
+ lbu t6, 5(t0)
4329
+ lbu t7, 6(t0)
4330
+ lbu t8, 7(t0)
4331
+ addiu t1, t1, -128
4332
+ addiu t2, t2, -128
4333
+ addiu t3, t3, -128
4334
+ addiu t4, t4, -128
4335
+ addiu t5, t5, -128
4336
+ addiu t6, t6, -128
4337
+ addiu t7, t7, -128
4338
+ addiu t8, t8, -128
4339
+ mtc1 t1, f2
4340
+ mtc1 t2, f4
4341
+ mtc1 t3, f6
4342
+ mtc1 t4, f8
4343
+ mtc1 t5, f10
4344
+ mtc1 t6, f12
4345
+ mtc1 t7, f14
4346
+ mtc1 t8, f16
4347
+ cvt.s.w f2, f2
4348
+ cvt.s.w f4, f4
4349
+ cvt.s.w f6, f6
4350
+ cvt.s.w f8, f8
4351
+ cvt.s.w f10, f10
4352
+ cvt.s.w f12, f12
4353
+ cvt.s.w f14, f14
4354
+ cvt.s.w f16, f16
4355
+ lw t0, 16(a0)
4356
+ swc1 f2, 96(a2)
4357
+ swc1 f4, 100(a2)
4358
+ swc1 f6, 104(a2)
4359
+ addu t0, t0, a1
4360
+ swc1 f8, 108(a2)
4361
+ swc1 f10, 112(a2)
4362
+ swc1 f12, 116(a2)
4363
+ swc1 f14, 120(a2)
4364
+ swc1 f16, 124(a2)
4365
+ /* elemr 4 */
4366
+ lbu t1, 0(t0)
4367
+ lbu t2, 1(t0)
4368
+ lbu t3, 2(t0)
4369
+ lbu t4, 3(t0)
4370
+ lbu t5, 4(t0)
4371
+ lbu t6, 5(t0)
4372
+ lbu t7, 6(t0)
4373
+ lbu t8, 7(t0)
4374
+ addiu t1, t1, -128
4375
+ addiu t2, t2, -128
4376
+ addiu t3, t3, -128
4377
+ addiu t4, t4, -128
4378
+ addiu t5, t5, -128
4379
+ addiu t6, t6, -128
4380
+ addiu t7, t7, -128
4381
+ addiu t8, t8, -128
4382
+ mtc1 t1, f2
4383
+ mtc1 t2, f4
4384
+ mtc1 t3, f6
4385
+ mtc1 t4, f8
4386
+ mtc1 t5, f10
4387
+ mtc1 t6, f12
4388
+ mtc1 t7, f14
4389
+ mtc1 t8, f16
4390
+ cvt.s.w f2, f2
4391
+ cvt.s.w f4, f4
4392
+ cvt.s.w f6, f6
4393
+ cvt.s.w f8, f8
4394
+ cvt.s.w f10, f10
4395
+ cvt.s.w f12, f12
4396
+ cvt.s.w f14, f14
4397
+ cvt.s.w f16, f16
4398
+ lw t0, 20(a0)
4399
+ swc1 f2, 128(a2)
4400
+ swc1 f4, 132(a2)
4401
+ swc1 f6, 136(a2)
4402
+ addu t0, t0, a1
4403
+ swc1 f8, 140(a2)
4404
+ swc1 f10, 144(a2)
4405
+ swc1 f12, 148(a2)
4406
+ swc1 f14, 152(a2)
4407
+ swc1 f16, 156(a2)
4408
+ /* elemr 5 */
4409
+ lbu t1, 0(t0)
4410
+ lbu t2, 1(t0)
4411
+ lbu t3, 2(t0)
4412
+ lbu t4, 3(t0)
4413
+ lbu t5, 4(t0)
4414
+ lbu t6, 5(t0)
4415
+ lbu t7, 6(t0)
4416
+ lbu t8, 7(t0)
4417
+ addiu t1, t1, -128
4418
+ addiu t2, t2, -128
4419
+ addiu t3, t3, -128
4420
+ addiu t4, t4, -128
4421
+ addiu t5, t5, -128
4422
+ addiu t6, t6, -128
4423
+ addiu t7, t7, -128
4424
+ addiu t8, t8, -128
4425
+ mtc1 t1, f2
4426
+ mtc1 t2, f4
4427
+ mtc1 t3, f6
4428
+ mtc1 t4, f8
4429
+ mtc1 t5, f10
4430
+ mtc1 t6, f12
4431
+ mtc1 t7, f14
4432
+ mtc1 t8, f16
4433
+ cvt.s.w f2, f2
4434
+ cvt.s.w f4, f4
4435
+ cvt.s.w f6, f6
4436
+ cvt.s.w f8, f8
4437
+ cvt.s.w f10, f10
4438
+ cvt.s.w f12, f12
4439
+ cvt.s.w f14, f14
4440
+ cvt.s.w f16, f16
4441
+ lw t0, 24(a0)
4442
+ swc1 f2, 160(a2)
4443
+ swc1 f4, 164(a2)
4444
+ swc1 f6, 168(a2)
4445
+ addu t0, t0, a1
4446
+ swc1 f8, 172(a2)
4447
+ swc1 f10, 176(a2)
4448
+ swc1 f12, 180(a2)
4449
+ swc1 f14, 184(a2)
4450
+ swc1 f16, 188(a2)
4451
+ /* elemr 6 */
4452
+ lbu t1, 0(t0)
4453
+ lbu t2, 1(t0)
4454
+ lbu t3, 2(t0)
4455
+ lbu t4, 3(t0)
4456
+ lbu t5, 4(t0)
4457
+ lbu t6, 5(t0)
4458
+ lbu t7, 6(t0)
4459
+ lbu t8, 7(t0)
4460
+ addiu t1, t1, -128
4461
+ addiu t2, t2, -128
4462
+ addiu t3, t3, -128
4463
+ addiu t4, t4, -128
4464
+ addiu t5, t5, -128
4465
+ addiu t6, t6, -128
4466
+ addiu t7, t7, -128
4467
+ addiu t8, t8, -128
4468
+ mtc1 t1, f2
4469
+ mtc1 t2, f4
4470
+ mtc1 t3, f6
4471
+ mtc1 t4, f8
4472
+ mtc1 t5, f10
4473
+ mtc1 t6, f12
4474
+ mtc1 t7, f14
4475
+ mtc1 t8, f16
4476
+ cvt.s.w f2, f2
4477
+ cvt.s.w f4, f4
4478
+ cvt.s.w f6, f6
4479
+ cvt.s.w f8, f8
4480
+ cvt.s.w f10, f10
4481
+ cvt.s.w f12, f12
4482
+ cvt.s.w f14, f14
4483
+ cvt.s.w f16, f16
4484
+ lw t0, 28(a0)
4485
+ swc1 f2, 192(a2)
4486
+ swc1 f4, 196(a2)
4487
+ swc1 f6, 200(a2)
4488
+ addu t0, t0, a1
4489
+ swc1 f8, 204(a2)
4490
+ swc1 f10, 208(a2)
4491
+ swc1 f12, 212(a2)
4492
+ swc1 f14, 216(a2)
4493
+ swc1 f16, 220(a2)
4494
+ /* elemr 7 */
4495
+ lbu t1, 0(t0)
4496
+ lbu t2, 1(t0)
4497
+ lbu t3, 2(t0)
4498
+ lbu t4, 3(t0)
4499
+ lbu t5, 4(t0)
4500
+ lbu t6, 5(t0)
4501
+ lbu t7, 6(t0)
4502
+ lbu t8, 7(t0)
4503
+ addiu t1, t1, -128
4504
+ addiu t2, t2, -128
4505
+ addiu t3, t3, -128
4506
+ addiu t4, t4, -128
4507
+ addiu t5, t5, -128
4508
+ addiu t6, t6, -128
4509
+ addiu t7, t7, -128
4510
+ addiu t8, t8, -128
4511
+ mtc1 t1, f2
4512
+ mtc1 t2, f4
4513
+ mtc1 t3, f6
4514
+ mtc1 t4, f8
4515
+ mtc1 t5, f10
4516
+ mtc1 t6, f12
4517
+ mtc1 t7, f14
4518
+ mtc1 t8, f16
4519
+ cvt.s.w f2, f2
4520
+ cvt.s.w f4, f4
4521
+ cvt.s.w f6, f6
4522
+ cvt.s.w f8, f8
4523
+ cvt.s.w f10, f10
4524
+ cvt.s.w f12, f12
4525
+ cvt.s.w f14, f14
4526
+ cvt.s.w f16, f16
4527
+ swc1 f2, 224(a2)
4528
+ swc1 f4, 228(a2)
4529
+ swc1 f6, 232(a2)
4530
+ swc1 f8, 236(a2)
4531
+ swc1 f10, 240(a2)
4532
+ swc1 f12, 244(a2)
4533
+ swc1 f14, 248(a2)
4534
+ swc1 f16, 252(a2)
4535
+
4536
+ j ra
4537
+ nop
4538
+
4539
+ END(jsimd_convsamp_float_dspr2)
4540
+
4541
+ #endif
4542
+
4543
+ /*****************************************************************************/