epeg 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (504) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/MANIFEST +5 -0
  4. data/TODO +1 -0
  5. data/epeg/.dockerignore +4 -0
  6. data/epeg/.gitignore +5 -0
  7. data/epeg/CMakeLists.txt +30 -0
  8. data/epeg/Dockerfile +23 -0
  9. data/epeg/Epeg.h +90 -0
  10. data/epeg/README.md +42 -0
  11. data/epeg/epeg_main.c +1642 -0
  12. data/epeg/epeg_private.h +85 -0
  13. data/epeg/example/.gitignore +1 -0
  14. data/epeg/example/CMakeLists.txt +20 -0
  15. data/epeg/example/example.jpg +0 -0
  16. data/epeg/example/rotatetest.c +29 -0
  17. data/epeg/example/scaletest.c +48 -0
  18. data/epeg/vendor/libjpeg-turbo-2.0.4/BUILDING.md +828 -0
  19. data/epeg/vendor/libjpeg-turbo-2.0.4/CMakeLists.txt +1420 -0
  20. data/epeg/vendor/libjpeg-turbo-2.0.4/ChangeLog.md +1494 -0
  21. data/epeg/vendor/libjpeg-turbo-2.0.4/LICENSE.md +132 -0
  22. data/epeg/vendor/libjpeg-turbo-2.0.4/README.ijg +277 -0
  23. data/epeg/vendor/libjpeg-turbo-2.0.4/README.md +356 -0
  24. data/epeg/vendor/libjpeg-turbo-2.0.4/cderror.h +137 -0
  25. data/epeg/vendor/libjpeg-turbo-2.0.4/cdjpeg.c +145 -0
  26. data/epeg/vendor/libjpeg-turbo-2.0.4/cdjpeg.h +157 -0
  27. data/epeg/vendor/libjpeg-turbo-2.0.4/change.log +315 -0
  28. data/epeg/vendor/libjpeg-turbo-2.0.4/cjpeg.1 +354 -0
  29. data/epeg/vendor/libjpeg-turbo-2.0.4/cjpeg.c +695 -0
  30. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/BuildPackages.cmake +182 -0
  31. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/GNUInstallDirs.cmake +416 -0
  32. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/cmake_uninstall.cmake.in +24 -0
  33. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/testclean.cmake +41 -0
  34. data/epeg/vendor/libjpeg-turbo-2.0.4/cmyk.h +61 -0
  35. data/epeg/vendor/libjpeg-turbo-2.0.4/coderules.txt +78 -0
  36. data/epeg/vendor/libjpeg-turbo-2.0.4/djpeg.1 +296 -0
  37. data/epeg/vendor/libjpeg-turbo-2.0.4/djpeg.c +822 -0
  38. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/annotated.html +104 -0
  39. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/bc_s.png +0 -0
  40. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/bdwn.png +0 -0
  41. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/classes.html +106 -0
  42. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/closed.png +0 -0
  43. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen-extra.css +3 -0
  44. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen.css +1184 -0
  45. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen.png +0 -0
  46. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/dynsections.js +97 -0
  47. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2blank.png +0 -0
  48. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2cl.png +0 -0
  49. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2doc.png +0 -0
  50. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2folderclosed.png +0 -0
  51. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2folderopen.png +0 -0
  52. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2lastnode.png +0 -0
  53. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2link.png +0 -0
  54. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mlastnode.png +0 -0
  55. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mnode.png +0 -0
  56. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mo.png +0 -0
  57. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2node.png +0 -0
  58. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2ns.png +0 -0
  59. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2plastnode.png +0 -0
  60. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2pnode.png +0 -0
  61. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2splitbar.png +0 -0
  62. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2vertline.png +0 -0
  63. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/functions.html +134 -0
  64. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/functions_vars.html +134 -0
  65. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/group___turbo_j_p_e_g.html +2775 -0
  66. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/index.html +90 -0
  67. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/jquery.js +8 -0
  68. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/modules.html +95 -0
  69. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_f.png +0 -0
  70. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_g.png +0 -0
  71. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_h.png +0 -0
  72. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/open.png +0 -0
  73. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_63.html +26 -0
  74. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_63.js +4 -0
  75. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_64.html +26 -0
  76. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_64.js +5 -0
  77. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_68.html +26 -0
  78. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_68.js +4 -0
  79. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6e.html +26 -0
  80. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6e.js +4 -0
  81. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6f.html +26 -0
  82. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6f.js +5 -0
  83. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_72.html +26 -0
  84. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_72.js +4 -0
  85. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_74.html +26 -0
  86. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_74.js +102 -0
  87. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_77.html +26 -0
  88. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_77.js +4 -0
  89. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_78.html +26 -0
  90. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_78.js +4 -0
  91. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_79.html +26 -0
  92. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_79.js +4 -0
  93. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/classes_74.html +26 -0
  94. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/classes_74.js +6 -0
  95. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/close.png +0 -0
  96. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enums_74.html +26 -0
  97. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enums_74.js +8 -0
  98. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enumvalues_74.html +26 -0
  99. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enumvalues_74.js +37 -0
  100. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/functions_74.html +26 -0
  101. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/functions_74.js +31 -0
  102. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/groups_74.html +26 -0
  103. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/groups_74.js +4 -0
  104. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/mag_sel.png +0 -0
  105. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/nomatches.html +12 -0
  106. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search.css +271 -0
  107. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search.js +809 -0
  108. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_l.png +0 -0
  109. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_m.png +0 -0
  110. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_r.png +0 -0
  111. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/typedefs_74.html +26 -0
  112. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/typedefs_74.js +5 -0
  113. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_63.html +26 -0
  114. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_63.js +4 -0
  115. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_64.html +26 -0
  116. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_64.js +5 -0
  117. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_68.html +26 -0
  118. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_68.js +4 -0
  119. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6e.html +26 -0
  120. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6e.js +4 -0
  121. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6f.html +26 -0
  122. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6f.js +5 -0
  123. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_72.html +26 -0
  124. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_72.js +4 -0
  125. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_74.html +26 -0
  126. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_74.js +10 -0
  127. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_77.html +26 -0
  128. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_77.js +4 -0
  129. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_78.html +26 -0
  130. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_78.js +4 -0
  131. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_79.html +26 -0
  132. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_79.js +4 -0
  133. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjregion.html +186 -0
  134. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjscalingfactor.html +148 -0
  135. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjtransform.html +212 -0
  136. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/sync_off.png +0 -0
  137. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/sync_on.png +0 -0
  138. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_a.png +0 -0
  139. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_b.png +0 -0
  140. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_h.png +0 -0
  141. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_s.png +0 -0
  142. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tabs.css +60 -0
  143. data/epeg/vendor/libjpeg-turbo-2.0.4/doxygen-extra.css +3 -0
  144. data/epeg/vendor/libjpeg-turbo-2.0.4/doxygen.config +16 -0
  145. data/epeg/vendor/libjpeg-turbo-2.0.4/example.txt +464 -0
  146. data/epeg/vendor/libjpeg-turbo-2.0.4/jaricom.c +157 -0
  147. data/epeg/vendor/libjpeg-turbo-2.0.4/java/CMakeLists.txt +88 -0
  148. data/epeg/vendor/libjpeg-turbo-2.0.4/java/MANIFEST.MF +2 -0
  149. data/epeg/vendor/libjpeg-turbo-2.0.4/java/README +52 -0
  150. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJBench.java +1021 -0
  151. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJExample.java +405 -0
  152. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJUnitTest.java +960 -0
  153. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/allclasses-frame.html +24 -0
  154. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/allclasses-noframe.html +24 -0
  155. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/constant-values.html +532 -0
  156. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/deprecated-list.html +252 -0
  157. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/help-doc.html +210 -0
  158. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/index-all.html +1029 -0
  159. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/index.html +71 -0
  160. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJ.html +1356 -0
  161. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html +926 -0
  162. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html +241 -0
  163. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html +1255 -0
  164. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJException.html +340 -0
  165. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html +343 -0
  166. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html +751 -0
  167. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html +421 -0
  168. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html +765 -0
  169. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-frame.html +31 -0
  170. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-summary.html +202 -0
  171. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-tree.html +160 -0
  172. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/overview-tree.html +164 -0
  173. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/package-list +1 -0
  174. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/background.gif +0 -0
  175. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/tab.gif +0 -0
  176. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/titlebar.gif +0 -0
  177. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/titlebar_end.gif +0 -0
  178. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/script.js +30 -0
  179. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/serialized-form.html +176 -0
  180. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/stylesheet.css +474 -0
  181. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJ.java +584 -0
  182. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJCompressor.java +677 -0
  183. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java +76 -0
  184. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJDecompressor.java +931 -0
  185. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJException.java +78 -0
  186. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in +59 -0
  187. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJLoader-win.java.in +35 -0
  188. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJScalingFactor.java +115 -0
  189. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJTransform.java +227 -0
  190. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJTransformer.java +163 -0
  191. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/YUVImage.java +445 -0
  192. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJ.h +129 -0
  193. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJCompressor.h +101 -0
  194. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJDecompressor.h +101 -0
  195. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJTransformer.h +29 -0
  196. data/epeg/vendor/libjpeg-turbo-2.0.4/jcapimin.c +295 -0
  197. data/epeg/vendor/libjpeg-turbo-2.0.4/jcapistd.c +162 -0
  198. data/epeg/vendor/libjpeg-turbo-2.0.4/jcarith.c +932 -0
  199. data/epeg/vendor/libjpeg-turbo-2.0.4/jccoefct.c +449 -0
  200. data/epeg/vendor/libjpeg-turbo-2.0.4/jccolext.c +144 -0
  201. data/epeg/vendor/libjpeg-turbo-2.0.4/jccolor.c +710 -0
  202. data/epeg/vendor/libjpeg-turbo-2.0.4/jcdctmgr.c +721 -0
  203. data/epeg/vendor/libjpeg-turbo-2.0.4/jchuff.c +1096 -0
  204. data/epeg/vendor/libjpeg-turbo-2.0.4/jchuff.h +42 -0
  205. data/epeg/vendor/libjpeg-turbo-2.0.4/jcicc.c +105 -0
  206. data/epeg/vendor/libjpeg-turbo-2.0.4/jcinit.c +77 -0
  207. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmainct.c +162 -0
  208. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmarker.c +664 -0
  209. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmaster.c +640 -0
  210. data/epeg/vendor/libjpeg-turbo-2.0.4/jcomapi.c +109 -0
  211. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfig.h.in +73 -0
  212. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfig.txt +143 -0
  213. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfigint.h.in +31 -0
  214. data/epeg/vendor/libjpeg-turbo-2.0.4/jcparam.c +541 -0
  215. data/epeg/vendor/libjpeg-turbo-2.0.4/jcphuff.c +1105 -0
  216. data/epeg/vendor/libjpeg-turbo-2.0.4/jcprepct.c +351 -0
  217. data/epeg/vendor/libjpeg-turbo-2.0.4/jcsample.c +539 -0
  218. data/epeg/vendor/libjpeg-turbo-2.0.4/jcstest.c +126 -0
  219. data/epeg/vendor/libjpeg-turbo-2.0.4/jctrans.c +400 -0
  220. data/epeg/vendor/libjpeg-turbo-2.0.4/jdapimin.c +407 -0
  221. data/epeg/vendor/libjpeg-turbo-2.0.4/jdapistd.c +639 -0
  222. data/epeg/vendor/libjpeg-turbo-2.0.4/jdarith.c +773 -0
  223. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatadst-tj.c +203 -0
  224. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatadst.c +293 -0
  225. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatasrc-tj.c +194 -0
  226. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatasrc.c +295 -0
  227. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcoefct.c +692 -0
  228. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcoefct.h +82 -0
  229. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcol565.c +384 -0
  230. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcolext.c +143 -0
  231. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcolor.c +883 -0
  232. data/epeg/vendor/libjpeg-turbo-2.0.4/jdct.h +208 -0
  233. data/epeg/vendor/libjpeg-turbo-2.0.4/jddctmgr.c +352 -0
  234. data/epeg/vendor/libjpeg-turbo-2.0.4/jdhuff.c +831 -0
  235. data/epeg/vendor/libjpeg-turbo-2.0.4/jdhuff.h +238 -0
  236. data/epeg/vendor/libjpeg-turbo-2.0.4/jdicc.c +171 -0
  237. data/epeg/vendor/libjpeg-turbo-2.0.4/jdinput.c +408 -0
  238. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmainct.c +460 -0
  239. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmainct.h +71 -0
  240. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmarker.c +1377 -0
  241. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmaster.c +737 -0
  242. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmaster.h +28 -0
  243. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmerge.c +617 -0
  244. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmrg565.c +354 -0
  245. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmrgext.c +184 -0
  246. data/epeg/vendor/libjpeg-turbo-2.0.4/jdphuff.c +687 -0
  247. data/epeg/vendor/libjpeg-turbo-2.0.4/jdpostct.c +294 -0
  248. data/epeg/vendor/libjpeg-turbo-2.0.4/jdsample.c +518 -0
  249. data/epeg/vendor/libjpeg-turbo-2.0.4/jdsample.h +50 -0
  250. data/epeg/vendor/libjpeg-turbo-2.0.4/jdtrans.c +155 -0
  251. data/epeg/vendor/libjpeg-turbo-2.0.4/jerror.c +251 -0
  252. data/epeg/vendor/libjpeg-turbo-2.0.4/jerror.h +316 -0
  253. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctflt.c +169 -0
  254. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctfst.c +227 -0
  255. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctint.c +288 -0
  256. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctflt.c +240 -0
  257. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctfst.c +371 -0
  258. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctint.c +2627 -0
  259. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctred.c +409 -0
  260. data/epeg/vendor/libjpeg-turbo-2.0.4/jinclude.h +88 -0
  261. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemmgr.c +1179 -0
  262. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemnobs.c +115 -0
  263. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemsys.h +178 -0
  264. data/epeg/vendor/libjpeg-turbo-2.0.4/jmorecfg.h +421 -0
  265. data/epeg/vendor/libjpeg-turbo-2.0.4/jpeg_nbits_table.h +4098 -0
  266. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegcomp.h +31 -0
  267. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegint.h +368 -0
  268. data/epeg/vendor/libjpeg-turbo-2.0.4/jpeglib.h +1132 -0
  269. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegtran.1 +295 -0
  270. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegtran.c +601 -0
  271. data/epeg/vendor/libjpeg-turbo-2.0.4/jquant1.c +859 -0
  272. data/epeg/vendor/libjpeg-turbo-2.0.4/jquant2.c +1285 -0
  273. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimd.h +117 -0
  274. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimd_none.c +418 -0
  275. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimddct.h +70 -0
  276. data/epeg/vendor/libjpeg-turbo-2.0.4/jstdhuff.c +143 -0
  277. data/epeg/vendor/libjpeg-turbo-2.0.4/jutils.c +133 -0
  278. data/epeg/vendor/libjpeg-turbo-2.0.4/jversion.h +52 -0
  279. data/epeg/vendor/libjpeg-turbo-2.0.4/libjpeg.map.in +11 -0
  280. data/epeg/vendor/libjpeg-turbo-2.0.4/libjpeg.txt +3144 -0
  281. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/CMakeLists.txt +1 -0
  282. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5.c +275 -0
  283. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5.h +57 -0
  284. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5cmp.c +59 -0
  285. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5hl.c +125 -0
  286. data/epeg/vendor/libjpeg-turbo-2.0.4/rdbmp.c +689 -0
  287. data/epeg/vendor/libjpeg-turbo-2.0.4/rdcolmap.c +254 -0
  288. data/epeg/vendor/libjpeg-turbo-2.0.4/rdgif.c +39 -0
  289. data/epeg/vendor/libjpeg-turbo-2.0.4/rdjpgcom.1 +63 -0
  290. data/epeg/vendor/libjpeg-turbo-2.0.4/rdjpgcom.c +510 -0
  291. data/epeg/vendor/libjpeg-turbo-2.0.4/rdppm.c +766 -0
  292. data/epeg/vendor/libjpeg-turbo-2.0.4/rdrle.c +389 -0
  293. data/epeg/vendor/libjpeg-turbo-2.0.4/rdswitch.c +424 -0
  294. data/epeg/vendor/libjpeg-turbo-2.0.4/rdtarga.c +509 -0
  295. data/epeg/vendor/libjpeg-turbo-2.0.4/release/Distribution.xml.in +24 -0
  296. data/epeg/vendor/libjpeg-turbo-2.0.4/release/License.rtf +20 -0
  297. data/epeg/vendor/libjpeg-turbo-2.0.4/release/ReadMe.txt +5 -0
  298. data/epeg/vendor/libjpeg-turbo-2.0.4/release/Welcome.rtf +17 -0
  299. data/epeg/vendor/libjpeg-turbo-2.0.4/release/deb-control.in +31 -0
  300. data/epeg/vendor/libjpeg-turbo-2.0.4/release/installer.nsi.in +191 -0
  301. data/epeg/vendor/libjpeg-turbo-2.0.4/release/libjpeg.pc.in +10 -0
  302. data/epeg/vendor/libjpeg-turbo-2.0.4/release/libturbojpeg.pc.in +10 -0
  303. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makecygwinpkg.in +66 -0
  304. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makedpkg.in +115 -0
  305. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makemacpkg.in +284 -0
  306. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makerpm.in +30 -0
  307. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makesrpm.in +48 -0
  308. data/epeg/vendor/libjpeg-turbo-2.0.4/release/maketarball.in +51 -0
  309. data/epeg/vendor/libjpeg-turbo-2.0.4/release/rpm.spec.in +221 -0
  310. data/epeg/vendor/libjpeg-turbo-2.0.4/release/uninstall.in +113 -0
  311. data/epeg/vendor/libjpeg-turbo-2.0.4/sharedlib/CMakeLists.txt +99 -0
  312. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/CMakeLists.txt +385 -0
  313. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm/jsimd.c +721 -0
  314. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm/jsimd_neon.S +2878 -0
  315. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm64/jsimd.c +798 -0
  316. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm64/jsimd_neon.S +3433 -0
  317. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/gas-preprocessor.in +1 -0
  318. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-avx2.asm +578 -0
  319. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-mmx.asm +476 -0
  320. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-sse2.asm +503 -0
  321. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-avx2.asm +121 -0
  322. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-mmx.asm +121 -0
  323. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-sse2.asm +120 -0
  324. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-avx2.asm +113 -0
  325. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-mmx.asm +113 -0
  326. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-sse2.asm +112 -0
  327. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-avx2.asm +457 -0
  328. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-mmx.asm +355 -0
  329. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-sse2.asm +382 -0
  330. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jchuff-sse2.asm +424 -0
  331. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcphuff-sse2.asm +660 -0
  332. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-avx2.asm +388 -0
  333. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-mmx.asm +324 -0
  334. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-sse2.asm +351 -0
  335. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-avx2.asm +515 -0
  336. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-mmx.asm +404 -0
  337. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-sse2.asm +458 -0
  338. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-avx2.asm +118 -0
  339. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-mmx.asm +117 -0
  340. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-sse2.asm +117 -0
  341. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-avx2.asm +136 -0
  342. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-mmx.asm +123 -0
  343. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-sse2.asm +135 -0
  344. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-avx2.asm +575 -0
  345. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-mmx.asm +460 -0
  346. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-sse2.asm +517 -0
  347. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-avx2.asm +760 -0
  348. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-mmx.asm +731 -0
  349. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-sse2.asm +724 -0
  350. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctflt-3dn.asm +318 -0
  351. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctflt-sse.asm +369 -0
  352. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctfst-mmx.asm +395 -0
  353. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctfst-sse2.asm +403 -0
  354. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-avx2.asm +331 -0
  355. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-mmx.asm +620 -0
  356. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-sse2.asm +633 -0
  357. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-3dn.asm +451 -0
  358. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-sse.asm +571 -0
  359. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-sse2.asm +497 -0
  360. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctfst-mmx.asm +499 -0
  361. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctfst-sse2.asm +501 -0
  362. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-avx2.asm +453 -0
  363. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-mmx.asm +851 -0
  364. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-sse2.asm +858 -0
  365. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctred-mmx.asm +704 -0
  366. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctred-sse2.asm +592 -0
  367. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-3dn.asm +230 -0
  368. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-mmx.asm +276 -0
  369. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-sse.asm +208 -0
  370. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquantf-sse2.asm +168 -0
  371. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquanti-avx2.asm +188 -0
  372. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquanti-sse2.asm +201 -0
  373. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jsimd.c +1253 -0
  374. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jsimdcpu.asm +135 -0
  375. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/jsimd.h +1083 -0
  376. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jccolext-mmi.c +483 -0
  377. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jccolor-mmi.c +148 -0
  378. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jcsample-mmi.c +100 -0
  379. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jcsample.h +28 -0
  380. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdcolext-mmi.c +424 -0
  381. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdcolor-mmi.c +139 -0
  382. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdsample-mmi.c +245 -0
  383. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jfdctint-mmi.c +398 -0
  384. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jidctint-mmi.c +571 -0
  385. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jquanti-mmi.c +130 -0
  386. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jsimd.c +610 -0
  387. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jsimd_mmi.h +57 -0
  388. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/loongson-mmintrin.h +1324 -0
  389. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd.c +1123 -0
  390. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd_dspr2.S +4479 -0
  391. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd_dspr2_asm.h +292 -0
  392. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jcolsamp.inc +135 -0
  393. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jdct.inc +31 -0
  394. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jpeg_nbits_table.inc +4097 -0
  395. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdcfg.inc +93 -0
  396. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdcfg.inc.h +131 -0
  397. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdext.inc +479 -0
  398. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jccolext-altivec.c +269 -0
  399. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jccolor-altivec.c +116 -0
  400. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcgray-altivec.c +111 -0
  401. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcgryext-altivec.c +228 -0
  402. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcsample-altivec.c +159 -0
  403. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcsample.h +28 -0
  404. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdcolext-altivec.c +276 -0
  405. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdcolor-altivec.c +106 -0
  406. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdmerge-altivec.c +130 -0
  407. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdmrgext-altivec.c +329 -0
  408. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdsample-altivec.c +400 -0
  409. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jfdctfst-altivec.c +154 -0
  410. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jfdctint-altivec.c +258 -0
  411. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jidctfst-altivec.c +255 -0
  412. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jidctint-altivec.c +357 -0
  413. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jquanti-altivec.c +250 -0
  414. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jsimd.c +872 -0
  415. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jsimd_altivec.h +98 -0
  416. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolext-avx2.asm +558 -0
  417. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolext-sse2.asm +483 -0
  418. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolor-avx2.asm +121 -0
  419. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolor-sse2.asm +120 -0
  420. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgray-avx2.asm +113 -0
  421. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgray-sse2.asm +112 -0
  422. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgryext-avx2.asm +437 -0
  423. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgryext-sse2.asm +362 -0
  424. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jchuff-sse2.asm +346 -0
  425. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcphuff-sse2.asm +637 -0
  426. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcsample-avx2.asm +366 -0
  427. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcsample-sse2.asm +329 -0
  428. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolext-avx2.asm +495 -0
  429. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolext-sse2.asm +438 -0
  430. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolor-avx2.asm +118 -0
  431. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolor-sse2.asm +117 -0
  432. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmerge-avx2.asm +136 -0
  433. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmerge-sse2.asm +135 -0
  434. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmrgext-avx2.asm +593 -0
  435. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmrgext-sse2.asm +535 -0
  436. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdsample-avx2.asm +695 -0
  437. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdsample-sse2.asm +664 -0
  438. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctflt-sse.asm +355 -0
  439. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctfst-sse2.asm +389 -0
  440. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctint-avx2.asm +320 -0
  441. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctint-sse2.asm +619 -0
  442. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctflt-sse2.asm +481 -0
  443. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctfst-sse2.asm +490 -0
  444. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctint-avx2.asm +417 -0
  445. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctint-sse2.asm +846 -0
  446. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctred-sse2.asm +573 -0
  447. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquantf-sse2.asm +154 -0
  448. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquanti-avx2.asm +162 -0
  449. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquanti-sse2.asm +187 -0
  450. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jsimd.c +1076 -0
  451. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jsimdcpu.asm +86 -0
  452. data/epeg/vendor/libjpeg-turbo-2.0.4/structure.txt +904 -0
  453. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/nightshot_iso_100.bmp +0 -0
  454. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/nightshot_iso_100.txt +25 -0
  455. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test.scan +5 -0
  456. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test1.icc +0 -0
  457. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test1.icc.txt +20 -0
  458. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test2.icc +0 -0
  459. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test2.icc.txt +20 -0
  460. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testimgari.jpg +0 -0
  461. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testimgint.jpg +0 -0
  462. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig.jpg +0 -0
  463. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig.ppm +4 -0
  464. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig12.jpg +0 -0
  465. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_5674_0098.bmp +0 -0
  466. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_6434_0018a.bmp +0 -0
  467. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_6548_0026a.bmp +0 -0
  468. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbench.c +1031 -0
  469. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbenchtest.in +256 -0
  470. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbenchtest.java.in +215 -0
  471. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexample.c +396 -0
  472. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexampletest.in +149 -0
  473. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexampletest.java.in +151 -0
  474. data/epeg/vendor/libjpeg-turbo-2.0.4/tjunittest.c +931 -0
  475. data/epeg/vendor/libjpeg-turbo-2.0.4/tjutil.c +70 -0
  476. data/epeg/vendor/libjpeg-turbo-2.0.4/tjutil.h +47 -0
  477. data/epeg/vendor/libjpeg-turbo-2.0.4/transupp.c +1628 -0
  478. data/epeg/vendor/libjpeg-turbo-2.0.4/transupp.h +210 -0
  479. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-jni.c +1246 -0
  480. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-mapfile +65 -0
  481. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-mapfile.jni +101 -0
  482. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg.c +2152 -0
  483. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg.h +1744 -0
  484. data/epeg/vendor/libjpeg-turbo-2.0.4/usage.txt +635 -0
  485. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jconfig.h.in +34 -0
  486. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg62-memsrcdst.def +108 -0
  487. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg62.def +106 -0
  488. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg7-memsrcdst.def +110 -0
  489. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg7.def +108 -0
  490. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg8.def +111 -0
  491. data/epeg/vendor/libjpeg-turbo-2.0.4/wizard.txt +212 -0
  492. data/epeg/vendor/libjpeg-turbo-2.0.4/wrbmp.c +558 -0
  493. data/epeg/vendor/libjpeg-turbo-2.0.4/wrgif.c +413 -0
  494. data/epeg/vendor/libjpeg-turbo-2.0.4/wrjpgcom.1 +103 -0
  495. data/epeg/vendor/libjpeg-turbo-2.0.4/wrjpgcom.c +591 -0
  496. data/epeg/vendor/libjpeg-turbo-2.0.4/wrppm.c +365 -0
  497. data/epeg/vendor/libjpeg-turbo-2.0.4/wrrle.c +309 -0
  498. data/epeg/vendor/libjpeg-turbo-2.0.4/wrtarga.c +261 -0
  499. data/epeg.c +131 -0
  500. data/epeg.gemspec +18 -0
  501. data/extconf.rb +80 -0
  502. data/test.jpg +0 -0
  503. data/test.rb +42 -0
  504. metadata +546 -0
@@ -0,0 +1,4479 @@
1
+ /*
2
+ * MIPS DSPr2 optimizations for libjpeg-turbo
3
+ *
4
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
5
+ * All Rights Reserved.
6
+ * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com>
7
+ * Darko Laus <darko.laus@imgtec.com>
8
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
9
+ *
10
+ * This software is provided 'as-is', without any express or implied
11
+ * warranty. In no event will the authors be held liable for any damages
12
+ * arising from the use of this software.
13
+ *
14
+ * Permission is granted to anyone to use this software for any purpose,
15
+ * including commercial applications, and to alter it and redistribute it
16
+ * freely, subject to the following restrictions:
17
+ *
18
+ * 1. The origin of this software must not be misrepresented; you must not
19
+ * claim that you wrote the original software. If you use this software
20
+ * in a product, an acknowledgment in the product documentation would be
21
+ * appreciated but is not required.
22
+ * 2. Altered source versions must be plainly marked as such, and must not be
23
+ * misrepresented as being the original software.
24
+ * 3. This notice may not be removed or altered from any source distribution.
25
+ */
26
+
27
+ #include "jsimd_dspr2_asm.h"
28
+
29
+
30
+ /*****************************************************************************/
31
+ LEAF_DSPR2(jsimd_c_null_convert_dspr2)
32
+ /*
33
+ * a0 = cinfo->image_width
34
+ * a1 = input_buf
35
+ * a2 = output_buf
36
+ * a3 = output_row
37
+ * 16(sp) = num_rows
38
+ * 20(sp) = cinfo->num_components
39
+ *
40
+ * Null conversion for compression
41
+ */
42
+ SAVE_REGS_ON_STACK 8, s0, s1
43
+
44
+ lw t9, 24(sp) // t9 = num_rows
45
+ lw s0, 28(sp) // s0 = cinfo->num_components
46
+ andi t0, a0, 3 // t0 = cinfo->image_width & 3
47
+ beqz t0, 4f // no residual
48
+ nop
49
+ 0:
50
+ addiu t9, t9, -1
51
+ bltz t9, 7f
52
+ li t1, 0
53
+ 1:
54
+ sll t3, t1, 2
55
+ lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
56
+ lw t2, 0(a1) // t2 = inptr = *input_buf
57
+ sll t4, a3, 2
58
+ lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
59
+ addu t2, t2, t1
60
+ addu s1, t5, a0
61
+ addu t6, t5, t0
62
+ 2:
63
+ lbu t3, 0(t2)
64
+ addiu t5, t5, 1
65
+ sb t3, -1(t5)
66
+ bne t6, t5, 2b
67
+ addu t2, t2, s0
68
+ 3:
69
+ lbu t3, 0(t2)
70
+ addu t4, t2, s0
71
+ addu t7, t4, s0
72
+ addu t8, t7, s0
73
+ addu t2, t8, s0
74
+ lbu t4, 0(t4)
75
+ lbu t7, 0(t7)
76
+ lbu t8, 0(t8)
77
+ addiu t5, t5, 4
78
+ sb t3, -4(t5)
79
+ sb t4, -3(t5)
80
+ sb t7, -2(t5)
81
+ bne s1, t5, 3b
82
+ sb t8, -1(t5)
83
+ addiu t1, t1, 1
84
+ bne t1, s0, 1b
85
+ nop
86
+ addiu a1, a1, 4
87
+ bgez t9, 0b
88
+ addiu a3, a3, 1
89
+ b 7f
90
+ nop
91
+ 4:
92
+ addiu t9, t9, -1
93
+ bltz t9, 7f
94
+ li t1, 0
95
+ 5:
96
+ sll t3, t1, 2
97
+ lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
98
+ lw t2, 0(a1) // t2 = inptr = *input_buf
99
+ sll t4, a3, 2
100
+ lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
101
+ addu t2, t2, t1
102
+ addu s1, t5, a0
103
+ addu t6, t5, t0
104
+ 6:
105
+ lbu t3, 0(t2)
106
+ addu t4, t2, s0
107
+ addu t7, t4, s0
108
+ addu t8, t7, s0
109
+ addu t2, t8, s0
110
+ lbu t4, 0(t4)
111
+ lbu t7, 0(t7)
112
+ lbu t8, 0(t8)
113
+ addiu t5, t5, 4
114
+ sb t3, -4(t5)
115
+ sb t4, -3(t5)
116
+ sb t7, -2(t5)
117
+ bne s1, t5, 6b
118
+ sb t8, -1(t5)
119
+ addiu t1, t1, 1
120
+ bne t1, s0, 5b
121
+ nop
122
+ addiu a1, a1, 4
123
+ bgez t9, 4b
124
+ addiu a3, a3, 1
125
+ 7:
126
+ RESTORE_REGS_FROM_STACK 8, s0, s1
127
+
128
+ j ra
129
+ nop
130
+
131
+ END(jsimd_c_null_convert_dspr2)
132
+
133
+
134
+ /*****************************************************************************/
135
+ /*
136
+ * jsimd_extrgb_ycc_convert_dspr2
137
+ * jsimd_extbgr_ycc_convert_dspr2
138
+ * jsimd_extrgbx_ycc_convert_dspr2
139
+ * jsimd_extbgrx_ycc_convert_dspr2
140
+ * jsimd_extxbgr_ycc_convert_dspr2
141
+ * jsimd_extxrgb_ycc_convert_dspr2
142
+ *
143
+ * Colorspace conversion RGB -> YCbCr
144
+ */
145
+
146
+ .macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \
147
+ r_offs, g_offs, b_offs
148
+
149
+ .macro DO_RGB_TO_YCC r, g, b, inptr
150
+ lbu \r, \r_offs(\inptr)
151
+ lbu \g, \g_offs(\inptr)
152
+ lbu \b, \b_offs(\inptr)
153
+ addiu \inptr, \pixel_size
154
+ .endm
155
+
156
+ LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
157
+ /*
158
+ * a0 = cinfo->image_width
159
+ * a1 = input_buf
160
+ * a2 = output_buf
161
+ * a3 = output_row
162
+ * 16(sp) = num_rows
163
+ */
164
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
165
+
166
+ lw t7, 48(sp) // t7 = num_rows
167
+ li s0, 0x4c8b // FIX(0.29900)
168
+ li s1, 0x9646 // FIX(0.58700)
169
+ li s2, 0x1d2f // FIX(0.11400)
170
+ li s3, 0xffffd4cd // -FIX(0.16874)
171
+ li s4, 0xffffab33 // -FIX(0.33126)
172
+ li s5, 0x8000 // FIX(0.50000)
173
+ li s6, 0xffff94d1 // -FIX(0.41869)
174
+ li s7, 0xffffeb2f // -FIX(0.08131)
175
+ li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
176
+
177
+ 0:
178
+ addiu t7, -1 // --num_rows
179
+ lw t6, 0(a1) // t6 = input_buf[0]
180
+ lw t0, 0(a2)
181
+ lw t1, 4(a2)
182
+ lw t2, 8(a2)
183
+ sll t3, a3, 2
184
+ lwx t0, t3(t0) // t0 = output_buf[0][output_row]
185
+ lwx t1, t3(t1) // t1 = output_buf[1][output_row]
186
+ lwx t2, t3(t2) // t2 = output_buf[2][output_row]
187
+
188
+ addu t9, t2, a0 // t9 = end address
189
+ addiu a3, 1
190
+
191
+ 1:
192
+ DO_RGB_TO_YCC t3, t4, t5, t6
193
+
194
+ mtlo s5, $ac0
195
+ mtlo t8, $ac1
196
+ mtlo t8, $ac2
197
+ maddu $ac0, s2, t5
198
+ maddu $ac1, s5, t5
199
+ maddu $ac2, s5, t3
200
+ maddu $ac0, s0, t3
201
+ maddu $ac1, s3, t3
202
+ maddu $ac2, s6, t4
203
+ maddu $ac0, s1, t4
204
+ maddu $ac1, s4, t4
205
+ maddu $ac2, s7, t5
206
+ extr.w t3, $ac0, 16
207
+ extr.w t4, $ac1, 16
208
+ extr.w t5, $ac2, 16
209
+ sb t3, 0(t0)
210
+ sb t4, 0(t1)
211
+ sb t5, 0(t2)
212
+ addiu t0, 1
213
+ addiu t2, 1
214
+ bne t2, t9, 1b
215
+ addiu t1, 1
216
+ bgtz t7, 0b
217
+ addiu a1, 4
218
+
219
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
220
+
221
+ j ra
222
+ nop
223
+ END(jsimd_\colorid\()_ycc_convert_dspr2)
224
+
225
+ .purgem DO_RGB_TO_YCC
226
+
227
+ .endm
228
+
229
+ /*-------------------------------------id -- pix R G B */
230
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
231
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
232
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
233
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
234
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
235
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
236
+
237
+
238
+ /*****************************************************************************/
239
+ /*
240
+ * jsimd_ycc_extrgb_convert_dspr2
241
+ * jsimd_ycc_extbgr_convert_dspr2
242
+ * jsimd_ycc_extrgbx_convert_dspr2
243
+ * jsimd_ycc_extbgrx_convert_dspr2
244
+ * jsimd_ycc_extxbgr_convert_dspr2
245
+ * jsimd_ycc_extxrgb_convert_dspr2
246
+ *
247
+ * Colorspace conversion YCbCr -> RGB
248
+ */
249
+
250
+ .macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \
251
+ r_offs, g_offs, b_offs, a_offs
252
+
253
+ .macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr
254
+ sb \scratch0, \r_offs(\outptr)
255
+ sb \scratch1, \g_offs(\outptr)
256
+ sb \scratch2, \b_offs(\outptr)
257
+ .if (\pixel_size == 4)
258
+ li t0, 0xFF
259
+ sb t0, \a_offs(\outptr)
260
+ .endif
261
+ addiu \outptr, \pixel_size
262
+ .endm
263
+
264
+ LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
265
+ /*
266
+ * a0 = cinfo->image_width
267
+ * a1 = input_buf
268
+ * a2 = input_row
269
+ * a3 = output_buf
270
+ * 16(sp) = num_rows
271
+ */
272
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
273
+
274
+ lw s1, 48(sp)
275
+ li t3, 0x8000
276
+ li t4, 0x166e9 // FIX(1.40200)
277
+ li t5, 0x1c5a2 // FIX(1.77200)
278
+ li t6, 0xffff492e // -FIX(0.71414)
279
+ li t7, 0xffffa7e6 // -FIX(0.34414)
280
+ repl.ph t8, 128
281
+
282
+ 0:
283
+ lw s0, 0(a3)
284
+ lw t0, 0(a1)
285
+ lw t1, 4(a1)
286
+ lw t2, 8(a1)
287
+ sll s5, a2, 2
288
+ addiu s1, -1
289
+ lwx s2, s5(t0)
290
+ lwx s3, s5(t1)
291
+ lwx s4, s5(t2)
292
+ addu t9, s2, a0
293
+ addiu a2, 1
294
+
295
+ 1:
296
+ lbu s7, 0(s4) // cr
297
+ lbu s6, 0(s3) // cb
298
+ lbu s5, 0(s2) // y
299
+ addiu s2, 1
300
+ addiu s4, 1
301
+ addiu s7, -128
302
+ addiu s6, -128
303
+ mul t2, t7, s6
304
+ mul t0, t6, s7 // Crgtab[cr]
305
+ sll s7, 15
306
+ mulq_rs.w t1, t4, s7 // Crrtab[cr]
307
+ sll s6, 15
308
+ addu t2, t3 // Cbgtab[cb]
309
+ addu t2, t0
310
+
311
+ mulq_rs.w t0, t5, s6 // Cbbtab[cb]
312
+ sra t2, 16
313
+ addu t1, s5
314
+ addu t2, s5 // add y
315
+ ins t2, t1, 16, 16
316
+ subu.ph t2, t2, t8
317
+ addu t0, s5
318
+ shll_s.ph t2, t2, 8
319
+ subu t0, 128
320
+ shra.ph t2, t2, 8
321
+ shll_s.w t0, t0, 24
322
+ addu.ph t2, t2, t8 // clip & store
323
+ sra t0, t0, 24
324
+ sra t1, t2, 16
325
+ addiu t0, 128
326
+
327
+ STORE_YCC_TO_RGB t1, t2, t0, s0
328
+
329
+ bne s2, t9, 1b
330
+ addiu s3, 1
331
+ bgtz s1, 0b
332
+ addiu a3, 4
333
+
334
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
335
+
336
+ j ra
337
+ nop
338
+ END(jsimd_ycc_\colorid\()_convert_dspr2)
339
+
340
+ .purgem STORE_YCC_TO_RGB
341
+
342
+ .endm
343
+
344
+ /*-------------------------------------id -- pix R G B A */
345
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3
346
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3
347
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
348
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
349
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
350
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
351
+
352
+
353
+ /*****************************************************************************/
354
+ /*
355
+ * jsimd_extrgb_gray_convert_dspr2
356
+ * jsimd_extbgr_gray_convert_dspr2
357
+ * jsimd_extrgbx_gray_convert_dspr2
358
+ * jsimd_extbgrx_gray_convert_dspr2
359
+ * jsimd_extxbgr_gray_convert_dspr2
360
+ * jsimd_extxrgb_gray_convert_dspr2
361
+ *
362
+ * Colorspace conversion RGB -> GRAY
363
+ */
364
+
365
+ .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \
366
+ r_offs, g_offs, b_offs
367
+
368
+ .macro DO_RGB_TO_GRAY r, g, b, inptr
369
+ lbu \r, \r_offs(\inptr)
370
+ lbu \g, \g_offs(\inptr)
371
+ lbu \b, \b_offs(\inptr)
372
+ addiu \inptr, \pixel_size
373
+ .endm
374
+
375
+ LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
376
+ /*
377
+ * a0 = cinfo->image_width
378
+ * a1 = input_buf
379
+ * a2 = output_buf
380
+ * a3 = output_row
381
+ * 16(sp) = num_rows
382
+ */
383
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
384
+
385
+ li s0, 0x4c8b // s0 = FIX(0.29900)
386
+ li s1, 0x9646 // s1 = FIX(0.58700)
387
+ li s2, 0x1d2f // s2 = FIX(0.11400)
388
+ li s7, 0x8000 // s7 = FIX(0.50000)
389
+ lw s6, 48(sp)
390
+ andi t7, a0, 3
391
+
392
+ 0:
393
+ addiu s6, -1 // s6 = num_rows
394
+ lw t0, 0(a1)
395
+ lw t1, 0(a2)
396
+ sll t3, a3, 2
397
+ lwx t1, t3(t1)
398
+ addiu a3, 1
399
+ addu t9, t1, a0
400
+ subu t8, t9, t7
401
+ beq t1, t8, 2f
402
+ nop
403
+
404
+ 1:
405
+ DO_RGB_TO_GRAY t3, t4, t5, t0
406
+ DO_RGB_TO_GRAY s3, s4, s5, t0
407
+
408
+ mtlo s7, $ac0
409
+ maddu $ac0, s2, t5
410
+ maddu $ac0, s1, t4
411
+ maddu $ac0, s0, t3
412
+ mtlo s7, $ac1
413
+ maddu $ac1, s2, s5
414
+ maddu $ac1, s1, s4
415
+ maddu $ac1, s0, s3
416
+ extr.w t6, $ac0, 16
417
+
418
+ DO_RGB_TO_GRAY t3, t4, t5, t0
419
+ DO_RGB_TO_GRAY s3, s4, s5, t0
420
+
421
+ mtlo s7, $ac0
422
+ maddu $ac0, s2, t5
423
+ maddu $ac0, s1, t4
424
+ extr.w t2, $ac1, 16
425
+ maddu $ac0, s0, t3
426
+ mtlo s7, $ac1
427
+ maddu $ac1, s2, s5
428
+ maddu $ac1, s1, s4
429
+ maddu $ac1, s0, s3
430
+ extr.w t5, $ac0, 16
431
+ sb t6, 0(t1)
432
+ sb t2, 1(t1)
433
+ extr.w t3, $ac1, 16
434
+ addiu t1, 4
435
+ sb t5, -2(t1)
436
+ sb t3, -1(t1)
437
+ bne t1, t8, 1b
438
+ nop
439
+
440
+ 2:
441
+ beqz t7, 4f
442
+ nop
443
+
444
+ 3:
445
+ DO_RGB_TO_GRAY t3, t4, t5, t0
446
+
447
+ mtlo s7, $ac0
448
+ maddu $ac0, s2, t5
449
+ maddu $ac0, s1, t4
450
+ maddu $ac0, s0, t3
451
+ extr.w t6, $ac0, 16
452
+ sb t6, 0(t1)
453
+ addiu t1, 1
454
+ bne t1, t9, 3b
455
+ nop
456
+
457
+ 4:
458
+ bgtz s6, 0b
459
+ addiu a1, 4
460
+
461
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
462
+
463
+ j ra
464
+ nop
465
+ END(jsimd_\colorid\()_gray_convert_dspr2)
466
+
467
+ .purgem DO_RGB_TO_GRAY
468
+
469
+ .endm
470
+
471
+ /*-------------------------------------id -- pix R G B */
472
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
473
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
474
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
475
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
476
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
477
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
478
+
479
+
480
+ /*****************************************************************************/
481
+ /*
482
+ * jsimd_h2v2_merged_upsample_dspr2
483
+ * jsimd_h2v2_extrgb_merged_upsample_dspr2
484
+ * jsimd_h2v2_extrgbx_merged_upsample_dspr2
485
+ * jsimd_h2v2_extbgr_merged_upsample_dspr2
486
+ * jsimd_h2v2_extbgrx_merged_upsample_dspr2
487
+ * jsimd_h2v2_extxbgr_merged_upsample_dspr2
488
+ * jsimd_h2v2_extxrgb_merged_upsample_dspr2
489
+ *
490
+ * Merged h2v2 upsample routines
491
+ */
492
+ .macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
493
+ r1_offs, g1_offs, \
494
+ b1_offs, a1_offs, \
495
+ r2_offs, g2_offs, \
496
+ b2_offs, a2_offs
497
+
498
+ .macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
499
+ scratch5 outptr
500
+ sb \scratch0, \r1_offs(\outptr)
501
+ sb \scratch1, \g1_offs(\outptr)
502
+ sb \scratch2, \b1_offs(\outptr)
503
+ sb \scratch3, \r2_offs(\outptr)
504
+ sb \scratch4, \g2_offs(\outptr)
505
+ sb \scratch5, \b2_offs(\outptr)
506
+ .if (\pixel_size == 8)
507
+ li \scratch0, 0xFF
508
+ sb \scratch0, \a1_offs(\outptr)
509
+ sb \scratch0, \a2_offs(\outptr)
510
+ .endif
511
+ addiu \outptr, \pixel_size
512
+ .endm
513
+
514
+ .macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr
515
+ sb \scratch0, \r1_offs(\outptr)
516
+ sb \scratch1, \g1_offs(\outptr)
517
+ sb \scratch2, \b1_offs(\outptr)
518
+
519
+ .if (\pixel_size == 8)
520
+ li t0, 0xFF
521
+ sb t0, \a1_offs(\outptr)
522
+ .endif
523
+ .endm
524
+
525
+ LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
526
+ /*
527
+ * a0 = cinfo->output_width
528
+ * a1 = input_buf
529
+ * a2 = in_row_group_ctr
530
+ * a3 = output_buf
531
+ * 16(sp) = cinfo->sample_range_limit
532
+ */
533
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
534
+
535
+ lw t9, 56(sp) // cinfo->sample_range_limit
536
+ lw v0, 0(a1)
537
+ lw v1, 4(a1)
538
+ lw t0, 8(a1)
539
+ sll t1, a2, 3
540
+ addiu t2, t1, 4
541
+ sll t3, a2, 2
542
+ lw t4, 0(a3) // t4 = output_buf[0]
543
+ lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2]
544
+ lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1]
545
+ lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr]
546
+ lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr]
547
+ lw t7, 4(a3) // t7 = output_buf[1]
548
+ li s1, 0xe6ea
549
+ addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)]
550
+ addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)]
551
+ addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
552
+ xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
553
+ srl t3, a0, 1
554
+ blez t3, 2f
555
+ addu t0, t5, t3 // t0 = end address
556
+ 1:
557
+ lbu t3, 0(t5)
558
+ lbu s3, 0(t6)
559
+ addiu t5, t5, 1
560
+ addiu t3, t3, -128 // (cb - 128)
561
+ addiu s3, s3, -128 // (cr - 128)
562
+ mult $ac1, s1, t3
563
+ madd $ac1, s2, s3
564
+ sll s3, s3, 15
565
+ sll t3, t3, 15
566
+ mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
567
+ extr_r.w s5, $ac1, 16
568
+ mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
569
+ lbu v0, 0(t1)
570
+ addiu t6, t6, 1
571
+ addiu t1, t1, 2
572
+ addu t3, v0, s4 // y+cred
573
+ addu s3, v0, s5 // y+cgreen
574
+ addu v1, v0, s6 // y+cblue
575
+ addu t3, t9, t3 // y+cred
576
+ addu s3, t9, s3 // y+cgreen
577
+ addu v1, t9, v1 // y+cblue
578
+ lbu AT, 0(t3)
579
+ lbu s7, 0(s3)
580
+ lbu ra, 0(v1)
581
+ lbu v0, -1(t1)
582
+ addu t3, v0, s4 // y+cred
583
+ addu s3, v0, s5 // y+cgreen
584
+ addu v1, v0, s6 // y+cblue
585
+ addu t3, t9, t3 // y+cred
586
+ addu s3, t9, s3 // y+cgreen
587
+ addu v1, t9, v1 // y+cblue
588
+ lbu t3, 0(t3)
589
+ lbu s3, 0(s3)
590
+ lbu v1, 0(v1)
591
+ lbu v0, 0(t2)
592
+
593
+ STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
594
+
595
+ addu t3, v0, s4 // y+cred
596
+ addu s3, v0, s5 // y+cgreen
597
+ addu v1, v0, s6 // y+cblue
598
+ addu t3, t9, t3 // y+cred
599
+ addu s3, t9, s3 // y+cgreen
600
+ addu v1, t9, v1 // y+cblue
601
+ lbu AT, 0(t3)
602
+ lbu s7, 0(s3)
603
+ lbu ra, 0(v1)
604
+ lbu v0, 1(t2)
605
+ addiu t2, t2, 2
606
+ addu t3, v0, s4 // y+cred
607
+ addu s3, v0, s5 // y+cgreen
608
+ addu v1, v0, s6 // y+cblue
609
+ addu t3, t9, t3 // y+cred
610
+ addu s3, t9, s3 // y+cgreen
611
+ addu v1, t9, v1 // y+cblue
612
+ lbu t3, 0(t3)
613
+ lbu s3, 0(s3)
614
+ lbu v1, 0(v1)
615
+
616
+ STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
617
+
618
+ bne t0, t5, 1b
619
+ nop
620
+ 2:
621
+ andi t0, a0, 1
622
+ beqz t0, 4f
623
+ lbu t3, 0(t5)
624
+ lbu s3, 0(t6)
625
+ addiu t3, t3, -128 // (cb - 128)
626
+ addiu s3, s3, -128 // (cr - 128)
627
+ mult $ac1, s1, t3
628
+ madd $ac1, s2, s3
629
+ sll s3, s3, 15
630
+ sll t3, t3, 15
631
+ lbu v0, 0(t1)
632
+ extr_r.w s5, $ac1, 16
633
+ mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
634
+ mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
635
+ addu t3, v0, s4 // y+cred
636
+ addu s3, v0, s5 // y+cgreen
637
+ addu v1, v0, s6 // y+cblue
638
+ addu t3, t9, t3 // y+cred
639
+ addu s3, t9, s3 // y+cgreen
640
+ addu v1, t9, v1 // y+cblue
641
+ lbu t3, 0(t3)
642
+ lbu s3, 0(s3)
643
+ lbu v1, 0(v1)
644
+ lbu v0, 0(t2)
645
+
646
+ STORE_H2V2_1_PIXEL t3, s3, v1, t4
647
+
648
+ addu t3, v0, s4 // y+cred
649
+ addu s3, v0, s5 // y+cgreen
650
+ addu v1, v0, s6 // y+cblue
651
+ addu t3, t9, t3 // y+cred
652
+ addu s3, t9, s3 // y+cgreen
653
+ addu v1, t9, v1 // y+cblue
654
+ lbu t3, 0(t3)
655
+ lbu s3, 0(s3)
656
+ lbu v1, 0(v1)
657
+
658
+ STORE_H2V2_1_PIXEL t3, s3, v1, t7
659
+ 4:
660
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
661
+
662
+ j ra
663
+ nop
664
+
665
+ END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
666
+
667
+ .purgem STORE_H2V2_1_PIXEL
668
+ .purgem STORE_H2V2_2_PIXELS
669
+ .endm
670
+
671
+ /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
672
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
673
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
674
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
675
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
676
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
677
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
678
+
679
+
680
+ /*****************************************************************************/
681
+ /*
682
+ * jsimd_h2v1_merged_upsample_dspr2
683
+ * jsimd_h2v1_extrgb_merged_upsample_dspr2
684
+ * jsimd_h2v1_extrgbx_merged_upsample_dspr2
685
+ * jsimd_h2v1_extbgr_merged_upsample_dspr2
686
+ * jsimd_h2v1_extbgrx_merged_upsample_dspr2
687
+ * jsimd_h2v1_extxbgr_merged_upsample_dspr2
688
+ * jsimd_h2v1_extxrgb_merged_upsample_dspr2
689
+ *
690
+ * Merged h2v1 upsample routines
691
+ */
692
+
693
+ .macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
694
+ r1_offs, g1_offs, \
695
+ b1_offs, a1_offs, \
696
+ r2_offs, g2_offs, \
697
+ b2_offs, a2_offs
698
+
699
+ .macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
700
+ scratch5 outptr
701
+ sb \scratch0, \r1_offs(\outptr)
702
+ sb \scratch1, \g1_offs(\outptr)
703
+ sb \scratch2, \b1_offs(\outptr)
704
+ sb \scratch3, \r2_offs(\outptr)
705
+ sb \scratch4, \g2_offs(\outptr)
706
+ sb \scratch5, \b2_offs(\outptr)
707
+ .if (\pixel_size == 8)
708
+ li t0, 0xFF
709
+ sb t0, \a1_offs(\outptr)
710
+ sb t0, \a2_offs(\outptr)
711
+ .endif
712
+ addiu \outptr, \pixel_size
713
+ .endm
714
+
715
+ .macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr
716
+ sb \scratch0, \r1_offs(\outptr)
717
+ sb \scratch1, \g1_offs(\outptr)
718
+ sb \scratch2, \b1_offs(\outptr)
719
+ .if (\pixel_size == 8)
720
+ li t0, 0xFF
721
+ sb t0, \a1_offs(\outptr)
722
+ .endif
723
+ .endm
724
+
725
+ LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
726
+ /*
727
+ * a0 = cinfo->output_width
728
+ * a1 = input_buf
729
+ * a2 = in_row_group_ctr
730
+ * a3 = output_buf
731
+ * 16(sp) = range_limit
732
+ */
733
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
734
+
735
+ li t0, 0xe6ea
736
+ lw t1, 0(a1) // t1 = input_buf[0]
737
+ lw t2, 4(a1) // t2 = input_buf[1]
738
+ lw t3, 8(a1) // t3 = input_buf[2]
739
+ lw t8, 56(sp) // t8 = range_limit
740
+ addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)]
741
+ addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)]
742
+ addiu s0, t0, 0x9916 // s0 = 0x8000
743
+ addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
744
+ xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
745
+ srl t0, a0, 1
746
+ sll t4, a2, 2
747
+ lwx s5, t4(t1) // s5 = inptr0
748
+ lwx s6, t4(t2) // s6 = inptr1
749
+ lwx s7, t4(t3) // s7 = inptr2
750
+ lw t7, 0(a3) // t7 = outptr
751
+ blez t0, 2f
752
+ addu t9, s6, t0 // t9 = end address
753
+ 1:
754
+ lbu t2, 0(s6) // t2 = cb
755
+ lbu t0, 0(s7) // t0 = cr
756
+ lbu t1, 0(s5) // t1 = y
757
+ addiu t2, t2, -128 // t2 = cb - 128
758
+ addiu t0, t0, -128 // t0 = cr - 128
759
+ mult $ac1, s4, t2
760
+ madd $ac1, s3, t0
761
+ sll t0, t0, 15
762
+ sll t2, t2, 15
763
+ mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
764
+ extr_r.w t5, $ac1, 16
765
+ mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
766
+ addiu s7, s7, 1
767
+ addiu s6, s6, 1
768
+ addu t2, t1, t0 // t2 = y + cred
769
+ addu t3, t1, t5 // t3 = y + cgreen
770
+ addu t4, t1, t6 // t4 = y + cblue
771
+ addu t2, t8, t2
772
+ addu t3, t8, t3
773
+ addu t4, t8, t4
774
+ lbu t1, 1(s5)
775
+ lbu v0, 0(t2)
776
+ lbu v1, 0(t3)
777
+ lbu ra, 0(t4)
778
+ addu t2, t1, t0
779
+ addu t3, t1, t5
780
+ addu t4, t1, t6
781
+ addu t2, t8, t2
782
+ addu t3, t8, t3
783
+ addu t4, t8, t4
784
+ lbu t2, 0(t2)
785
+ lbu t3, 0(t3)
786
+ lbu t4, 0(t4)
787
+
788
+ STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
789
+
790
+ bne t9, s6, 1b
791
+ addiu s5, s5, 2
792
+ 2:
793
+ andi t0, a0, 1
794
+ beqz t0, 4f
795
+ nop
796
+ 3:
797
+ lbu t2, 0(s6)
798
+ lbu t0, 0(s7)
799
+ lbu t1, 0(s5)
800
+ addiu t2, t2, -128 // (cb - 128)
801
+ addiu t0, t0, -128 // (cr - 128)
802
+ mul t3, s4, t2
803
+ mul t4, s3, t0
804
+ sll t0, t0, 15
805
+ sll t2, t2, 15
806
+ mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS
807
+ mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS
808
+ addu t3, t3, s0
809
+ addu t3, t4, t3
810
+ sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
811
+ addu t2, t1, t0 // y + cred
812
+ addu t3, t1, t5 // y + cgreen
813
+ addu t4, t1, t6 // y + cblue
814
+ addu t2, t8, t2
815
+ addu t3, t8, t3
816
+ addu t4, t8, t4
817
+ lbu t2, 0(t2)
818
+ lbu t3, 0(t3)
819
+ lbu t4, 0(t4)
820
+
821
+ STORE_H2V1_1_PIXEL t2, t3, t4, t7
822
+ 4:
823
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
824
+
825
+ j ra
826
+ nop
827
+
828
+ END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
829
+
830
+ .purgem STORE_H2V1_1_PIXEL
831
+ .purgem STORE_H2V1_2_PIXELS
832
+ .endm
833
+
834
+ /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
835
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
836
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
837
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
838
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
839
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
840
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
841
+
842
+
843
+ /*****************************************************************************/
844
+ /*
845
+ * jsimd_h2v2_fancy_upsample_dspr2
846
+ *
847
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
848
+ */
849
+ LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
850
+ /*
851
+ * a0 = cinfo->max_v_samp_factor
852
+ * a1 = downsampled_width
853
+ * a2 = input_data
854
+ * a3 = output_data_ptr
855
+ */
856
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
857
+
858
+ li s4, 0
859
+ lw s2, 0(a3) // s2 = *output_data_ptr
860
+ 0:
861
+ li t9, 2
862
+ lw s1, -4(a2) // s1 = inptr1
863
+
864
+ 1:
865
+ lw s0, 0(a2) // s0 = inptr0
866
+ lwx s3, s4(s2)
867
+ addiu s5, a1, -2 // s5 = downsampled_width - 2
868
+ srl t4, s5, 1
869
+ sll t4, t4, 1
870
+ lbu t0, 0(s0)
871
+ lbu t1, 1(s0)
872
+ lbu t2, 0(s1)
873
+ lbu t3, 1(s1)
874
+ addiu s0, 2
875
+ addiu s1, 2
876
+ addu t8, s0, t4 // t8 = end address
877
+ andi s5, s5, 1 // s5 = residual
878
+ sll t4, t0, 1
879
+ sll t6, t1, 1
880
+ addu t0, t0, t4 // t0 = (*inptr0++) * 3
881
+ addu t1, t1, t6 // t1 = (*inptr0++) * 3
882
+ addu t7, t0, t2 // t7 = thiscolsum
883
+ addu t6, t1, t3 // t5 = nextcolsum
884
+ sll t0, t7, 2 // t0 = thiscolsum * 4
885
+ subu t1, t0, t7 // t1 = thiscolsum * 3
886
+ shra_r.w t0, t0, 4
887
+ addiu t1, 7
888
+ addu t1, t1, t6
889
+ srl t1, t1, 4
890
+ sb t0, 0(s3)
891
+ sb t1, 1(s3)
892
+ beq t8, s0, 22f // skip to final iteration if width == 3
893
+ addiu s3, 2
894
+ 2:
895
+ lh t0, 0(s0) // t0 = A3|A2
896
+ lh t2, 0(s1) // t2 = B3|B2
897
+ addiu s0, 2
898
+ addiu s1, 2
899
+ preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2
900
+ preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2
901
+ shll.ph t1, t0, 1
902
+ sll t3, t6, 1
903
+ addu.ph t0, t1, t0 // t0 = A3*3|A2*3
904
+ addu t3, t3, t6 // t3 = this * 3
905
+ addu.ph t0, t0, t2 // t0 = next2|next1
906
+ addu t1, t3, t7
907
+ andi t7, t0, 0xFFFF // t7 = next1
908
+ sll t2, t7, 1
909
+ addu t2, t7, t2 // t2 = next1*3
910
+ addu t4, t2, t6
911
+ srl t6, t0, 16 // t6 = next2
912
+ shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4
913
+ addu t0, t3, t7
914
+ addiu t0, 7
915
+ srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4
916
+ shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4
917
+ addu t2, t2, t6
918
+ addiu t2, 7
919
+ srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4
920
+ sb t1, 0(s3)
921
+ sb t0, 1(s3)
922
+ sb t4, 2(s3)
923
+ sb t2, 3(s3)
924
+ bne t8, s0, 2b
925
+ addiu s3, 4
926
+ 22:
927
+ beqz s5, 4f
928
+ addu t8, s0, s5
929
+ 3:
930
+ lbu t0, 0(s0)
931
+ lbu t2, 0(s1)
932
+ addiu s0, 1
933
+ addiu s1, 1
934
+ sll t3, t6, 1
935
+ sll t1, t0, 1
936
+ addu t1, t0, t1 // t1 = inptr0 * 3
937
+ addu t3, t3, t6 // t3 = thiscolsum * 3
938
+ addu t5, t1, t2
939
+ addu t1, t3, t7
940
+ shra_r.w t1, t1, 4
941
+ addu t0, t3, t5
942
+ addiu t0, 7
943
+ srl t0, t0, 4
944
+ sb t1, 0(s3)
945
+ sb t0, 1(s3)
946
+ addiu s3, 2
947
+ move t7, t6
948
+ bne t8, s0, 3b
949
+ move t6, t5
950
+ 4:
951
+ sll t0, t6, 2 // t0 = thiscolsum * 4
952
+ subu t1, t0, t6 // t1 = thiscolsum * 3
953
+ addu t1, t1, t7
954
+ addiu s4, 4
955
+ shra_r.w t1, t1, 4
956
+ addiu t0, 7
957
+ srl t0, t0, 4
958
+ sb t1, 0(s3)
959
+ sb t0, 1(s3)
960
+ addiu t9, -1
961
+ addiu s3, 2
962
+ bnez t9, 1b
963
+ lw s1, 4(a2)
964
+ srl t0, s4, 2
965
+ subu t0, a0, t0
966
+ bgtz t0, 0b
967
+ addiu a2, 4
968
+
969
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
970
+
971
+ j ra
972
+ nop
973
+ END(jsimd_h2v2_fancy_upsample_dspr2)
974
+
975
+
976
+ /*****************************************************************************/
977
+ LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
978
+ /*
979
+ * a0 = cinfo->max_v_samp_factor
980
+ * a1 = downsampled_width
981
+ * a2 = input_data
982
+ * a3 = output_data_ptr
983
+ */
984
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
985
+
986
+ .set at
987
+
988
+ beqz a0, 3f
989
+ sll t0, a0, 2
990
+ lw s1, 0(a3)
991
+ li s3, 0x10001
992
+ addu s0, s1, t0
993
+ 0:
994
+ addiu t8, a1, -2
995
+ srl t9, t8, 2
996
+ lw t7, 0(a2)
997
+ lw s2, 0(s1)
998
+ lbu t0, 0(t7)
999
+ lbu t1, 1(t7) // t1 = inptr[1]
1000
+ sll t2, t0, 1
1001
+ addu t2, t2, t0 // t2 = invalue*3
1002
+ addu t2, t2, t1
1003
+ shra_r.w t2, t2, 2
1004
+ sb t0, 0(s2)
1005
+ sb t2, 1(s2)
1006
+ beqz t9, 11f
1007
+ addiu s2, 2
1008
+ 1:
1009
+ ulw t0, 0(t7) // t0 = |P3|P2|P1|P0|
1010
+ ulw t1, 1(t7)
1011
+ ulh t2, 4(t7) // t2 = |0|0|P5|P4|
1012
+ preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2|
1013
+ preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0|
1014
+ preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4|
1015
+ preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3|
1016
+ preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1|
1017
+ shll.ph t5, t4, 1
1018
+ shll.ph t6, t1, 1
1019
+ addu.ph t5, t5, t4 // t5 = |P4*3|P3*3|
1020
+ addu.ph t6, t6, t1 // t6 = |P2*3|P1*3|
1021
+ addu.ph t4, t3, s3
1022
+ addu.ph t0, t0, s3
1023
+ addu.ph t4, t4, t5
1024
+ addu.ph t0, t0, t6
1025
+ shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2|
1026
+ shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0|
1027
+ addu.ph t2, t2, t5
1028
+ addu.ph t3, t3, t6
1029
+ shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4|
1030
+ shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2|
1031
+ shll.ph t2, t2, 8
1032
+ shll.ph t3, t3, 8
1033
+ or t2, t4, t2
1034
+ or t3, t3, t0
1035
+ addiu t9, -1
1036
+ usw t3, 0(s2)
1037
+ usw t2, 4(s2)
1038
+ addiu s2, 8
1039
+ bgtz t9, 1b
1040
+ addiu t7, 4
1041
+ 11:
1042
+ andi t8, 3
1043
+ beqz t8, 22f
1044
+ addiu t7, 1
1045
+
1046
+ 2:
1047
+ lbu t0, 0(t7)
1048
+ addiu t7, 1
1049
+ sll t1, t0, 1
1050
+ addu t2, t0, t1 // t2 = invalue
1051
+ lbu t3, -2(t7)
1052
+ lbu t4, 0(t7)
1053
+ addiu t3, 1
1054
+ addiu t4, 2
1055
+ addu t3, t3, t2
1056
+ addu t4, t4, t2
1057
+ srl t3, 2
1058
+ srl t4, 2
1059
+ sb t3, 0(s2)
1060
+ sb t4, 1(s2)
1061
+ addiu t8, -1
1062
+ bgtz t8, 2b
1063
+ addiu s2, 2
1064
+
1065
+ 22:
1066
+ lbu t0, 0(t7)
1067
+ lbu t2, -1(t7)
1068
+ sll t1, t0, 1
1069
+ addu t1, t1, t0 // t1 = invalue * 3
1070
+ addu t1, t1, t2
1071
+ addiu t1, 1
1072
+ srl t1, t1, 2
1073
+ sb t1, 0(s2)
1074
+ sb t0, 1(s2)
1075
+ addiu s1, 4
1076
+ bne s1, s0, 0b
1077
+ addiu a2, 4
1078
+ 3:
1079
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1080
+
1081
+ j ra
1082
+ nop
1083
+ END(jsimd_h2v1_fancy_upsample_dspr2)
1084
+
1085
+
1086
+ /*****************************************************************************/
1087
+ LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
1088
+ /*
1089
+ * a0 = cinfo->image_width
1090
+ * a1 = cinfo->max_v_samp_factor
1091
+ * a2 = compptr->v_samp_factor
1092
+ * a3 = compptr->width_in_blocks
1093
+ * 16(sp) = input_data
1094
+ * 20(sp) = output_data
1095
+ */
1096
+ .set at
1097
+
1098
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
1099
+
1100
+ beqz a2, 7f
1101
+ lw s1, 44(sp) // s1 = output_data
1102
+ lw s0, 40(sp) // s0 = input_data
1103
+ srl s2, a0, 2
1104
+ andi t9, a0, 2
1105
+ srl t7, t9, 1
1106
+ addu s2, t7, s2
1107
+ sll t0, a3, 3 // t0 = width_in_blocks*DCT
1108
+ srl t7, t0, 1
1109
+ subu s2, t7, s2
1110
+ 0:
1111
+ andi t6, a0, 1 // t6 = temp_index
1112
+ addiu t6, -1
1113
+ lw t4, 0(s1) // t4 = outptr
1114
+ lw t5, 0(s0) // t5 = inptr0
1115
+ li s3, 0 // s3 = bias
1116
+ srl t7, a0, 1 // t7 = image_width1
1117
+ srl s4, t7, 2
1118
+ andi t8, t7, 3
1119
+ 1:
1120
+ ulhu t0, 0(t5)
1121
+ ulhu t1, 2(t5)
1122
+ ulhu t2, 4(t5)
1123
+ ulhu t3, 6(t5)
1124
+ raddu.w.qb t0, t0
1125
+ raddu.w.qb t1, t1
1126
+ raddu.w.qb t2, t2
1127
+ raddu.w.qb t3, t3
1128
+ shra.ph t0, t0, 1
1129
+ shra_r.ph t1, t1, 1
1130
+ shra.ph t2, t2, 1
1131
+ shra_r.ph t3, t3, 1
1132
+ sb t0, 0(t4)
1133
+ sb t1, 1(t4)
1134
+ sb t2, 2(t4)
1135
+ sb t3, 3(t4)
1136
+ addiu s4, -1
1137
+ addiu t4, 4
1138
+ bgtz s4, 1b
1139
+ addiu t5, 8
1140
+ beqz t8, 3f
1141
+ addu s4, t4, t8
1142
+ 2:
1143
+ ulhu t0, 0(t5)
1144
+ raddu.w.qb t0, t0
1145
+ addqh.w t0, t0, s3
1146
+ xori s3, s3, 1
1147
+ sb t0, 0(t4)
1148
+ addiu t4, 1
1149
+ bne t4, s4, 2b
1150
+ addiu t5, 2
1151
+ 3:
1152
+ lbux t1, t6(t5)
1153
+ sll t1, 1
1154
+ addqh.w t2, t1, s3 // t2 = pixval1
1155
+ xori s3, s3, 1
1156
+ addqh.w t3, t1, s3 // t3 = pixval2
1157
+ blez s2, 5f
1158
+ append t3, t2, 8
1159
+ addu t5, t4, s2 // t5 = loop_end2
1160
+ 4:
1161
+ ush t3, 0(t4)
1162
+ addiu s2, -1
1163
+ bgtz s2, 4b
1164
+ addiu t4, 2
1165
+ 5:
1166
+ beqz t9, 6f
1167
+ nop
1168
+ sb t2, 0(t4)
1169
+ 6:
1170
+ addiu s1, 4
1171
+ addiu a2, -1
1172
+ bnez a2, 0b
1173
+ addiu s0, 4
1174
+ 7:
1175
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
1176
+
1177
+ j ra
1178
+ nop
1179
+ END(jsimd_h2v1_downsample_dspr2)
1180
+
1181
+
1182
+ /*****************************************************************************/
1183
+ LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
1184
+ /*
1185
+ * a0 = cinfo->image_width
1186
+ * a1 = cinfo->max_v_samp_factor
1187
+ * a2 = compptr->v_samp_factor
1188
+ * a3 = compptr->width_in_blocks
1189
+ * 16(sp) = input_data
1190
+ * 20(sp) = output_data
1191
+ */
1192
+ .set at
1193
+
1194
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1195
+
1196
+ beqz a2, 8f
1197
+ lw s1, 52(sp) // s1 = output_data
1198
+ lw s0, 48(sp) // s0 = input_data
1199
+
1200
+ andi t6, a0, 1 // t6 = temp_index
1201
+ addiu t6, -1
1202
+ srl t7, a0, 1 // t7 = image_width1
1203
+ srl s4, t7, 2
1204
+ andi t8, t7, 3
1205
+ andi t9, a0, 2
1206
+ srl s2, a0, 2
1207
+ srl t7, t9, 1
1208
+ addu s2, t7, s2
1209
+ sll t0, a3, 3 // s2 = width_in_blocks*DCT
1210
+ srl t7, t0, 1
1211
+ subu s2, t7, s2
1212
+ 0:
1213
+ lw t4, 0(s1) // t4 = outptr
1214
+ lw t5, 0(s0) // t5 = inptr0
1215
+ lw s7, 4(s0) // s7 = inptr1
1216
+ li s6, 1 // s6 = bias
1217
+ 2:
1218
+ ulw t0, 0(t5) // t0 = |P3|P2|P1|P0|
1219
+ ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0|
1220
+ ulw t2, 4(t5)
1221
+ ulw t3, 4(s7)
1222
+ precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2|
1223
+ ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0|
1224
+ raddu.w.qb t1, t7
1225
+ raddu.w.qb t0, t0
1226
+ shra_r.w t1, t1, 2
1227
+ addiu t0, 1
1228
+ srl t0, 2
1229
+ precrq.ph.w t7, t2, t3
1230
+ ins t2, t3, 16, 16
1231
+ raddu.w.qb t7, t7
1232
+ raddu.w.qb t2, t2
1233
+ shra_r.w t7, t7, 2
1234
+ addiu t2, 1
1235
+ srl t2, 2
1236
+ sb t0, 0(t4)
1237
+ sb t1, 1(t4)
1238
+ sb t2, 2(t4)
1239
+ sb t7, 3(t4)
1240
+ addiu t4, 4
1241
+ addiu t5, 8
1242
+ addiu s4, s4, -1
1243
+ bgtz s4, 2b
1244
+ addiu s7, 8
1245
+ beqz t8, 4f
1246
+ addu t8, t4, t8
1247
+ 3:
1248
+ ulhu t0, 0(t5)
1249
+ ulhu t1, 0(s7)
1250
+ ins t0, t1, 16, 16
1251
+ raddu.w.qb t0, t0
1252
+ addu t0, t0, s6
1253
+ srl t0, 2
1254
+ xori s6, s6, 3
1255
+ sb t0, 0(t4)
1256
+ addiu t5, 2
1257
+ addiu t4, 1
1258
+ bne t8, t4, 3b
1259
+ addiu s7, 2
1260
+ 4:
1261
+ lbux t1, t6(t5)
1262
+ sll t1, 1
1263
+ lbux t0, t6(s7)
1264
+ sll t0, 1
1265
+ addu t1, t1, t0
1266
+ addu t3, t1, s6
1267
+ srl t0, t3, 2 // t2 = pixval1
1268
+ xori s6, s6, 3
1269
+ addu t2, t1, s6
1270
+ srl t1, t2, 2 // t3 = pixval2
1271
+ blez s2, 6f
1272
+ append t1, t0, 8
1273
+ 5:
1274
+ ush t1, 0(t4)
1275
+ addiu s2, -1
1276
+ bgtz s2, 5b
1277
+ addiu t4, 2
1278
+ 6:
1279
+ beqz t9, 7f
1280
+ nop
1281
+ sb t0, 0(t4)
1282
+ 7:
1283
+ addiu s1, 4
1284
+ addiu a2, -1
1285
+ bnez a2, 0b
1286
+ addiu s0, 8
1287
+ 8:
1288
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1289
+
1290
+ j ra
1291
+ nop
1292
+ END(jsimd_h2v2_downsample_dspr2)
1293
+
1294
+
1295
+ /*****************************************************************************/
1296
+ LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
1297
+ /*
1298
+ * a0 = input_data
1299
+ * a1 = output_data
1300
+ * a2 = compptr->v_samp_factor
1301
+ * a3 = cinfo->max_v_samp_factor
1302
+ * 16(sp) = cinfo->smoothing_factor
1303
+ * 20(sp) = compptr->width_in_blocks
1304
+ * 24(sp) = cinfo->image_width
1305
+ */
1306
+ .set at
1307
+
1308
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1309
+
1310
+ lw s7, 52(sp) // compptr->width_in_blocks
1311
+ lw s0, 56(sp) // cinfo->image_width
1312
+ lw s6, 48(sp) // cinfo->smoothing_factor
1313
+ sll s7, 3 // output_cols = width_in_blocks * DCTSIZE
1314
+ sll v0, s7, 1
1315
+ subu v0, v0, s0
1316
+ blez v0, 2f
1317
+ move v1, zero
1318
+ addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2
1319
+ 0:
1320
+ addiu t1, a0, -4
1321
+ sll t2, v1, 2
1322
+ lwx t1, t2(t1)
1323
+ move t3, v0
1324
+ addu t1, t1, s0
1325
+ lbu t2, -1(t1)
1326
+ 1:
1327
+ addiu t3, t3, -1
1328
+ sb t2, 0(t1)
1329
+ bgtz t3, 1b
1330
+ addiu t1, t1, 1
1331
+ addiu v1, v1, 1
1332
+ bne v1, t0, 0b
1333
+ nop
1334
+ 2:
1335
+ li v0, 80
1336
+ mul v0, s6, v0
1337
+ li v1, 16384
1338
+ move t4, zero
1339
+ move t5, zero
1340
+ subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80
1341
+ sll t7, s6, 4 // t7 = tmp_smoot_f * 16
1342
+ 3:
1343
+ /* Special case for first column: pretend column -1 is same as column 0 */
1344
+ sll v0, t4, 2
1345
+ lwx t8, v0(a1) // outptr = output_data[outrow]
1346
+ sll v1, t5, 2
1347
+ addiu t9, v1, 4
1348
+ addiu s0, v1, -4
1349
+ addiu s1, v1, 8
1350
+ lwx s2, v1(a0) // inptr0 = input_data[inrow]
1351
+ lwx t9, t9(a0) // inptr1 = input_data[inrow+1]
1352
+ lwx s0, s0(a0) // above_ptr = input_data[inrow-1]
1353
+ lwx s1, s1(a0) // below_ptr = input_data[inrow+2]
1354
+ lh v0, 0(s2)
1355
+ lh v1, 0(t9)
1356
+ lh t0, 0(s0)
1357
+ lh t1, 0(s1)
1358
+ ins v0, v1, 16, 16
1359
+ ins t0, t1, 16, 16
1360
+ raddu.w.qb t2, v0
1361
+ raddu.w.qb s3, t0
1362
+ lbu v0, 0(s2)
1363
+ lbu v1, 2(s2)
1364
+ lbu t0, 0(t9)
1365
+ lbu t1, 2(t9)
1366
+ addu v0, v0, v1
1367
+ mult $ac1, t2, t6
1368
+ addu t0, t0, t1
1369
+ lbu t2, 2(s0)
1370
+ addu t0, t0, v0
1371
+ lbu t3, 2(s1)
1372
+ addu s3, t0, s3
1373
+ lbu v0, 0(s0)
1374
+ lbu t0, 0(s1)
1375
+ sll s3, s3, 1
1376
+ addu v0, v0, t2
1377
+ addu t0, t0, t3
1378
+ addu t0, t0, v0
1379
+ addu s3, t0, s3
1380
+ madd $ac1, s3, t7
1381
+ extr_r.w v0, $ac1, 16
1382
+ addiu t8, t8, 1
1383
+ addiu s2, s2, 2
1384
+ addiu t9, t9, 2
1385
+ addiu s0, s0, 2
1386
+ addiu s1, s1, 2
1387
+ sb v0, -1(t8)
1388
+ addiu s4, s7, -2
1389
+ and s4, s4, 3
1390
+ addu s5, s4, t8 // end address
1391
+ 4:
1392
+ lh v0, 0(s2)
1393
+ lh v1, 0(t9)
1394
+ lh t0, 0(s0)
1395
+ lh t1, 0(s1)
1396
+ ins v0, v1, 16, 16
1397
+ ins t0, t1, 16, 16
1398
+ raddu.w.qb t2, v0
1399
+ raddu.w.qb s3, t0
1400
+ lbu v0, -1(s2)
1401
+ lbu v1, 2(s2)
1402
+ lbu t0, -1(t9)
1403
+ lbu t1, 2(t9)
1404
+ addu v0, v0, v1
1405
+ mult $ac1, t2, t6
1406
+ addu t0, t0, t1
1407
+ lbu t2, 2(s0)
1408
+ addu t0, t0, v0
1409
+ lbu t3, 2(s1)
1410
+ addu s3, t0, s3
1411
+ lbu v0, -1(s0)
1412
+ lbu t0, -1(s1)
1413
+ sll s3, s3, 1
1414
+ addu v0, v0, t2
1415
+ addu t0, t0, t3
1416
+ addu t0, t0, v0
1417
+ addu s3, t0, s3
1418
+ madd $ac1, s3, t7
1419
+ extr_r.w t2, $ac1, 16
1420
+ addiu t8, t8, 1
1421
+ addiu s2, s2, 2
1422
+ addiu t9, t9, 2
1423
+ addiu s0, s0, 2
1424
+ sb t2, -1(t8)
1425
+ bne s5, t8, 4b
1426
+ addiu s1, s1, 2
1427
+ addiu s5, s7, -2
1428
+ subu s5, s5, s4
1429
+ addu s5, s5, t8 // end address
1430
+ 5:
1431
+ lh v0, 0(s2)
1432
+ lh v1, 0(t9)
1433
+ lh t0, 0(s0)
1434
+ lh t1, 0(s1)
1435
+ ins v0, v1, 16, 16
1436
+ ins t0, t1, 16, 16
1437
+ raddu.w.qb t2, v0
1438
+ raddu.w.qb s3, t0
1439
+ lbu v0, -1(s2)
1440
+ lbu v1, 2(s2)
1441
+ lbu t0, -1(t9)
1442
+ lbu t1, 2(t9)
1443
+ addu v0, v0, v1
1444
+ mult $ac1, t2, t6
1445
+ addu t0, t0, t1
1446
+ lbu t2, 2(s0)
1447
+ addu t0, t0, v0
1448
+ lbu t3, 2(s1)
1449
+ addu s3, t0, s3
1450
+ lbu v0, -1(s0)
1451
+ lbu t0, -1(s1)
1452
+ sll s3, s3, 1
1453
+ addu v0, v0, t2
1454
+ addu t0, t0, t3
1455
+ lh v1, 2(t9)
1456
+ addu t0, t0, v0
1457
+ lh v0, 2(s2)
1458
+ addu s3, t0, s3
1459
+ lh t0, 2(s0)
1460
+ lh t1, 2(s1)
1461
+ madd $ac1, s3, t7
1462
+ extr_r.w t2, $ac1, 16
1463
+ ins t0, t1, 16, 16
1464
+ ins v0, v1, 16, 16
1465
+ raddu.w.qb s3, t0
1466
+ lbu v1, 4(s2)
1467
+ lbu t0, 1(t9)
1468
+ lbu t1, 4(t9)
1469
+ sb t2, 0(t8)
1470
+ raddu.w.qb t3, v0
1471
+ lbu v0, 1(s2)
1472
+ addu t0, t0, t1
1473
+ mult $ac1, t3, t6
1474
+ addu v0, v0, v1
1475
+ lbu t2, 4(s0)
1476
+ addu t0, t0, v0
1477
+ lbu v0, 1(s0)
1478
+ addu s3, t0, s3
1479
+ lbu t0, 1(s1)
1480
+ lbu t3, 4(s1)
1481
+ addu v0, v0, t2
1482
+ sll s3, s3, 1
1483
+ addu t0, t0, t3
1484
+ lh v1, 4(t9)
1485
+ addu t0, t0, v0
1486
+ lh v0, 4(s2)
1487
+ addu s3, t0, s3
1488
+ lh t0, 4(s0)
1489
+ lh t1, 4(s1)
1490
+ madd $ac1, s3, t7
1491
+ extr_r.w t2, $ac1, 16
1492
+ ins t0, t1, 16, 16
1493
+ ins v0, v1, 16, 16
1494
+ raddu.w.qb s3, t0
1495
+ lbu v1, 6(s2)
1496
+ lbu t0, 3(t9)
1497
+ lbu t1, 6(t9)
1498
+ sb t2, 1(t8)
1499
+ raddu.w.qb t3, v0
1500
+ lbu v0, 3(s2)
1501
+ addu t0, t0, t1
1502
+ mult $ac1, t3, t6
1503
+ addu v0, v0, v1
1504
+ lbu t2, 6(s0)
1505
+ addu t0, t0, v0
1506
+ lbu v0, 3(s0)
1507
+ addu s3, t0, s3
1508
+ lbu t0, 3(s1)
1509
+ lbu t3, 6(s1)
1510
+ addu v0, v0, t2
1511
+ sll s3, s3, 1
1512
+ addu t0, t0, t3
1513
+ lh v1, 6(t9)
1514
+ addu t0, t0, v0
1515
+ lh v0, 6(s2)
1516
+ addu s3, t0, s3
1517
+ lh t0, 6(s0)
1518
+ lh t1, 6(s1)
1519
+ madd $ac1, s3, t7
1520
+ extr_r.w t3, $ac1, 16
1521
+ ins t0, t1, 16, 16
1522
+ ins v0, v1, 16, 16
1523
+ raddu.w.qb s3, t0
1524
+ lbu v1, 8(s2)
1525
+ lbu t0, 5(t9)
1526
+ lbu t1, 8(t9)
1527
+ sb t3, 2(t8)
1528
+ raddu.w.qb t2, v0
1529
+ lbu v0, 5(s2)
1530
+ addu t0, t0, t1
1531
+ mult $ac1, t2, t6
1532
+ addu v0, v0, v1
1533
+ lbu t2, 8(s0)
1534
+ addu t0, t0, v0
1535
+ lbu v0, 5(s0)
1536
+ addu s3, t0, s3
1537
+ lbu t0, 5(s1)
1538
+ lbu t3, 8(s1)
1539
+ addu v0, v0, t2
1540
+ sll s3, s3, 1
1541
+ addu t0, t0, t3
1542
+ addiu t8, t8, 4
1543
+ addu t0, t0, v0
1544
+ addiu s2, s2, 8
1545
+ addu s3, t0, s3
1546
+ addiu t9, t9, 8
1547
+ madd $ac1, s3, t7
1548
+ extr_r.w t1, $ac1, 16
1549
+ addiu s0, s0, 8
1550
+ addiu s1, s1, 8
1551
+ bne s5, t8, 5b
1552
+ sb t1, -1(t8)
1553
+ /* Special case for last column */
1554
+ lh v0, 0(s2)
1555
+ lh v1, 0(t9)
1556
+ lh t0, 0(s0)
1557
+ lh t1, 0(s1)
1558
+ ins v0, v1, 16, 16
1559
+ ins t0, t1, 16, 16
1560
+ raddu.w.qb t2, v0
1561
+ raddu.w.qb s3, t0
1562
+ lbu v0, -1(s2)
1563
+ lbu v1, 1(s2)
1564
+ lbu t0, -1(t9)
1565
+ lbu t1, 1(t9)
1566
+ addu v0, v0, v1
1567
+ mult $ac1, t2, t6
1568
+ addu t0, t0, t1
1569
+ lbu t2, 1(s0)
1570
+ addu t0, t0, v0
1571
+ lbu t3, 1(s1)
1572
+ addu s3, t0, s3
1573
+ lbu v0, -1(s0)
1574
+ lbu t0, -1(s1)
1575
+ sll s3, s3, 1
1576
+ addu v0, v0, t2
1577
+ addu t0, t0, t3
1578
+ addu t0, t0, v0
1579
+ addu s3, t0, s3
1580
+ madd $ac1, s3, t7
1581
+ extr_r.w t0, $ac1, 16
1582
+ addiu t5, t5, 2
1583
+ sb t0, 0(t8)
1584
+ addiu t4, t4, 1
1585
+ bne t4, a2, 3b
1586
+ addiu t5, t5, 2
1587
+
1588
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1589
+
1590
+ j ra
1591
+ nop
1592
+
1593
+ END(jsimd_h2v2_smooth_downsample_dspr2)
1594
+
1595
+
1596
+ /*****************************************************************************/
1597
+ LEAF_DSPR2(jsimd_int_upsample_dspr2)
1598
+ /*
1599
+ * a0 = upsample->h_expand[compptr->component_index]
1600
+ * a1 = upsample->v_expand[compptr->component_index]
1601
+ * a2 = input_data
1602
+ * a3 = output_data_ptr
1603
+ * 16(sp) = cinfo->output_width
1604
+ * 20(sp) = cinfo->max_v_samp_factor
1605
+ */
1606
+ .set at
1607
+
1608
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1609
+
1610
+ lw s0, 0(a3) // s0 = output_data
1611
+ lw s1, 32(sp) // s1 = cinfo->output_width
1612
+ lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor
1613
+ li t6, 0 // t6 = inrow
1614
+ beqz s2, 10f
1615
+ li s3, 0 // s3 = outrow
1616
+ 0:
1617
+ addu t0, a2, t6
1618
+ addu t7, s0, s3
1619
+ lw t3, 0(t0) // t3 = inptr
1620
+ lw t8, 0(t7) // t8 = outptr
1621
+ beqz s1, 4f
1622
+ addu t5, t8, s1 // t5 = outend
1623
+ 1:
1624
+ lb t2, 0(t3) // t2 = invalue = *inptr++
1625
+ addiu t3, 1
1626
+ beqz a0, 3f
1627
+ move t0, a0 // t0 = h_expand
1628
+ 2:
1629
+ sb t2, 0(t8)
1630
+ addiu t0, -1
1631
+ bgtz t0, 2b
1632
+ addiu t8, 1
1633
+ 3:
1634
+ bgt t5, t8, 1b
1635
+ nop
1636
+ 4:
1637
+ addiu t9, a1, -1 // t9 = v_expand - 1
1638
+ blez t9, 9f
1639
+ nop
1640
+ 5:
1641
+ lw t3, 0(s0)
1642
+ lw t4, 4(s0)
1643
+ subu t0, s1, 0xF
1644
+ blez t0, 7f
1645
+ addu t5, t3, s1 // t5 = end address
1646
+ andi t7, s1, 0xF // t7 = residual
1647
+ subu t8, t5, t7
1648
+ 6:
1649
+ ulw t0, 0(t3)
1650
+ ulw t1, 4(t3)
1651
+ ulw t2, 8(t3)
1652
+ usw t0, 0(t4)
1653
+ ulw t0, 12(t3)
1654
+ usw t1, 4(t4)
1655
+ usw t2, 8(t4)
1656
+ usw t0, 12(t4)
1657
+ addiu t3, 16
1658
+ bne t3, t8, 6b
1659
+ addiu t4, 16
1660
+ beqz t7, 8f
1661
+ nop
1662
+ 7:
1663
+ lbu t0, 0(t3)
1664
+ sb t0, 0(t4)
1665
+ addiu t3, 1
1666
+ bne t3, t5, 7b
1667
+ addiu t4, 1
1668
+ 8:
1669
+ addiu t9, -1
1670
+ bgtz t9, 5b
1671
+ addiu s0, 8
1672
+ 9:
1673
+ addu s3, s3, a1
1674
+ bne s3, s2, 0b
1675
+ addiu t6, 1
1676
+ 10:
1677
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1678
+
1679
+ j ra
1680
+ nop
1681
+ END(jsimd_int_upsample_dspr2)
1682
+
1683
+
1684
+ /*****************************************************************************/
1685
+ LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
1686
+ /*
1687
+ * a0 = cinfo->max_v_samp_factor
1688
+ * a1 = cinfo->output_width
1689
+ * a2 = input_data
1690
+ * a3 = output_data_ptr
1691
+ */
1692
+ lw t7, 0(a3) // t7 = output_data
1693
+ andi t8, a1, 0xf // t8 = residual
1694
+ sll t0, a0, 2
1695
+ blez a0, 4f
1696
+ addu t9, t7, t0 // t9 = output_data end address
1697
+ 0:
1698
+ lw t5, 0(t7) // t5 = outptr
1699
+ lw t6, 0(a2) // t6 = inptr
1700
+ addu t3, t5, a1 // t3 = outptr + output_width (end address)
1701
+ subu t3, t8 // t3 = end address - residual
1702
+ beq t5, t3, 2f
1703
+ move t4, t8
1704
+ 1:
1705
+ ulw t0, 0(t6) // t0 = |P3|P2|P1|P0|
1706
+ ulw t2, 4(t6) // t2 = |P7|P6|P5|P4|
1707
+ srl t1, t0, 16 // t1 = |X|X|P3|P2|
1708
+ ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0|
1709
+ ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2|
1710
+ ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0|
1711
+ ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2|
1712
+ usw t0, 0(t5)
1713
+ usw t1, 4(t5)
1714
+ srl t0, t2, 16 // t0 = |X|X|P7|P6|
1715
+ ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4|
1716
+ ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6|
1717
+ ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4|
1718
+ ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6|
1719
+ usw t2, 8(t5)
1720
+ usw t0, 12(t5)
1721
+ addiu t5, 16
1722
+ bne t5, t3, 1b
1723
+ addiu t6, 8
1724
+ beqz t8, 3f
1725
+ move t4, t8
1726
+ 2:
1727
+ lbu t1, 0(t6)
1728
+ sb t1, 0(t5)
1729
+ sb t1, 1(t5)
1730
+ addiu t4, -2
1731
+ addiu t6, 1
1732
+ bgtz t4, 2b
1733
+ addiu t5, 2
1734
+ 3:
1735
+ addiu t7, 4
1736
+ bne t9, t7, 0b
1737
+ addiu a2, 4
1738
+ 4:
1739
+ j ra
1740
+ nop
1741
+ END(jsimd_h2v1_upsample_dspr2)
1742
+
1743
+
1744
+ /*****************************************************************************/
1745
+ LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
1746
+ /*
1747
+ * a0 = cinfo->max_v_samp_factor
1748
+ * a1 = cinfo->output_width
1749
+ * a2 = input_data
1750
+ * a3 = output_data_ptr
1751
+ */
1752
+ lw t7, 0(a3)
1753
+ blez a0, 7f
1754
+ andi t9, a1, 0xf // t9 = residual
1755
+ 0:
1756
+ lw t6, 0(a2) // t6 = inptr
1757
+ lw t5, 0(t7) // t5 = outptr
1758
+ addu t8, t5, a1 // t8 = outptr end address
1759
+ subu t8, t9 // t8 = end address - residual
1760
+ beq t5, t8, 2f
1761
+ move t4, t9
1762
+ 1:
1763
+ ulw t0, 0(t6)
1764
+ srl t1, t0, 16
1765
+ ins t0, t0, 16, 16
1766
+ ins t0, t0, 8, 16
1767
+ ins t1, t1, 16, 16
1768
+ ins t1, t1, 8, 16
1769
+ ulw t2, 4(t6)
1770
+ usw t0, 0(t5)
1771
+ usw t1, 4(t5)
1772
+ srl t3, t2, 16
1773
+ ins t2, t2, 16, 16
1774
+ ins t2, t2, 8, 16
1775
+ ins t3, t3, 16, 16
1776
+ ins t3, t3, 8, 16
1777
+ usw t2, 8(t5)
1778
+ usw t3, 12(t5)
1779
+ addiu t5, 16
1780
+ bne t5, t8, 1b
1781
+ addiu t6, 8
1782
+ beqz t9, 3f
1783
+ move t4, t9
1784
+ 2:
1785
+ lbu t0, 0(t6)
1786
+ sb t0, 0(t5)
1787
+ sb t0, 1(t5)
1788
+ addiu t4, -2
1789
+ addiu t6, 1
1790
+ bgtz t4, 2b
1791
+ addiu t5, 2
1792
+ 3:
1793
+ lw t6, 0(t7) // t6 = outptr[0]
1794
+ lw t5, 4(t7) // t5 = outptr[1]
1795
+ addu t4, t6, a1 // t4 = new end address
1796
+ beq a1, t9, 5f
1797
+ subu t8, t4, t9
1798
+ 4:
1799
+ ulw t0, 0(t6)
1800
+ ulw t1, 4(t6)
1801
+ ulw t2, 8(t6)
1802
+ usw t0, 0(t5)
1803
+ ulw t0, 12(t6)
1804
+ usw t1, 4(t5)
1805
+ usw t2, 8(t5)
1806
+ usw t0, 12(t5)
1807
+ addiu t6, 16
1808
+ bne t6, t8, 4b
1809
+ addiu t5, 16
1810
+ beqz t9, 6f
1811
+ nop
1812
+ 5:
1813
+ lbu t0, 0(t6)
1814
+ sb t0, 0(t5)
1815
+ addiu t6, 1
1816
+ bne t6, t4, 5b
1817
+ addiu t5, 1
1818
+ 6:
1819
+ addiu t7, 8
1820
+ addiu a0, -2
1821
+ bgtz a0, 0b
1822
+ addiu a2, 4
1823
+ 7:
1824
+ j ra
1825
+ nop
1826
+ END(jsimd_h2v2_upsample_dspr2)
1827
+
1828
+
1829
+ /*****************************************************************************/
1830
+ LEAF_DSPR2(jsimd_idct_islow_dspr2)
1831
+ /*
1832
+ * a0 = coef_block
1833
+ * a1 = compptr->dcttable
1834
+ * a2 = output
1835
+ * a3 = range_limit
1836
+ */
1837
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1838
+
1839
+ addiu sp, sp, -256
1840
+ move v0, sp
1841
+ addiu v1, zero, 8 // v1 = DCTSIZE = 8
1842
+ 1:
1843
+ lh s4, 32(a0) // s4 = inptr[16]
1844
+ lh s5, 64(a0) // s5 = inptr[32]
1845
+ lh s6, 96(a0) // s6 = inptr[48]
1846
+ lh t1, 112(a0) // t1 = inptr[56]
1847
+ lh t7, 16(a0) // t7 = inptr[8]
1848
+ lh t5, 80(a0) // t5 = inptr[40]
1849
+ lh t3, 48(a0) // t3 = inptr[24]
1850
+ or s4, s4, t1
1851
+ or s4, s4, t3
1852
+ or s4, s4, t5
1853
+ or s4, s4, t7
1854
+ or s4, s4, s5
1855
+ or s4, s4, s6
1856
+ bnez s4, 2f
1857
+ addiu v1, v1, -1
1858
+ lh s5, 0(a1) // quantptr[DCTSIZE*0]
1859
+ lh s6, 0(a0) // inptr[DCTSIZE*0]
1860
+ mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0])
1861
+ sll s5, s5, 2
1862
+ sw s5, 0(v0)
1863
+ sw s5, 32(v0)
1864
+ sw s5, 64(v0)
1865
+ sw s5, 96(v0)
1866
+ sw s5, 128(v0)
1867
+ sw s5, 160(v0)
1868
+ sw s5, 192(v0)
1869
+ b 3f
1870
+ sw s5, 224(v0)
1871
+ 2:
1872
+ lh t0, 112(a1)
1873
+ lh t2, 48(a1)
1874
+ lh t4, 80(a1)
1875
+ lh t6, 16(a1)
1876
+ mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7])
1877
+ mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3])
1878
+ mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5])
1879
+ mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1])
1880
+ lh t4, 32(a1)
1881
+ lh t5, 32(a0)
1882
+ lh t6, 96(a1)
1883
+ lh t7, 96(a0)
1884
+ addu s0, t0, t1 // z3 = tmp0 + tmp2
1885
+ addu s1, t1, t2 // z2 = tmp1 + tmp2
1886
+ addu s2, t2, t3 // z4 = tmp1 + tmp3
1887
+ addu s3, s0, s2 // z3 + z4
1888
+ addiu t9, zero, 9633 // FIX_1_175875602
1889
+ mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
1890
+ addu t8, t0, t3 // z1 = tmp0 + tmp3
1891
+ addiu t9, zero, 2446 // FIX_0_298631336
1892
+ mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
1893
+ addiu t9, zero, 16819 // FIX_2_053119869
1894
+ mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
1895
+ addiu t9, zero, 25172 // FIX_3_072711026
1896
+ mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
1897
+ addiu t9, zero, 12299 // FIX_1_501321110
1898
+ mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
1899
+ addiu t9, zero, 16069 // FIX_1_961570560
1900
+ mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560)
1901
+ addiu t9, zero, 3196 // FIX_0_390180644
1902
+ mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644)
1903
+ addiu t9, zero, 7373 // FIX_0_899976223
1904
+ mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223)
1905
+ addiu t9, zero, 20995 // FIX_2_562915447
1906
+ mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447)
1907
+ subu s0, s3, s0 // z3 += z5
1908
+ addu t0, t0, s0 // tmp0 += z3
1909
+ addu t1, t1, s0 // tmp2 += z3
1910
+ subu s2, s3, s2 // z4 += z5
1911
+ addu t2, t2, s2 // tmp1 += z4
1912
+ addu t3, t3, s2 // tmp3 += z4
1913
+ subu t0, t0, t8 // tmp0 += z1
1914
+ subu t1, t1, s1 // tmp2 += z2
1915
+ subu t2, t2, s1 // tmp1 += z2
1916
+ subu t3, t3, t8 // tmp3 += z1
1917
+ mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
1918
+ addiu t9, zero, 6270 // FIX_0_765366865
1919
+ mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
1920
+ lh t4, 0(a1)
1921
+ lh t5, 0(a0)
1922
+ lh t6, 64(a1)
1923
+ lh t7, 64(a0)
1924
+ mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865)
1925
+ mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
1926
+ mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
1927
+ addiu t9, zero, 4433 // FIX_0_541196100
1928
+ addu s3, s0, s1 // z2 + z3
1929
+ mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
1930
+ addiu t9, zero, 15137 // FIX_1_847759065
1931
+ mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065)
1932
+ addu t4, t5, t6
1933
+ subu t5, t5, t6
1934
+ sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS
1935
+ sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS
1936
+ addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
1937
+ subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
1938
+ addu s0, t4, t7
1939
+ subu s1, t4, t7
1940
+ addu s2, t5, t6
1941
+ subu s3, t5, t6
1942
+ addu t4, s0, t3
1943
+ subu s0, s0, t3
1944
+ addu t3, s2, t1
1945
+ subu s2, s2, t1
1946
+ addu t1, s3, t2
1947
+ subu s3, s3, t2
1948
+ addu t2, s1, t0
1949
+ subu s1, s1, t0
1950
+ shra_r.w t4, t4, 11
1951
+ shra_r.w t3, t3, 11
1952
+ shra_r.w t1, t1, 11
1953
+ shra_r.w t2, t2, 11
1954
+ shra_r.w s1, s1, 11
1955
+ shra_r.w s3, s3, 11
1956
+ shra_r.w s2, s2, 11
1957
+ shra_r.w s0, s0, 11
1958
+ sw t4, 0(v0)
1959
+ sw t3, 32(v0)
1960
+ sw t1, 64(v0)
1961
+ sw t2, 96(v0)
1962
+ sw s1, 128(v0)
1963
+ sw s3, 160(v0)
1964
+ sw s2, 192(v0)
1965
+ sw s0, 224(v0)
1966
+ 3:
1967
+ addiu a1, a1, 2
1968
+ addiu a0, a0, 2
1969
+ bgtz v1, 1b
1970
+ addiu v0, v0, 4
1971
+ move v0, sp
1972
+ addiu v1, zero, 8
1973
+ 4:
1974
+ lw t0, 8(v0) // z2 = (JLONG)wsptr[2]
1975
+ lw t1, 24(v0) // z3 = (JLONG)wsptr[6]
1976
+ lw t2, 0(v0) // (JLONG)wsptr[0]
1977
+ lw t3, 16(v0) // (JLONG)wsptr[4]
1978
+ lw s4, 4(v0) // (JLONG)wsptr[1]
1979
+ lw s5, 12(v0) // (JLONG)wsptr[3]
1980
+ lw s6, 20(v0) // (JLONG)wsptr[5]
1981
+ lw s7, 28(v0) // (JLONG)wsptr[7]
1982
+ or s4, s4, t0
1983
+ or s4, s4, t1
1984
+ or s4, s4, t3
1985
+ or s4, s4, s7
1986
+ or s4, s4, s5
1987
+ or s4, s4, s6
1988
+ bnez s4, 5f
1989
+ addiu v1, v1, -1
1990
+ shra_r.w s5, t2, 5
1991
+ andi s5, s5, 0x3ff
1992
+ lbux s5, s5(a3)
1993
+ lw s1, 0(a2)
1994
+ replv.qb s5, s5
1995
+ usw s5, 0(s1)
1996
+ usw s5, 4(s1)
1997
+ b 6f
1998
+ nop
1999
+ 5:
2000
+ addu t4, t0, t1 // z2 + z3
2001
+ addiu t8, zero, 4433 // FIX_0_541196100
2002
+ mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
2003
+ addiu t8, zero, 15137 // FIX_1_847759065
2004
+ mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065)
2005
+ addiu t8, zero, 6270 // FIX_0_765366865
2006
+ mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865)
2007
+ addu t4, t2, t3 // (JLONG)wsptr[0] + (JLONG)wsptr[4]
2008
+ subu t2, t2, t3 // (JLONG)wsptr[0] - (JLONG)wsptr[4]
2009
+ sll t4, t4, 13 // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS
2010
+ sll t2, t2, 13 // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS
2011
+ subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
2012
+ subu t3, t2, t1 // tmp12 = tmp1 - tmp2
2013
+ addu t2, t2, t1 // tmp11 = tmp1 + tmp2
2014
+ addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
2015
+ subu t1, t4, t5 // tmp13 = tmp0 - tmp3
2016
+ addu t0, t4, t5 // tmp10 = tmp0 + tmp3
2017
+ lw t4, 28(v0) // tmp0 = (JLONG)wsptr[7]
2018
+ lw t6, 12(v0) // tmp2 = (JLONG)wsptr[3]
2019
+ lw t5, 20(v0) // tmp1 = (JLONG)wsptr[5]
2020
+ lw t7, 4(v0) // tmp3 = (JLONG)wsptr[1]
2021
+ addu s0, t4, t6 // z3 = tmp0 + tmp2
2022
+ addiu t8, zero, 9633 // FIX_1_175875602
2023
+ addu s1, t5, t7 // z4 = tmp1 + tmp3
2024
+ addu s2, s0, s1 // z3 + z4
2025
+ mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
2026
+ addu s3, t4, t7 // z1 = tmp0 + tmp3
2027
+ addu t9, t5, t6 // z2 = tmp1 + tmp2
2028
+ addiu t8, zero, 16069 // FIX_1_961570560
2029
+ mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560)
2030
+ addiu t8, zero, 3196 // FIX_0_390180644
2031
+ mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644)
2032
+ addiu t8, zero, 2446 // FIX_0_298631336
2033
+ mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
2034
+ addiu t8, zero, 7373 // FIX_0_899976223
2035
+ mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223)
2036
+ addiu t8, zero, 16819 // FIX_2_053119869
2037
+ mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
2038
+ addiu t8, zero, 20995 // FIX_2_562915447
2039
+ mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447)
2040
+ addiu t8, zero, 25172 // FIX_3_072711026
2041
+ mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
2042
+ addiu t8, zero, 12299 // FIX_1_501321110
2043
+ mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
2044
+ subu s0, s2, s0 // z3 += z5
2045
+ subu s1, s2, s1 // z4 += z5
2046
+ addu t4, t4, s0
2047
+ subu t4, t4, s3 // tmp0
2048
+ addu t5, t5, s1
2049
+ subu t5, t5, t9 // tmp1
2050
+ addu t6, t6, s0
2051
+ subu t6, t6, t9 // tmp2
2052
+ addu t7, t7, s1
2053
+ subu t7, t7, s3 // tmp3
2054
+ addu s0, t0, t7
2055
+ subu t0, t0, t7
2056
+ addu t7, t2, t6
2057
+ subu t2, t2, t6
2058
+ addu t6, t3, t5
2059
+ subu t3, t3, t5
2060
+ addu t5, t1, t4
2061
+ subu t1, t1, t4
2062
+ shra_r.w s0, s0, 18
2063
+ shra_r.w t7, t7, 18
2064
+ shra_r.w t6, t6, 18
2065
+ shra_r.w t5, t5, 18
2066
+ shra_r.w t1, t1, 18
2067
+ shra_r.w t3, t3, 18
2068
+ shra_r.w t2, t2, 18
2069
+ shra_r.w t0, t0, 18
2070
+ andi s0, s0, 0x3ff
2071
+ andi t7, t7, 0x3ff
2072
+ andi t6, t6, 0x3ff
2073
+ andi t5, t5, 0x3ff
2074
+ andi t1, t1, 0x3ff
2075
+ andi t3, t3, 0x3ff
2076
+ andi t2, t2, 0x3ff
2077
+ andi t0, t0, 0x3ff
2078
+ lw s1, 0(a2)
2079
+ lbux s0, s0(a3)
2080
+ lbux t7, t7(a3)
2081
+ lbux t6, t6(a3)
2082
+ lbux t5, t5(a3)
2083
+ lbux t1, t1(a3)
2084
+ lbux t3, t3(a3)
2085
+ lbux t2, t2(a3)
2086
+ lbux t0, t0(a3)
2087
+ sb s0, 0(s1)
2088
+ sb t7, 1(s1)
2089
+ sb t6, 2(s1)
2090
+ sb t5, 3(s1)
2091
+ sb t1, 4(s1)
2092
+ sb t3, 5(s1)
2093
+ sb t2, 6(s1)
2094
+ sb t0, 7(s1)
2095
+ 6:
2096
+ addiu v0, v0, 32
2097
+ bgtz v1, 4b
2098
+ addiu a2, a2, 4
2099
+ addiu sp, sp, 256
2100
+
2101
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2102
+
2103
+ j ra
2104
+ nop
2105
+
2106
+ END(jsimd_idct_islow_dspr2)
2107
+
2108
+
2109
+ /*****************************************************************************/
2110
+ LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
2111
+ /*
2112
+ * a0 = inptr
2113
+ * a1 = quantptr
2114
+ * a2 = wsptr
2115
+ * a3 = mips_idct_ifast_coefs
2116
+ */
2117
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2118
+
2119
+ addiu t9, a0, 16 // end address
2120
+ or AT, a3, zero
2121
+
2122
+ 0:
2123
+ lw s0, 0(a1) // quantptr[DCTSIZE*0]
2124
+ lw t0, 0(a0) // inptr[DCTSIZE*0]
2125
+ lw t1, 16(a0) // inptr[DCTSIZE*1]
2126
+ muleq_s.w.phl v0, t0, s0 // tmp0 ...
2127
+ lw t2, 32(a0) // inptr[DCTSIZE*2]
2128
+ lw t3, 48(a0) // inptr[DCTSIZE*3]
2129
+ lw t4, 64(a0) // inptr[DCTSIZE*4]
2130
+ lw t5, 80(a0) // inptr[DCTSIZE*5]
2131
+ muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
2132
+ lw t6, 96(a0) // inptr[DCTSIZE*6]
2133
+ lw t7, 112(a0) // inptr[DCTSIZE*7]
2134
+ or s4, t1, t2
2135
+ or s5, t3, t4
2136
+ bnez s4, 1f
2137
+ ins t0, v0, 16, 16 // ... tmp0
2138
+ bnez s5, 1f
2139
+ or s6, t5, t6
2140
+ or s6, s6, t7
2141
+ bnez s6, 1f
2142
+ sw t0, 0(a2) // wsptr[DCTSIZE*0]
2143
+ sw t0, 16(a2) // wsptr[DCTSIZE*1]
2144
+ sw t0, 32(a2) // wsptr[DCTSIZE*2]
2145
+ sw t0, 48(a2) // wsptr[DCTSIZE*3]
2146
+ sw t0, 64(a2) // wsptr[DCTSIZE*4]
2147
+ sw t0, 80(a2) // wsptr[DCTSIZE*5]
2148
+ sw t0, 96(a2) // wsptr[DCTSIZE*6]
2149
+ sw t0, 112(a2) // wsptr[DCTSIZE*7]
2150
+ addiu a0, a0, 4
2151
+ b 2f
2152
+ addiu a1, a1, 4
2153
+
2154
+ 1:
2155
+ lw s1, 32(a1) // quantptr[DCTSIZE*2]
2156
+ lw s2, 64(a1) // quantptr[DCTSIZE*4]
2157
+ muleq_s.w.phl v0, t2, s1 // tmp1 ...
2158
+ muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
2159
+ lw s0, 16(a1) // quantptr[DCTSIZE*1]
2160
+ lw s1, 48(a1) // quantptr[DCTSIZE*3]
2161
+ lw s3, 96(a1) // quantptr[DCTSIZE*6]
2162
+ muleq_s.w.phl v1, t4, s2 // tmp2 ...
2163
+ muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
2164
+ lw s2, 80(a1) // quantptr[DCTSIZE*5]
2165
+ lw t8, 4(AT) // FIX(1.414213562)
2166
+ ins t2, v0, 16, 16 // ... tmp1
2167
+ muleq_s.w.phl v0, t6, s3 // tmp3 ...
2168
+ muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
2169
+ ins t4, v1, 16, 16 // ... tmp2
2170
+ addq.ph s4, t0, t4 // tmp10
2171
+ subq.ph s5, t0, t4 // tmp11
2172
+ ins t6, v0, 16, 16 // ... tmp3
2173
+ subq.ph s6, t2, t6 // tmp12 ...
2174
+ addq.ph s7, t2, t6 // tmp13
2175
+ mulq_s.ph s6, s6, t8 // ... tmp12 ...
2176
+ addq.ph t0, s4, s7 // tmp0
2177
+ subq.ph t6, s4, s7 // tmp3
2178
+ muleq_s.w.phl v0, t1, s0 // tmp4 ...
2179
+ muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
2180
+ shll_s.ph s6, s6, 1 // x2
2181
+ lw s3, 112(a1) // quantptr[DCTSIZE*7]
2182
+ subq.ph s6, s6, s7 // ... tmp12
2183
+ muleq_s.w.phl v1, t7, s3 // tmp7 ...
2184
+ muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
2185
+ ins t1, v0, 16, 16 // ... tmp4
2186
+ addq.ph t2, s5, s6 // tmp1
2187
+ subq.ph t4, s5, s6 // tmp2
2188
+ muleq_s.w.phl v0, t5, s2 // tmp6 ...
2189
+ muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
2190
+ ins t7, v1, 16, 16 // ... tmp7
2191
+ addq.ph s5, t1, t7 // z11
2192
+ subq.ph s6, t1, t7 // z12
2193
+ muleq_s.w.phl v1, t3, s1 // tmp5 ...
2194
+ muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
2195
+ ins t5, v0, 16, 16 // ... tmp6
2196
+ ins t3, v1, 16, 16 // ... tmp5
2197
+ addq.ph s7, t5, t3 // z13
2198
+ subq.ph v0, t5, t3 // z10
2199
+ addq.ph t7, s5, s7 // tmp7
2200
+ subq.ph s5, s5, s7 // tmp11 ...
2201
+ addq.ph v1, v0, s6 // z5 ...
2202
+ mulq_s.ph s5, s5, t8 // ... tmp11
2203
+ lw t8, 8(AT) // FIX(1.847759065)
2204
+ lw s4, 0(AT) // FIX(1.082392200)
2205
+ addq.ph s0, t0, t7
2206
+ subq.ph s1, t0, t7
2207
+ mulq_s.ph v1, v1, t8 // ... z5
2208
+ shll_s.ph s5, s5, 1 // x2
2209
+ lw t8, 12(AT) // FIX(-2.613125930)
2210
+ sw s0, 0(a2) // wsptr[DCTSIZE*0]
2211
+ shll_s.ph v0, v0, 1 // x4
2212
+ mulq_s.ph v0, v0, t8 // tmp12 ...
2213
+ mulq_s.ph s4, s6, s4 // tmp10 ...
2214
+ shll_s.ph v1, v1, 1 // x2
2215
+ addiu a0, a0, 4
2216
+ addiu a1, a1, 4
2217
+ sw s1, 112(a2) // wsptr[DCTSIZE*7]
2218
+ shll_s.ph s6, v0, 1 // x4
2219
+ shll_s.ph s4, s4, 1 // x2
2220
+ addq.ph s6, s6, v1 // ... tmp12
2221
+ subq.ph t5, s6, t7 // tmp6
2222
+ subq.ph s4, s4, v1 // ... tmp10
2223
+ subq.ph t3, s5, t5 // tmp5
2224
+ addq.ph s2, t2, t5
2225
+ addq.ph t1, s4, t3 // tmp4
2226
+ subq.ph s3, t2, t5
2227
+ sw s2, 16(a2) // wsptr[DCTSIZE*1]
2228
+ sw s3, 96(a2) // wsptr[DCTSIZE*6]
2229
+ addq.ph v0, t4, t3
2230
+ subq.ph v1, t4, t3
2231
+ sw v0, 32(a2) // wsptr[DCTSIZE*2]
2232
+ sw v1, 80(a2) // wsptr[DCTSIZE*5]
2233
+ addq.ph v0, t6, t1
2234
+ subq.ph v1, t6, t1
2235
+ sw v0, 64(a2) // wsptr[DCTSIZE*4]
2236
+ sw v1, 48(a2) // wsptr[DCTSIZE*3]
2237
+
2238
+ 2:
2239
+ bne a0, t9, 0b
2240
+ addiu a2, a2, 4
2241
+
2242
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2243
+
2244
+ j ra
2245
+ nop
2246
+
2247
+ END(jsimd_idct_ifast_cols_dspr2)
2248
+
2249
+
2250
+ /*****************************************************************************/
2251
+ LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
2252
+ /*
2253
+ * a0 = wsptr
2254
+ * a1 = output_buf
2255
+ * a2 = output_col
2256
+ * a3 = mips_idct_ifast_coefs
2257
+ */
2258
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2259
+
2260
+ addiu t9, a0, 128 // end address
2261
+ lui s8, 0x8080
2262
+ ori s8, s8, 0x8080
2263
+
2264
+ 0:
2265
+ lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
2266
+ lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
2267
+ lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
2268
+ lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
2269
+ lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
2270
+ lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
2271
+ lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
2272
+ lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
2273
+ lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
2274
+ precrq.ph.w t1, s0, t0 // B b
2275
+ ins t0, s0, 16, 16 // A a
2276
+ bnez t1, 1f
2277
+ or s0, t2, s2
2278
+ bnez s0, 1f
2279
+ or s0, t4, s4
2280
+ bnez s0, 1f
2281
+ or s0, t6, s6
2282
+ bnez s0, 1f
2283
+ shll_s.ph s0, t0, 2 // A a
2284
+ lw a3, 0(a1)
2285
+ lw AT, 4(a1)
2286
+ precrq.ph.w t0, s0, s0 // A A
2287
+ ins s0, s0, 16, 16 // a a
2288
+ addu a3, a3, a2
2289
+ addu AT, AT, a2
2290
+ precrq.qb.ph t0, t0, t0 // A A A A
2291
+ precrq.qb.ph s0, s0, s0 // a a a a
2292
+ addu.qb s0, s0, s8
2293
+ addu.qb t0, t0, s8
2294
+ sw s0, 0(a3)
2295
+ sw s0, 4(a3)
2296
+ sw t0, 0(AT)
2297
+ sw t0, 4(AT)
2298
+ addiu a0, a0, 32
2299
+ bne a0, t9, 0b
2300
+ addiu a1, a1, 8
2301
+ b 2f
2302
+ nop
2303
+
2304
+ 1:
2305
+ precrq.ph.w t3, s2, t2
2306
+ ins t2, s2, 16, 16
2307
+ precrq.ph.w t5, s4, t4
2308
+ ins t4, s4, 16, 16
2309
+ precrq.ph.w t7, s6, t6
2310
+ ins t6, s6, 16, 16
2311
+ lw t8, 4(AT) // FIX(1.414213562)
2312
+ addq.ph s4, t0, t4 // tmp10
2313
+ subq.ph s5, t0, t4 // tmp11
2314
+ subq.ph s6, t2, t6 // tmp12 ...
2315
+ addq.ph s7, t2, t6 // tmp13
2316
+ mulq_s.ph s6, s6, t8 // ... tmp12 ...
2317
+ addq.ph t0, s4, s7 // tmp0
2318
+ subq.ph t6, s4, s7 // tmp3
2319
+ shll_s.ph s6, s6, 1 // x2
2320
+ subq.ph s6, s6, s7 // ... tmp12
2321
+ addq.ph t2, s5, s6 // tmp1
2322
+ subq.ph t4, s5, s6 // tmp2
2323
+ addq.ph s5, t1, t7 // z11
2324
+ subq.ph s6, t1, t7 // z12
2325
+ addq.ph s7, t5, t3 // z13
2326
+ subq.ph v0, t5, t3 // z10
2327
+ addq.ph t7, s5, s7 // tmp7
2328
+ subq.ph s5, s5, s7 // tmp11 ...
2329
+ addq.ph v1, v0, s6 // z5 ...
2330
+ mulq_s.ph s5, s5, t8 // ... tmp11
2331
+ lw t8, 8(AT) // FIX(1.847759065)
2332
+ lw s4, 0(AT) // FIX(1.082392200)
2333
+ addq.ph s0, t0, t7 // tmp0 + tmp7
2334
+ subq.ph s7, t0, t7 // tmp0 - tmp7
2335
+ mulq_s.ph v1, v1, t8 // ... z5
2336
+ lw a3, 0(a1)
2337
+ lw t8, 12(AT) // FIX(-2.613125930)
2338
+ shll_s.ph s5, s5, 1 // x2
2339
+ addu a3, a3, a2
2340
+ shll_s.ph v0, v0, 1 // x4
2341
+ mulq_s.ph v0, v0, t8 // tmp12 ...
2342
+ mulq_s.ph s4, s6, s4 // tmp10 ...
2343
+ shll_s.ph v1, v1, 1 // x2
2344
+ addiu a0, a0, 32
2345
+ addiu a1, a1, 8
2346
+ shll_s.ph s6, v0, 1 // x4
2347
+ shll_s.ph s4, s4, 1 // x2
2348
+ addq.ph s6, s6, v1 // ... tmp12
2349
+ shll_s.ph s0, s0, 2
2350
+ subq.ph t5, s6, t7 // tmp6
2351
+ subq.ph s4, s4, v1 // ... tmp10
2352
+ subq.ph t3, s5, t5 // tmp5
2353
+ shll_s.ph s7, s7, 2
2354
+ addq.ph t1, s4, t3 // tmp4
2355
+ addq.ph s1, t2, t5 // tmp1 + tmp6
2356
+ subq.ph s6, t2, t5 // tmp1 - tmp6
2357
+ addq.ph s2, t4, t3 // tmp2 + tmp5
2358
+ subq.ph s5, t4, t3 // tmp2 - tmp5
2359
+ addq.ph s4, t6, t1 // tmp3 + tmp4
2360
+ subq.ph s3, t6, t1 // tmp3 - tmp4
2361
+ shll_s.ph s1, s1, 2
2362
+ shll_s.ph s2, s2, 2
2363
+ shll_s.ph s3, s3, 2
2364
+ shll_s.ph s4, s4, 2
2365
+ shll_s.ph s5, s5, 2
2366
+ shll_s.ph s6, s6, 2
2367
+ precrq.ph.w t0, s1, s0 // B A
2368
+ ins s0, s1, 16, 16 // b a
2369
+ precrq.ph.w t2, s3, s2 // D C
2370
+ ins s2, s3, 16, 16 // d c
2371
+ precrq.ph.w t4, s5, s4 // F E
2372
+ ins s4, s5, 16, 16 // f e
2373
+ precrq.ph.w t6, s7, s6 // H G
2374
+ ins s6, s7, 16, 16 // h g
2375
+ precrq.qb.ph t0, t2, t0 // D C B A
2376
+ precrq.qb.ph s0, s2, s0 // d c b a
2377
+ precrq.qb.ph t4, t6, t4 // H G F E
2378
+ precrq.qb.ph s4, s6, s4 // h g f e
2379
+ addu.qb s0, s0, s8
2380
+ addu.qb s4, s4, s8
2381
+ sw s0, 0(a3) // outptr[0/1/2/3] d c b a
2382
+ sw s4, 4(a3) // outptr[4/5/6/7] h g f e
2383
+ lw a3, -4(a1)
2384
+ addu.qb t0, t0, s8
2385
+ addu a3, a3, a2
2386
+ addu.qb t4, t4, s8
2387
+ sw t0, 0(a3) // outptr[0/1/2/3] D C B A
2388
+ bne a0, t9, 0b
2389
+ sw t4, 4(a3) // outptr[4/5/6/7] H G F E
2390
+
2391
+ 2:
2392
+
2393
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2394
+
2395
+ j ra
2396
+ nop
2397
+
2398
+ END(jsimd_idct_ifast_rows_dspr2)
2399
+
2400
+
2401
+ /*****************************************************************************/
2402
+ LEAF_DSPR2(jsimd_fdct_islow_dspr2)
2403
+ /*
2404
+ * a0 = data
2405
+ */
2406
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2407
+
2408
+ lui t0, 6437
2409
+ ori t0, 2260
2410
+ lui t1, 9633
2411
+ ori t1, 11363
2412
+ lui t2, 0xd39e
2413
+ ori t2, 0xe6dc
2414
+ lui t3, 0xf72d
2415
+ ori t3, 9633
2416
+ lui t4, 2261
2417
+ ori t4, 9633
2418
+ lui t5, 0xd39e
2419
+ ori t5, 6437
2420
+ lui t6, 9633
2421
+ ori t6, 0xd39d
2422
+ lui t7, 0xe6dc
2423
+ ori t7, 2260
2424
+ lui t8, 4433
2425
+ ori t8, 10703
2426
+ lui t9, 0xd630
2427
+ ori t9, 4433
2428
+ li s8, 8
2429
+ move a1, a0
2430
+ 1:
2431
+ lw s0, 0(a1) // tmp0 = 1|0
2432
+ lw s1, 4(a1) // tmp1 = 3|2
2433
+ lw s2, 8(a1) // tmp2 = 5|4
2434
+ lw s3, 12(a1) // tmp3 = 7|6
2435
+ packrl.ph s1, s1, s1 // tmp1 = 2|3
2436
+ packrl.ph s3, s3, s3 // tmp3 = 6|7
2437
+ subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
2438
+ subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
2439
+ mult $0, $0 // ac0 = 0
2440
+ dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
2441
+ dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
2442
+ mult $ac1, $0, $0 // ac1 = 0
2443
+ dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
2444
+ dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
2445
+ mult $ac2, $0, $0 // ac2 = 0
2446
+ dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
2447
+ dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
2448
+ mult $ac3, $0, $0 // ac3 = 0
2449
+ dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
2450
+ dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
2451
+ addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
2452
+ addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
2453
+ extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
2454
+ extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
2455
+ extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
2456
+ extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
2457
+ addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
2458
+ subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
2459
+ sh s0, 2(a1)
2460
+ sh s1, 6(a1)
2461
+ sh s2, 10(a1)
2462
+ sh s3, 14(a1)
2463
+ mult $0, $0 // ac0 = 0
2464
+ dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
2465
+ mult $ac1, $0, $0 // ac1 = 0
2466
+ dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
2467
+ sra s4, s5, 16 // tmp4 = t11
2468
+ addiu a1, a1, 16
2469
+ addiu s8, s8, -1
2470
+ extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
2471
+ extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
2472
+ addu s2, s5, s4 // tmp2 = t10 + t11
2473
+ subu s3, s5, s4 // tmp3 = t10 - t11
2474
+ sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
2475
+ sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
2476
+ sh s2, -16(a1)
2477
+ sh s3, -8(a1)
2478
+ sh s0, -12(a1)
2479
+ bgtz s8, 1b
2480
+ sh s1, -4(a1)
2481
+ li t0, 2260
2482
+ li t1, 11363
2483
+ li t2, 9633
2484
+ li t3, 6436
2485
+ li t4, 6437
2486
+ li t5, 2261
2487
+ li t6, 11362
2488
+ li t7, 2259
2489
+ li t8, 4433
2490
+ li t9, 10703
2491
+ li a1, 10704
2492
+ li s8, 8
2493
+
2494
+ 2:
2495
+ lh a2, 0(a0) // 0
2496
+ lh a3, 16(a0) // 8
2497
+ lh v0, 32(a0) // 16
2498
+ lh v1, 48(a0) // 24
2499
+ lh s4, 64(a0) // 32
2500
+ lh s5, 80(a0) // 40
2501
+ lh s6, 96(a0) // 48
2502
+ lh s7, 112(a0) // 56
2503
+ addu s2, v0, s5 // tmp2 = 16 + 40
2504
+ subu s5, v0, s5 // tmp5 = 16 - 40
2505
+ addu s3, v1, s4 // tmp3 = 24 + 32
2506
+ subu s4, v1, s4 // tmp4 = 24 - 32
2507
+ addu s0, a2, s7 // tmp0 = 0 + 56
2508
+ subu s7, a2, s7 // tmp7 = 0 - 56
2509
+ addu s1, a3, s6 // tmp1 = 8 + 48
2510
+ subu s6, a3, s6 // tmp6 = 8 - 48
2511
+ addu a2, s0, s3 // tmp10 = tmp0 + tmp3
2512
+ subu v1, s0, s3 // tmp13 = tmp0 - tmp3
2513
+ addu a3, s1, s2 // tmp11 = tmp1 + tmp2
2514
+ subu v0, s1, s2 // tmp12 = tmp1 - tmp2
2515
+ mult s7, t1 // ac0 = tmp7 * c1
2516
+ madd s4, t0 // ac0 += tmp4 * c0
2517
+ madd s5, t4 // ac0 += tmp5 * c4
2518
+ madd s6, t2 // ac0 += tmp6 * c2
2519
+ mult $ac1, s7, t2 // ac1 = tmp7 * c2
2520
+ msub $ac1, s4, t3 // ac1 -= tmp4 * c3
2521
+ msub $ac1, s5, t6 // ac1 -= tmp5 * c6
2522
+ msub $ac1, s6, t7 // ac1 -= tmp6 * c7
2523
+ mult $ac2, s7, t4 // ac2 = tmp7 * c4
2524
+ madd $ac2, s4, t2 // ac2 += tmp4 * c2
2525
+ madd $ac2, s5, t5 // ac2 += tmp5 * c5
2526
+ msub $ac2, s6, t6 // ac2 -= tmp6 * c6
2527
+ mult $ac3, s7, t0 // ac3 = tmp7 * c0
2528
+ msub $ac3, s4, t1 // ac3 -= tmp4 * c1
2529
+ madd $ac3, s5, t2 // ac3 += tmp5 * c2
2530
+ msub $ac3, s6, t3 // ac3 -= tmp6 * c3
2531
+ extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
2532
+ extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
2533
+ extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
2534
+ extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
2535
+ addiu s8, s8, -1
2536
+ addu s4, a2, a3 // tmp4 = tmp10 + tmp11
2537
+ subu s5, a2, a3 // tmp5 = tmp10 - tmp11
2538
+ sh s0, 16(a0)
2539
+ sh s1, 48(a0)
2540
+ sh s2, 80(a0)
2541
+ sh s3, 112(a0)
2542
+ mult v0, t8 // ac0 = tmp12 * c8
2543
+ madd v1, t9 // ac0 += tmp13 * c9
2544
+ mult $ac1, v1, t8 // ac1 = tmp13 * c8
2545
+ msub $ac1, v0, a1 // ac1 -= tmp12 * c10
2546
+ addiu a0, a0, 2
2547
+ extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
2548
+ extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
2549
+ shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
2550
+ shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
2551
+ sh s4, -2(a0)
2552
+ sh s5, 62(a0)
2553
+ sh s6, 30(a0)
2554
+ bgtz s8, 2b
2555
+ sh s7, 94(a0)
2556
+
2557
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2558
+
2559
+ jr ra
2560
+ nop
2561
+
2562
+ END(jsimd_fdct_islow_dspr2)
2563
+
2564
+
2565
+ /**************************************************************************/
2566
+ LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
2567
+ /*
2568
+ * a0 = data
2569
+ */
2570
+ .set at
2571
+
2572
+ SAVE_REGS_ON_STACK 8, s0, s1
2573
+
2574
+ li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
2575
+ li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
2576
+ li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
2577
+ li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
2578
+
2579
+ move v0, a0
2580
+ addiu v1, v0, 128 // end address
2581
+
2582
+ 0:
2583
+ lw t0, 0(v0) // tmp0 = 1|0
2584
+ lw t1, 4(v0) // tmp1 = 3|2
2585
+ lw t2, 8(v0) // tmp2 = 5|4
2586
+ lw t3, 12(v0) // tmp3 = 7|6
2587
+ packrl.ph t1, t1, t1 // tmp1 = 2|3
2588
+ packrl.ph t3, t3, t3 // tmp3 = 6|7
2589
+ subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4
2590
+ subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7
2591
+ addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3
2592
+ addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0
2593
+ addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10
2594
+ subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13
2595
+ sra t4, t8, 16 // tmp4 = t11
2596
+ mult $0, $0 // ac0 = 0
2597
+ dpa.w.ph $ac0, t9, s1
2598
+ mult $ac1, $0, $0 // ac1 = 0
2599
+ dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98
2600
+ dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98
2601
+ mult $ac2, $0, $0 // ac2 = 0
2602
+ dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139
2603
+ mult $ac3, $0, $0 // ac3 = 0
2604
+ dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334
2605
+ precrq.ph.w t0, t5, t7 // t0 = t5|t6
2606
+ addq.ph t2, t8, t4 // tmp2 = t10 + t11
2607
+ subq.ph t3, t8, t4 // tmp3 = t10 - t11
2608
+ extr.w t4, $ac0, 8
2609
+ mult $0, $0 // ac0 = 0
2610
+ dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181
2611
+ extr.w t0, $ac1, 8 // t0 = z5
2612
+ extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139)
2613
+ extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334)
2614
+ extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181)
2615
+ add t6, t1, t0 // t6 = z2
2616
+ add t7, t7, t0 // t7 = z4
2617
+ subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3
2618
+ addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3
2619
+ addq.ph t1, t0, t6 // t1 = z13 + z2
2620
+ subq.ph t6, t0, t6 // t6 = z13 - z2
2621
+ addq.ph t0, t8, t7 // t0 = z11 + z4
2622
+ subq.ph t7, t8, t7 // t7 = z11 - z4
2623
+ addq.ph t5, t4, t9
2624
+ subq.ph t4, t9, t4
2625
+ sh t2, 0(v0)
2626
+ sh t5, 4(v0)
2627
+ sh t3, 8(v0)
2628
+ sh t4, 12(v0)
2629
+ sh t1, 10(v0)
2630
+ sh t6, 6(v0)
2631
+ sh t0, 2(v0)
2632
+ sh t7, 14(v0)
2633
+ addiu v0, 16
2634
+ bne v1, v0, 0b
2635
+ nop
2636
+ move v0, a0
2637
+ addiu v1, v0, 16
2638
+
2639
+ 1:
2640
+ lh t0, 0(v0) // 0
2641
+ lh t1, 16(v0) // 8
2642
+ lh t2, 32(v0) // 16
2643
+ lh t3, 48(v0) // 24
2644
+ lh t4, 64(v0) // 32
2645
+ lh t5, 80(v0) // 40
2646
+ lh t6, 96(v0) // 48
2647
+ lh t7, 112(v0) // 56
2648
+ add t8, t0, t7 // t8 = tmp0
2649
+ sub t7, t0, t7 // t7 = tmp7
2650
+ add t0, t1, t6 // t0 = tmp1
2651
+ sub t1, t1, t6 // t1 = tmp6
2652
+ add t6, t2, t5 // t6 = tmp2
2653
+ sub t5, t2, t5 // t5 = tmp5
2654
+ add t2, t3, t4 // t2 = tmp3
2655
+ sub t3, t3, t4 // t3 = tmp4
2656
+ add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3
2657
+ sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3
2658
+ sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2
2659
+ ins t8, s0, 16, 16 // t8 = tmp12|tmp13
2660
+ add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2
2661
+ mult $0, $0 // ac0 = 0
2662
+ dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181
2663
+ add s0, t4, t2 // t8 = tmp10+tmp11
2664
+ sub t4, t4, t2 // t4 = tmp10-tmp11
2665
+ sh s0, 0(v0)
2666
+ sh t4, 64(v0)
2667
+ extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781)
2668
+ addq.ph t4, t8, t2 // t9 = tmp13 + z1
2669
+ subq.ph t8, t8, t2 // t2 = tmp13 - z1
2670
+ sh t4, 32(v0)
2671
+ sh t8, 96(v0)
2672
+ add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5
2673
+ add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6
2674
+ add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7
2675
+ andi t4, a1, 0xffff
2676
+ mul s0, t1, t4
2677
+ sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
2678
+ ins t1, t3, 16, 16 // t1 = tmp10|tmp12
2679
+ mult $0, $0 // ac0 = 0
2680
+ mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98
2681
+ extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433)
2682
+ add t2, t7, t8 // t2 = tmp7 + z5
2683
+ sub t7, t7, t8 // t7 = tmp7 - z5
2684
+ andi t4, a2, 0xffff
2685
+ mul t8, t3, t4
2686
+ sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
2687
+ andi t4, s1, 0xffff
2688
+ mul t6, t0, t4
2689
+ sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
2690
+ add t0, t6, t8 // t0 = z3 + z2
2691
+ sub t1, t6, t8 // t1 = z3 - z2
2692
+ add t3, t6, s0 // t3 = z3 + z4
2693
+ sub t4, t6, s0 // t4 = z3 - z4
2694
+ sub t5, t2, t1 // t5 = dataptr[5]
2695
+ sub t6, t7, t0 // t6 = dataptr[3]
2696
+ add t3, t2, t3 // t3 = dataptr[1]
2697
+ add t4, t7, t4 // t4 = dataptr[7]
2698
+ sh t5, 80(v0)
2699
+ sh t6, 48(v0)
2700
+ sh t3, 16(v0)
2701
+ sh t4, 112(v0)
2702
+ addiu v0, 2
2703
+ bne v0, v1, 1b
2704
+ nop
2705
+
2706
+ RESTORE_REGS_FROM_STACK 8, s0, s1
2707
+
2708
+ j ra
2709
+ nop
2710
+ END(jsimd_fdct_ifast_dspr2)
2711
+
2712
+
2713
+ /*****************************************************************************/
2714
+ LEAF_DSPR2(jsimd_quantize_dspr2)
2715
+ /*
2716
+ * a0 = coef_block
2717
+ * a1 = divisors
2718
+ * a2 = workspace
2719
+ */
2720
+ .set at
2721
+
2722
+ SAVE_REGS_ON_STACK 16, s0, s1, s2
2723
+
2724
+ addiu v0, a2, 124 // v0 = workspace_end
2725
+ lh t0, 0(a2)
2726
+ lh t1, 0(a1)
2727
+ lh t2, 128(a1)
2728
+ sra t3, t0, 15
2729
+ sll t3, t3, 1
2730
+ addiu t3, t3, 1
2731
+ mul t0, t0, t3
2732
+ lh t4, 384(a1)
2733
+ lh t5, 130(a1)
2734
+ lh t6, 2(a2)
2735
+ lh t7, 2(a1)
2736
+ lh t8, 386(a1)
2737
+
2738
+ 1:
2739
+ andi t1, 0xffff
2740
+ add t9, t0, t2
2741
+ andi t9, 0xffff
2742
+ mul v1, t9, t1
2743
+ sra s0, t6, 15
2744
+ sll s0, s0, 1
2745
+ addiu s0, s0, 1
2746
+ addiu t9, t4, 16
2747
+ srav v1, v1, t9
2748
+ mul v1, v1, t3
2749
+ mul t6, t6, s0
2750
+ andi t7, 0xffff
2751
+ addiu a2, a2, 4
2752
+ addiu a1, a1, 4
2753
+ add s1, t6, t5
2754
+ andi s1, 0xffff
2755
+ sh v1, 0(a0)
2756
+
2757
+ mul s2, s1, t7
2758
+ addiu s1, t8, 16
2759
+ srav s2, s2, s1
2760
+ mul s2, s2, s0
2761
+ lh t0, 0(a2)
2762
+ lh t1, 0(a1)
2763
+ sra t3, t0, 15
2764
+ sll t3, t3, 1
2765
+ addiu t3, t3, 1
2766
+ mul t0, t0, t3
2767
+ lh t2, 128(a1)
2768
+ lh t4, 384(a1)
2769
+ lh t5, 130(a1)
2770
+ lh t8, 386(a1)
2771
+ lh t6, 2(a2)
2772
+ lh t7, 2(a1)
2773
+ sh s2, 2(a0)
2774
+ lh t0, 0(a2)
2775
+ sra t3, t0, 15
2776
+ sll t3, t3, 1
2777
+ addiu t3, t3, 1
2778
+ mul t0, t0, t3
2779
+ bne a2, v0, 1b
2780
+ addiu a0, a0, 4
2781
+
2782
+ andi t1, 0xffff
2783
+ add t9, t0, t2
2784
+ andi t9, 0xffff
2785
+ mul v1, t9, t1
2786
+ sra s0, t6, 15
2787
+ sll s0, s0, 1
2788
+ addiu s0, s0, 1
2789
+ addiu t9, t4, 16
2790
+ srav v1, v1, t9
2791
+ mul v1, v1, t3
2792
+ mul t6, t6, s0
2793
+ andi t7, 0xffff
2794
+ sh v1, 0(a0)
2795
+ add s1, t6, t5
2796
+ andi s1, 0xffff
2797
+ mul s2, s1, t7
2798
+ addiu s1, t8, 16
2799
+ addiu a2, a2, 4
2800
+ addiu a1, a1, 4
2801
+ srav s2, s2, s1
2802
+ mul s2, s2, s0
2803
+ sh s2, 2(a0)
2804
+
2805
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2
2806
+
2807
+ j ra
2808
+ nop
2809
+
2810
+ END(jsimd_quantize_dspr2)
2811
+
2812
+
2813
+ #ifndef __mips_soft_float
2814
+
2815
+ /*****************************************************************************/
2816
+ LEAF_DSPR2(jsimd_quantize_float_dspr2)
2817
+ /*
2818
+ * a0 = coef_block
2819
+ * a1 = divisors
2820
+ * a2 = workspace
2821
+ */
2822
+ .set at
2823
+
2824
+ li t1, 0x46800100 // integer representation 16384.5
2825
+ mtc1 t1, f0
2826
+ li t0, 63
2827
+ 0:
2828
+ lwc1 f2, 0(a2)
2829
+ lwc1 f10, 0(a1)
2830
+ lwc1 f4, 4(a2)
2831
+ lwc1 f12, 4(a1)
2832
+ lwc1 f6, 8(a2)
2833
+ lwc1 f14, 8(a1)
2834
+ lwc1 f8, 12(a2)
2835
+ lwc1 f16, 12(a1)
2836
+ madd.s f2, f0, f2, f10
2837
+ madd.s f4, f0, f4, f12
2838
+ madd.s f6, f0, f6, f14
2839
+ madd.s f8, f0, f8, f16
2840
+ lwc1 f10, 16(a1)
2841
+ lwc1 f12, 20(a1)
2842
+ trunc.w.s f2, f2
2843
+ trunc.w.s f4, f4
2844
+ trunc.w.s f6, f6
2845
+ trunc.w.s f8, f8
2846
+ lwc1 f14, 24(a1)
2847
+ lwc1 f16, 28(a1)
2848
+ mfc1 t1, f2
2849
+ mfc1 t2, f4
2850
+ mfc1 t3, f6
2851
+ mfc1 t4, f8
2852
+ lwc1 f2, 16(a2)
2853
+ lwc1 f4, 20(a2)
2854
+ lwc1 f6, 24(a2)
2855
+ lwc1 f8, 28(a2)
2856
+ madd.s f2, f0, f2, f10
2857
+ madd.s f4, f0, f4, f12
2858
+ madd.s f6, f0, f6, f14
2859
+ madd.s f8, f0, f8, f16
2860
+ addiu t1, t1, -16384
2861
+ addiu t2, t2, -16384
2862
+ addiu t3, t3, -16384
2863
+ addiu t4, t4, -16384
2864
+ trunc.w.s f2, f2
2865
+ trunc.w.s f4, f4
2866
+ trunc.w.s f6, f6
2867
+ trunc.w.s f8, f8
2868
+ sh t1, 0(a0)
2869
+ sh t2, 2(a0)
2870
+ sh t3, 4(a0)
2871
+ sh t4, 6(a0)
2872
+ mfc1 t1, f2
2873
+ mfc1 t2, f4
2874
+ mfc1 t3, f6
2875
+ mfc1 t4, f8
2876
+ addiu t0, t0, -8
2877
+ addiu a2, a2, 32
2878
+ addiu a1, a1, 32
2879
+ addiu t1, t1, -16384
2880
+ addiu t2, t2, -16384
2881
+ addiu t3, t3, -16384
2882
+ addiu t4, t4, -16384
2883
+ sh t1, 8(a0)
2884
+ sh t2, 10(a0)
2885
+ sh t3, 12(a0)
2886
+ sh t4, 14(a0)
2887
+ bgez t0, 0b
2888
+ addiu a0, a0, 16
2889
+
2890
+ j ra
2891
+ nop
2892
+
2893
+ END(jsimd_quantize_float_dspr2)
2894
+
2895
+ #endif
2896
+
2897
+
2898
+ /*****************************************************************************/
2899
+ LEAF_DSPR2(jsimd_idct_2x2_dspr2)
2900
+ /*
2901
+ * a0 = compptr->dct_table
2902
+ * a1 = coef_block
2903
+ * a2 = output_buf
2904
+ * a3 = output_col
2905
+ */
2906
+ .set at
2907
+
2908
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
2909
+
2910
+ addiu sp, sp, -40
2911
+ move v0, sp
2912
+ addiu s2, zero, 29692
2913
+ addiu s3, zero, -10426
2914
+ addiu s4, zero, 6967
2915
+ addiu s5, zero, -5906
2916
+ lh t0, 0(a1) // t0 = inptr[DCTSIZE*0]
2917
+ lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0]
2918
+ lh t1, 48(a1) // t1 = inptr[DCTSIZE*3]
2919
+ lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3]
2920
+ mul t4, t5, t0
2921
+ lh t0, 16(a1) // t0 = inptr[DCTSIZE*1]
2922
+ lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1]
2923
+ mul t6, t6, t1
2924
+ mul t5, t5, t0
2925
+ lh t2, 80(a1) // t2 = inptr[DCTSIZE*5]
2926
+ lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5]
2927
+ lh t3, 112(a1) // t3 = inptr[DCTSIZE*7]
2928
+ lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7]
2929
+ mul t7, t7, t2
2930
+ mult zero, zero
2931
+ mul t8, t8, t3
2932
+ li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff)
2933
+ li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff)
2934
+ ins t6, t5, 16, 16 // t6 = t5|t6
2935
+ sll t4, t4, 15
2936
+ dpa.w.ph $ac0, t6, s0
2937
+ lh t1, 2(a1)
2938
+ lh t6, 2(a0)
2939
+ ins t8, t7, 16, 16 // t8 = t7|t8
2940
+ dpa.w.ph $ac0, t8, s1
2941
+ mflo t0, $ac0
2942
+ mul t5, t6, t1
2943
+ lh t1, 18(a1)
2944
+ lh t6, 18(a0)
2945
+ lh t2, 50(a1)
2946
+ lh t7, 50(a0)
2947
+ mul t6, t6, t1
2948
+ subu t8, t4, t0
2949
+ mul t7, t7, t2
2950
+ addu t0, t4, t0
2951
+ shra_r.w t0, t0, 13
2952
+ lh t1, 82(a1)
2953
+ lh t2, 82(a0)
2954
+ lh t3, 114(a1)
2955
+ lh t4, 114(a0)
2956
+ shra_r.w t8, t8, 13
2957
+ mul t1, t1, t2
2958
+ mul t3, t3, t4
2959
+ sw t0, 0(v0)
2960
+ sw t8, 20(v0)
2961
+ sll t4, t5, 15
2962
+ ins t7, t6, 16, 16
2963
+ mult zero, zero
2964
+ dpa.w.ph $ac0, t7, s0
2965
+ ins t3, t1, 16, 16
2966
+ lh t1, 6(a1)
2967
+ lh t6, 6(a0)
2968
+ dpa.w.ph $ac0, t3, s1
2969
+ mflo t0, $ac0
2970
+ mul t5, t6, t1
2971
+ lh t1, 22(a1)
2972
+ lh t6, 22(a0)
2973
+ lh t2, 54(a1)
2974
+ lh t7, 54(a0)
2975
+ mul t6, t6, t1
2976
+ subu t8, t4, t0
2977
+ mul t7, t7, t2
2978
+ addu t0, t4, t0
2979
+ shra_r.w t0, t0, 13
2980
+ lh t1, 86(a1)
2981
+ lh t2, 86(a0)
2982
+ lh t3, 118(a1)
2983
+ lh t4, 118(a0)
2984
+ shra_r.w t8, t8, 13
2985
+ mul t1, t1, t2
2986
+ mul t3, t3, t4
2987
+ sw t0, 4(v0)
2988
+ sw t8, 24(v0)
2989
+ sll t4, t5, 15
2990
+ ins t7, t6, 16, 16
2991
+ mult zero, zero
2992
+ dpa.w.ph $ac0, t7, s0
2993
+ ins t3, t1, 16, 16
2994
+ lh t1, 10(a1)
2995
+ lh t6, 10(a0)
2996
+ dpa.w.ph $ac0, t3, s1
2997
+ mflo t0, $ac0
2998
+ mul t5, t6, t1
2999
+ lh t1, 26(a1)
3000
+ lh t6, 26(a0)
3001
+ lh t2, 58(a1)
3002
+ lh t7, 58(a0)
3003
+ mul t6, t6, t1
3004
+ subu t8, t4, t0
3005
+ mul t7, t7, t2
3006
+ addu t0, t4, t0
3007
+ shra_r.w t0, t0, 13
3008
+ lh t1, 90(a1)
3009
+ lh t2, 90(a0)
3010
+ lh t3, 122(a1)
3011
+ lh t4, 122(a0)
3012
+ shra_r.w t8, t8, 13
3013
+ mul t1, t1, t2
3014
+ mul t3, t3, t4
3015
+ sw t0, 8(v0)
3016
+ sw t8, 28(v0)
3017
+ sll t4, t5, 15
3018
+ ins t7, t6, 16, 16
3019
+ mult zero, zero
3020
+ dpa.w.ph $ac0, t7, s0
3021
+ ins t3, t1, 16, 16
3022
+ lh t1, 14(a1)
3023
+ lh t6, 14(a0)
3024
+ dpa.w.ph $ac0, t3, s1
3025
+ mflo t0, $ac0
3026
+ mul t5, t6, t1
3027
+ lh t1, 30(a1)
3028
+ lh t6, 30(a0)
3029
+ lh t2, 62(a1)
3030
+ lh t7, 62(a0)
3031
+ mul t6, t6, t1
3032
+ subu t8, t4, t0
3033
+ mul t7, t7, t2
3034
+ addu t0, t4, t0
3035
+ shra_r.w t0, t0, 13
3036
+ lh t1, 94(a1)
3037
+ lh t2, 94(a0)
3038
+ lh t3, 126(a1)
3039
+ lh t4, 126(a0)
3040
+ shra_r.w t8, t8, 13
3041
+ mul t1, t1, t2
3042
+ mul t3, t3, t4
3043
+ sw t0, 12(v0)
3044
+ sw t8, 32(v0)
3045
+ sll t4, t5, 15
3046
+ ins t7, t6, 16, 16
3047
+ mult zero, zero
3048
+ dpa.w.ph $ac0, t7, s0
3049
+ ins t3, t1, 16, 16
3050
+ dpa.w.ph $ac0, t3, s1
3051
+ mflo t0, $ac0
3052
+ lw t9, 0(a2)
3053
+ lw t3, 0(v0)
3054
+ lw t7, 4(v0)
3055
+ lw t1, 8(v0)
3056
+ addu t9, t9, a3
3057
+ sll t3, t3, 15
3058
+ subu t8, t4, t0
3059
+ addu t0, t4, t0
3060
+ shra_r.w t0, t0, 13
3061
+ shra_r.w t8, t8, 13
3062
+ sw t0, 16(v0)
3063
+ sw t8, 36(v0)
3064
+ lw t5, 12(v0)
3065
+ lw t6, 16(v0)
3066
+ mult t7, s2
3067
+ madd t1, s3
3068
+ madd t5, s4
3069
+ madd t6, s5
3070
+ lw t5, 24(v0)
3071
+ lw t7, 28(v0)
3072
+ mflo t0, $ac0
3073
+ lw t8, 32(v0)
3074
+ lw t2, 36(v0)
3075
+ mult $ac1, t5, s2
3076
+ madd $ac1, t7, s3
3077
+ madd $ac1, t8, s4
3078
+ madd $ac1, t2, s5
3079
+ addu t1, t3, t0
3080
+ subu t6, t3, t0
3081
+ shra_r.w t1, t1, 20
3082
+ shra_r.w t6, t6, 20
3083
+ mflo t4, $ac1
3084
+ shll_s.w t1, t1, 24
3085
+ shll_s.w t6, t6, 24
3086
+ sra t1, t1, 24
3087
+ sra t6, t6, 24
3088
+ addiu t1, t1, 128
3089
+ addiu t6, t6, 128
3090
+ lw t0, 20(v0)
3091
+ sb t1, 0(t9)
3092
+ sb t6, 1(t9)
3093
+ sll t0, t0, 15
3094
+ lw t9, 4(a2)
3095
+ addu t1, t0, t4
3096
+ subu t6, t0, t4
3097
+ addu t9, t9, a3
3098
+ shra_r.w t1, t1, 20
3099
+ shra_r.w t6, t6, 20
3100
+ shll_s.w t1, t1, 24
3101
+ shll_s.w t6, t6, 24
3102
+ sra t1, t1, 24
3103
+ sra t6, t6, 24
3104
+ addiu t1, t1, 128
3105
+ addiu t6, t6, 128
3106
+ sb t1, 0(t9)
3107
+ sb t6, 1(t9)
3108
+ addiu sp, sp, 40
3109
+
3110
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
3111
+
3112
+ j ra
3113
+ nop
3114
+
3115
+ END(jsimd_idct_2x2_dspr2)
3116
+
3117
+
3118
+ /*****************************************************************************/
3119
+ LEAF_DSPR2(jsimd_idct_4x4_dspr2)
3120
+ /*
3121
+ * a0 = compptr->dct_table
3122
+ * a1 = coef_block
3123
+ * a2 = output_buf
3124
+ * a3 = output_col
3125
+ * 16(sp) = workspace[DCTSIZE*4]; // buffers data between passes
3126
+ */
3127
+ .set at
3128
+
3129
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3130
+
3131
+ lw v1, 48(sp)
3132
+ move t0, a1
3133
+ move t1, v1
3134
+ li t9, 4
3135
+ li s0, 0x2e75f93e
3136
+ li s1, 0x21f9ba79
3137
+ li s2, 0xecc2efb0
3138
+ li s3, 0x52031ccd
3139
+
3140
+ 0:
3141
+ lh s6, 32(t0) // inptr[DCTSIZE*2]
3142
+ lh t6, 32(a0) // quantptr[DCTSIZE*2]
3143
+ lh s7, 96(t0) // inptr[DCTSIZE*6]
3144
+ lh t7, 96(a0) // quantptr[DCTSIZE*6]
3145
+ mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3146
+ lh s4, 0(t0) // inptr[DCTSIZE*0]
3147
+ mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3148
+ lh s5, 0(a0) // quantptr[0]
3149
+ li s6, 15137
3150
+ li s7, 6270
3151
+ mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
3152
+ mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3153
+ lh t5, 112(t0) // inptr[DCTSIZE*7]
3154
+ mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3155
+ lh s4, 112(a0) // quantptr[DCTSIZE*7]
3156
+ lh v0, 80(t0) // inptr[DCTSIZE*5]
3157
+ lh s5, 80(a0) // quantptr[DCTSIZE*5]
3158
+ lh s6, 48(a0) // quantptr[DCTSIZE*3]
3159
+ sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
3160
+ lh s7, 16(a0) // quantptr[DCTSIZE*1]
3161
+ lh t8, 16(t0) // inptr[DCTSIZE*1]
3162
+ subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3163
+ lh t7, 48(t0) // inptr[DCTSIZE*3]
3164
+ mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3165
+ mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3166
+ mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3167
+ mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3168
+ addu t3, t2, t6 // tmp10 = tmp0 + z2
3169
+ subu t4, t2, t6 // tmp10 = tmp0 - z2
3170
+ mult $ac0, zero, zero
3171
+ mult $ac1, zero, zero
3172
+ ins t5, v0, 16, 16
3173
+ ins t7, t8, 16, 16
3174
+ addiu t9, t9, -1
3175
+ dpa.w.ph $ac0, t5, s0
3176
+ dpa.w.ph $ac0, t7, s1
3177
+ dpa.w.ph $ac1, t5, s2
3178
+ dpa.w.ph $ac1, t7, s3
3179
+ mflo s4, $ac0
3180
+ mflo s5, $ac1
3181
+ addiu a0, a0, 2
3182
+ addiu t1, t1, 4
3183
+ addiu t0, t0, 2
3184
+ addu t6, t4, s4
3185
+ subu t5, t4, s4
3186
+ addu s6, t3, s5
3187
+ subu s7, t3, s5
3188
+ shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12)
3189
+ shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12)
3190
+ shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
3191
+ shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
3192
+ sw t6, 28(t1)
3193
+ sw t5, 60(t1)
3194
+ sw s6, -4(t1)
3195
+ bgtz t9, 0b
3196
+ sw s7, 92(t1)
3197
+ // second loop three pass
3198
+ li t9, 3
3199
+ 1:
3200
+ lh s6, 34(t0) // inptr[DCTSIZE*2]
3201
+ lh t6, 34(a0) // quantptr[DCTSIZE*2]
3202
+ lh s7, 98(t0) // inptr[DCTSIZE*6]
3203
+ lh t7, 98(a0) // quantptr[DCTSIZE*6]
3204
+ mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3205
+ lh s4, 2(t0) // inptr[DCTSIZE*0]
3206
+ mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3207
+ lh s5, 2(a0) // quantptr[DCTSIZE*0]
3208
+ li s6, 15137
3209
+ li s7, 6270
3210
+ mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
3211
+ mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3212
+ lh t5, 114(t0) // inptr[DCTSIZE*7]
3213
+ mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3214
+ lh s4, 114(a0) // quantptr[DCTSIZE*7]
3215
+ lh s5, 82(a0) // quantptr[DCTSIZE*5]
3216
+ lh t6, 82(t0) // inptr[DCTSIZE*5]
3217
+ sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
3218
+ lh s6, 50(a0) // quantptr[DCTSIZE*3]
3219
+ lh t8, 18(t0) // inptr[DCTSIZE*1]
3220
+ subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3221
+ lh t7, 50(t0) // inptr[DCTSIZE*3]
3222
+ lh s7, 18(a0) // quantptr[DCTSIZE*1]
3223
+ mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3224
+ mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3225
+ mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3226
+ mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3227
+ addu t3, t2, v0 // tmp10 = tmp0 + z2
3228
+ subu t4, t2, v0 // tmp10 = tmp0 - z2
3229
+ mult $ac0, zero, zero
3230
+ mult $ac1, zero, zero
3231
+ ins t5, t6, 16, 16
3232
+ ins t7, t8, 16, 16
3233
+ dpa.w.ph $ac0, t5, s0
3234
+ dpa.w.ph $ac0, t7, s1
3235
+ dpa.w.ph $ac1, t5, s2
3236
+ dpa.w.ph $ac1, t7, s3
3237
+ mflo t5, $ac0
3238
+ mflo t6, $ac1
3239
+ addiu t9, t9, -1
3240
+ addiu t0, t0, 2
3241
+ addiu a0, a0, 2
3242
+ addiu t1, t1, 4
3243
+ addu s5, t4, t5
3244
+ subu s4, t4, t5
3245
+ addu s6, t3, t6
3246
+ subu s7, t3, t6
3247
+ shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12)
3248
+ shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12)
3249
+ shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
3250
+ shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
3251
+ sw s5, 32(t1)
3252
+ sw s4, 64(t1)
3253
+ sw s6, 0(t1)
3254
+ bgtz t9, 1b
3255
+ sw s7, 96(t1)
3256
+ move t1, v1
3257
+ li s4, 15137
3258
+ lw s6, 8(t1) // wsptr[2]
3259
+ li s5, 6270
3260
+ lw s7, 24(t1) // wsptr[6]
3261
+ mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3262
+ lw t2, 0(t1) // wsptr[0]
3263
+ mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3264
+ lh t5, 28(t1) // wsptr[7]
3265
+ lh t6, 20(t1) // wsptr[5]
3266
+ lh t7, 12(t1) // wsptr[3]
3267
+ lh t8, 4(t1) // wsptr[1]
3268
+ ins t5, t6, 16, 16
3269
+ ins t7, t8, 16, 16
3270
+ mult $ac0, zero, zero
3271
+ dpa.w.ph $ac0, t5, s0
3272
+ dpa.w.ph $ac0, t7, s1
3273
+ mult $ac1, zero, zero
3274
+ dpa.w.ph $ac1, t5, s2
3275
+ dpa.w.ph $ac1, t7, s3
3276
+ sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3277
+ mflo s6, $ac0
3278
+ // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3279
+ subu s4, s4, s5
3280
+ addu t3, t2, s4 // tmp10 = tmp0 + z2
3281
+ mflo s7, $ac1
3282
+ subu t4, t2, s4 // tmp10 = tmp0 - z2
3283
+ addu t7, t4, s6
3284
+ subu t8, t4, s6
3285
+ addu t5, t3, s7
3286
+ subu t6, t3, s7
3287
+ shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3288
+ shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3289
+ shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3290
+ shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3291
+ sll s4, t9, 2
3292
+ lw v0, 0(a2) // output_buf[ctr]
3293
+ shll_s.w t5, t5, 24
3294
+ shll_s.w t6, t6, 24
3295
+ shll_s.w t7, t7, 24
3296
+ shll_s.w t8, t8, 24
3297
+ sra t5, t5, 24
3298
+ sra t6, t6, 24
3299
+ sra t7, t7, 24
3300
+ sra t8, t8, 24
3301
+ addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3302
+ addiu t5, t5, 128
3303
+ addiu t6, t6, 128
3304
+ addiu t7, t7, 128
3305
+ addiu t8, t8, 128
3306
+ sb t5, 0(v0)
3307
+ sb t7, 1(v0)
3308
+ sb t8, 2(v0)
3309
+ sb t6, 3(v0)
3310
+ // 2
3311
+ li s4, 15137
3312
+ lw s6, 40(t1) // wsptr[2]
3313
+ li s5, 6270
3314
+ lw s7, 56(t1) // wsptr[6]
3315
+ mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3316
+ lw t2, 32(t1) // wsptr[0]
3317
+ mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3318
+ lh t5, 60(t1) // wsptr[7]
3319
+ lh t6, 52(t1) // wsptr[5]
3320
+ lh t7, 44(t1) // wsptr[3]
3321
+ lh t8, 36(t1) // wsptr[1]
3322
+ ins t5, t6, 16, 16
3323
+ ins t7, t8, 16, 16
3324
+ mult $ac0, zero, zero
3325
+ dpa.w.ph $ac0, t5, s0
3326
+ dpa.w.ph $ac0, t7, s1
3327
+ mult $ac1, zero, zero
3328
+ dpa.w.ph $ac1, t5, s2
3329
+ dpa.w.ph $ac1, t7, s3
3330
+ sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3331
+ mflo s6, $ac0
3332
+ // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3333
+ subu s4, s4, s5
3334
+ addu t3, t2, s4 // tmp10 = tmp0 + z2
3335
+ mflo s7, $ac1
3336
+ subu t4, t2, s4 // tmp10 = tmp0 - z2
3337
+ addu t7, t4, s6
3338
+ subu t8, t4, s6
3339
+ addu t5, t3, s7
3340
+ subu t6, t3, s7
3341
+ shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
3342
+ shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
3343
+ shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
3344
+ shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
3345
+ sll s4, t9, 2
3346
+ lw v0, 4(a2) // output_buf[ctr]
3347
+ shll_s.w t5, t5, 24
3348
+ shll_s.w t6, t6, 24
3349
+ shll_s.w t7, t7, 24
3350
+ shll_s.w t8, t8, 24
3351
+ sra t5, t5, 24
3352
+ sra t6, t6, 24
3353
+ sra t7, t7, 24
3354
+ sra t8, t8, 24
3355
+ addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3356
+ addiu t5, t5, 128
3357
+ addiu t6, t6, 128
3358
+ addiu t7, t7, 128
3359
+ addiu t8, t8, 128
3360
+ sb t5, 0(v0)
3361
+ sb t7, 1(v0)
3362
+ sb t8, 2(v0)
3363
+ sb t6, 3(v0)
3364
+ // 3
3365
+ li s4, 15137
3366
+ lw s6, 72(t1) // wsptr[2]
3367
+ li s5, 6270
3368
+ lw s7, 88(t1) // wsptr[6]
3369
+ mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3370
+ lw t2, 64(t1) // wsptr[0]
3371
+ mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3372
+ lh t5, 92(t1) // wsptr[7]
3373
+ lh t6, 84(t1) // wsptr[5]
3374
+ lh t7, 76(t1) // wsptr[3]
3375
+ lh t8, 68(t1) // wsptr[1]
3376
+ ins t5, t6, 16, 16
3377
+ ins t7, t8, 16, 16
3378
+ mult $ac0, zero, zero
3379
+ dpa.w.ph $ac0, t5, s0
3380
+ dpa.w.ph $ac0, t7, s1
3381
+ mult $ac1, zero, zero
3382
+ dpa.w.ph $ac1, t5, s2
3383
+ dpa.w.ph $ac1, t7, s3
3384
+ sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3385
+ mflo s6, $ac0
3386
+ // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3387
+ subu s4, s4, s5
3388
+ addu t3, t2, s4 // tmp10 = tmp0 + z2
3389
+ mflo s7, $ac1
3390
+ subu t4, t2, s4 // tmp10 = tmp0 - z2
3391
+ addu t7, t4, s6
3392
+ subu t8, t4, s6
3393
+ addu t5, t3, s7
3394
+ subu t6, t3, s7
3395
+ shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3396
+ shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3397
+ shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3398
+ shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3399
+ sll s4, t9, 2
3400
+ lw v0, 8(a2) // output_buf[ctr]
3401
+ shll_s.w t5, t5, 24
3402
+ shll_s.w t6, t6, 24
3403
+ shll_s.w t7, t7, 24
3404
+ shll_s.w t8, t8, 24
3405
+ sra t5, t5, 24
3406
+ sra t6, t6, 24
3407
+ sra t7, t7, 24
3408
+ sra t8, t8, 24
3409
+ addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3410
+ addiu t5, t5, 128
3411
+ addiu t6, t6, 128
3412
+ addiu t7, t7, 128
3413
+ addiu t8, t8, 128
3414
+ sb t5, 0(v0)
3415
+ sb t7, 1(v0)
3416
+ sb t8, 2(v0)
3417
+ sb t6, 3(v0)
3418
+ li s4, 15137
3419
+ lw s6, 104(t1) // wsptr[2]
3420
+ li s5, 6270
3421
+ lw s7, 120(t1) // wsptr[6]
3422
+ mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3423
+ lw t2, 96(t1) // wsptr[0]
3424
+ mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3425
+ lh t5, 124(t1) // wsptr[7]
3426
+ lh t6, 116(t1) // wsptr[5]
3427
+ lh t7, 108(t1) // wsptr[3]
3428
+ lh t8, 100(t1) // wsptr[1]
3429
+ ins t5, t6, 16, 16
3430
+ ins t7, t8, 16, 16
3431
+ mult $ac0, zero, zero
3432
+ dpa.w.ph $ac0, t5, s0
3433
+ dpa.w.ph $ac0, t7, s1
3434
+ mult $ac1, zero, zero
3435
+ dpa.w.ph $ac1, t5, s2
3436
+ dpa.w.ph $ac1, t7, s3
3437
+ sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3438
+ mflo s6, $ac0
3439
+ // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3440
+ subu s4, s4, s5
3441
+ addu t3, t2, s4 // tmp10 = tmp0 + z2;
3442
+ mflo s7, $ac1
3443
+ subu t4, t2, s4 // tmp10 = tmp0 - z2;
3444
+ addu t7, t4, s6
3445
+ subu t8, t4, s6
3446
+ addu t5, t3, s7
3447
+ subu t6, t3, s7
3448
+ shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3449
+ shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3450
+ shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3451
+ shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3452
+ sll s4, t9, 2
3453
+ lw v0, 12(a2) // output_buf[ctr]
3454
+ shll_s.w t5, t5, 24
3455
+ shll_s.w t6, t6, 24
3456
+ shll_s.w t7, t7, 24
3457
+ shll_s.w t8, t8, 24
3458
+ sra t5, t5, 24
3459
+ sra t6, t6, 24
3460
+ sra t7, t7, 24
3461
+ sra t8, t8, 24
3462
+ addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3463
+ addiu t5, t5, 128
3464
+ addiu t6, t6, 128
3465
+ addiu t7, t7, 128
3466
+ addiu t8, t8, 128
3467
+ sb t5, 0(v0)
3468
+ sb t7, 1(v0)
3469
+ sb t8, 2(v0)
3470
+ sb t6, 3(v0)
3471
+
3472
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3473
+
3474
+ j ra
3475
+ nop
3476
+ END(jsimd_idct_4x4_dspr2)
3477
+
3478
+
3479
+ /*****************************************************************************/
3480
+ LEAF_DSPR2(jsimd_idct_6x6_dspr2)
3481
+ /*
3482
+ * a0 = compptr->dct_table
3483
+ * a1 = coef_block
3484
+ * a2 = output_buf
3485
+ * a3 = output_col
3486
+ */
3487
+ .set at
3488
+
3489
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3490
+
3491
+ addiu sp, sp, -144
3492
+ move v0, sp
3493
+ addiu v1, v0, 24
3494
+ addiu t9, zero, 5793
3495
+ addiu s0, zero, 10033
3496
+ addiu s1, zero, 2998
3497
+
3498
+ 1:
3499
+ lh s2, 0(a0) // q0 = quantptr[ 0]
3500
+ lh s3, 32(a0) // q1 = quantptr[16]
3501
+ lh s4, 64(a0) // q2 = quantptr[32]
3502
+ lh t2, 64(a1) // tmp2 = inptr[32]
3503
+ lh t1, 32(a1) // tmp1 = inptr[16]
3504
+ lh t0, 0(a1) // tmp0 = inptr[ 0]
3505
+ mul t2, t2, s4 // tmp2 = tmp2 * q2
3506
+ mul t1, t1, s3 // tmp1 = tmp1 * q1
3507
+ mul t0, t0, s2 // tmp0 = tmp0 * q0
3508
+ lh t6, 16(a1) // z1 = inptr[ 8]
3509
+ lh t8, 80(a1) // z3 = inptr[40]
3510
+ lh t7, 48(a1) // z2 = inptr[24]
3511
+ lh s2, 16(a0) // q0 = quantptr[ 8]
3512
+ lh s4, 80(a0) // q2 = quantptr[40]
3513
+ lh s3, 48(a0) // q1 = quantptr[24]
3514
+ mul t2, t2, t9 // tmp2 = tmp2 * 5793
3515
+ mul t1, t1, s0 // tmp1 = tmp1 * 10033
3516
+ sll t0, t0, 13 // tmp0 = tmp0 << 13
3517
+ mul t6, t6, s2 // z1 = z1 * q0
3518
+ mul t8, t8, s4 // z3 = z3 * q2
3519
+ mul t7, t7, s3 // z2 = z2 * q1
3520
+ addu t3, t0, t2 // tmp10 = tmp0 + tmp2
3521
+ sll t2, t2, 1 // tmp2 = tmp2 << 2
3522
+ subu t4, t0, t2 // tmp11 = tmp0 - tmp2;
3523
+ subu t5, t3, t1 // tmp12 = tmp10 - tmp1
3524
+ addu t3, t3, t1 // tmp10 = tmp10 + tmp1
3525
+ addu t1, t6, t8 // tmp1 = z1 + z3
3526
+ mul t1, t1, s1 // tmp1 = tmp1 * 2998
3527
+ shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
3528
+ subu t2, t6, t8 // tmp2 = z1 - z3
3529
+ subu t2, t2, t7 // tmp2 = tmp2 - z2
3530
+ sll t2, t2, 2 // tmp2 = tmp2 << 2
3531
+ addu t0, t6, t7 // tmp0 = z1 + z2
3532
+ sll t0, t0, 13 // tmp0 = tmp0 << 13
3533
+ subu s2, t8, t7 // q0 = z3 - z2
3534
+ sll s2, s2, 13 // q0 = q0 << 13
3535
+ addu t0, t0, t1 // tmp0 = tmp0 + tmp1
3536
+ addu t1, s2, t1 // tmp1 = q0 + tmp1
3537
+ addu s2, t4, t2 // q0 = tmp11 + tmp2
3538
+ subu s3, t4, t2 // q1 = tmp11 - tmp2
3539
+ addu t6, t3, t0 // z1 = tmp10 + tmp0
3540
+ subu t7, t3, t0 // z2 = tmp10 - tmp0
3541
+ addu t4, t5, t1 // tmp11 = tmp12 + tmp1
3542
+ subu t5, t5, t1 // tmp12 = tmp12 - tmp1
3543
+ shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11
3544
+ shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11
3545
+ shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
3546
+ shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11
3547
+ sw s2, 24(v0)
3548
+ sw s3, 96(v0)
3549
+ sw t6, 0(v0)
3550
+ sw t7, 120(v0)
3551
+ sw t4, 48(v0)
3552
+ sw t5, 72(v0)
3553
+ addiu v0, v0, 4
3554
+ addiu a1, a1, 2
3555
+ bne v0, v1, 1b
3556
+ addiu a0, a0, 2
3557
+
3558
+ /* Pass 2: process 6 rows from work array, store into output array. */
3559
+ move v0, sp
3560
+ addiu v1, v0, 144
3561
+
3562
+ 2:
3563
+ lw t0, 0(v0)
3564
+ lw t2, 16(v0)
3565
+ lw s5, 0(a2)
3566
+ addiu t0, t0, 16
3567
+ sll t0, t0, 13
3568
+ mul t3, t2, t9
3569
+ lw t6, 4(v0)
3570
+ lw t8, 20(v0)
3571
+ lw t7, 12(v0)
3572
+ addu s5, s5, a3
3573
+ addu s6, t6, t8
3574
+ mul s6, s6, s1
3575
+ addu t1, t0, t3
3576
+ subu t4, t0, t3
3577
+ subu t4, t4, t3
3578
+ lw t3, 8(v0)
3579
+ mul t0, t3, s0
3580
+ addu s7, t6, t7
3581
+ sll s7, s7, 13
3582
+ addu s7, s6, s7
3583
+ subu t2, t8, t7
3584
+ sll t2, t2, 13
3585
+ addu t2, s6, t2
3586
+ subu s6, t6, t7
3587
+ subu s6, s6, t8
3588
+ sll s6, s6, 13
3589
+ addu t3, t1, t0
3590
+ subu t5, t1, t0
3591
+ addu t6, t3, s7
3592
+ subu t3, t3, s7
3593
+ addu t7, t4, s6
3594
+ subu t4, t4, s6
3595
+ addu t8, t5, t2
3596
+ subu t5, t5, t2
3597
+ shll_s.w t6, t6, 6
3598
+ shll_s.w t3, t3, 6
3599
+ shll_s.w t7, t7, 6
3600
+ shll_s.w t4, t4, 6
3601
+ shll_s.w t8, t8, 6
3602
+ shll_s.w t5, t5, 6
3603
+ sra t6, t6, 24
3604
+ addiu t6, t6, 128
3605
+ sra t3, t3, 24
3606
+ addiu t3, t3, 128
3607
+ sb t6, 0(s5)
3608
+ sra t7, t7, 24
3609
+ addiu t7, t7, 128
3610
+ sb t3, 5(s5)
3611
+ sra t4, t4, 24
3612
+ addiu t4, t4, 128
3613
+ sb t7, 1(s5)
3614
+ sra t8, t8, 24
3615
+ addiu t8, t8, 128
3616
+ sb t4, 4(s5)
3617
+ addiu v0, v0, 24
3618
+ sra t5, t5, 24
3619
+ addiu t5, t5, 128
3620
+ sb t8, 2(s5)
3621
+ addiu a2, a2, 4
3622
+ bne v0, v1, 2b
3623
+ sb t5, 3(s5)
3624
+
3625
+ addiu sp, sp, 144
3626
+
3627
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3628
+
3629
+ j ra
3630
+ nop
3631
+
3632
+ END(jsimd_idct_6x6_dspr2)
3633
+
3634
+
3635
+ /*****************************************************************************/
3636
+ LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
3637
+ /*
3638
+ * a0 = compptr->dct_table
3639
+ * a1 = coef_block
3640
+ * a2 = workspace
3641
+ */
3642
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3643
+
3644
+ li a3, 8
3645
+
3646
+ 1:
3647
+ // odd part
3648
+ lh t0, 48(a1)
3649
+ lh t1, 48(a0)
3650
+ lh t2, 16(a1)
3651
+ lh t3, 16(a0)
3652
+ lh t4, 80(a1)
3653
+ lh t5, 80(a0)
3654
+ lh t6, 112(a1)
3655
+ lh t7, 112(a0)
3656
+ mul t0, t0, t1 // z2
3657
+ mul t1, t2, t3 // z1
3658
+ mul t2, t4, t5 // z3
3659
+ mul t3, t6, t7 // z4
3660
+ li t4, 10703 // FIX(1.306562965)
3661
+ li t5, 4433 // FIX_0_541196100
3662
+ li t6, 7053 // FIX(0.860918669)
3663
+ mul t4, t0, t4 // tmp11
3664
+ mul t5, t0, t5 // -tmp14
3665
+ addu t7, t1, t2 // tmp10
3666
+ addu t8, t7, t3 // tmp10 + z4
3667
+ mul t6, t6, t8 // tmp15
3668
+ li t8, 2139 // FIX(0.261052384)
3669
+ mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384))
3670
+ li t7, 2295 // FIX(0.280143716)
3671
+ mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716))
3672
+ addu t9, t2, t3 // z3 + z4
3673
+ li s0, 8565 // FIX(1.045510580)
3674
+ mul t9, t9, s0 // -tmp13
3675
+ li s0, 12112 // FIX(1.478575242)
3676
+ mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)
3677
+ li s1, 12998 // FIX(1.586706681)
3678
+ mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
3679
+ li s2, 5540 // FIX(0.676326758)
3680
+ mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
3681
+ li s3, 16244 // FIX(1.982889723)
3682
+ mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
3683
+ subu t1, t1, t3 // z1-=z4
3684
+ subu t0, t0, t2 // z2-=z3
3685
+ addu t2, t0, t1 // z1+z2
3686
+ li t3, 4433 // FIX_0_541196100
3687
+ mul t2, t2, t3 // z3
3688
+ li t3, 6270 // FIX_0_765366865
3689
+ mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
3690
+ li t3, 15137 // FIX_0_765366865
3691
+ mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
3692
+ addu t8, t6, t8 // tmp12
3693
+ addu t3, t8, t4 // tmp12 + tmp11
3694
+ addu t3, t3, t7 // tmp10
3695
+ subu t8, t8, t9 // tmp12 + tmp13
3696
+ addu s0, t5, s0
3697
+ subu t8, t8, s0 // tmp12
3698
+ subu t9, t6, t9
3699
+ subu s1, s1, t4
3700
+ addu t9, t9, s1 // tmp13
3701
+ subu t6, t6, t5
3702
+ subu t6, t6, s2
3703
+ subu t6, t6, s3 // tmp15
3704
+ // even part start
3705
+ lh t4, 64(a1)
3706
+ lh t5, 64(a0)
3707
+ lh t7, 32(a1)
3708
+ lh s0, 32(a0)
3709
+ lh s1, 0(a1)
3710
+ lh s2, 0(a0)
3711
+ lh s3, 96(a1)
3712
+ lh v0, 96(a0)
3713
+ mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
3714
+ mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
3715
+ mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
3716
+ mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
3717
+ // odd part end
3718
+ addu t1, t2, t1 // tmp11
3719
+ subu t0, t2, t0 // tmp14
3720
+ // update counter and pointers
3721
+ addiu a3, a3, -1
3722
+ addiu a0, a0, 2
3723
+ addiu a1, a1, 2
3724
+ // even part rest
3725
+ li s1, 10033
3726
+ li s2, 11190
3727
+ mul t4, t4, s1 // z4
3728
+ mul s1, t5, s2 // z4
3729
+ sll t5, t5, 13 // z1
3730
+ sll t7, t7, 13
3731
+ addiu t7, t7, 1024 // z3
3732
+ sll s0, s0, 13 // z2
3733
+ addu s2, t7, t4 // tmp10
3734
+ subu t4, t7, t4 // tmp11
3735
+ subu s3, t5, s0 // tmp12
3736
+ addu t2, t7, s3 // tmp21
3737
+ subu s3, t7, s3 // tmp24
3738
+ addu t7, s1, s0 // tmp12
3739
+ addu v0, s2, t7 // tmp20
3740
+ subu s2, s2, t7 // tmp25
3741
+ subu s1, s1, t5 // z4 - z1
3742
+ subu s1, s1, s0 // tmp12
3743
+ addu s0, t4, s1 // tmp22
3744
+ subu t4, t4, s1 // tmp23
3745
+ // final output stage
3746
+ addu t5, v0, t3
3747
+ subu v0, v0, t3
3748
+ addu t3, t2, t1
3749
+ subu t2, t2, t1
3750
+ addu t1, s0, t8
3751
+ subu s0, s0, t8
3752
+ addu t8, t4, t9
3753
+ subu t4, t4, t9
3754
+ addu t9, s3, t0
3755
+ subu s3, s3, t0
3756
+ addu t0, s2, t6
3757
+ subu s2, s2, t6
3758
+ sra t5, t5, 11
3759
+ sra t3, t3, 11
3760
+ sra t1, t1, 11
3761
+ sra t8, t8, 11
3762
+ sra t9, t9, 11
3763
+ sra t0, t0, 11
3764
+ sra s2, s2, 11
3765
+ sra s3, s3, 11
3766
+ sra t4, t4, 11
3767
+ sra s0, s0, 11
3768
+ sra t2, t2, 11
3769
+ sra v0, v0, 11
3770
+ sw t5, 0(a2)
3771
+ sw t3, 32(a2)
3772
+ sw t1, 64(a2)
3773
+ sw t8, 96(a2)
3774
+ sw t9, 128(a2)
3775
+ sw t0, 160(a2)
3776
+ sw s2, 192(a2)
3777
+ sw s3, 224(a2)
3778
+ sw t4, 256(a2)
3779
+ sw s0, 288(a2)
3780
+ sw t2, 320(a2)
3781
+ sw v0, 352(a2)
3782
+ bgtz a3, 1b
3783
+ addiu a2, a2, 4
3784
+
3785
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3786
+
3787
+ j ra
3788
+ nop
3789
+
3790
+ END(jsimd_idct_12x12_pass1_dspr2)
3791
+
3792
+
3793
+ /*****************************************************************************/
3794
+ LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
3795
+ /*
3796
+ * a0 = workspace
3797
+ * a1 = output
3798
+ */
3799
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3800
+
3801
+ li a3, 12
3802
+
3803
+ 1:
3804
+ // Odd part
3805
+ lw t0, 12(a0)
3806
+ lw t1, 4(a0)
3807
+ lw t2, 20(a0)
3808
+ lw t3, 28(a0)
3809
+ li t4, 10703 // FIX(1.306562965)
3810
+ li t5, 4433 // FIX_0_541196100
3811
+ mul t4, t0, t4 // tmp11
3812
+ mul t5, t0, t5 // -tmp14
3813
+ addu t6, t1, t2 // tmp10
3814
+ li t7, 2139 // FIX(0.261052384)
3815
+ mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384))
3816
+ addu t6, t6, t3 // tmp10 + z4
3817
+ li t8, 7053 // FIX(0.860918669)
3818
+ mul t6, t6, t8 // tmp15
3819
+ li t8, 2295 // FIX(0.280143716)
3820
+ mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716))
3821
+ addu t9, t2, t3 // z3 + z4
3822
+ li s0, 8565 // FIX(1.045510580)
3823
+ mul t9, t9, s0 // -tmp13
3824
+ li s0, 12112 // FIX(1.478575242)
3825
+ mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242))
3826
+ li s1, 12998 // FIX(1.586706681)
3827
+ mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
3828
+ li s2, 5540 // FIX(0.676326758)
3829
+ mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
3830
+ li s3, 16244 // FIX(1.982889723)
3831
+ mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
3832
+ subu t1, t1, t3 // z1 -= z4
3833
+ subu t0, t0, t2 // z2 -= z3
3834
+ addu t2, t1, t0 // z1 + z2
3835
+ li t3, 4433 // FIX_0_541196100
3836
+ mul t2, t2, t3 // z3
3837
+ li t3, 6270 // FIX_0_765366865
3838
+ mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
3839
+ li t3, 15137 // FIX_1_847759065
3840
+ mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
3841
+ addu t3, t6, t7 // tmp12
3842
+ addu t7, t3, t4
3843
+ addu t7, t7, t8 // tmp10
3844
+ subu t3, t3, t9
3845
+ subu t3, t3, t5
3846
+ subu t3, t3, s0 // tmp12
3847
+ subu t9, t6, t9
3848
+ subu t9, t9, t4
3849
+ addu t9, t9, s1 // tmp13
3850
+ subu t6, t6, t5
3851
+ subu t6, t6, s2
3852
+ subu t6, t6, s3 // tmp15
3853
+ addu t1, t2, t1 // tmp11
3854
+ subu t0, t2, t0 // tmp14
3855
+ // even part
3856
+ lw t2, 16(a0) // z4
3857
+ lw t4, 8(a0) // z1
3858
+ lw t5, 0(a0) // z3
3859
+ lw t8, 24(a0) // z2
3860
+ li s0, 10033 // FIX(1.224744871)
3861
+ li s1, 11190 // FIX(1.366025404)
3862
+ mul t2, t2, s0 // z4
3863
+ mul s0, t4, s1 // z4
3864
+ addiu t5, t5, 0x10
3865
+ sll t5, t5, 13 // z3
3866
+ sll t4, t4, 13 // z1
3867
+ sll t8, t8, 13 // z2
3868
+ subu s1, t4, t8 // tmp12
3869
+ addu s2, t5, t2 // tmp10
3870
+ subu t2, t5, t2 // tmp11
3871
+ addu s3, t5, s1 // tmp21
3872
+ subu s1, t5, s1 // tmp24
3873
+ addu t5, s0, t8 // tmp12
3874
+ addu v0, s2, t5 // tmp20
3875
+ subu t5, s2, t5 // tmp25
3876
+ subu t4, s0, t4
3877
+ subu t4, t4, t8 // tmp12
3878
+ addu t8, t2, t4 // tmp22
3879
+ subu t2, t2, t4 // tmp23
3880
+ // increment counter and pointers
3881
+ addiu a3, a3, -1
3882
+ addiu a0, a0, 32
3883
+ // Final stage
3884
+ addu t4, v0, t7
3885
+ subu v0, v0, t7
3886
+ addu t7, s3, t1
3887
+ subu s3, s3, t1
3888
+ addu t1, t8, t3
3889
+ subu t8, t8, t3
3890
+ addu t3, t2, t9
3891
+ subu t2, t2, t9
3892
+ addu t9, s1, t0
3893
+ subu s1, s1, t0
3894
+ addu t0, t5, t6
3895
+ subu t5, t5, t6
3896
+ sll t4, t4, 4
3897
+ sll t7, t7, 4
3898
+ sll t1, t1, 4
3899
+ sll t3, t3, 4
3900
+ sll t9, t9, 4
3901
+ sll t0, t0, 4
3902
+ sll t5, t5, 4
3903
+ sll s1, s1, 4
3904
+ sll t2, t2, 4
3905
+ sll t8, t8, 4
3906
+ sll s3, s3, 4
3907
+ sll v0, v0, 4
3908
+ shll_s.w t4, t4, 2
3909
+ shll_s.w t7, t7, 2
3910
+ shll_s.w t1, t1, 2
3911
+ shll_s.w t3, t3, 2
3912
+ shll_s.w t9, t9, 2
3913
+ shll_s.w t0, t0, 2
3914
+ shll_s.w t5, t5, 2
3915
+ shll_s.w s1, s1, 2
3916
+ shll_s.w t2, t2, 2
3917
+ shll_s.w t8, t8, 2
3918
+ shll_s.w s3, s3, 2
3919
+ shll_s.w v0, v0, 2
3920
+ srl t4, t4, 24
3921
+ srl t7, t7, 24
3922
+ srl t1, t1, 24
3923
+ srl t3, t3, 24
3924
+ srl t9, t9, 24
3925
+ srl t0, t0, 24
3926
+ srl t5, t5, 24
3927
+ srl s1, s1, 24
3928
+ srl t2, t2, 24
3929
+ srl t8, t8, 24
3930
+ srl s3, s3, 24
3931
+ srl v0, v0, 24
3932
+ lw t6, 0(a1)
3933
+ addiu t4, t4, 0x80
3934
+ addiu t7, t7, 0x80
3935
+ addiu t1, t1, 0x80
3936
+ addiu t3, t3, 0x80
3937
+ addiu t9, t9, 0x80
3938
+ addiu t0, t0, 0x80
3939
+ addiu t5, t5, 0x80
3940
+ addiu s1, s1, 0x80
3941
+ addiu t2, t2, 0x80
3942
+ addiu t8, t8, 0x80
3943
+ addiu s3, s3, 0x80
3944
+ addiu v0, v0, 0x80
3945
+ sb t4, 0(t6)
3946
+ sb t7, 1(t6)
3947
+ sb t1, 2(t6)
3948
+ sb t3, 3(t6)
3949
+ sb t9, 4(t6)
3950
+ sb t0, 5(t6)
3951
+ sb t5, 6(t6)
3952
+ sb s1, 7(t6)
3953
+ sb t2, 8(t6)
3954
+ sb t8, 9(t6)
3955
+ sb s3, 10(t6)
3956
+ sb v0, 11(t6)
3957
+ bgtz a3, 1b
3958
+ addiu a1, a1, 4
3959
+
3960
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3961
+
3962
+ jr ra
3963
+ nop
3964
+
3965
+ END(jsimd_idct_12x12_pass2_dspr2)
3966
+
3967
+
3968
+ /*****************************************************************************/
3969
+ LEAF_DSPR2(jsimd_convsamp_dspr2)
3970
+ /*
3971
+ * a0 = sample_data
3972
+ * a1 = start_col
3973
+ * a2 = workspace
3974
+ */
3975
+ lw t0, 0(a0)
3976
+ li t7, 0xff80ff80
3977
+ addu t0, t0, a1
3978
+ ulw t1, 0(t0)
3979
+ ulw t2, 4(t0)
3980
+ preceu.ph.qbr t3, t1
3981
+ preceu.ph.qbl t4, t1
3982
+ lw t0, 4(a0)
3983
+ preceu.ph.qbr t5, t2
3984
+ preceu.ph.qbl t6, t2
3985
+ addu t0, t0, a1
3986
+ addu.ph t3, t3, t7
3987
+ addu.ph t4, t4, t7
3988
+ ulw t1, 0(t0)
3989
+ ulw t2, 4(t0)
3990
+ addu.ph t5, t5, t7
3991
+ addu.ph t6, t6, t7
3992
+ usw t3, 0(a2)
3993
+ usw t4, 4(a2)
3994
+ preceu.ph.qbr t3, t1
3995
+ preceu.ph.qbl t4, t1
3996
+ usw t5, 8(a2)
3997
+ usw t6, 12(a2)
3998
+
3999
+ lw t0, 8(a0)
4000
+ preceu.ph.qbr t5, t2
4001
+ preceu.ph.qbl t6, t2
4002
+ addu t0, t0, a1
4003
+ addu.ph t3, t3, t7
4004
+ addu.ph t4, t4, t7
4005
+ ulw t1, 0(t0)
4006
+ ulw t2, 4(t0)
4007
+ addu.ph t5, t5, t7
4008
+ addu.ph t6, t6, t7
4009
+ usw t3, 16(a2)
4010
+ usw t4, 20(a2)
4011
+ preceu.ph.qbr t3, t1
4012
+ preceu.ph.qbl t4, t1
4013
+ usw t5, 24(a2)
4014
+ usw t6, 28(a2)
4015
+
4016
+ lw t0, 12(a0)
4017
+ preceu.ph.qbr t5, t2
4018
+ preceu.ph.qbl t6, t2
4019
+ addu t0, t0, a1
4020
+ addu.ph t3, t3, t7
4021
+ addu.ph t4, t4, t7
4022
+ ulw t1, 0(t0)
4023
+ ulw t2, 4(t0)
4024
+ addu.ph t5, t5, t7
4025
+ addu.ph t6, t6, t7
4026
+ usw t3, 32(a2)
4027
+ usw t4, 36(a2)
4028
+ preceu.ph.qbr t3, t1
4029
+ preceu.ph.qbl t4, t1
4030
+ usw t5, 40(a2)
4031
+ usw t6, 44(a2)
4032
+
4033
+ lw t0, 16(a0)
4034
+ preceu.ph.qbr t5, t2
4035
+ preceu.ph.qbl t6, t2
4036
+ addu t0, t0, a1
4037
+ addu.ph t3, t3, t7
4038
+ addu.ph t4, t4, t7
4039
+ ulw t1, 0(t0)
4040
+ ulw t2, 4(t0)
4041
+ addu.ph t5, t5, t7
4042
+ addu.ph t6, t6, t7
4043
+ usw t3, 48(a2)
4044
+ usw t4, 52(a2)
4045
+ preceu.ph.qbr t3, t1
4046
+ preceu.ph.qbl t4, t1
4047
+ usw t5, 56(a2)
4048
+ usw t6, 60(a2)
4049
+
4050
+ lw t0, 20(a0)
4051
+ preceu.ph.qbr t5, t2
4052
+ preceu.ph.qbl t6, t2
4053
+ addu t0, t0, a1
4054
+ addu.ph t3, t3, t7
4055
+ addu.ph t4, t4, t7
4056
+ ulw t1, 0(t0)
4057
+ ulw t2, 4(t0)
4058
+ addu.ph t5, t5, t7
4059
+ addu.ph t6, t6, t7
4060
+ usw t3, 64(a2)
4061
+ usw t4, 68(a2)
4062
+ preceu.ph.qbr t3, t1
4063
+ preceu.ph.qbl t4, t1
4064
+ usw t5, 72(a2)
4065
+ usw t6, 76(a2)
4066
+
4067
+ lw t0, 24(a0)
4068
+ preceu.ph.qbr t5, t2
4069
+ preceu.ph.qbl t6, t2
4070
+ addu t0, t0, a1
4071
+ addu.ph t3, t3, t7
4072
+ addu.ph t4, t4, t7
4073
+ ulw t1, 0(t0)
4074
+ ulw t2, 4(t0)
4075
+ addu.ph t5, t5, t7
4076
+ addu.ph t6, t6, t7
4077
+ usw t3, 80(a2)
4078
+ usw t4, 84(a2)
4079
+ preceu.ph.qbr t3, t1
4080
+ preceu.ph.qbl t4, t1
4081
+ usw t5, 88(a2)
4082
+ usw t6, 92(a2)
4083
+
4084
+ lw t0, 28(a0)
4085
+ preceu.ph.qbr t5, t2
4086
+ preceu.ph.qbl t6, t2
4087
+ addu t0, t0, a1
4088
+ addu.ph t3, t3, t7
4089
+ addu.ph t4, t4, t7
4090
+ ulw t1, 0(t0)
4091
+ ulw t2, 4(t0)
4092
+ addu.ph t5, t5, t7
4093
+ addu.ph t6, t6, t7
4094
+ usw t3, 96(a2)
4095
+ usw t4, 100(a2)
4096
+ preceu.ph.qbr t3, t1
4097
+ preceu.ph.qbl t4, t1
4098
+ usw t5, 104(a2)
4099
+ usw t6, 108(a2)
4100
+ preceu.ph.qbr t5, t2
4101
+ preceu.ph.qbl t6, t2
4102
+ addu.ph t3, t3, t7
4103
+ addu.ph t4, t4, t7
4104
+ addu.ph t5, t5, t7
4105
+ addu.ph t6, t6, t7
4106
+ usw t3, 112(a2)
4107
+ usw t4, 116(a2)
4108
+ usw t5, 120(a2)
4109
+ usw t6, 124(a2)
4110
+
4111
+ j ra
4112
+ nop
4113
+
4114
+ END(jsimd_convsamp_dspr2)
4115
+
4116
+
4117
+ #ifndef __mips_soft_float
4118
+
4119
+ /*****************************************************************************/
4120
+ LEAF_DSPR2(jsimd_convsamp_float_dspr2)
4121
+ /*
4122
+ * a0 = sample_data
4123
+ * a1 = start_col
4124
+ * a2 = workspace
4125
+ */
4126
+ .set at
4127
+
4128
+ lw t0, 0(a0)
4129
+ addu t0, t0, a1
4130
+ lbu t1, 0(t0)
4131
+ lbu t2, 1(t0)
4132
+ lbu t3, 2(t0)
4133
+ lbu t4, 3(t0)
4134
+ lbu t5, 4(t0)
4135
+ lbu t6, 5(t0)
4136
+ lbu t7, 6(t0)
4137
+ lbu t8, 7(t0)
4138
+ addiu t1, t1, -128
4139
+ addiu t2, t2, -128
4140
+ addiu t3, t3, -128
4141
+ addiu t4, t4, -128
4142
+ addiu t5, t5, -128
4143
+ addiu t6, t6, -128
4144
+ addiu t7, t7, -128
4145
+ addiu t8, t8, -128
4146
+ mtc1 t1, f2
4147
+ mtc1 t2, f4
4148
+ mtc1 t3, f6
4149
+ mtc1 t4, f8
4150
+ mtc1 t5, f10
4151
+ mtc1 t6, f12
4152
+ mtc1 t7, f14
4153
+ mtc1 t8, f16
4154
+ cvt.s.w f2, f2
4155
+ cvt.s.w f4, f4
4156
+ cvt.s.w f6, f6
4157
+ cvt.s.w f8, f8
4158
+ cvt.s.w f10, f10
4159
+ cvt.s.w f12, f12
4160
+ cvt.s.w f14, f14
4161
+ cvt.s.w f16, f16
4162
+ lw t0, 4(a0)
4163
+ swc1 f2, 0(a2)
4164
+ swc1 f4, 4(a2)
4165
+ swc1 f6, 8(a2)
4166
+ addu t0, t0, a1
4167
+ swc1 f8, 12(a2)
4168
+ swc1 f10, 16(a2)
4169
+ swc1 f12, 20(a2)
4170
+ swc1 f14, 24(a2)
4171
+ swc1 f16, 28(a2)
4172
+ // elemr 1
4173
+ lbu t1, 0(t0)
4174
+ lbu t2, 1(t0)
4175
+ lbu t3, 2(t0)
4176
+ lbu t4, 3(t0)
4177
+ lbu t5, 4(t0)
4178
+ lbu t6, 5(t0)
4179
+ lbu t7, 6(t0)
4180
+ lbu t8, 7(t0)
4181
+ addiu t1, t1, -128
4182
+ addiu t2, t2, -128
4183
+ addiu t3, t3, -128
4184
+ addiu t4, t4, -128
4185
+ addiu t5, t5, -128
4186
+ addiu t6, t6, -128
4187
+ addiu t7, t7, -128
4188
+ addiu t8, t8, -128
4189
+ mtc1 t1, f2
4190
+ mtc1 t2, f4
4191
+ mtc1 t3, f6
4192
+ mtc1 t4, f8
4193
+ mtc1 t5, f10
4194
+ mtc1 t6, f12
4195
+ mtc1 t7, f14
4196
+ mtc1 t8, f16
4197
+ cvt.s.w f2, f2
4198
+ cvt.s.w f4, f4
4199
+ cvt.s.w f6, f6
4200
+ cvt.s.w f8, f8
4201
+ cvt.s.w f10, f10
4202
+ cvt.s.w f12, f12
4203
+ cvt.s.w f14, f14
4204
+ cvt.s.w f16, f16
4205
+ lw t0, 8(a0)
4206
+ swc1 f2, 32(a2)
4207
+ swc1 f4, 36(a2)
4208
+ swc1 f6, 40(a2)
4209
+ addu t0, t0, a1
4210
+ swc1 f8, 44(a2)
4211
+ swc1 f10, 48(a2)
4212
+ swc1 f12, 52(a2)
4213
+ swc1 f14, 56(a2)
4214
+ swc1 f16, 60(a2)
4215
+ // elemr 2
4216
+ lbu t1, 0(t0)
4217
+ lbu t2, 1(t0)
4218
+ lbu t3, 2(t0)
4219
+ lbu t4, 3(t0)
4220
+ lbu t5, 4(t0)
4221
+ lbu t6, 5(t0)
4222
+ lbu t7, 6(t0)
4223
+ lbu t8, 7(t0)
4224
+ addiu t1, t1, -128
4225
+ addiu t2, t2, -128
4226
+ addiu t3, t3, -128
4227
+ addiu t4, t4, -128
4228
+ addiu t5, t5, -128
4229
+ addiu t6, t6, -128
4230
+ addiu t7, t7, -128
4231
+ addiu t8, t8, -128
4232
+ mtc1 t1, f2
4233
+ mtc1 t2, f4
4234
+ mtc1 t3, f6
4235
+ mtc1 t4, f8
4236
+ mtc1 t5, f10
4237
+ mtc1 t6, f12
4238
+ mtc1 t7, f14
4239
+ mtc1 t8, f16
4240
+ cvt.s.w f2, f2
4241
+ cvt.s.w f4, f4
4242
+ cvt.s.w f6, f6
4243
+ cvt.s.w f8, f8
4244
+ cvt.s.w f10, f10
4245
+ cvt.s.w f12, f12
4246
+ cvt.s.w f14, f14
4247
+ cvt.s.w f16, f16
4248
+ lw t0, 12(a0)
4249
+ swc1 f2, 64(a2)
4250
+ swc1 f4, 68(a2)
4251
+ swc1 f6, 72(a2)
4252
+ addu t0, t0, a1
4253
+ swc1 f8, 76(a2)
4254
+ swc1 f10, 80(a2)
4255
+ swc1 f12, 84(a2)
4256
+ swc1 f14, 88(a2)
4257
+ swc1 f16, 92(a2)
4258
+ // elemr 3
4259
+ lbu t1, 0(t0)
4260
+ lbu t2, 1(t0)
4261
+ lbu t3, 2(t0)
4262
+ lbu t4, 3(t0)
4263
+ lbu t5, 4(t0)
4264
+ lbu t6, 5(t0)
4265
+ lbu t7, 6(t0)
4266
+ lbu t8, 7(t0)
4267
+ addiu t1, t1, -128
4268
+ addiu t2, t2, -128
4269
+ addiu t3, t3, -128
4270
+ addiu t4, t4, -128
4271
+ addiu t5, t5, -128
4272
+ addiu t6, t6, -128
4273
+ addiu t7, t7, -128
4274
+ addiu t8, t8, -128
4275
+ mtc1 t1, f2
4276
+ mtc1 t2, f4
4277
+ mtc1 t3, f6
4278
+ mtc1 t4, f8
4279
+ mtc1 t5, f10
4280
+ mtc1 t6, f12
4281
+ mtc1 t7, f14
4282
+ mtc1 t8, f16
4283
+ cvt.s.w f2, f2
4284
+ cvt.s.w f4, f4
4285
+ cvt.s.w f6, f6
4286
+ cvt.s.w f8, f8
4287
+ cvt.s.w f10, f10
4288
+ cvt.s.w f12, f12
4289
+ cvt.s.w f14, f14
4290
+ cvt.s.w f16, f16
4291
+ lw t0, 16(a0)
4292
+ swc1 f2, 96(a2)
4293
+ swc1 f4, 100(a2)
4294
+ swc1 f6, 104(a2)
4295
+ addu t0, t0, a1
4296
+ swc1 f8, 108(a2)
4297
+ swc1 f10, 112(a2)
4298
+ swc1 f12, 116(a2)
4299
+ swc1 f14, 120(a2)
4300
+ swc1 f16, 124(a2)
4301
+ // elemr 4
4302
+ lbu t1, 0(t0)
4303
+ lbu t2, 1(t0)
4304
+ lbu t3, 2(t0)
4305
+ lbu t4, 3(t0)
4306
+ lbu t5, 4(t0)
4307
+ lbu t6, 5(t0)
4308
+ lbu t7, 6(t0)
4309
+ lbu t8, 7(t0)
4310
+ addiu t1, t1, -128
4311
+ addiu t2, t2, -128
4312
+ addiu t3, t3, -128
4313
+ addiu t4, t4, -128
4314
+ addiu t5, t5, -128
4315
+ addiu t6, t6, -128
4316
+ addiu t7, t7, -128
4317
+ addiu t8, t8, -128
4318
+ mtc1 t1, f2
4319
+ mtc1 t2, f4
4320
+ mtc1 t3, f6
4321
+ mtc1 t4, f8
4322
+ mtc1 t5, f10
4323
+ mtc1 t6, f12
4324
+ mtc1 t7, f14
4325
+ mtc1 t8, f16
4326
+ cvt.s.w f2, f2
4327
+ cvt.s.w f4, f4
4328
+ cvt.s.w f6, f6
4329
+ cvt.s.w f8, f8
4330
+ cvt.s.w f10, f10
4331
+ cvt.s.w f12, f12
4332
+ cvt.s.w f14, f14
4333
+ cvt.s.w f16, f16
4334
+ lw t0, 20(a0)
4335
+ swc1 f2, 128(a2)
4336
+ swc1 f4, 132(a2)
4337
+ swc1 f6, 136(a2)
4338
+ addu t0, t0, a1
4339
+ swc1 f8, 140(a2)
4340
+ swc1 f10, 144(a2)
4341
+ swc1 f12, 148(a2)
4342
+ swc1 f14, 152(a2)
4343
+ swc1 f16, 156(a2)
4344
+ // elemr 5
4345
+ lbu t1, 0(t0)
4346
+ lbu t2, 1(t0)
4347
+ lbu t3, 2(t0)
4348
+ lbu t4, 3(t0)
4349
+ lbu t5, 4(t0)
4350
+ lbu t6, 5(t0)
4351
+ lbu t7, 6(t0)
4352
+ lbu t8, 7(t0)
4353
+ addiu t1, t1, -128
4354
+ addiu t2, t2, -128
4355
+ addiu t3, t3, -128
4356
+ addiu t4, t4, -128
4357
+ addiu t5, t5, -128
4358
+ addiu t6, t6, -128
4359
+ addiu t7, t7, -128
4360
+ addiu t8, t8, -128
4361
+ mtc1 t1, f2
4362
+ mtc1 t2, f4
4363
+ mtc1 t3, f6
4364
+ mtc1 t4, f8
4365
+ mtc1 t5, f10
4366
+ mtc1 t6, f12
4367
+ mtc1 t7, f14
4368
+ mtc1 t8, f16
4369
+ cvt.s.w f2, f2
4370
+ cvt.s.w f4, f4
4371
+ cvt.s.w f6, f6
4372
+ cvt.s.w f8, f8
4373
+ cvt.s.w f10, f10
4374
+ cvt.s.w f12, f12
4375
+ cvt.s.w f14, f14
4376
+ cvt.s.w f16, f16
4377
+ lw t0, 24(a0)
4378
+ swc1 f2, 160(a2)
4379
+ swc1 f4, 164(a2)
4380
+ swc1 f6, 168(a2)
4381
+ addu t0, t0, a1
4382
+ swc1 f8, 172(a2)
4383
+ swc1 f10, 176(a2)
4384
+ swc1 f12, 180(a2)
4385
+ swc1 f14, 184(a2)
4386
+ swc1 f16, 188(a2)
4387
+ // elemr 6
4388
+ lbu t1, 0(t0)
4389
+ lbu t2, 1(t0)
4390
+ lbu t3, 2(t0)
4391
+ lbu t4, 3(t0)
4392
+ lbu t5, 4(t0)
4393
+ lbu t6, 5(t0)
4394
+ lbu t7, 6(t0)
4395
+ lbu t8, 7(t0)
4396
+ addiu t1, t1, -128
4397
+ addiu t2, t2, -128
4398
+ addiu t3, t3, -128
4399
+ addiu t4, t4, -128
4400
+ addiu t5, t5, -128
4401
+ addiu t6, t6, -128
4402
+ addiu t7, t7, -128
4403
+ addiu t8, t8, -128
4404
+ mtc1 t1, f2
4405
+ mtc1 t2, f4
4406
+ mtc1 t3, f6
4407
+ mtc1 t4, f8
4408
+ mtc1 t5, f10
4409
+ mtc1 t6, f12
4410
+ mtc1 t7, f14
4411
+ mtc1 t8, f16
4412
+ cvt.s.w f2, f2
4413
+ cvt.s.w f4, f4
4414
+ cvt.s.w f6, f6
4415
+ cvt.s.w f8, f8
4416
+ cvt.s.w f10, f10
4417
+ cvt.s.w f12, f12
4418
+ cvt.s.w f14, f14
4419
+ cvt.s.w f16, f16
4420
+ lw t0, 28(a0)
4421
+ swc1 f2, 192(a2)
4422
+ swc1 f4, 196(a2)
4423
+ swc1 f6, 200(a2)
4424
+ addu t0, t0, a1
4425
+ swc1 f8, 204(a2)
4426
+ swc1 f10, 208(a2)
4427
+ swc1 f12, 212(a2)
4428
+ swc1 f14, 216(a2)
4429
+ swc1 f16, 220(a2)
4430
+ // elemr 7
4431
+ lbu t1, 0(t0)
4432
+ lbu t2, 1(t0)
4433
+ lbu t3, 2(t0)
4434
+ lbu t4, 3(t0)
4435
+ lbu t5, 4(t0)
4436
+ lbu t6, 5(t0)
4437
+ lbu t7, 6(t0)
4438
+ lbu t8, 7(t0)
4439
+ addiu t1, t1, -128
4440
+ addiu t2, t2, -128
4441
+ addiu t3, t3, -128
4442
+ addiu t4, t4, -128
4443
+ addiu t5, t5, -128
4444
+ addiu t6, t6, -128
4445
+ addiu t7, t7, -128
4446
+ addiu t8, t8, -128
4447
+ mtc1 t1, f2
4448
+ mtc1 t2, f4
4449
+ mtc1 t3, f6
4450
+ mtc1 t4, f8
4451
+ mtc1 t5, f10
4452
+ mtc1 t6, f12
4453
+ mtc1 t7, f14
4454
+ mtc1 t8, f16
4455
+ cvt.s.w f2, f2
4456
+ cvt.s.w f4, f4
4457
+ cvt.s.w f6, f6
4458
+ cvt.s.w f8, f8
4459
+ cvt.s.w f10, f10
4460
+ cvt.s.w f12, f12
4461
+ cvt.s.w f14, f14
4462
+ cvt.s.w f16, f16
4463
+ swc1 f2, 224(a2)
4464
+ swc1 f4, 228(a2)
4465
+ swc1 f6, 232(a2)
4466
+ swc1 f8, 236(a2)
4467
+ swc1 f10, 240(a2)
4468
+ swc1 f12, 244(a2)
4469
+ swc1 f14, 248(a2)
4470
+ swc1 f16, 252(a2)
4471
+
4472
+ j ra
4473
+ nop
4474
+
4475
+ END(jsimd_convsamp_float_dspr2)
4476
+
4477
+ #endif
4478
+
4479
+ /*****************************************************************************/