epeg 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (504) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/MANIFEST +5 -0
  4. data/TODO +1 -0
  5. data/epeg/.dockerignore +4 -0
  6. data/epeg/.gitignore +5 -0
  7. data/epeg/CMakeLists.txt +30 -0
  8. data/epeg/Dockerfile +23 -0
  9. data/epeg/Epeg.h +90 -0
  10. data/epeg/README.md +42 -0
  11. data/epeg/epeg_main.c +1642 -0
  12. data/epeg/epeg_private.h +85 -0
  13. data/epeg/example/.gitignore +1 -0
  14. data/epeg/example/CMakeLists.txt +20 -0
  15. data/epeg/example/example.jpg +0 -0
  16. data/epeg/example/rotatetest.c +29 -0
  17. data/epeg/example/scaletest.c +48 -0
  18. data/epeg/vendor/libjpeg-turbo-2.0.4/BUILDING.md +828 -0
  19. data/epeg/vendor/libjpeg-turbo-2.0.4/CMakeLists.txt +1420 -0
  20. data/epeg/vendor/libjpeg-turbo-2.0.4/ChangeLog.md +1494 -0
  21. data/epeg/vendor/libjpeg-turbo-2.0.4/LICENSE.md +132 -0
  22. data/epeg/vendor/libjpeg-turbo-2.0.4/README.ijg +277 -0
  23. data/epeg/vendor/libjpeg-turbo-2.0.4/README.md +356 -0
  24. data/epeg/vendor/libjpeg-turbo-2.0.4/cderror.h +137 -0
  25. data/epeg/vendor/libjpeg-turbo-2.0.4/cdjpeg.c +145 -0
  26. data/epeg/vendor/libjpeg-turbo-2.0.4/cdjpeg.h +157 -0
  27. data/epeg/vendor/libjpeg-turbo-2.0.4/change.log +315 -0
  28. data/epeg/vendor/libjpeg-turbo-2.0.4/cjpeg.1 +354 -0
  29. data/epeg/vendor/libjpeg-turbo-2.0.4/cjpeg.c +695 -0
  30. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/BuildPackages.cmake +182 -0
  31. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/GNUInstallDirs.cmake +416 -0
  32. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/cmake_uninstall.cmake.in +24 -0
  33. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/testclean.cmake +41 -0
  34. data/epeg/vendor/libjpeg-turbo-2.0.4/cmyk.h +61 -0
  35. data/epeg/vendor/libjpeg-turbo-2.0.4/coderules.txt +78 -0
  36. data/epeg/vendor/libjpeg-turbo-2.0.4/djpeg.1 +296 -0
  37. data/epeg/vendor/libjpeg-turbo-2.0.4/djpeg.c +822 -0
  38. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/annotated.html +104 -0
  39. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/bc_s.png +0 -0
  40. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/bdwn.png +0 -0
  41. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/classes.html +106 -0
  42. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/closed.png +0 -0
  43. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen-extra.css +3 -0
  44. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen.css +1184 -0
  45. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen.png +0 -0
  46. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/dynsections.js +97 -0
  47. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2blank.png +0 -0
  48. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2cl.png +0 -0
  49. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2doc.png +0 -0
  50. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2folderclosed.png +0 -0
  51. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2folderopen.png +0 -0
  52. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2lastnode.png +0 -0
  53. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2link.png +0 -0
  54. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mlastnode.png +0 -0
  55. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mnode.png +0 -0
  56. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mo.png +0 -0
  57. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2node.png +0 -0
  58. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2ns.png +0 -0
  59. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2plastnode.png +0 -0
  60. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2pnode.png +0 -0
  61. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2splitbar.png +0 -0
  62. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2vertline.png +0 -0
  63. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/functions.html +134 -0
  64. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/functions_vars.html +134 -0
  65. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/group___turbo_j_p_e_g.html +2775 -0
  66. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/index.html +90 -0
  67. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/jquery.js +8 -0
  68. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/modules.html +95 -0
  69. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_f.png +0 -0
  70. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_g.png +0 -0
  71. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_h.png +0 -0
  72. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/open.png +0 -0
  73. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_63.html +26 -0
  74. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_63.js +4 -0
  75. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_64.html +26 -0
  76. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_64.js +5 -0
  77. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_68.html +26 -0
  78. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_68.js +4 -0
  79. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6e.html +26 -0
  80. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6e.js +4 -0
  81. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6f.html +26 -0
  82. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6f.js +5 -0
  83. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_72.html +26 -0
  84. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_72.js +4 -0
  85. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_74.html +26 -0
  86. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_74.js +102 -0
  87. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_77.html +26 -0
  88. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_77.js +4 -0
  89. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_78.html +26 -0
  90. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_78.js +4 -0
  91. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_79.html +26 -0
  92. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_79.js +4 -0
  93. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/classes_74.html +26 -0
  94. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/classes_74.js +6 -0
  95. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/close.png +0 -0
  96. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enums_74.html +26 -0
  97. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enums_74.js +8 -0
  98. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enumvalues_74.html +26 -0
  99. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enumvalues_74.js +37 -0
  100. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/functions_74.html +26 -0
  101. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/functions_74.js +31 -0
  102. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/groups_74.html +26 -0
  103. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/groups_74.js +4 -0
  104. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/mag_sel.png +0 -0
  105. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/nomatches.html +12 -0
  106. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search.css +271 -0
  107. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search.js +809 -0
  108. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_l.png +0 -0
  109. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_m.png +0 -0
  110. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_r.png +0 -0
  111. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/typedefs_74.html +26 -0
  112. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/typedefs_74.js +5 -0
  113. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_63.html +26 -0
  114. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_63.js +4 -0
  115. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_64.html +26 -0
  116. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_64.js +5 -0
  117. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_68.html +26 -0
  118. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_68.js +4 -0
  119. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6e.html +26 -0
  120. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6e.js +4 -0
  121. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6f.html +26 -0
  122. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6f.js +5 -0
  123. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_72.html +26 -0
  124. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_72.js +4 -0
  125. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_74.html +26 -0
  126. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_74.js +10 -0
  127. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_77.html +26 -0
  128. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_77.js +4 -0
  129. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_78.html +26 -0
  130. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_78.js +4 -0
  131. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_79.html +26 -0
  132. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_79.js +4 -0
  133. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjregion.html +186 -0
  134. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjscalingfactor.html +148 -0
  135. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjtransform.html +212 -0
  136. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/sync_off.png +0 -0
  137. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/sync_on.png +0 -0
  138. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_a.png +0 -0
  139. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_b.png +0 -0
  140. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_h.png +0 -0
  141. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_s.png +0 -0
  142. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tabs.css +60 -0
  143. data/epeg/vendor/libjpeg-turbo-2.0.4/doxygen-extra.css +3 -0
  144. data/epeg/vendor/libjpeg-turbo-2.0.4/doxygen.config +16 -0
  145. data/epeg/vendor/libjpeg-turbo-2.0.4/example.txt +464 -0
  146. data/epeg/vendor/libjpeg-turbo-2.0.4/jaricom.c +157 -0
  147. data/epeg/vendor/libjpeg-turbo-2.0.4/java/CMakeLists.txt +88 -0
  148. data/epeg/vendor/libjpeg-turbo-2.0.4/java/MANIFEST.MF +2 -0
  149. data/epeg/vendor/libjpeg-turbo-2.0.4/java/README +52 -0
  150. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJBench.java +1021 -0
  151. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJExample.java +405 -0
  152. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJUnitTest.java +960 -0
  153. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/allclasses-frame.html +24 -0
  154. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/allclasses-noframe.html +24 -0
  155. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/constant-values.html +532 -0
  156. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/deprecated-list.html +252 -0
  157. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/help-doc.html +210 -0
  158. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/index-all.html +1029 -0
  159. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/index.html +71 -0
  160. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJ.html +1356 -0
  161. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html +926 -0
  162. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html +241 -0
  163. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html +1255 -0
  164. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJException.html +340 -0
  165. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html +343 -0
  166. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html +751 -0
  167. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html +421 -0
  168. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html +765 -0
  169. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-frame.html +31 -0
  170. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-summary.html +202 -0
  171. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-tree.html +160 -0
  172. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/overview-tree.html +164 -0
  173. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/package-list +1 -0
  174. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/background.gif +0 -0
  175. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/tab.gif +0 -0
  176. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/titlebar.gif +0 -0
  177. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/titlebar_end.gif +0 -0
  178. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/script.js +30 -0
  179. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/serialized-form.html +176 -0
  180. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/stylesheet.css +474 -0
  181. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJ.java +584 -0
  182. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJCompressor.java +677 -0
  183. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java +76 -0
  184. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJDecompressor.java +931 -0
  185. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJException.java +78 -0
  186. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in +59 -0
  187. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJLoader-win.java.in +35 -0
  188. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJScalingFactor.java +115 -0
  189. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJTransform.java +227 -0
  190. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJTransformer.java +163 -0
  191. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/YUVImage.java +445 -0
  192. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJ.h +129 -0
  193. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJCompressor.h +101 -0
  194. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJDecompressor.h +101 -0
  195. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJTransformer.h +29 -0
  196. data/epeg/vendor/libjpeg-turbo-2.0.4/jcapimin.c +295 -0
  197. data/epeg/vendor/libjpeg-turbo-2.0.4/jcapistd.c +162 -0
  198. data/epeg/vendor/libjpeg-turbo-2.0.4/jcarith.c +932 -0
  199. data/epeg/vendor/libjpeg-turbo-2.0.4/jccoefct.c +449 -0
  200. data/epeg/vendor/libjpeg-turbo-2.0.4/jccolext.c +144 -0
  201. data/epeg/vendor/libjpeg-turbo-2.0.4/jccolor.c +710 -0
  202. data/epeg/vendor/libjpeg-turbo-2.0.4/jcdctmgr.c +721 -0
  203. data/epeg/vendor/libjpeg-turbo-2.0.4/jchuff.c +1096 -0
  204. data/epeg/vendor/libjpeg-turbo-2.0.4/jchuff.h +42 -0
  205. data/epeg/vendor/libjpeg-turbo-2.0.4/jcicc.c +105 -0
  206. data/epeg/vendor/libjpeg-turbo-2.0.4/jcinit.c +77 -0
  207. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmainct.c +162 -0
  208. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmarker.c +664 -0
  209. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmaster.c +640 -0
  210. data/epeg/vendor/libjpeg-turbo-2.0.4/jcomapi.c +109 -0
  211. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfig.h.in +73 -0
  212. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfig.txt +143 -0
  213. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfigint.h.in +31 -0
  214. data/epeg/vendor/libjpeg-turbo-2.0.4/jcparam.c +541 -0
  215. data/epeg/vendor/libjpeg-turbo-2.0.4/jcphuff.c +1105 -0
  216. data/epeg/vendor/libjpeg-turbo-2.0.4/jcprepct.c +351 -0
  217. data/epeg/vendor/libjpeg-turbo-2.0.4/jcsample.c +539 -0
  218. data/epeg/vendor/libjpeg-turbo-2.0.4/jcstest.c +126 -0
  219. data/epeg/vendor/libjpeg-turbo-2.0.4/jctrans.c +400 -0
  220. data/epeg/vendor/libjpeg-turbo-2.0.4/jdapimin.c +407 -0
  221. data/epeg/vendor/libjpeg-turbo-2.0.4/jdapistd.c +639 -0
  222. data/epeg/vendor/libjpeg-turbo-2.0.4/jdarith.c +773 -0
  223. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatadst-tj.c +203 -0
  224. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatadst.c +293 -0
  225. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatasrc-tj.c +194 -0
  226. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatasrc.c +295 -0
  227. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcoefct.c +692 -0
  228. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcoefct.h +82 -0
  229. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcol565.c +384 -0
  230. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcolext.c +143 -0
  231. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcolor.c +883 -0
  232. data/epeg/vendor/libjpeg-turbo-2.0.4/jdct.h +208 -0
  233. data/epeg/vendor/libjpeg-turbo-2.0.4/jddctmgr.c +352 -0
  234. data/epeg/vendor/libjpeg-turbo-2.0.4/jdhuff.c +831 -0
  235. data/epeg/vendor/libjpeg-turbo-2.0.4/jdhuff.h +238 -0
  236. data/epeg/vendor/libjpeg-turbo-2.0.4/jdicc.c +171 -0
  237. data/epeg/vendor/libjpeg-turbo-2.0.4/jdinput.c +408 -0
  238. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmainct.c +460 -0
  239. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmainct.h +71 -0
  240. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmarker.c +1377 -0
  241. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmaster.c +737 -0
  242. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmaster.h +28 -0
  243. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmerge.c +617 -0
  244. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmrg565.c +354 -0
  245. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmrgext.c +184 -0
  246. data/epeg/vendor/libjpeg-turbo-2.0.4/jdphuff.c +687 -0
  247. data/epeg/vendor/libjpeg-turbo-2.0.4/jdpostct.c +294 -0
  248. data/epeg/vendor/libjpeg-turbo-2.0.4/jdsample.c +518 -0
  249. data/epeg/vendor/libjpeg-turbo-2.0.4/jdsample.h +50 -0
  250. data/epeg/vendor/libjpeg-turbo-2.0.4/jdtrans.c +155 -0
  251. data/epeg/vendor/libjpeg-turbo-2.0.4/jerror.c +251 -0
  252. data/epeg/vendor/libjpeg-turbo-2.0.4/jerror.h +316 -0
  253. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctflt.c +169 -0
  254. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctfst.c +227 -0
  255. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctint.c +288 -0
  256. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctflt.c +240 -0
  257. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctfst.c +371 -0
  258. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctint.c +2627 -0
  259. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctred.c +409 -0
  260. data/epeg/vendor/libjpeg-turbo-2.0.4/jinclude.h +88 -0
  261. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemmgr.c +1179 -0
  262. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemnobs.c +115 -0
  263. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemsys.h +178 -0
  264. data/epeg/vendor/libjpeg-turbo-2.0.4/jmorecfg.h +421 -0
  265. data/epeg/vendor/libjpeg-turbo-2.0.4/jpeg_nbits_table.h +4098 -0
  266. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegcomp.h +31 -0
  267. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegint.h +368 -0
  268. data/epeg/vendor/libjpeg-turbo-2.0.4/jpeglib.h +1132 -0
  269. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegtran.1 +295 -0
  270. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegtran.c +601 -0
  271. data/epeg/vendor/libjpeg-turbo-2.0.4/jquant1.c +859 -0
  272. data/epeg/vendor/libjpeg-turbo-2.0.4/jquant2.c +1285 -0
  273. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimd.h +117 -0
  274. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimd_none.c +418 -0
  275. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimddct.h +70 -0
  276. data/epeg/vendor/libjpeg-turbo-2.0.4/jstdhuff.c +143 -0
  277. data/epeg/vendor/libjpeg-turbo-2.0.4/jutils.c +133 -0
  278. data/epeg/vendor/libjpeg-turbo-2.0.4/jversion.h +52 -0
  279. data/epeg/vendor/libjpeg-turbo-2.0.4/libjpeg.map.in +11 -0
  280. data/epeg/vendor/libjpeg-turbo-2.0.4/libjpeg.txt +3144 -0
  281. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/CMakeLists.txt +1 -0
  282. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5.c +275 -0
  283. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5.h +57 -0
  284. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5cmp.c +59 -0
  285. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5hl.c +125 -0
  286. data/epeg/vendor/libjpeg-turbo-2.0.4/rdbmp.c +689 -0
  287. data/epeg/vendor/libjpeg-turbo-2.0.4/rdcolmap.c +254 -0
  288. data/epeg/vendor/libjpeg-turbo-2.0.4/rdgif.c +39 -0
  289. data/epeg/vendor/libjpeg-turbo-2.0.4/rdjpgcom.1 +63 -0
  290. data/epeg/vendor/libjpeg-turbo-2.0.4/rdjpgcom.c +510 -0
  291. data/epeg/vendor/libjpeg-turbo-2.0.4/rdppm.c +766 -0
  292. data/epeg/vendor/libjpeg-turbo-2.0.4/rdrle.c +389 -0
  293. data/epeg/vendor/libjpeg-turbo-2.0.4/rdswitch.c +424 -0
  294. data/epeg/vendor/libjpeg-turbo-2.0.4/rdtarga.c +509 -0
  295. data/epeg/vendor/libjpeg-turbo-2.0.4/release/Distribution.xml.in +24 -0
  296. data/epeg/vendor/libjpeg-turbo-2.0.4/release/License.rtf +20 -0
  297. data/epeg/vendor/libjpeg-turbo-2.0.4/release/ReadMe.txt +5 -0
  298. data/epeg/vendor/libjpeg-turbo-2.0.4/release/Welcome.rtf +17 -0
  299. data/epeg/vendor/libjpeg-turbo-2.0.4/release/deb-control.in +31 -0
  300. data/epeg/vendor/libjpeg-turbo-2.0.4/release/installer.nsi.in +191 -0
  301. data/epeg/vendor/libjpeg-turbo-2.0.4/release/libjpeg.pc.in +10 -0
  302. data/epeg/vendor/libjpeg-turbo-2.0.4/release/libturbojpeg.pc.in +10 -0
  303. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makecygwinpkg.in +66 -0
  304. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makedpkg.in +115 -0
  305. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makemacpkg.in +284 -0
  306. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makerpm.in +30 -0
  307. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makesrpm.in +48 -0
  308. data/epeg/vendor/libjpeg-turbo-2.0.4/release/maketarball.in +51 -0
  309. data/epeg/vendor/libjpeg-turbo-2.0.4/release/rpm.spec.in +221 -0
  310. data/epeg/vendor/libjpeg-turbo-2.0.4/release/uninstall.in +113 -0
  311. data/epeg/vendor/libjpeg-turbo-2.0.4/sharedlib/CMakeLists.txt +99 -0
  312. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/CMakeLists.txt +385 -0
  313. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm/jsimd.c +721 -0
  314. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm/jsimd_neon.S +2878 -0
  315. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm64/jsimd.c +798 -0
  316. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm64/jsimd_neon.S +3433 -0
  317. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/gas-preprocessor.in +1 -0
  318. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-avx2.asm +578 -0
  319. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-mmx.asm +476 -0
  320. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-sse2.asm +503 -0
  321. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-avx2.asm +121 -0
  322. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-mmx.asm +121 -0
  323. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-sse2.asm +120 -0
  324. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-avx2.asm +113 -0
  325. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-mmx.asm +113 -0
  326. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-sse2.asm +112 -0
  327. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-avx2.asm +457 -0
  328. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-mmx.asm +355 -0
  329. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-sse2.asm +382 -0
  330. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jchuff-sse2.asm +424 -0
  331. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcphuff-sse2.asm +660 -0
  332. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-avx2.asm +388 -0
  333. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-mmx.asm +324 -0
  334. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-sse2.asm +351 -0
  335. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-avx2.asm +515 -0
  336. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-mmx.asm +404 -0
  337. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-sse2.asm +458 -0
  338. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-avx2.asm +118 -0
  339. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-mmx.asm +117 -0
  340. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-sse2.asm +117 -0
  341. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-avx2.asm +136 -0
  342. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-mmx.asm +123 -0
  343. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-sse2.asm +135 -0
  344. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-avx2.asm +575 -0
  345. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-mmx.asm +460 -0
  346. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-sse2.asm +517 -0
  347. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-avx2.asm +760 -0
  348. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-mmx.asm +731 -0
  349. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-sse2.asm +724 -0
  350. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctflt-3dn.asm +318 -0
  351. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctflt-sse.asm +369 -0
  352. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctfst-mmx.asm +395 -0
  353. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctfst-sse2.asm +403 -0
  354. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-avx2.asm +331 -0
  355. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-mmx.asm +620 -0
  356. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-sse2.asm +633 -0
  357. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-3dn.asm +451 -0
  358. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-sse.asm +571 -0
  359. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-sse2.asm +497 -0
  360. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctfst-mmx.asm +499 -0
  361. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctfst-sse2.asm +501 -0
  362. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-avx2.asm +453 -0
  363. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-mmx.asm +851 -0
  364. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-sse2.asm +858 -0
  365. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctred-mmx.asm +704 -0
  366. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctred-sse2.asm +592 -0
  367. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-3dn.asm +230 -0
  368. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-mmx.asm +276 -0
  369. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-sse.asm +208 -0
  370. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquantf-sse2.asm +168 -0
  371. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquanti-avx2.asm +188 -0
  372. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquanti-sse2.asm +201 -0
  373. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jsimd.c +1253 -0
  374. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jsimdcpu.asm +135 -0
  375. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/jsimd.h +1083 -0
  376. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jccolext-mmi.c +483 -0
  377. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jccolor-mmi.c +148 -0
  378. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jcsample-mmi.c +100 -0
  379. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jcsample.h +28 -0
  380. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdcolext-mmi.c +424 -0
  381. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdcolor-mmi.c +139 -0
  382. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdsample-mmi.c +245 -0
  383. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jfdctint-mmi.c +398 -0
  384. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jidctint-mmi.c +571 -0
  385. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jquanti-mmi.c +130 -0
  386. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jsimd.c +610 -0
  387. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jsimd_mmi.h +57 -0
  388. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/loongson-mmintrin.h +1324 -0
  389. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd.c +1123 -0
  390. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd_dspr2.S +4479 -0
  391. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd_dspr2_asm.h +292 -0
  392. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jcolsamp.inc +135 -0
  393. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jdct.inc +31 -0
  394. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jpeg_nbits_table.inc +4097 -0
  395. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdcfg.inc +93 -0
  396. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdcfg.inc.h +131 -0
  397. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdext.inc +479 -0
  398. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jccolext-altivec.c +269 -0
  399. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jccolor-altivec.c +116 -0
  400. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcgray-altivec.c +111 -0
  401. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcgryext-altivec.c +228 -0
  402. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcsample-altivec.c +159 -0
  403. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcsample.h +28 -0
  404. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdcolext-altivec.c +276 -0
  405. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdcolor-altivec.c +106 -0
  406. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdmerge-altivec.c +130 -0
  407. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdmrgext-altivec.c +329 -0
  408. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdsample-altivec.c +400 -0
  409. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jfdctfst-altivec.c +154 -0
  410. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jfdctint-altivec.c +258 -0
  411. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jidctfst-altivec.c +255 -0
  412. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jidctint-altivec.c +357 -0
  413. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jquanti-altivec.c +250 -0
  414. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jsimd.c +872 -0
  415. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jsimd_altivec.h +98 -0
  416. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolext-avx2.asm +558 -0
  417. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolext-sse2.asm +483 -0
  418. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolor-avx2.asm +121 -0
  419. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolor-sse2.asm +120 -0
  420. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgray-avx2.asm +113 -0
  421. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgray-sse2.asm +112 -0
  422. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgryext-avx2.asm +437 -0
  423. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgryext-sse2.asm +362 -0
  424. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jchuff-sse2.asm +346 -0
  425. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcphuff-sse2.asm +637 -0
  426. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcsample-avx2.asm +366 -0
  427. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcsample-sse2.asm +329 -0
  428. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolext-avx2.asm +495 -0
  429. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolext-sse2.asm +438 -0
  430. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolor-avx2.asm +118 -0
  431. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolor-sse2.asm +117 -0
  432. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmerge-avx2.asm +136 -0
  433. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmerge-sse2.asm +135 -0
  434. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmrgext-avx2.asm +593 -0
  435. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmrgext-sse2.asm +535 -0
  436. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdsample-avx2.asm +695 -0
  437. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdsample-sse2.asm +664 -0
  438. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctflt-sse.asm +355 -0
  439. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctfst-sse2.asm +389 -0
  440. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctint-avx2.asm +320 -0
  441. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctint-sse2.asm +619 -0
  442. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctflt-sse2.asm +481 -0
  443. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctfst-sse2.asm +490 -0
  444. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctint-avx2.asm +417 -0
  445. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctint-sse2.asm +846 -0
  446. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctred-sse2.asm +573 -0
  447. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquantf-sse2.asm +154 -0
  448. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquanti-avx2.asm +162 -0
  449. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquanti-sse2.asm +187 -0
  450. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jsimd.c +1076 -0
  451. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jsimdcpu.asm +86 -0
  452. data/epeg/vendor/libjpeg-turbo-2.0.4/structure.txt +904 -0
  453. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/nightshot_iso_100.bmp +0 -0
  454. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/nightshot_iso_100.txt +25 -0
  455. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test.scan +5 -0
  456. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test1.icc +0 -0
  457. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test1.icc.txt +20 -0
  458. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test2.icc +0 -0
  459. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test2.icc.txt +20 -0
  460. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testimgari.jpg +0 -0
  461. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testimgint.jpg +0 -0
  462. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig.jpg +0 -0
  463. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig.ppm +4 -0
  464. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig12.jpg +0 -0
  465. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_5674_0098.bmp +0 -0
  466. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_6434_0018a.bmp +0 -0
  467. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_6548_0026a.bmp +0 -0
  468. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbench.c +1031 -0
  469. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbenchtest.in +256 -0
  470. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbenchtest.java.in +215 -0
  471. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexample.c +396 -0
  472. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexampletest.in +149 -0
  473. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexampletest.java.in +151 -0
  474. data/epeg/vendor/libjpeg-turbo-2.0.4/tjunittest.c +931 -0
  475. data/epeg/vendor/libjpeg-turbo-2.0.4/tjutil.c +70 -0
  476. data/epeg/vendor/libjpeg-turbo-2.0.4/tjutil.h +47 -0
  477. data/epeg/vendor/libjpeg-turbo-2.0.4/transupp.c +1628 -0
  478. data/epeg/vendor/libjpeg-turbo-2.0.4/transupp.h +210 -0
  479. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-jni.c +1246 -0
  480. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-mapfile +65 -0
  481. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-mapfile.jni +101 -0
  482. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg.c +2152 -0
  483. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg.h +1744 -0
  484. data/epeg/vendor/libjpeg-turbo-2.0.4/usage.txt +635 -0
  485. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jconfig.h.in +34 -0
  486. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg62-memsrcdst.def +108 -0
  487. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg62.def +106 -0
  488. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg7-memsrcdst.def +110 -0
  489. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg7.def +108 -0
  490. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg8.def +111 -0
  491. data/epeg/vendor/libjpeg-turbo-2.0.4/wizard.txt +212 -0
  492. data/epeg/vendor/libjpeg-turbo-2.0.4/wrbmp.c +558 -0
  493. data/epeg/vendor/libjpeg-turbo-2.0.4/wrgif.c +413 -0
  494. data/epeg/vendor/libjpeg-turbo-2.0.4/wrjpgcom.1 +103 -0
  495. data/epeg/vendor/libjpeg-turbo-2.0.4/wrjpgcom.c +591 -0
  496. data/epeg/vendor/libjpeg-turbo-2.0.4/wrppm.c +365 -0
  497. data/epeg/vendor/libjpeg-turbo-2.0.4/wrrle.c +309 -0
  498. data/epeg/vendor/libjpeg-turbo-2.0.4/wrtarga.c +261 -0
  499. data/epeg.c +131 -0
  500. data/epeg.gemspec +18 -0
  501. data/extconf.rb +80 -0
  502. data/test.jpg +0 -0
  503. data/test.rb +42 -0
  504. metadata +546 -0
@@ -0,0 +1,4479 @@
1
+ /*
2
+ * MIPS DSPr2 optimizations for libjpeg-turbo
3
+ *
4
+ * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
5
+ * All Rights Reserved.
6
+ * Authors: Teodora Novkovic <teodora.novkovic@imgtec.com>
7
+ * Darko Laus <darko.laus@imgtec.com>
8
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
9
+ *
10
+ * This software is provided 'as-is', without any express or implied
11
+ * warranty. In no event will the authors be held liable for any damages
12
+ * arising from the use of this software.
13
+ *
14
+ * Permission is granted to anyone to use this software for any purpose,
15
+ * including commercial applications, and to alter it and redistribute it
16
+ * freely, subject to the following restrictions:
17
+ *
18
+ * 1. The origin of this software must not be misrepresented; you must not
19
+ * claim that you wrote the original software. If you use this software
20
+ * in a product, an acknowledgment in the product documentation would be
21
+ * appreciated but is not required.
22
+ * 2. Altered source versions must be plainly marked as such, and must not be
23
+ * misrepresented as being the original software.
24
+ * 3. This notice may not be removed or altered from any source distribution.
25
+ */
26
+
27
+ #include "jsimd_dspr2_asm.h"
28
+
29
+
30
+ /*****************************************************************************/
31
+ LEAF_DSPR2(jsimd_c_null_convert_dspr2)
32
+ /*
33
+ * a0 = cinfo->image_width
34
+ * a1 = input_buf
35
+ * a2 = output_buf
36
+ * a3 = output_row
37
+ * 16(sp) = num_rows
38
+ * 20(sp) = cinfo->num_components
39
+ *
40
+ * Null conversion for compression
41
+ */
42
+ SAVE_REGS_ON_STACK 8, s0, s1
43
+
44
+ lw t9, 24(sp) // t9 = num_rows
45
+ lw s0, 28(sp) // s0 = cinfo->num_components
46
+ andi t0, a0, 3 // t0 = cinfo->image_width & 3
47
+ beqz t0, 4f // no residual
48
+ nop
49
+ 0:
50
+ addiu t9, t9, -1
51
+ bltz t9, 7f
52
+ li t1, 0
53
+ 1:
54
+ sll t3, t1, 2
55
+ lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
56
+ lw t2, 0(a1) // t2 = inptr = *input_buf
57
+ sll t4, a3, 2
58
+ lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
59
+ addu t2, t2, t1
60
+ addu s1, t5, a0
61
+ addu t6, t5, t0
62
+ 2:
63
+ lbu t3, 0(t2)
64
+ addiu t5, t5, 1
65
+ sb t3, -1(t5)
66
+ bne t6, t5, 2b
67
+ addu t2, t2, s0
68
+ 3:
69
+ lbu t3, 0(t2)
70
+ addu t4, t2, s0
71
+ addu t7, t4, s0
72
+ addu t8, t7, s0
73
+ addu t2, t8, s0
74
+ lbu t4, 0(t4)
75
+ lbu t7, 0(t7)
76
+ lbu t8, 0(t8)
77
+ addiu t5, t5, 4
78
+ sb t3, -4(t5)
79
+ sb t4, -3(t5)
80
+ sb t7, -2(t5)
81
+ bne s1, t5, 3b
82
+ sb t8, -1(t5)
83
+ addiu t1, t1, 1
84
+ bne t1, s0, 1b
85
+ nop
86
+ addiu a1, a1, 4
87
+ bgez t9, 0b
88
+ addiu a3, a3, 1
89
+ b 7f
90
+ nop
91
+ 4:
92
+ addiu t9, t9, -1
93
+ bltz t9, 7f
94
+ li t1, 0
95
+ 5:
96
+ sll t3, t1, 2
97
+ lwx t5, t3(a2) // t5 = outptr = output_buf[ci]
98
+ lw t2, 0(a1) // t2 = inptr = *input_buf
99
+ sll t4, a3, 2
100
+ lwx t5, t4(t5) // t5 = outptr = output_buf[ci][output_row]
101
+ addu t2, t2, t1
102
+ addu s1, t5, a0
103
+ addu t6, t5, t0
104
+ 6:
105
+ lbu t3, 0(t2)
106
+ addu t4, t2, s0
107
+ addu t7, t4, s0
108
+ addu t8, t7, s0
109
+ addu t2, t8, s0
110
+ lbu t4, 0(t4)
111
+ lbu t7, 0(t7)
112
+ lbu t8, 0(t8)
113
+ addiu t5, t5, 4
114
+ sb t3, -4(t5)
115
+ sb t4, -3(t5)
116
+ sb t7, -2(t5)
117
+ bne s1, t5, 6b
118
+ sb t8, -1(t5)
119
+ addiu t1, t1, 1
120
+ bne t1, s0, 5b
121
+ nop
122
+ addiu a1, a1, 4
123
+ bgez t9, 4b
124
+ addiu a3, a3, 1
125
+ 7:
126
+ RESTORE_REGS_FROM_STACK 8, s0, s1
127
+
128
+ j ra
129
+ nop
130
+
131
+ END(jsimd_c_null_convert_dspr2)
132
+
133
+
134
+ /*****************************************************************************/
135
+ /*
136
+ * jsimd_extrgb_ycc_convert_dspr2
137
+ * jsimd_extbgr_ycc_convert_dspr2
138
+ * jsimd_extrgbx_ycc_convert_dspr2
139
+ * jsimd_extbgrx_ycc_convert_dspr2
140
+ * jsimd_extxbgr_ycc_convert_dspr2
141
+ * jsimd_extxrgb_ycc_convert_dspr2
142
+ *
143
+ * Colorspace conversion RGB -> YCbCr
144
+ */
145
+
146
+ .macro GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 colorid, pixel_size, \
147
+ r_offs, g_offs, b_offs
148
+
149
+ .macro DO_RGB_TO_YCC r, g, b, inptr
150
+ lbu \r, \r_offs(\inptr)
151
+ lbu \g, \g_offs(\inptr)
152
+ lbu \b, \b_offs(\inptr)
153
+ addiu \inptr, \pixel_size
154
+ .endm
155
+
156
+ LEAF_DSPR2(jsimd_\colorid\()_ycc_convert_dspr2)
157
+ /*
158
+ * a0 = cinfo->image_width
159
+ * a1 = input_buf
160
+ * a2 = output_buf
161
+ * a3 = output_row
162
+ * 16(sp) = num_rows
163
+ */
164
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
165
+
166
+ lw t7, 48(sp) // t7 = num_rows
167
+ li s0, 0x4c8b // FIX(0.29900)
168
+ li s1, 0x9646 // FIX(0.58700)
169
+ li s2, 0x1d2f // FIX(0.11400)
170
+ li s3, 0xffffd4cd // -FIX(0.16874)
171
+ li s4, 0xffffab33 // -FIX(0.33126)
172
+ li s5, 0x8000 // FIX(0.50000)
173
+ li s6, 0xffff94d1 // -FIX(0.41869)
174
+ li s7, 0xffffeb2f // -FIX(0.08131)
175
+ li t8, 0x807fff // CBCR_OFFSET + ONE_HALF-1
176
+
177
+ 0:
178
+ addiu t7, -1 // --num_rows
179
+ lw t6, 0(a1) // t6 = input_buf[0]
180
+ lw t0, 0(a2)
181
+ lw t1, 4(a2)
182
+ lw t2, 8(a2)
183
+ sll t3, a3, 2
184
+ lwx t0, t3(t0) // t0 = output_buf[0][output_row]
185
+ lwx t1, t3(t1) // t1 = output_buf[1][output_row]
186
+ lwx t2, t3(t2) // t2 = output_buf[2][output_row]
187
+
188
+ addu t9, t2, a0 // t9 = end address
189
+ addiu a3, 1
190
+
191
+ 1:
192
+ DO_RGB_TO_YCC t3, t4, t5, t6
193
+
194
+ mtlo s5, $ac0
195
+ mtlo t8, $ac1
196
+ mtlo t8, $ac2
197
+ maddu $ac0, s2, t5
198
+ maddu $ac1, s5, t5
199
+ maddu $ac2, s5, t3
200
+ maddu $ac0, s0, t3
201
+ maddu $ac1, s3, t3
202
+ maddu $ac2, s6, t4
203
+ maddu $ac0, s1, t4
204
+ maddu $ac1, s4, t4
205
+ maddu $ac2, s7, t5
206
+ extr.w t3, $ac0, 16
207
+ extr.w t4, $ac1, 16
208
+ extr.w t5, $ac2, 16
209
+ sb t3, 0(t0)
210
+ sb t4, 0(t1)
211
+ sb t5, 0(t2)
212
+ addiu t0, 1
213
+ addiu t2, 1
214
+ bne t2, t9, 1b
215
+ addiu t1, 1
216
+ bgtz t7, 0b
217
+ addiu a1, 4
218
+
219
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
220
+
221
+ j ra
222
+ nop
223
+ END(jsimd_\colorid\()_ycc_convert_dspr2)
224
+
225
+ .purgem DO_RGB_TO_YCC
226
+
227
+ .endm
228
+
229
+ /*-------------------------------------id -- pix R G B */
230
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
231
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
232
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
233
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
234
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
235
+ GENERATE_JSIMD_RGB_YCC_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
236
+
237
+
238
+ /*****************************************************************************/
239
+ /*
240
+ * jsimd_ycc_extrgb_convert_dspr2
241
+ * jsimd_ycc_extbgr_convert_dspr2
242
+ * jsimd_ycc_extrgbx_convert_dspr2
243
+ * jsimd_ycc_extbgrx_convert_dspr2
244
+ * jsimd_ycc_extxbgr_convert_dspr2
245
+ * jsimd_ycc_extxrgb_convert_dspr2
246
+ *
247
+ * Colorspace conversion YCbCr -> RGB
248
+ */
249
+
250
+ .macro GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 colorid, pixel_size, \
251
+ r_offs, g_offs, b_offs, a_offs
252
+
253
+ .macro STORE_YCC_TO_RGB scratch0 scratch1 scratch2 outptr
254
+ sb \scratch0, \r_offs(\outptr)
255
+ sb \scratch1, \g_offs(\outptr)
256
+ sb \scratch2, \b_offs(\outptr)
257
+ .if (\pixel_size == 4)
258
+ li t0, 0xFF
259
+ sb t0, \a_offs(\outptr)
260
+ .endif
261
+ addiu \outptr, \pixel_size
262
+ .endm
263
+
264
+ LEAF_DSPR2(jsimd_ycc_\colorid\()_convert_dspr2)
265
+ /*
266
+ * a0 = cinfo->image_width
267
+ * a1 = input_buf
268
+ * a2 = input_row
269
+ * a3 = output_buf
270
+ * 16(sp) = num_rows
271
+ */
272
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
273
+
274
+ lw s1, 48(sp)
275
+ li t3, 0x8000
276
+ li t4, 0x166e9 // FIX(1.40200)
277
+ li t5, 0x1c5a2 // FIX(1.77200)
278
+ li t6, 0xffff492e // -FIX(0.71414)
279
+ li t7, 0xffffa7e6 // -FIX(0.34414)
280
+ repl.ph t8, 128
281
+
282
+ 0:
283
+ lw s0, 0(a3)
284
+ lw t0, 0(a1)
285
+ lw t1, 4(a1)
286
+ lw t2, 8(a1)
287
+ sll s5, a2, 2
288
+ addiu s1, -1
289
+ lwx s2, s5(t0)
290
+ lwx s3, s5(t1)
291
+ lwx s4, s5(t2)
292
+ addu t9, s2, a0
293
+ addiu a2, 1
294
+
295
+ 1:
296
+ lbu s7, 0(s4) // cr
297
+ lbu s6, 0(s3) // cb
298
+ lbu s5, 0(s2) // y
299
+ addiu s2, 1
300
+ addiu s4, 1
301
+ addiu s7, -128
302
+ addiu s6, -128
303
+ mul t2, t7, s6
304
+ mul t0, t6, s7 // Crgtab[cr]
305
+ sll s7, 15
306
+ mulq_rs.w t1, t4, s7 // Crrtab[cr]
307
+ sll s6, 15
308
+ addu t2, t3 // Cbgtab[cb]
309
+ addu t2, t0
310
+
311
+ mulq_rs.w t0, t5, s6 // Cbbtab[cb]
312
+ sra t2, 16
313
+ addu t1, s5
314
+ addu t2, s5 // add y
315
+ ins t2, t1, 16, 16
316
+ subu.ph t2, t2, t8
317
+ addu t0, s5
318
+ shll_s.ph t2, t2, 8
319
+ subu t0, 128
320
+ shra.ph t2, t2, 8
321
+ shll_s.w t0, t0, 24
322
+ addu.ph t2, t2, t8 // clip & store
323
+ sra t0, t0, 24
324
+ sra t1, t2, 16
325
+ addiu t0, 128
326
+
327
+ STORE_YCC_TO_RGB t1, t2, t0, s0
328
+
329
+ bne s2, t9, 1b
330
+ addiu s3, 1
331
+ bgtz s1, 0b
332
+ addiu a3, 4
333
+
334
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
335
+
336
+ j ra
337
+ nop
338
+ END(jsimd_ycc_\colorid\()_convert_dspr2)
339
+
340
+ .purgem STORE_YCC_TO_RGB
341
+
342
+ .endm
343
+
344
+ /*-------------------------------------id -- pix R G B A */
345
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgb, 3, 0, 1, 2, 3
346
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgr, 3, 2, 1, 0, 3
347
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2, 3
348
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0, 3
349
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1, 0
350
+ GENERATE_JSIMD_YCC_RGB_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3, 0
351
+
352
+
353
+ /*****************************************************************************/
354
+ /*
355
+ * jsimd_extrgb_gray_convert_dspr2
356
+ * jsimd_extbgr_gray_convert_dspr2
357
+ * jsimd_extrgbx_gray_convert_dspr2
358
+ * jsimd_extbgrx_gray_convert_dspr2
359
+ * jsimd_extxbgr_gray_convert_dspr2
360
+ * jsimd_extxrgb_gray_convert_dspr2
361
+ *
362
+ * Colorspace conversion RGB -> GRAY
363
+ */
364
+
365
+ .macro GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 colorid, pixel_size, \
366
+ r_offs, g_offs, b_offs
367
+
368
+ .macro DO_RGB_TO_GRAY r, g, b, inptr
369
+ lbu \r, \r_offs(\inptr)
370
+ lbu \g, \g_offs(\inptr)
371
+ lbu \b, \b_offs(\inptr)
372
+ addiu \inptr, \pixel_size
373
+ .endm
374
+
375
+ LEAF_DSPR2(jsimd_\colorid\()_gray_convert_dspr2)
376
+ /*
377
+ * a0 = cinfo->image_width
378
+ * a1 = input_buf
379
+ * a2 = output_buf
380
+ * a3 = output_row
381
+ * 16(sp) = num_rows
382
+ */
383
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
384
+
385
+ li s0, 0x4c8b // s0 = FIX(0.29900)
386
+ li s1, 0x9646 // s1 = FIX(0.58700)
387
+ li s2, 0x1d2f // s2 = FIX(0.11400)
388
+ li s7, 0x8000 // s7 = FIX(0.50000)
389
+ lw s6, 48(sp)
390
+ andi t7, a0, 3
391
+
392
+ 0:
393
+ addiu s6, -1 // s6 = num_rows
394
+ lw t0, 0(a1)
395
+ lw t1, 0(a2)
396
+ sll t3, a3, 2
397
+ lwx t1, t3(t1)
398
+ addiu a3, 1
399
+ addu t9, t1, a0
400
+ subu t8, t9, t7
401
+ beq t1, t8, 2f
402
+ nop
403
+
404
+ 1:
405
+ DO_RGB_TO_GRAY t3, t4, t5, t0
406
+ DO_RGB_TO_GRAY s3, s4, s5, t0
407
+
408
+ mtlo s7, $ac0
409
+ maddu $ac0, s2, t5
410
+ maddu $ac0, s1, t4
411
+ maddu $ac0, s0, t3
412
+ mtlo s7, $ac1
413
+ maddu $ac1, s2, s5
414
+ maddu $ac1, s1, s4
415
+ maddu $ac1, s0, s3
416
+ extr.w t6, $ac0, 16
417
+
418
+ DO_RGB_TO_GRAY t3, t4, t5, t0
419
+ DO_RGB_TO_GRAY s3, s4, s5, t0
420
+
421
+ mtlo s7, $ac0
422
+ maddu $ac0, s2, t5
423
+ maddu $ac0, s1, t4
424
+ extr.w t2, $ac1, 16
425
+ maddu $ac0, s0, t3
426
+ mtlo s7, $ac1
427
+ maddu $ac1, s2, s5
428
+ maddu $ac1, s1, s4
429
+ maddu $ac1, s0, s3
430
+ extr.w t5, $ac0, 16
431
+ sb t6, 0(t1)
432
+ sb t2, 1(t1)
433
+ extr.w t3, $ac1, 16
434
+ addiu t1, 4
435
+ sb t5, -2(t1)
436
+ sb t3, -1(t1)
437
+ bne t1, t8, 1b
438
+ nop
439
+
440
+ 2:
441
+ beqz t7, 4f
442
+ nop
443
+
444
+ 3:
445
+ DO_RGB_TO_GRAY t3, t4, t5, t0
446
+
447
+ mtlo s7, $ac0
448
+ maddu $ac0, s2, t5
449
+ maddu $ac0, s1, t4
450
+ maddu $ac0, s0, t3
451
+ extr.w t6, $ac0, 16
452
+ sb t6, 0(t1)
453
+ addiu t1, 1
454
+ bne t1, t9, 3b
455
+ nop
456
+
457
+ 4:
458
+ bgtz s6, 0b
459
+ addiu a1, 4
460
+
461
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
462
+
463
+ j ra
464
+ nop
465
+ END(jsimd_\colorid\()_gray_convert_dspr2)
466
+
467
+ .purgem DO_RGB_TO_GRAY
468
+
469
+ .endm
470
+
471
+ /*-------------------------------------id -- pix R G B */
472
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgb, 3, 0, 1, 2
473
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgr, 3, 2, 1, 0
474
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extrgbx, 4, 0, 1, 2
475
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extbgrx, 4, 2, 1, 0
476
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxbgr, 4, 3, 2, 1
477
+ GENERATE_JSIMD_RGB_GRAY_CONVERT_DSPR2 extxrgb, 4, 1, 2, 3
478
+
479
+
480
+ /*****************************************************************************/
481
+ /*
482
+ * jsimd_h2v2_merged_upsample_dspr2
483
+ * jsimd_h2v2_extrgb_merged_upsample_dspr2
484
+ * jsimd_h2v2_extrgbx_merged_upsample_dspr2
485
+ * jsimd_h2v2_extbgr_merged_upsample_dspr2
486
+ * jsimd_h2v2_extbgrx_merged_upsample_dspr2
487
+ * jsimd_h2v2_extxbgr_merged_upsample_dspr2
488
+ * jsimd_h2v2_extxrgb_merged_upsample_dspr2
489
+ *
490
+ * Merged h2v2 upsample routines
491
+ */
492
+ .macro GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
493
+ r1_offs, g1_offs, \
494
+ b1_offs, a1_offs, \
495
+ r2_offs, g2_offs, \
496
+ b2_offs, a2_offs
497
+
498
+ .macro STORE_H2V2_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
499
+ scratch5 outptr
500
+ sb \scratch0, \r1_offs(\outptr)
501
+ sb \scratch1, \g1_offs(\outptr)
502
+ sb \scratch2, \b1_offs(\outptr)
503
+ sb \scratch3, \r2_offs(\outptr)
504
+ sb \scratch4, \g2_offs(\outptr)
505
+ sb \scratch5, \b2_offs(\outptr)
506
+ .if (\pixel_size == 8)
507
+ li \scratch0, 0xFF
508
+ sb \scratch0, \a1_offs(\outptr)
509
+ sb \scratch0, \a2_offs(\outptr)
510
+ .endif
511
+ addiu \outptr, \pixel_size
512
+ .endm
513
+
514
+ .macro STORE_H2V2_1_PIXEL scratch0 scratch1 scratch2 outptr
515
+ sb \scratch0, \r1_offs(\outptr)
516
+ sb \scratch1, \g1_offs(\outptr)
517
+ sb \scratch2, \b1_offs(\outptr)
518
+
519
+ .if (\pixel_size == 8)
520
+ li t0, 0xFF
521
+ sb t0, \a1_offs(\outptr)
522
+ .endif
523
+ .endm
524
+
525
+ LEAF_DSPR2(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
526
+ /*
527
+ * a0 = cinfo->output_width
528
+ * a1 = input_buf
529
+ * a2 = in_row_group_ctr
530
+ * a3 = output_buf
531
+ * 16(sp) = cinfo->sample_range_limit
532
+ */
533
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
534
+
535
+ lw t9, 56(sp) // cinfo->sample_range_limit
536
+ lw v0, 0(a1)
537
+ lw v1, 4(a1)
538
+ lw t0, 8(a1)
539
+ sll t1, a2, 3
540
+ addiu t2, t1, 4
541
+ sll t3, a2, 2
542
+ lw t4, 0(a3) // t4 = output_buf[0]
543
+ lwx t1, t1(v0) // t1 = input_buf[0][in_row_group_ctr*2]
544
+ lwx t2, t2(v0) // t2 = input_buf[0][in_row_group_ctr*2 + 1]
545
+ lwx t5, t3(v1) // t5 = input_buf[1][in_row_group_ctr]
546
+ lwx t6, t3(t0) // t6 = input_buf[2][in_row_group_ctr]
547
+ lw t7, 4(a3) // t7 = output_buf[1]
548
+ li s1, 0xe6ea
549
+ addiu t8, s1, 0x7fff // t8 = 0x166e9 [FIX(1.40200)]
550
+ addiu s0, t8, 0x5eb9 // s0 = 0x1c5a2 [FIX(1.77200)]
551
+ addiu s1, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
552
+ xori s2, s1, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
553
+ srl t3, a0, 1
554
+ blez t3, 2f
555
+ addu t0, t5, t3 // t0 = end address
556
+ 1:
557
+ lbu t3, 0(t5)
558
+ lbu s3, 0(t6)
559
+ addiu t5, t5, 1
560
+ addiu t3, t3, -128 // (cb - 128)
561
+ addiu s3, s3, -128 // (cr - 128)
562
+ mult $ac1, s1, t3
563
+ madd $ac1, s2, s3
564
+ sll s3, s3, 15
565
+ sll t3, t3, 15
566
+ mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
567
+ extr_r.w s5, $ac1, 16
568
+ mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
569
+ lbu v0, 0(t1)
570
+ addiu t6, t6, 1
571
+ addiu t1, t1, 2
572
+ addu t3, v0, s4 // y+cred
573
+ addu s3, v0, s5 // y+cgreen
574
+ addu v1, v0, s6 // y+cblue
575
+ addu t3, t9, t3 // y+cred
576
+ addu s3, t9, s3 // y+cgreen
577
+ addu v1, t9, v1 // y+cblue
578
+ lbu AT, 0(t3)
579
+ lbu s7, 0(s3)
580
+ lbu ra, 0(v1)
581
+ lbu v0, -1(t1)
582
+ addu t3, v0, s4 // y+cred
583
+ addu s3, v0, s5 // y+cgreen
584
+ addu v1, v0, s6 // y+cblue
585
+ addu t3, t9, t3 // y+cred
586
+ addu s3, t9, s3 // y+cgreen
587
+ addu v1, t9, v1 // y+cblue
588
+ lbu t3, 0(t3)
589
+ lbu s3, 0(s3)
590
+ lbu v1, 0(v1)
591
+ lbu v0, 0(t2)
592
+
593
+ STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t4
594
+
595
+ addu t3, v0, s4 // y+cred
596
+ addu s3, v0, s5 // y+cgreen
597
+ addu v1, v0, s6 // y+cblue
598
+ addu t3, t9, t3 // y+cred
599
+ addu s3, t9, s3 // y+cgreen
600
+ addu v1, t9, v1 // y+cblue
601
+ lbu AT, 0(t3)
602
+ lbu s7, 0(s3)
603
+ lbu ra, 0(v1)
604
+ lbu v0, 1(t2)
605
+ addiu t2, t2, 2
606
+ addu t3, v0, s4 // y+cred
607
+ addu s3, v0, s5 // y+cgreen
608
+ addu v1, v0, s6 // y+cblue
609
+ addu t3, t9, t3 // y+cred
610
+ addu s3, t9, s3 // y+cgreen
611
+ addu v1, t9, v1 // y+cblue
612
+ lbu t3, 0(t3)
613
+ lbu s3, 0(s3)
614
+ lbu v1, 0(v1)
615
+
616
+ STORE_H2V2_2_PIXELS AT, s7, ra, t3, s3, v1, t7
617
+
618
+ bne t0, t5, 1b
619
+ nop
620
+ 2:
621
+ andi t0, a0, 1
622
+ beqz t0, 4f
623
+ lbu t3, 0(t5)
624
+ lbu s3, 0(t6)
625
+ addiu t3, t3, -128 // (cb - 128)
626
+ addiu s3, s3, -128 // (cr - 128)
627
+ mult $ac1, s1, t3
628
+ madd $ac1, s2, s3
629
+ sll s3, s3, 15
630
+ sll t3, t3, 15
631
+ lbu v0, 0(t1)
632
+ extr_r.w s5, $ac1, 16
633
+ mulq_rs.w s4, t8, s3 // s4 = (C1 * cr + ONE_HALF)>> SCALEBITS
634
+ mulq_rs.w s6, s0, t3 // s6 = (C2 * cb + ONE_HALF)>> SCALEBITS
635
+ addu t3, v0, s4 // y+cred
636
+ addu s3, v0, s5 // y+cgreen
637
+ addu v1, v0, s6 // y+cblue
638
+ addu t3, t9, t3 // y+cred
639
+ addu s3, t9, s3 // y+cgreen
640
+ addu v1, t9, v1 // y+cblue
641
+ lbu t3, 0(t3)
642
+ lbu s3, 0(s3)
643
+ lbu v1, 0(v1)
644
+ lbu v0, 0(t2)
645
+
646
+ STORE_H2V2_1_PIXEL t3, s3, v1, t4
647
+
648
+ addu t3, v0, s4 // y+cred
649
+ addu s3, v0, s5 // y+cgreen
650
+ addu v1, v0, s6 // y+cblue
651
+ addu t3, t9, t3 // y+cred
652
+ addu s3, t9, s3 // y+cgreen
653
+ addu v1, t9, v1 // y+cblue
654
+ lbu t3, 0(t3)
655
+ lbu s3, 0(s3)
656
+ lbu v1, 0(v1)
657
+
658
+ STORE_H2V2_1_PIXEL t3, s3, v1, t7
659
+ 4:
660
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
661
+
662
+ j ra
663
+ nop
664
+
665
+ END(jsimd_h2v2_\colorid\()_merged_upsample_dspr2)
666
+
667
+ .purgem STORE_H2V2_1_PIXEL
668
+ .purgem STORE_H2V2_2_PIXELS
669
+ .endm
670
+
671
+ /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
672
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
673
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
674
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
675
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
676
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
677
+ GENERATE_H2V2_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
678
+
679
+
680
+ /*****************************************************************************/
681
+ /*
682
+ * jsimd_h2v1_merged_upsample_dspr2
683
+ * jsimd_h2v1_extrgb_merged_upsample_dspr2
684
+ * jsimd_h2v1_extrgbx_merged_upsample_dspr2
685
+ * jsimd_h2v1_extbgr_merged_upsample_dspr2
686
+ * jsimd_h2v1_extbgrx_merged_upsample_dspr2
687
+ * jsimd_h2v1_extxbgr_merged_upsample_dspr2
688
+ * jsimd_h2v1_extxrgb_merged_upsample_dspr2
689
+ *
690
+ * Merged h2v1 upsample routines
691
+ */
692
+
693
+ .macro GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 colorid, pixel_size, \
694
+ r1_offs, g1_offs, \
695
+ b1_offs, a1_offs, \
696
+ r2_offs, g2_offs, \
697
+ b2_offs, a2_offs
698
+
699
+ .macro STORE_H2V1_2_PIXELS scratch0 scratch1 scratch2 scratch3 scratch4 \
700
+ scratch5 outptr
701
+ sb \scratch0, \r1_offs(\outptr)
702
+ sb \scratch1, \g1_offs(\outptr)
703
+ sb \scratch2, \b1_offs(\outptr)
704
+ sb \scratch3, \r2_offs(\outptr)
705
+ sb \scratch4, \g2_offs(\outptr)
706
+ sb \scratch5, \b2_offs(\outptr)
707
+ .if (\pixel_size == 8)
708
+ li t0, 0xFF
709
+ sb t0, \a1_offs(\outptr)
710
+ sb t0, \a2_offs(\outptr)
711
+ .endif
712
+ addiu \outptr, \pixel_size
713
+ .endm
714
+
715
+ .macro STORE_H2V1_1_PIXEL scratch0 scratch1 scratch2 outptr
716
+ sb \scratch0, \r1_offs(\outptr)
717
+ sb \scratch1, \g1_offs(\outptr)
718
+ sb \scratch2, \b1_offs(\outptr)
719
+ .if (\pixel_size == 8)
720
+ li t0, 0xFF
721
+ sb t0, \a1_offs(\outptr)
722
+ .endif
723
+ .endm
724
+
725
+ LEAF_DSPR2(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
726
+ /*
727
+ * a0 = cinfo->output_width
728
+ * a1 = input_buf
729
+ * a2 = in_row_group_ctr
730
+ * a3 = output_buf
731
+ * 16(sp) = range_limit
732
+ */
733
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
734
+
735
+ li t0, 0xe6ea
736
+ lw t1, 0(a1) // t1 = input_buf[0]
737
+ lw t2, 4(a1) // t2 = input_buf[1]
738
+ lw t3, 8(a1) // t3 = input_buf[2]
739
+ lw t8, 56(sp) // t8 = range_limit
740
+ addiu s1, t0, 0x7fff // s1 = 0x166e9 [FIX(1.40200)]
741
+ addiu s2, s1, 0x5eb9 // s2 = 0x1c5a2 [FIX(1.77200)]
742
+ addiu s0, t0, 0x9916 // s0 = 0x8000
743
+ addiu s4, zero, 0xa7e6 // s4 = 0xffffa7e6 [-FIX(0.34414)]
744
+ xori s3, s4, 0xeec8 // s3 = 0xffff492e [-FIX(0.71414)]
745
+ srl t0, a0, 1
746
+ sll t4, a2, 2
747
+ lwx s5, t4(t1) // s5 = inptr0
748
+ lwx s6, t4(t2) // s6 = inptr1
749
+ lwx s7, t4(t3) // s7 = inptr2
750
+ lw t7, 0(a3) // t7 = outptr
751
+ blez t0, 2f
752
+ addu t9, s6, t0 // t9 = end address
753
+ 1:
754
+ lbu t2, 0(s6) // t2 = cb
755
+ lbu t0, 0(s7) // t0 = cr
756
+ lbu t1, 0(s5) // t1 = y
757
+ addiu t2, t2, -128 // t2 = cb - 128
758
+ addiu t0, t0, -128 // t0 = cr - 128
759
+ mult $ac1, s4, t2
760
+ madd $ac1, s3, t0
761
+ sll t0, t0, 15
762
+ sll t2, t2, 15
763
+ mulq_rs.w t0, s1, t0 // t0 = (C1*cr + ONE_HALF)>> SCALEBITS
764
+ extr_r.w t5, $ac1, 16
765
+ mulq_rs.w t6, s2, t2 // t6 = (C2*cb + ONE_HALF)>> SCALEBITS
766
+ addiu s7, s7, 1
767
+ addiu s6, s6, 1
768
+ addu t2, t1, t0 // t2 = y + cred
769
+ addu t3, t1, t5 // t3 = y + cgreen
770
+ addu t4, t1, t6 // t4 = y + cblue
771
+ addu t2, t8, t2
772
+ addu t3, t8, t3
773
+ addu t4, t8, t4
774
+ lbu t1, 1(s5)
775
+ lbu v0, 0(t2)
776
+ lbu v1, 0(t3)
777
+ lbu ra, 0(t4)
778
+ addu t2, t1, t0
779
+ addu t3, t1, t5
780
+ addu t4, t1, t6
781
+ addu t2, t8, t2
782
+ addu t3, t8, t3
783
+ addu t4, t8, t4
784
+ lbu t2, 0(t2)
785
+ lbu t3, 0(t3)
786
+ lbu t4, 0(t4)
787
+
788
+ STORE_H2V1_2_PIXELS v0, v1, ra, t2, t3, t4, t7
789
+
790
+ bne t9, s6, 1b
791
+ addiu s5, s5, 2
792
+ 2:
793
+ andi t0, a0, 1
794
+ beqz t0, 4f
795
+ nop
796
+ 3:
797
+ lbu t2, 0(s6)
798
+ lbu t0, 0(s7)
799
+ lbu t1, 0(s5)
800
+ addiu t2, t2, -128 // (cb - 128)
801
+ addiu t0, t0, -128 // (cr - 128)
802
+ mul t3, s4, t2
803
+ mul t4, s3, t0
804
+ sll t0, t0, 15
805
+ sll t2, t2, 15
806
+ mulq_rs.w t0, s1, t0 // (C1*cr + ONE_HALF)>> SCALEBITS
807
+ mulq_rs.w t6, s2, t2 // (C2*cb + ONE_HALF)>> SCALEBITS
808
+ addu t3, t3, s0
809
+ addu t3, t4, t3
810
+ sra t5, t3, 16 // (C4*cb + ONE_HALF + C3*cr)>> SCALEBITS
811
+ addu t2, t1, t0 // y + cred
812
+ addu t3, t1, t5 // y + cgreen
813
+ addu t4, t1, t6 // y + cblue
814
+ addu t2, t8, t2
815
+ addu t3, t8, t3
816
+ addu t4, t8, t4
817
+ lbu t2, 0(t2)
818
+ lbu t3, 0(t3)
819
+ lbu t4, 0(t4)
820
+
821
+ STORE_H2V1_1_PIXEL t2, t3, t4, t7
822
+ 4:
823
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, ra
824
+
825
+ j ra
826
+ nop
827
+
828
+ END(jsimd_h2v1_\colorid\()_merged_upsample_dspr2)
829
+
830
+ .purgem STORE_H2V1_1_PIXEL
831
+ .purgem STORE_H2V1_2_PIXELS
832
+ .endm
833
+
834
+ /*------------------------------------id -- pix R1 G1 B1 A1 R2 G2 B2 A2 */
835
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgb, 6, 0, 1, 2, 6, 3, 4, 5, 6
836
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgr, 6, 2, 1, 0, 3, 5, 4, 3, 6
837
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extrgbx, 8, 0, 1, 2, 3, 4, 5, 6, 7
838
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extbgrx, 8, 2, 1, 0, 3, 6, 5, 4, 7
839
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxbgr, 8, 3, 2, 1, 0, 7, 6, 5, 4
840
+ GENERATE_H2V1_MERGED_UPSAMPLE_DSPR2 extxrgb, 8, 1, 2, 3, 0, 5, 6, 7, 4
841
+
842
+
843
+ /*****************************************************************************/
844
+ /*
845
+ * jsimd_h2v2_fancy_upsample_dspr2
846
+ *
847
+ * Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
848
+ */
849
+ LEAF_DSPR2(jsimd_h2v2_fancy_upsample_dspr2)
850
+ /*
851
+ * a0 = cinfo->max_v_samp_factor
852
+ * a1 = downsampled_width
853
+ * a2 = input_data
854
+ * a3 = output_data_ptr
855
+ */
856
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
857
+
858
+ li s4, 0
859
+ lw s2, 0(a3) // s2 = *output_data_ptr
860
+ 0:
861
+ li t9, 2
862
+ lw s1, -4(a2) // s1 = inptr1
863
+
864
+ 1:
865
+ lw s0, 0(a2) // s0 = inptr0
866
+ lwx s3, s4(s2)
867
+ addiu s5, a1, -2 // s5 = downsampled_width - 2
868
+ srl t4, s5, 1
869
+ sll t4, t4, 1
870
+ lbu t0, 0(s0)
871
+ lbu t1, 1(s0)
872
+ lbu t2, 0(s1)
873
+ lbu t3, 1(s1)
874
+ addiu s0, 2
875
+ addiu s1, 2
876
+ addu t8, s0, t4 // t8 = end address
877
+ andi s5, s5, 1 // s5 = residual
878
+ sll t4, t0, 1
879
+ sll t6, t1, 1
880
+ addu t0, t0, t4 // t0 = (*inptr0++) * 3
881
+ addu t1, t1, t6 // t1 = (*inptr0++) * 3
882
+ addu t7, t0, t2 // t7 = thiscolsum
883
+ addu t6, t1, t3 // t5 = nextcolsum
884
+ sll t0, t7, 2 // t0 = thiscolsum * 4
885
+ subu t1, t0, t7 // t1 = thiscolsum * 3
886
+ shra_r.w t0, t0, 4
887
+ addiu t1, 7
888
+ addu t1, t1, t6
889
+ srl t1, t1, 4
890
+ sb t0, 0(s3)
891
+ sb t1, 1(s3)
892
+ beq t8, s0, 22f // skip to final iteration if width == 3
893
+ addiu s3, 2
894
+ 2:
895
+ lh t0, 0(s0) // t0 = A3|A2
896
+ lh t2, 0(s1) // t2 = B3|B2
897
+ addiu s0, 2
898
+ addiu s1, 2
899
+ preceu.ph.qbr t0, t0 // t0 = 0|A3|0|A2
900
+ preceu.ph.qbr t2, t2 // t2 = 0|B3|0|B2
901
+ shll.ph t1, t0, 1
902
+ sll t3, t6, 1
903
+ addu.ph t0, t1, t0 // t0 = A3*3|A2*3
904
+ addu t3, t3, t6 // t3 = this * 3
905
+ addu.ph t0, t0, t2 // t0 = next2|next1
906
+ addu t1, t3, t7
907
+ andi t7, t0, 0xFFFF // t7 = next1
908
+ sll t2, t7, 1
909
+ addu t2, t7, t2 // t2 = next1*3
910
+ addu t4, t2, t6
911
+ srl t6, t0, 16 // t6 = next2
912
+ shra_r.w t1, t1, 4 // t1 = (this*3 + last + 8) >> 4
913
+ addu t0, t3, t7
914
+ addiu t0, 7
915
+ srl t0, t0, 4 // t0 = (this*3 + next1 + 7) >> 4
916
+ shra_r.w t4, t4, 4 // t3 = (next1*3 + this + 8) >> 4
917
+ addu t2, t2, t6
918
+ addiu t2, 7
919
+ srl t2, t2, 4 // t2 = (next1*3 + next2 + 7) >> 4
920
+ sb t1, 0(s3)
921
+ sb t0, 1(s3)
922
+ sb t4, 2(s3)
923
+ sb t2, 3(s3)
924
+ bne t8, s0, 2b
925
+ addiu s3, 4
926
+ 22:
927
+ beqz s5, 4f
928
+ addu t8, s0, s5
929
+ 3:
930
+ lbu t0, 0(s0)
931
+ lbu t2, 0(s1)
932
+ addiu s0, 1
933
+ addiu s1, 1
934
+ sll t3, t6, 1
935
+ sll t1, t0, 1
936
+ addu t1, t0, t1 // t1 = inptr0 * 3
937
+ addu t3, t3, t6 // t3 = thiscolsum * 3
938
+ addu t5, t1, t2
939
+ addu t1, t3, t7
940
+ shra_r.w t1, t1, 4
941
+ addu t0, t3, t5
942
+ addiu t0, 7
943
+ srl t0, t0, 4
944
+ sb t1, 0(s3)
945
+ sb t0, 1(s3)
946
+ addiu s3, 2
947
+ move t7, t6
948
+ bne t8, s0, 3b
949
+ move t6, t5
950
+ 4:
951
+ sll t0, t6, 2 // t0 = thiscolsum * 4
952
+ subu t1, t0, t6 // t1 = thiscolsum * 3
953
+ addu t1, t1, t7
954
+ addiu s4, 4
955
+ shra_r.w t1, t1, 4
956
+ addiu t0, 7
957
+ srl t0, t0, 4
958
+ sb t1, 0(s3)
959
+ sb t0, 1(s3)
960
+ addiu t9, -1
961
+ addiu s3, 2
962
+ bnez t9, 1b
963
+ lw s1, 4(a2)
964
+ srl t0, s4, 2
965
+ subu t0, a0, t0
966
+ bgtz t0, 0b
967
+ addiu a2, 4
968
+
969
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
970
+
971
+ j ra
972
+ nop
973
+ END(jsimd_h2v2_fancy_upsample_dspr2)
974
+
975
+
976
+ /*****************************************************************************/
977
+ LEAF_DSPR2(jsimd_h2v1_fancy_upsample_dspr2)
978
+ /*
979
+ * a0 = cinfo->max_v_samp_factor
980
+ * a1 = downsampled_width
981
+ * a2 = input_data
982
+ * a3 = output_data_ptr
983
+ */
984
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
985
+
986
+ .set at
987
+
988
+ beqz a0, 3f
989
+ sll t0, a0, 2
990
+ lw s1, 0(a3)
991
+ li s3, 0x10001
992
+ addu s0, s1, t0
993
+ 0:
994
+ addiu t8, a1, -2
995
+ srl t9, t8, 2
996
+ lw t7, 0(a2)
997
+ lw s2, 0(s1)
998
+ lbu t0, 0(t7)
999
+ lbu t1, 1(t7) // t1 = inptr[1]
1000
+ sll t2, t0, 1
1001
+ addu t2, t2, t0 // t2 = invalue*3
1002
+ addu t2, t2, t1
1003
+ shra_r.w t2, t2, 2
1004
+ sb t0, 0(s2)
1005
+ sb t2, 1(s2)
1006
+ beqz t9, 11f
1007
+ addiu s2, 2
1008
+ 1:
1009
+ ulw t0, 0(t7) // t0 = |P3|P2|P1|P0|
1010
+ ulw t1, 1(t7)
1011
+ ulh t2, 4(t7) // t2 = |0|0|P5|P4|
1012
+ preceu.ph.qbl t3, t0 // t3 = |0|P3|0|P2|
1013
+ preceu.ph.qbr t0, t0 // t0 = |0|P1|0|P0|
1014
+ preceu.ph.qbr t2, t2 // t2 = |0|P5|0|P4|
1015
+ preceu.ph.qbl t4, t1 // t4 = |0|P4|0|P3|
1016
+ preceu.ph.qbr t1, t1 // t1 = |0|P2|0|P1|
1017
+ shll.ph t5, t4, 1
1018
+ shll.ph t6, t1, 1
1019
+ addu.ph t5, t5, t4 // t5 = |P4*3|P3*3|
1020
+ addu.ph t6, t6, t1 // t6 = |P2*3|P1*3|
1021
+ addu.ph t4, t3, s3
1022
+ addu.ph t0, t0, s3
1023
+ addu.ph t4, t4, t5
1024
+ addu.ph t0, t0, t6
1025
+ shrl.ph t4, t4, 2 // t4 = |0|P3|0|P2|
1026
+ shrl.ph t0, t0, 2 // t0 = |0|P1|0|P0|
1027
+ addu.ph t2, t2, t5
1028
+ addu.ph t3, t3, t6
1029
+ shra_r.ph t2, t2, 2 // t2 = |0|P5|0|P4|
1030
+ shra_r.ph t3, t3, 2 // t3 = |0|P3|0|P2|
1031
+ shll.ph t2, t2, 8
1032
+ shll.ph t3, t3, 8
1033
+ or t2, t4, t2
1034
+ or t3, t3, t0
1035
+ addiu t9, -1
1036
+ usw t3, 0(s2)
1037
+ usw t2, 4(s2)
1038
+ addiu s2, 8
1039
+ bgtz t9, 1b
1040
+ addiu t7, 4
1041
+ 11:
1042
+ andi t8, 3
1043
+ beqz t8, 22f
1044
+ addiu t7, 1
1045
+
1046
+ 2:
1047
+ lbu t0, 0(t7)
1048
+ addiu t7, 1
1049
+ sll t1, t0, 1
1050
+ addu t2, t0, t1 // t2 = invalue
1051
+ lbu t3, -2(t7)
1052
+ lbu t4, 0(t7)
1053
+ addiu t3, 1
1054
+ addiu t4, 2
1055
+ addu t3, t3, t2
1056
+ addu t4, t4, t2
1057
+ srl t3, 2
1058
+ srl t4, 2
1059
+ sb t3, 0(s2)
1060
+ sb t4, 1(s2)
1061
+ addiu t8, -1
1062
+ bgtz t8, 2b
1063
+ addiu s2, 2
1064
+
1065
+ 22:
1066
+ lbu t0, 0(t7)
1067
+ lbu t2, -1(t7)
1068
+ sll t1, t0, 1
1069
+ addu t1, t1, t0 // t1 = invalue * 3
1070
+ addu t1, t1, t2
1071
+ addiu t1, 1
1072
+ srl t1, t1, 2
1073
+ sb t1, 0(s2)
1074
+ sb t0, 1(s2)
1075
+ addiu s1, 4
1076
+ bne s1, s0, 0b
1077
+ addiu a2, 4
1078
+ 3:
1079
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1080
+
1081
+ j ra
1082
+ nop
1083
+ END(jsimd_h2v1_fancy_upsample_dspr2)
1084
+
1085
+
1086
+ /*****************************************************************************/
1087
+ LEAF_DSPR2(jsimd_h2v1_downsample_dspr2)
1088
+ /*
1089
+ * a0 = cinfo->image_width
1090
+ * a1 = cinfo->max_v_samp_factor
1091
+ * a2 = compptr->v_samp_factor
1092
+ * a3 = compptr->width_in_blocks
1093
+ * 16(sp) = input_data
1094
+ * 20(sp) = output_data
1095
+ */
1096
+ .set at
1097
+
1098
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4
1099
+
1100
+ beqz a2, 7f
1101
+ lw s1, 44(sp) // s1 = output_data
1102
+ lw s0, 40(sp) // s0 = input_data
1103
+ srl s2, a0, 2
1104
+ andi t9, a0, 2
1105
+ srl t7, t9, 1
1106
+ addu s2, t7, s2
1107
+ sll t0, a3, 3 // t0 = width_in_blocks*DCT
1108
+ srl t7, t0, 1
1109
+ subu s2, t7, s2
1110
+ 0:
1111
+ andi t6, a0, 1 // t6 = temp_index
1112
+ addiu t6, -1
1113
+ lw t4, 0(s1) // t4 = outptr
1114
+ lw t5, 0(s0) // t5 = inptr0
1115
+ li s3, 0 // s3 = bias
1116
+ srl t7, a0, 1 // t7 = image_width1
1117
+ srl s4, t7, 2
1118
+ andi t8, t7, 3
1119
+ 1:
1120
+ ulhu t0, 0(t5)
1121
+ ulhu t1, 2(t5)
1122
+ ulhu t2, 4(t5)
1123
+ ulhu t3, 6(t5)
1124
+ raddu.w.qb t0, t0
1125
+ raddu.w.qb t1, t1
1126
+ raddu.w.qb t2, t2
1127
+ raddu.w.qb t3, t3
1128
+ shra.ph t0, t0, 1
1129
+ shra_r.ph t1, t1, 1
1130
+ shra.ph t2, t2, 1
1131
+ shra_r.ph t3, t3, 1
1132
+ sb t0, 0(t4)
1133
+ sb t1, 1(t4)
1134
+ sb t2, 2(t4)
1135
+ sb t3, 3(t4)
1136
+ addiu s4, -1
1137
+ addiu t4, 4
1138
+ bgtz s4, 1b
1139
+ addiu t5, 8
1140
+ beqz t8, 3f
1141
+ addu s4, t4, t8
1142
+ 2:
1143
+ ulhu t0, 0(t5)
1144
+ raddu.w.qb t0, t0
1145
+ addqh.w t0, t0, s3
1146
+ xori s3, s3, 1
1147
+ sb t0, 0(t4)
1148
+ addiu t4, 1
1149
+ bne t4, s4, 2b
1150
+ addiu t5, 2
1151
+ 3:
1152
+ lbux t1, t6(t5)
1153
+ sll t1, 1
1154
+ addqh.w t2, t1, s3 // t2 = pixval1
1155
+ xori s3, s3, 1
1156
+ addqh.w t3, t1, s3 // t3 = pixval2
1157
+ blez s2, 5f
1158
+ append t3, t2, 8
1159
+ addu t5, t4, s2 // t5 = loop_end2
1160
+ 4:
1161
+ ush t3, 0(t4)
1162
+ addiu s2, -1
1163
+ bgtz s2, 4b
1164
+ addiu t4, 2
1165
+ 5:
1166
+ beqz t9, 6f
1167
+ nop
1168
+ sb t2, 0(t4)
1169
+ 6:
1170
+ addiu s1, 4
1171
+ addiu a2, -1
1172
+ bnez a2, 0b
1173
+ addiu s0, 4
1174
+ 7:
1175
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4
1176
+
1177
+ j ra
1178
+ nop
1179
+ END(jsimd_h2v1_downsample_dspr2)
1180
+
1181
+
1182
+ /*****************************************************************************/
1183
+ LEAF_DSPR2(jsimd_h2v2_downsample_dspr2)
1184
+ /*
1185
+ * a0 = cinfo->image_width
1186
+ * a1 = cinfo->max_v_samp_factor
1187
+ * a2 = compptr->v_samp_factor
1188
+ * a3 = compptr->width_in_blocks
1189
+ * 16(sp) = input_data
1190
+ * 20(sp) = output_data
1191
+ */
1192
+ .set at
1193
+
1194
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1195
+
1196
+ beqz a2, 8f
1197
+ lw s1, 52(sp) // s1 = output_data
1198
+ lw s0, 48(sp) // s0 = input_data
1199
+
1200
+ andi t6, a0, 1 // t6 = temp_index
1201
+ addiu t6, -1
1202
+ srl t7, a0, 1 // t7 = image_width1
1203
+ srl s4, t7, 2
1204
+ andi t8, t7, 3
1205
+ andi t9, a0, 2
1206
+ srl s2, a0, 2
1207
+ srl t7, t9, 1
1208
+ addu s2, t7, s2
1209
+ sll t0, a3, 3 // s2 = width_in_blocks*DCT
1210
+ srl t7, t0, 1
1211
+ subu s2, t7, s2
1212
+ 0:
1213
+ lw t4, 0(s1) // t4 = outptr
1214
+ lw t5, 0(s0) // t5 = inptr0
1215
+ lw s7, 4(s0) // s7 = inptr1
1216
+ li s6, 1 // s6 = bias
1217
+ 2:
1218
+ ulw t0, 0(t5) // t0 = |P3|P2|P1|P0|
1219
+ ulw t1, 0(s7) // t1 = |Q3|Q2|Q1|Q0|
1220
+ ulw t2, 4(t5)
1221
+ ulw t3, 4(s7)
1222
+ precrq.ph.w t7, t0, t1 // t2 = |P3|P2|Q3|Q2|
1223
+ ins t0, t1, 16, 16 // t0 = |Q1|Q0|P1|P0|
1224
+ raddu.w.qb t1, t7
1225
+ raddu.w.qb t0, t0
1226
+ shra_r.w t1, t1, 2
1227
+ addiu t0, 1
1228
+ srl t0, 2
1229
+ precrq.ph.w t7, t2, t3
1230
+ ins t2, t3, 16, 16
1231
+ raddu.w.qb t7, t7
1232
+ raddu.w.qb t2, t2
1233
+ shra_r.w t7, t7, 2
1234
+ addiu t2, 1
1235
+ srl t2, 2
1236
+ sb t0, 0(t4)
1237
+ sb t1, 1(t4)
1238
+ sb t2, 2(t4)
1239
+ sb t7, 3(t4)
1240
+ addiu t4, 4
1241
+ addiu t5, 8
1242
+ addiu s4, s4, -1
1243
+ bgtz s4, 2b
1244
+ addiu s7, 8
1245
+ beqz t8, 4f
1246
+ addu t8, t4, t8
1247
+ 3:
1248
+ ulhu t0, 0(t5)
1249
+ ulhu t1, 0(s7)
1250
+ ins t0, t1, 16, 16
1251
+ raddu.w.qb t0, t0
1252
+ addu t0, t0, s6
1253
+ srl t0, 2
1254
+ xori s6, s6, 3
1255
+ sb t0, 0(t4)
1256
+ addiu t5, 2
1257
+ addiu t4, 1
1258
+ bne t8, t4, 3b
1259
+ addiu s7, 2
1260
+ 4:
1261
+ lbux t1, t6(t5)
1262
+ sll t1, 1
1263
+ lbux t0, t6(s7)
1264
+ sll t0, 1
1265
+ addu t1, t1, t0
1266
+ addu t3, t1, s6
1267
+ srl t0, t3, 2 // t2 = pixval1
1268
+ xori s6, s6, 3
1269
+ addu t2, t1, s6
1270
+ srl t1, t2, 2 // t3 = pixval2
1271
+ blez s2, 6f
1272
+ append t1, t0, 8
1273
+ 5:
1274
+ ush t1, 0(t4)
1275
+ addiu s2, -1
1276
+ bgtz s2, 5b
1277
+ addiu t4, 2
1278
+ 6:
1279
+ beqz t9, 7f
1280
+ nop
1281
+ sb t0, 0(t4)
1282
+ 7:
1283
+ addiu s1, 4
1284
+ addiu a2, -1
1285
+ bnez a2, 0b
1286
+ addiu s0, 8
1287
+ 8:
1288
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1289
+
1290
+ j ra
1291
+ nop
1292
+ END(jsimd_h2v2_downsample_dspr2)
1293
+
1294
+
1295
+ /*****************************************************************************/
1296
+ LEAF_DSPR2(jsimd_h2v2_smooth_downsample_dspr2)
1297
+ /*
1298
+ * a0 = input_data
1299
+ * a1 = output_data
1300
+ * a2 = compptr->v_samp_factor
1301
+ * a3 = cinfo->max_v_samp_factor
1302
+ * 16(sp) = cinfo->smoothing_factor
1303
+ * 20(sp) = compptr->width_in_blocks
1304
+ * 24(sp) = cinfo->image_width
1305
+ */
1306
+ .set at
1307
+
1308
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1309
+
1310
+ lw s7, 52(sp) // compptr->width_in_blocks
1311
+ lw s0, 56(sp) // cinfo->image_width
1312
+ lw s6, 48(sp) // cinfo->smoothing_factor
1313
+ sll s7, 3 // output_cols = width_in_blocks * DCTSIZE
1314
+ sll v0, s7, 1
1315
+ subu v0, v0, s0
1316
+ blez v0, 2f
1317
+ move v1, zero
1318
+ addiu t0, a3, 2 // t0 = cinfo->max_v_samp_factor + 2
1319
+ 0:
1320
+ addiu t1, a0, -4
1321
+ sll t2, v1, 2
1322
+ lwx t1, t2(t1)
1323
+ move t3, v0
1324
+ addu t1, t1, s0
1325
+ lbu t2, -1(t1)
1326
+ 1:
1327
+ addiu t3, t3, -1
1328
+ sb t2, 0(t1)
1329
+ bgtz t3, 1b
1330
+ addiu t1, t1, 1
1331
+ addiu v1, v1, 1
1332
+ bne v1, t0, 0b
1333
+ nop
1334
+ 2:
1335
+ li v0, 80
1336
+ mul v0, s6, v0
1337
+ li v1, 16384
1338
+ move t4, zero
1339
+ move t5, zero
1340
+ subu t6, v1, v0 // t6 = 16384 - tmp_smoot_f * 80
1341
+ sll t7, s6, 4 // t7 = tmp_smoot_f * 16
1342
+ 3:
1343
+ /* Special case for first column: pretend column -1 is same as column 0 */
1344
+ sll v0, t4, 2
1345
+ lwx t8, v0(a1) // outptr = output_data[outrow]
1346
+ sll v1, t5, 2
1347
+ addiu t9, v1, 4
1348
+ addiu s0, v1, -4
1349
+ addiu s1, v1, 8
1350
+ lwx s2, v1(a0) // inptr0 = input_data[inrow]
1351
+ lwx t9, t9(a0) // inptr1 = input_data[inrow+1]
1352
+ lwx s0, s0(a0) // above_ptr = input_data[inrow-1]
1353
+ lwx s1, s1(a0) // below_ptr = input_data[inrow+2]
1354
+ lh v0, 0(s2)
1355
+ lh v1, 0(t9)
1356
+ lh t0, 0(s0)
1357
+ lh t1, 0(s1)
1358
+ ins v0, v1, 16, 16
1359
+ ins t0, t1, 16, 16
1360
+ raddu.w.qb t2, v0
1361
+ raddu.w.qb s3, t0
1362
+ lbu v0, 0(s2)
1363
+ lbu v1, 2(s2)
1364
+ lbu t0, 0(t9)
1365
+ lbu t1, 2(t9)
1366
+ addu v0, v0, v1
1367
+ mult $ac1, t2, t6
1368
+ addu t0, t0, t1
1369
+ lbu t2, 2(s0)
1370
+ addu t0, t0, v0
1371
+ lbu t3, 2(s1)
1372
+ addu s3, t0, s3
1373
+ lbu v0, 0(s0)
1374
+ lbu t0, 0(s1)
1375
+ sll s3, s3, 1
1376
+ addu v0, v0, t2
1377
+ addu t0, t0, t3
1378
+ addu t0, t0, v0
1379
+ addu s3, t0, s3
1380
+ madd $ac1, s3, t7
1381
+ extr_r.w v0, $ac1, 16
1382
+ addiu t8, t8, 1
1383
+ addiu s2, s2, 2
1384
+ addiu t9, t9, 2
1385
+ addiu s0, s0, 2
1386
+ addiu s1, s1, 2
1387
+ sb v0, -1(t8)
1388
+ addiu s4, s7, -2
1389
+ and s4, s4, 3
1390
+ addu s5, s4, t8 // end address
1391
+ 4:
1392
+ lh v0, 0(s2)
1393
+ lh v1, 0(t9)
1394
+ lh t0, 0(s0)
1395
+ lh t1, 0(s1)
1396
+ ins v0, v1, 16, 16
1397
+ ins t0, t1, 16, 16
1398
+ raddu.w.qb t2, v0
1399
+ raddu.w.qb s3, t0
1400
+ lbu v0, -1(s2)
1401
+ lbu v1, 2(s2)
1402
+ lbu t0, -1(t9)
1403
+ lbu t1, 2(t9)
1404
+ addu v0, v0, v1
1405
+ mult $ac1, t2, t6
1406
+ addu t0, t0, t1
1407
+ lbu t2, 2(s0)
1408
+ addu t0, t0, v0
1409
+ lbu t3, 2(s1)
1410
+ addu s3, t0, s3
1411
+ lbu v0, -1(s0)
1412
+ lbu t0, -1(s1)
1413
+ sll s3, s3, 1
1414
+ addu v0, v0, t2
1415
+ addu t0, t0, t3
1416
+ addu t0, t0, v0
1417
+ addu s3, t0, s3
1418
+ madd $ac1, s3, t7
1419
+ extr_r.w t2, $ac1, 16
1420
+ addiu t8, t8, 1
1421
+ addiu s2, s2, 2
1422
+ addiu t9, t9, 2
1423
+ addiu s0, s0, 2
1424
+ sb t2, -1(t8)
1425
+ bne s5, t8, 4b
1426
+ addiu s1, s1, 2
1427
+ addiu s5, s7, -2
1428
+ subu s5, s5, s4
1429
+ addu s5, s5, t8 // end address
1430
+ 5:
1431
+ lh v0, 0(s2)
1432
+ lh v1, 0(t9)
1433
+ lh t0, 0(s0)
1434
+ lh t1, 0(s1)
1435
+ ins v0, v1, 16, 16
1436
+ ins t0, t1, 16, 16
1437
+ raddu.w.qb t2, v0
1438
+ raddu.w.qb s3, t0
1439
+ lbu v0, -1(s2)
1440
+ lbu v1, 2(s2)
1441
+ lbu t0, -1(t9)
1442
+ lbu t1, 2(t9)
1443
+ addu v0, v0, v1
1444
+ mult $ac1, t2, t6
1445
+ addu t0, t0, t1
1446
+ lbu t2, 2(s0)
1447
+ addu t0, t0, v0
1448
+ lbu t3, 2(s1)
1449
+ addu s3, t0, s3
1450
+ lbu v0, -1(s0)
1451
+ lbu t0, -1(s1)
1452
+ sll s3, s3, 1
1453
+ addu v0, v0, t2
1454
+ addu t0, t0, t3
1455
+ lh v1, 2(t9)
1456
+ addu t0, t0, v0
1457
+ lh v0, 2(s2)
1458
+ addu s3, t0, s3
1459
+ lh t0, 2(s0)
1460
+ lh t1, 2(s1)
1461
+ madd $ac1, s3, t7
1462
+ extr_r.w t2, $ac1, 16
1463
+ ins t0, t1, 16, 16
1464
+ ins v0, v1, 16, 16
1465
+ raddu.w.qb s3, t0
1466
+ lbu v1, 4(s2)
1467
+ lbu t0, 1(t9)
1468
+ lbu t1, 4(t9)
1469
+ sb t2, 0(t8)
1470
+ raddu.w.qb t3, v0
1471
+ lbu v0, 1(s2)
1472
+ addu t0, t0, t1
1473
+ mult $ac1, t3, t6
1474
+ addu v0, v0, v1
1475
+ lbu t2, 4(s0)
1476
+ addu t0, t0, v0
1477
+ lbu v0, 1(s0)
1478
+ addu s3, t0, s3
1479
+ lbu t0, 1(s1)
1480
+ lbu t3, 4(s1)
1481
+ addu v0, v0, t2
1482
+ sll s3, s3, 1
1483
+ addu t0, t0, t3
1484
+ lh v1, 4(t9)
1485
+ addu t0, t0, v0
1486
+ lh v0, 4(s2)
1487
+ addu s3, t0, s3
1488
+ lh t0, 4(s0)
1489
+ lh t1, 4(s1)
1490
+ madd $ac1, s3, t7
1491
+ extr_r.w t2, $ac1, 16
1492
+ ins t0, t1, 16, 16
1493
+ ins v0, v1, 16, 16
1494
+ raddu.w.qb s3, t0
1495
+ lbu v1, 6(s2)
1496
+ lbu t0, 3(t9)
1497
+ lbu t1, 6(t9)
1498
+ sb t2, 1(t8)
1499
+ raddu.w.qb t3, v0
1500
+ lbu v0, 3(s2)
1501
+ addu t0, t0, t1
1502
+ mult $ac1, t3, t6
1503
+ addu v0, v0, v1
1504
+ lbu t2, 6(s0)
1505
+ addu t0, t0, v0
1506
+ lbu v0, 3(s0)
1507
+ addu s3, t0, s3
1508
+ lbu t0, 3(s1)
1509
+ lbu t3, 6(s1)
1510
+ addu v0, v0, t2
1511
+ sll s3, s3, 1
1512
+ addu t0, t0, t3
1513
+ lh v1, 6(t9)
1514
+ addu t0, t0, v0
1515
+ lh v0, 6(s2)
1516
+ addu s3, t0, s3
1517
+ lh t0, 6(s0)
1518
+ lh t1, 6(s1)
1519
+ madd $ac1, s3, t7
1520
+ extr_r.w t3, $ac1, 16
1521
+ ins t0, t1, 16, 16
1522
+ ins v0, v1, 16, 16
1523
+ raddu.w.qb s3, t0
1524
+ lbu v1, 8(s2)
1525
+ lbu t0, 5(t9)
1526
+ lbu t1, 8(t9)
1527
+ sb t3, 2(t8)
1528
+ raddu.w.qb t2, v0
1529
+ lbu v0, 5(s2)
1530
+ addu t0, t0, t1
1531
+ mult $ac1, t2, t6
1532
+ addu v0, v0, v1
1533
+ lbu t2, 8(s0)
1534
+ addu t0, t0, v0
1535
+ lbu v0, 5(s0)
1536
+ addu s3, t0, s3
1537
+ lbu t0, 5(s1)
1538
+ lbu t3, 8(s1)
1539
+ addu v0, v0, t2
1540
+ sll s3, s3, 1
1541
+ addu t0, t0, t3
1542
+ addiu t8, t8, 4
1543
+ addu t0, t0, v0
1544
+ addiu s2, s2, 8
1545
+ addu s3, t0, s3
1546
+ addiu t9, t9, 8
1547
+ madd $ac1, s3, t7
1548
+ extr_r.w t1, $ac1, 16
1549
+ addiu s0, s0, 8
1550
+ addiu s1, s1, 8
1551
+ bne s5, t8, 5b
1552
+ sb t1, -1(t8)
1553
+ /* Special case for last column */
1554
+ lh v0, 0(s2)
1555
+ lh v1, 0(t9)
1556
+ lh t0, 0(s0)
1557
+ lh t1, 0(s1)
1558
+ ins v0, v1, 16, 16
1559
+ ins t0, t1, 16, 16
1560
+ raddu.w.qb t2, v0
1561
+ raddu.w.qb s3, t0
1562
+ lbu v0, -1(s2)
1563
+ lbu v1, 1(s2)
1564
+ lbu t0, -1(t9)
1565
+ lbu t1, 1(t9)
1566
+ addu v0, v0, v1
1567
+ mult $ac1, t2, t6
1568
+ addu t0, t0, t1
1569
+ lbu t2, 1(s0)
1570
+ addu t0, t0, v0
1571
+ lbu t3, 1(s1)
1572
+ addu s3, t0, s3
1573
+ lbu v0, -1(s0)
1574
+ lbu t0, -1(s1)
1575
+ sll s3, s3, 1
1576
+ addu v0, v0, t2
1577
+ addu t0, t0, t3
1578
+ addu t0, t0, v0
1579
+ addu s3, t0, s3
1580
+ madd $ac1, s3, t7
1581
+ extr_r.w t0, $ac1, 16
1582
+ addiu t5, t5, 2
1583
+ sb t0, 0(t8)
1584
+ addiu t4, t4, 1
1585
+ bne t4, a2, 3b
1586
+ addiu t5, t5, 2
1587
+
1588
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1589
+
1590
+ j ra
1591
+ nop
1592
+
1593
+ END(jsimd_h2v2_smooth_downsample_dspr2)
1594
+
1595
+
1596
+ /*****************************************************************************/
1597
+ LEAF_DSPR2(jsimd_int_upsample_dspr2)
1598
+ /*
1599
+ * a0 = upsample->h_expand[compptr->component_index]
1600
+ * a1 = upsample->v_expand[compptr->component_index]
1601
+ * a2 = input_data
1602
+ * a3 = output_data_ptr
1603
+ * 16(sp) = cinfo->output_width
1604
+ * 20(sp) = cinfo->max_v_samp_factor
1605
+ */
1606
+ .set at
1607
+
1608
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
1609
+
1610
+ lw s0, 0(a3) // s0 = output_data
1611
+ lw s1, 32(sp) // s1 = cinfo->output_width
1612
+ lw s2, 36(sp) // s2 = cinfo->max_v_samp_factor
1613
+ li t6, 0 // t6 = inrow
1614
+ beqz s2, 10f
1615
+ li s3, 0 // s3 = outrow
1616
+ 0:
1617
+ addu t0, a2, t6
1618
+ addu t7, s0, s3
1619
+ lw t3, 0(t0) // t3 = inptr
1620
+ lw t8, 0(t7) // t8 = outptr
1621
+ beqz s1, 4f
1622
+ addu t5, t8, s1 // t5 = outend
1623
+ 1:
1624
+ lb t2, 0(t3) // t2 = invalue = *inptr++
1625
+ addiu t3, 1
1626
+ beqz a0, 3f
1627
+ move t0, a0 // t0 = h_expand
1628
+ 2:
1629
+ sb t2, 0(t8)
1630
+ addiu t0, -1
1631
+ bgtz t0, 2b
1632
+ addiu t8, 1
1633
+ 3:
1634
+ bgt t5, t8, 1b
1635
+ nop
1636
+ 4:
1637
+ addiu t9, a1, -1 // t9 = v_expand - 1
1638
+ blez t9, 9f
1639
+ nop
1640
+ 5:
1641
+ lw t3, 0(s0)
1642
+ lw t4, 4(s0)
1643
+ subu t0, s1, 0xF
1644
+ blez t0, 7f
1645
+ addu t5, t3, s1 // t5 = end address
1646
+ andi t7, s1, 0xF // t7 = residual
1647
+ subu t8, t5, t7
1648
+ 6:
1649
+ ulw t0, 0(t3)
1650
+ ulw t1, 4(t3)
1651
+ ulw t2, 8(t3)
1652
+ usw t0, 0(t4)
1653
+ ulw t0, 12(t3)
1654
+ usw t1, 4(t4)
1655
+ usw t2, 8(t4)
1656
+ usw t0, 12(t4)
1657
+ addiu t3, 16
1658
+ bne t3, t8, 6b
1659
+ addiu t4, 16
1660
+ beqz t7, 8f
1661
+ nop
1662
+ 7:
1663
+ lbu t0, 0(t3)
1664
+ sb t0, 0(t4)
1665
+ addiu t3, 1
1666
+ bne t3, t5, 7b
1667
+ addiu t4, 1
1668
+ 8:
1669
+ addiu t9, -1
1670
+ bgtz t9, 5b
1671
+ addiu s0, 8
1672
+ 9:
1673
+ addu s3, s3, a1
1674
+ bne s3, s2, 0b
1675
+ addiu t6, 1
1676
+ 10:
1677
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
1678
+
1679
+ j ra
1680
+ nop
1681
+ END(jsimd_int_upsample_dspr2)
1682
+
1683
+
1684
+ /*****************************************************************************/
1685
+ LEAF_DSPR2(jsimd_h2v1_upsample_dspr2)
1686
+ /*
1687
+ * a0 = cinfo->max_v_samp_factor
1688
+ * a1 = cinfo->output_width
1689
+ * a2 = input_data
1690
+ * a3 = output_data_ptr
1691
+ */
1692
+ lw t7, 0(a3) // t7 = output_data
1693
+ andi t8, a1, 0xf // t8 = residual
1694
+ sll t0, a0, 2
1695
+ blez a0, 4f
1696
+ addu t9, t7, t0 // t9 = output_data end address
1697
+ 0:
1698
+ lw t5, 0(t7) // t5 = outptr
1699
+ lw t6, 0(a2) // t6 = inptr
1700
+ addu t3, t5, a1 // t3 = outptr + output_width (end address)
1701
+ subu t3, t8 // t3 = end address - residual
1702
+ beq t5, t3, 2f
1703
+ move t4, t8
1704
+ 1:
1705
+ ulw t0, 0(t6) // t0 = |P3|P2|P1|P0|
1706
+ ulw t2, 4(t6) // t2 = |P7|P6|P5|P4|
1707
+ srl t1, t0, 16 // t1 = |X|X|P3|P2|
1708
+ ins t0, t0, 16, 16 // t0 = |P1|P0|P1|P0|
1709
+ ins t1, t1, 16, 16 // t1 = |P3|P2|P3|P2|
1710
+ ins t0, t0, 8, 16 // t0 = |P1|P1|P0|P0|
1711
+ ins t1, t1, 8, 16 // t1 = |P3|P3|P2|P2|
1712
+ usw t0, 0(t5)
1713
+ usw t1, 4(t5)
1714
+ srl t0, t2, 16 // t0 = |X|X|P7|P6|
1715
+ ins t2, t2, 16, 16 // t2 = |P5|P4|P5|P4|
1716
+ ins t0, t0, 16, 16 // t0 = |P7|P6|P7|P6|
1717
+ ins t2, t2, 8, 16 // t2 = |P5|P5|P4|P4|
1718
+ ins t0, t0, 8, 16 // t0 = |P7|P7|P6|P6|
1719
+ usw t2, 8(t5)
1720
+ usw t0, 12(t5)
1721
+ addiu t5, 16
1722
+ bne t5, t3, 1b
1723
+ addiu t6, 8
1724
+ beqz t8, 3f
1725
+ move t4, t8
1726
+ 2:
1727
+ lbu t1, 0(t6)
1728
+ sb t1, 0(t5)
1729
+ sb t1, 1(t5)
1730
+ addiu t4, -2
1731
+ addiu t6, 1
1732
+ bgtz t4, 2b
1733
+ addiu t5, 2
1734
+ 3:
1735
+ addiu t7, 4
1736
+ bne t9, t7, 0b
1737
+ addiu a2, 4
1738
+ 4:
1739
+ j ra
1740
+ nop
1741
+ END(jsimd_h2v1_upsample_dspr2)
1742
+
1743
+
1744
+ /*****************************************************************************/
1745
+ LEAF_DSPR2(jsimd_h2v2_upsample_dspr2)
1746
+ /*
1747
+ * a0 = cinfo->max_v_samp_factor
1748
+ * a1 = cinfo->output_width
1749
+ * a2 = input_data
1750
+ * a3 = output_data_ptr
1751
+ */
1752
+ lw t7, 0(a3)
1753
+ blez a0, 7f
1754
+ andi t9, a1, 0xf // t9 = residual
1755
+ 0:
1756
+ lw t6, 0(a2) // t6 = inptr
1757
+ lw t5, 0(t7) // t5 = outptr
1758
+ addu t8, t5, a1 // t8 = outptr end address
1759
+ subu t8, t9 // t8 = end address - residual
1760
+ beq t5, t8, 2f
1761
+ move t4, t9
1762
+ 1:
1763
+ ulw t0, 0(t6)
1764
+ srl t1, t0, 16
1765
+ ins t0, t0, 16, 16
1766
+ ins t0, t0, 8, 16
1767
+ ins t1, t1, 16, 16
1768
+ ins t1, t1, 8, 16
1769
+ ulw t2, 4(t6)
1770
+ usw t0, 0(t5)
1771
+ usw t1, 4(t5)
1772
+ srl t3, t2, 16
1773
+ ins t2, t2, 16, 16
1774
+ ins t2, t2, 8, 16
1775
+ ins t3, t3, 16, 16
1776
+ ins t3, t3, 8, 16
1777
+ usw t2, 8(t5)
1778
+ usw t3, 12(t5)
1779
+ addiu t5, 16
1780
+ bne t5, t8, 1b
1781
+ addiu t6, 8
1782
+ beqz t9, 3f
1783
+ move t4, t9
1784
+ 2:
1785
+ lbu t0, 0(t6)
1786
+ sb t0, 0(t5)
1787
+ sb t0, 1(t5)
1788
+ addiu t4, -2
1789
+ addiu t6, 1
1790
+ bgtz t4, 2b
1791
+ addiu t5, 2
1792
+ 3:
1793
+ lw t6, 0(t7) // t6 = outptr[0]
1794
+ lw t5, 4(t7) // t5 = outptr[1]
1795
+ addu t4, t6, a1 // t4 = new end address
1796
+ beq a1, t9, 5f
1797
+ subu t8, t4, t9
1798
+ 4:
1799
+ ulw t0, 0(t6)
1800
+ ulw t1, 4(t6)
1801
+ ulw t2, 8(t6)
1802
+ usw t0, 0(t5)
1803
+ ulw t0, 12(t6)
1804
+ usw t1, 4(t5)
1805
+ usw t2, 8(t5)
1806
+ usw t0, 12(t5)
1807
+ addiu t6, 16
1808
+ bne t6, t8, 4b
1809
+ addiu t5, 16
1810
+ beqz t9, 6f
1811
+ nop
1812
+ 5:
1813
+ lbu t0, 0(t6)
1814
+ sb t0, 0(t5)
1815
+ addiu t6, 1
1816
+ bne t6, t4, 5b
1817
+ addiu t5, 1
1818
+ 6:
1819
+ addiu t7, 8
1820
+ addiu a0, -2
1821
+ bgtz a0, 0b
1822
+ addiu a2, 4
1823
+ 7:
1824
+ j ra
1825
+ nop
1826
+ END(jsimd_h2v2_upsample_dspr2)
1827
+
1828
+
1829
+ /*****************************************************************************/
1830
+ LEAF_DSPR2(jsimd_idct_islow_dspr2)
1831
+ /*
1832
+ * a0 = coef_block
1833
+ * a1 = compptr->dcttable
1834
+ * a2 = output
1835
+ * a3 = range_limit
1836
+ */
1837
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
1838
+
1839
+ addiu sp, sp, -256
1840
+ move v0, sp
1841
+ addiu v1, zero, 8 // v1 = DCTSIZE = 8
1842
+ 1:
1843
+ lh s4, 32(a0) // s4 = inptr[16]
1844
+ lh s5, 64(a0) // s5 = inptr[32]
1845
+ lh s6, 96(a0) // s6 = inptr[48]
1846
+ lh t1, 112(a0) // t1 = inptr[56]
1847
+ lh t7, 16(a0) // t7 = inptr[8]
1848
+ lh t5, 80(a0) // t5 = inptr[40]
1849
+ lh t3, 48(a0) // t3 = inptr[24]
1850
+ or s4, s4, t1
1851
+ or s4, s4, t3
1852
+ or s4, s4, t5
1853
+ or s4, s4, t7
1854
+ or s4, s4, s5
1855
+ or s4, s4, s6
1856
+ bnez s4, 2f
1857
+ addiu v1, v1, -1
1858
+ lh s5, 0(a1) // quantptr[DCTSIZE*0]
1859
+ lh s6, 0(a0) // inptr[DCTSIZE*0]
1860
+ mul s5, s5, s6 // DEQUANTIZE(inptr[0], quantptr[0])
1861
+ sll s5, s5, 2
1862
+ sw s5, 0(v0)
1863
+ sw s5, 32(v0)
1864
+ sw s5, 64(v0)
1865
+ sw s5, 96(v0)
1866
+ sw s5, 128(v0)
1867
+ sw s5, 160(v0)
1868
+ sw s5, 192(v0)
1869
+ b 3f
1870
+ sw s5, 224(v0)
1871
+ 2:
1872
+ lh t0, 112(a1)
1873
+ lh t2, 48(a1)
1874
+ lh t4, 80(a1)
1875
+ lh t6, 16(a1)
1876
+ mul t0, t0, t1 // DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7])
1877
+ mul t1, t2, t3 // DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3])
1878
+ mul t2, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5])
1879
+ mul t3, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1])
1880
+ lh t4, 32(a1)
1881
+ lh t5, 32(a0)
1882
+ lh t6, 96(a1)
1883
+ lh t7, 96(a0)
1884
+ addu s0, t0, t1 // z3 = tmp0 + tmp2
1885
+ addu s1, t1, t2 // z2 = tmp1 + tmp2
1886
+ addu s2, t2, t3 // z4 = tmp1 + tmp3
1887
+ addu s3, s0, s2 // z3 + z4
1888
+ addiu t9, zero, 9633 // FIX_1_175875602
1889
+ mul s3, s3, t9 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
1890
+ addu t8, t0, t3 // z1 = tmp0 + tmp3
1891
+ addiu t9, zero, 2446 // FIX_0_298631336
1892
+ mul t0, t0, t9 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
1893
+ addiu t9, zero, 16819 // FIX_2_053119869
1894
+ mul t2, t2, t9 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
1895
+ addiu t9, zero, 25172 // FIX_3_072711026
1896
+ mul t1, t1, t9 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
1897
+ addiu t9, zero, 12299 // FIX_1_501321110
1898
+ mul t3, t3, t9 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
1899
+ addiu t9, zero, 16069 // FIX_1_961570560
1900
+ mul s0, s0, t9 // -z3 = MULTIPLY(z3, FIX_1_961570560)
1901
+ addiu t9, zero, 3196 // FIX_0_390180644
1902
+ mul s2, s2, t9 // -z4 = MULTIPLY(z4, FIX_0_390180644)
1903
+ addiu t9, zero, 7373 // FIX_0_899976223
1904
+ mul t8, t8, t9 // -z1 = MULTIPLY(z1, FIX_0_899976223)
1905
+ addiu t9, zero, 20995 // FIX_2_562915447
1906
+ mul s1, s1, t9 // -z2 = MULTIPLY(z2, FIX_2_562915447)
1907
+ subu s0, s3, s0 // z3 += z5
1908
+ addu t0, t0, s0 // tmp0 += z3
1909
+ addu t1, t1, s0 // tmp2 += z3
1910
+ subu s2, s3, s2 // z4 += z5
1911
+ addu t2, t2, s2 // tmp1 += z4
1912
+ addu t3, t3, s2 // tmp3 += z4
1913
+ subu t0, t0, t8 // tmp0 += z1
1914
+ subu t1, t1, s1 // tmp2 += z2
1915
+ subu t2, t2, s1 // tmp1 += z2
1916
+ subu t3, t3, t8 // tmp3 += z1
1917
+ mul s0, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
1918
+ addiu t9, zero, 6270 // FIX_0_765366865
1919
+ mul s1, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
1920
+ lh t4, 0(a1)
1921
+ lh t5, 0(a0)
1922
+ lh t6, 64(a1)
1923
+ lh t7, 64(a0)
1924
+ mul s2, t9, s0 // MULTIPLY(z2, FIX_0_765366865)
1925
+ mul t5, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
1926
+ mul t6, t6, t7 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
1927
+ addiu t9, zero, 4433 // FIX_0_541196100
1928
+ addu s3, s0, s1 // z2 + z3
1929
+ mul s3, s3, t9 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
1930
+ addiu t9, zero, 15137 // FIX_1_847759065
1931
+ mul t8, s1, t9 // MULTIPLY(z3, FIX_1_847759065)
1932
+ addu t4, t5, t6
1933
+ subu t5, t5, t6
1934
+ sll t4, t4, 13 // tmp0 = (z2 + z3) << CONST_BITS
1935
+ sll t5, t5, 13 // tmp1 = (z2 - z3) << CONST_BITS
1936
+ addu t7, s3, s2 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
1937
+ subu t6, s3, t8 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
1938
+ addu s0, t4, t7
1939
+ subu s1, t4, t7
1940
+ addu s2, t5, t6
1941
+ subu s3, t5, t6
1942
+ addu t4, s0, t3
1943
+ subu s0, s0, t3
1944
+ addu t3, s2, t1
1945
+ subu s2, s2, t1
1946
+ addu t1, s3, t2
1947
+ subu s3, s3, t2
1948
+ addu t2, s1, t0
1949
+ subu s1, s1, t0
1950
+ shra_r.w t4, t4, 11
1951
+ shra_r.w t3, t3, 11
1952
+ shra_r.w t1, t1, 11
1953
+ shra_r.w t2, t2, 11
1954
+ shra_r.w s1, s1, 11
1955
+ shra_r.w s3, s3, 11
1956
+ shra_r.w s2, s2, 11
1957
+ shra_r.w s0, s0, 11
1958
+ sw t4, 0(v0)
1959
+ sw t3, 32(v0)
1960
+ sw t1, 64(v0)
1961
+ sw t2, 96(v0)
1962
+ sw s1, 128(v0)
1963
+ sw s3, 160(v0)
1964
+ sw s2, 192(v0)
1965
+ sw s0, 224(v0)
1966
+ 3:
1967
+ addiu a1, a1, 2
1968
+ addiu a0, a0, 2
1969
+ bgtz v1, 1b
1970
+ addiu v0, v0, 4
1971
+ move v0, sp
1972
+ addiu v1, zero, 8
1973
+ 4:
1974
+ lw t0, 8(v0) // z2 = (JLONG)wsptr[2]
1975
+ lw t1, 24(v0) // z3 = (JLONG)wsptr[6]
1976
+ lw t2, 0(v0) // (JLONG)wsptr[0]
1977
+ lw t3, 16(v0) // (JLONG)wsptr[4]
1978
+ lw s4, 4(v0) // (JLONG)wsptr[1]
1979
+ lw s5, 12(v0) // (JLONG)wsptr[3]
1980
+ lw s6, 20(v0) // (JLONG)wsptr[5]
1981
+ lw s7, 28(v0) // (JLONG)wsptr[7]
1982
+ or s4, s4, t0
1983
+ or s4, s4, t1
1984
+ or s4, s4, t3
1985
+ or s4, s4, s7
1986
+ or s4, s4, s5
1987
+ or s4, s4, s6
1988
+ bnez s4, 5f
1989
+ addiu v1, v1, -1
1990
+ shra_r.w s5, t2, 5
1991
+ andi s5, s5, 0x3ff
1992
+ lbux s5, s5(a3)
1993
+ lw s1, 0(a2)
1994
+ replv.qb s5, s5
1995
+ usw s5, 0(s1)
1996
+ usw s5, 4(s1)
1997
+ b 6f
1998
+ nop
1999
+ 5:
2000
+ addu t4, t0, t1 // z2 + z3
2001
+ addiu t8, zero, 4433 // FIX_0_541196100
2002
+ mul t5, t4, t8 // z1 = MULTIPLY(z2 + z3, FIX_0_541196100)
2003
+ addiu t8, zero, 15137 // FIX_1_847759065
2004
+ mul t1, t1, t8 // MULTIPLY(z3, FIX_1_847759065)
2005
+ addiu t8, zero, 6270 // FIX_0_765366865
2006
+ mul t0, t0, t8 // MULTIPLY(z2, FIX_0_765366865)
2007
+ addu t4, t2, t3 // (JLONG)wsptr[0] + (JLONG)wsptr[4]
2008
+ subu t2, t2, t3 // (JLONG)wsptr[0] - (JLONG)wsptr[4]
2009
+ sll t4, t4, 13 // tmp0 = (wsptr[0] + wsptr[4]) << CONST_BITS
2010
+ sll t2, t2, 13 // tmp1 = (wsptr[0] - wsptr[4]) << CONST_BITS
2011
+ subu t1, t5, t1 // tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065)
2012
+ subu t3, t2, t1 // tmp12 = tmp1 - tmp2
2013
+ addu t2, t2, t1 // tmp11 = tmp1 + tmp2
2014
+ addu t5, t5, t0 // tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865)
2015
+ subu t1, t4, t5 // tmp13 = tmp0 - tmp3
2016
+ addu t0, t4, t5 // tmp10 = tmp0 + tmp3
2017
+ lw t4, 28(v0) // tmp0 = (JLONG)wsptr[7]
2018
+ lw t6, 12(v0) // tmp2 = (JLONG)wsptr[3]
2019
+ lw t5, 20(v0) // tmp1 = (JLONG)wsptr[5]
2020
+ lw t7, 4(v0) // tmp3 = (JLONG)wsptr[1]
2021
+ addu s0, t4, t6 // z3 = tmp0 + tmp2
2022
+ addiu t8, zero, 9633 // FIX_1_175875602
2023
+ addu s1, t5, t7 // z4 = tmp1 + tmp3
2024
+ addu s2, s0, s1 // z3 + z4
2025
+ mul s2, s2, t8 // z5 = MULTIPLY(z3 + z4, FIX_1_175875602)
2026
+ addu s3, t4, t7 // z1 = tmp0 + tmp3
2027
+ addu t9, t5, t6 // z2 = tmp1 + tmp2
2028
+ addiu t8, zero, 16069 // FIX_1_961570560
2029
+ mul s0, s0, t8 // -z3 = MULTIPLY(z3, FIX_1_961570560)
2030
+ addiu t8, zero, 3196 // FIX_0_390180644
2031
+ mul s1, s1, t8 // -z4 = MULTIPLY(z4, FIX_0_390180644)
2032
+ addiu t8, zero, 2446 // FIX_0_298631336
2033
+ mul t4, t4, t8 // tmp0 = MULTIPLY(tmp0, FIX_0_298631336)
2034
+ addiu t8, zero, 7373 // FIX_0_899976223
2035
+ mul s3, s3, t8 // -z1 = MULTIPLY(z1, FIX_0_899976223)
2036
+ addiu t8, zero, 16819 // FIX_2_053119869
2037
+ mul t5, t5, t8 // tmp1 = MULTIPLY(tmp1, FIX_2_053119869)
2038
+ addiu t8, zero, 20995 // FIX_2_562915447
2039
+ mul t9, t9, t8 // -z2 = MULTIPLY(z2, FIX_2_562915447)
2040
+ addiu t8, zero, 25172 // FIX_3_072711026
2041
+ mul t6, t6, t8 // tmp2 = MULTIPLY(tmp2, FIX_3_072711026)
2042
+ addiu t8, zero, 12299 // FIX_1_501321110
2043
+ mul t7, t7, t8 // tmp3 = MULTIPLY(tmp3, FIX_1_501321110)
2044
+ subu s0, s2, s0 // z3 += z5
2045
+ subu s1, s2, s1 // z4 += z5
2046
+ addu t4, t4, s0
2047
+ subu t4, t4, s3 // tmp0
2048
+ addu t5, t5, s1
2049
+ subu t5, t5, t9 // tmp1
2050
+ addu t6, t6, s0
2051
+ subu t6, t6, t9 // tmp2
2052
+ addu t7, t7, s1
2053
+ subu t7, t7, s3 // tmp3
2054
+ addu s0, t0, t7
2055
+ subu t0, t0, t7
2056
+ addu t7, t2, t6
2057
+ subu t2, t2, t6
2058
+ addu t6, t3, t5
2059
+ subu t3, t3, t5
2060
+ addu t5, t1, t4
2061
+ subu t1, t1, t4
2062
+ shra_r.w s0, s0, 18
2063
+ shra_r.w t7, t7, 18
2064
+ shra_r.w t6, t6, 18
2065
+ shra_r.w t5, t5, 18
2066
+ shra_r.w t1, t1, 18
2067
+ shra_r.w t3, t3, 18
2068
+ shra_r.w t2, t2, 18
2069
+ shra_r.w t0, t0, 18
2070
+ andi s0, s0, 0x3ff
2071
+ andi t7, t7, 0x3ff
2072
+ andi t6, t6, 0x3ff
2073
+ andi t5, t5, 0x3ff
2074
+ andi t1, t1, 0x3ff
2075
+ andi t3, t3, 0x3ff
2076
+ andi t2, t2, 0x3ff
2077
+ andi t0, t0, 0x3ff
2078
+ lw s1, 0(a2)
2079
+ lbux s0, s0(a3)
2080
+ lbux t7, t7(a3)
2081
+ lbux t6, t6(a3)
2082
+ lbux t5, t5(a3)
2083
+ lbux t1, t1(a3)
2084
+ lbux t3, t3(a3)
2085
+ lbux t2, t2(a3)
2086
+ lbux t0, t0(a3)
2087
+ sb s0, 0(s1)
2088
+ sb t7, 1(s1)
2089
+ sb t6, 2(s1)
2090
+ sb t5, 3(s1)
2091
+ sb t1, 4(s1)
2092
+ sb t3, 5(s1)
2093
+ sb t2, 6(s1)
2094
+ sb t0, 7(s1)
2095
+ 6:
2096
+ addiu v0, v0, 32
2097
+ bgtz v1, 4b
2098
+ addiu a2, a2, 4
2099
+ addiu sp, sp, 256
2100
+
2101
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2102
+
2103
+ j ra
2104
+ nop
2105
+
2106
+ END(jsimd_idct_islow_dspr2)
2107
+
2108
+
2109
+ /*****************************************************************************/
2110
+ LEAF_DSPR2(jsimd_idct_ifast_cols_dspr2)
2111
+ /*
2112
+ * a0 = inptr
2113
+ * a1 = quantptr
2114
+ * a2 = wsptr
2115
+ * a3 = mips_idct_ifast_coefs
2116
+ */
2117
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2118
+
2119
+ addiu t9, a0, 16 // end address
2120
+ or AT, a3, zero
2121
+
2122
+ 0:
2123
+ lw s0, 0(a1) // quantptr[DCTSIZE*0]
2124
+ lw t0, 0(a0) // inptr[DCTSIZE*0]
2125
+ lw t1, 16(a0) // inptr[DCTSIZE*1]
2126
+ muleq_s.w.phl v0, t0, s0 // tmp0 ...
2127
+ lw t2, 32(a0) // inptr[DCTSIZE*2]
2128
+ lw t3, 48(a0) // inptr[DCTSIZE*3]
2129
+ lw t4, 64(a0) // inptr[DCTSIZE*4]
2130
+ lw t5, 80(a0) // inptr[DCTSIZE*5]
2131
+ muleq_s.w.phr t0, t0, s0 // ... tmp0 ...
2132
+ lw t6, 96(a0) // inptr[DCTSIZE*6]
2133
+ lw t7, 112(a0) // inptr[DCTSIZE*7]
2134
+ or s4, t1, t2
2135
+ or s5, t3, t4
2136
+ bnez s4, 1f
2137
+ ins t0, v0, 16, 16 // ... tmp0
2138
+ bnez s5, 1f
2139
+ or s6, t5, t6
2140
+ or s6, s6, t7
2141
+ bnez s6, 1f
2142
+ sw t0, 0(a2) // wsptr[DCTSIZE*0]
2143
+ sw t0, 16(a2) // wsptr[DCTSIZE*1]
2144
+ sw t0, 32(a2) // wsptr[DCTSIZE*2]
2145
+ sw t0, 48(a2) // wsptr[DCTSIZE*3]
2146
+ sw t0, 64(a2) // wsptr[DCTSIZE*4]
2147
+ sw t0, 80(a2) // wsptr[DCTSIZE*5]
2148
+ sw t0, 96(a2) // wsptr[DCTSIZE*6]
2149
+ sw t0, 112(a2) // wsptr[DCTSIZE*7]
2150
+ addiu a0, a0, 4
2151
+ b 2f
2152
+ addiu a1, a1, 4
2153
+
2154
+ 1:
2155
+ lw s1, 32(a1) // quantptr[DCTSIZE*2]
2156
+ lw s2, 64(a1) // quantptr[DCTSIZE*4]
2157
+ muleq_s.w.phl v0, t2, s1 // tmp1 ...
2158
+ muleq_s.w.phr t2, t2, s1 // ... tmp1 ...
2159
+ lw s0, 16(a1) // quantptr[DCTSIZE*1]
2160
+ lw s1, 48(a1) // quantptr[DCTSIZE*3]
2161
+ lw s3, 96(a1) // quantptr[DCTSIZE*6]
2162
+ muleq_s.w.phl v1, t4, s2 // tmp2 ...
2163
+ muleq_s.w.phr t4, t4, s2 // ... tmp2 ...
2164
+ lw s2, 80(a1) // quantptr[DCTSIZE*5]
2165
+ lw t8, 4(AT) // FIX(1.414213562)
2166
+ ins t2, v0, 16, 16 // ... tmp1
2167
+ muleq_s.w.phl v0, t6, s3 // tmp3 ...
2168
+ muleq_s.w.phr t6, t6, s3 // ... tmp3 ...
2169
+ ins t4, v1, 16, 16 // ... tmp2
2170
+ addq.ph s4, t0, t4 // tmp10
2171
+ subq.ph s5, t0, t4 // tmp11
2172
+ ins t6, v0, 16, 16 // ... tmp3
2173
+ subq.ph s6, t2, t6 // tmp12 ...
2174
+ addq.ph s7, t2, t6 // tmp13
2175
+ mulq_s.ph s6, s6, t8 // ... tmp12 ...
2176
+ addq.ph t0, s4, s7 // tmp0
2177
+ subq.ph t6, s4, s7 // tmp3
2178
+ muleq_s.w.phl v0, t1, s0 // tmp4 ...
2179
+ muleq_s.w.phr t1, t1, s0 // ... tmp4 ...
2180
+ shll_s.ph s6, s6, 1 // x2
2181
+ lw s3, 112(a1) // quantptr[DCTSIZE*7]
2182
+ subq.ph s6, s6, s7 // ... tmp12
2183
+ muleq_s.w.phl v1, t7, s3 // tmp7 ...
2184
+ muleq_s.w.phr t7, t7, s3 // ... tmp7 ...
2185
+ ins t1, v0, 16, 16 // ... tmp4
2186
+ addq.ph t2, s5, s6 // tmp1
2187
+ subq.ph t4, s5, s6 // tmp2
2188
+ muleq_s.w.phl v0, t5, s2 // tmp6 ...
2189
+ muleq_s.w.phr t5, t5, s2 // ... tmp6 ...
2190
+ ins t7, v1, 16, 16 // ... tmp7
2191
+ addq.ph s5, t1, t7 // z11
2192
+ subq.ph s6, t1, t7 // z12
2193
+ muleq_s.w.phl v1, t3, s1 // tmp5 ...
2194
+ muleq_s.w.phr t3, t3, s1 // ... tmp5 ...
2195
+ ins t5, v0, 16, 16 // ... tmp6
2196
+ ins t3, v1, 16, 16 // ... tmp5
2197
+ addq.ph s7, t5, t3 // z13
2198
+ subq.ph v0, t5, t3 // z10
2199
+ addq.ph t7, s5, s7 // tmp7
2200
+ subq.ph s5, s5, s7 // tmp11 ...
2201
+ addq.ph v1, v0, s6 // z5 ...
2202
+ mulq_s.ph s5, s5, t8 // ... tmp11
2203
+ lw t8, 8(AT) // FIX(1.847759065)
2204
+ lw s4, 0(AT) // FIX(1.082392200)
2205
+ addq.ph s0, t0, t7
2206
+ subq.ph s1, t0, t7
2207
+ mulq_s.ph v1, v1, t8 // ... z5
2208
+ shll_s.ph s5, s5, 1 // x2
2209
+ lw t8, 12(AT) // FIX(-2.613125930)
2210
+ sw s0, 0(a2) // wsptr[DCTSIZE*0]
2211
+ shll_s.ph v0, v0, 1 // x4
2212
+ mulq_s.ph v0, v0, t8 // tmp12 ...
2213
+ mulq_s.ph s4, s6, s4 // tmp10 ...
2214
+ shll_s.ph v1, v1, 1 // x2
2215
+ addiu a0, a0, 4
2216
+ addiu a1, a1, 4
2217
+ sw s1, 112(a2) // wsptr[DCTSIZE*7]
2218
+ shll_s.ph s6, v0, 1 // x4
2219
+ shll_s.ph s4, s4, 1 // x2
2220
+ addq.ph s6, s6, v1 // ... tmp12
2221
+ subq.ph t5, s6, t7 // tmp6
2222
+ subq.ph s4, s4, v1 // ... tmp10
2223
+ subq.ph t3, s5, t5 // tmp5
2224
+ addq.ph s2, t2, t5
2225
+ addq.ph t1, s4, t3 // tmp4
2226
+ subq.ph s3, t2, t5
2227
+ sw s2, 16(a2) // wsptr[DCTSIZE*1]
2228
+ sw s3, 96(a2) // wsptr[DCTSIZE*6]
2229
+ addq.ph v0, t4, t3
2230
+ subq.ph v1, t4, t3
2231
+ sw v0, 32(a2) // wsptr[DCTSIZE*2]
2232
+ sw v1, 80(a2) // wsptr[DCTSIZE*5]
2233
+ addq.ph v0, t6, t1
2234
+ subq.ph v1, t6, t1
2235
+ sw v0, 64(a2) // wsptr[DCTSIZE*4]
2236
+ sw v1, 48(a2) // wsptr[DCTSIZE*3]
2237
+
2238
+ 2:
2239
+ bne a0, t9, 0b
2240
+ addiu a2, a2, 4
2241
+
2242
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
2243
+
2244
+ j ra
2245
+ nop
2246
+
2247
+ END(jsimd_idct_ifast_cols_dspr2)
2248
+
2249
+
2250
+ /*****************************************************************************/
2251
+ LEAF_DSPR2(jsimd_idct_ifast_rows_dspr2)
2252
+ /*
2253
+ * a0 = wsptr
2254
+ * a1 = output_buf
2255
+ * a2 = output_col
2256
+ * a3 = mips_idct_ifast_coefs
2257
+ */
2258
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2259
+
2260
+ addiu t9, a0, 128 // end address
2261
+ lui s8, 0x8080
2262
+ ori s8, s8, 0x8080
2263
+
2264
+ 0:
2265
+ lw AT, 36(sp) // restore $a3 (mips_idct_ifast_coefs)
2266
+ lw t0, 0(a0) // wsptr[DCTSIZE*0+0/1] b a
2267
+ lw s0, 16(a0) // wsptr[DCTSIZE*1+0/1] B A
2268
+ lw t2, 4(a0) // wsptr[DCTSIZE*0+2/3] d c
2269
+ lw s2, 20(a0) // wsptr[DCTSIZE*1+2/3] D C
2270
+ lw t4, 8(a0) // wsptr[DCTSIZE*0+4/5] f e
2271
+ lw s4, 24(a0) // wsptr[DCTSIZE*1+4/5] F E
2272
+ lw t6, 12(a0) // wsptr[DCTSIZE*0+6/7] h g
2273
+ lw s6, 28(a0) // wsptr[DCTSIZE*1+6/7] H G
2274
+ precrq.ph.w t1, s0, t0 // B b
2275
+ ins t0, s0, 16, 16 // A a
2276
+ bnez t1, 1f
2277
+ or s0, t2, s2
2278
+ bnez s0, 1f
2279
+ or s0, t4, s4
2280
+ bnez s0, 1f
2281
+ or s0, t6, s6
2282
+ bnez s0, 1f
2283
+ shll_s.ph s0, t0, 2 // A a
2284
+ lw a3, 0(a1)
2285
+ lw AT, 4(a1)
2286
+ precrq.ph.w t0, s0, s0 // A A
2287
+ ins s0, s0, 16, 16 // a a
2288
+ addu a3, a3, a2
2289
+ addu AT, AT, a2
2290
+ precrq.qb.ph t0, t0, t0 // A A A A
2291
+ precrq.qb.ph s0, s0, s0 // a a a a
2292
+ addu.qb s0, s0, s8
2293
+ addu.qb t0, t0, s8
2294
+ sw s0, 0(a3)
2295
+ sw s0, 4(a3)
2296
+ sw t0, 0(AT)
2297
+ sw t0, 4(AT)
2298
+ addiu a0, a0, 32
2299
+ bne a0, t9, 0b
2300
+ addiu a1, a1, 8
2301
+ b 2f
2302
+ nop
2303
+
2304
+ 1:
2305
+ precrq.ph.w t3, s2, t2
2306
+ ins t2, s2, 16, 16
2307
+ precrq.ph.w t5, s4, t4
2308
+ ins t4, s4, 16, 16
2309
+ precrq.ph.w t7, s6, t6
2310
+ ins t6, s6, 16, 16
2311
+ lw t8, 4(AT) // FIX(1.414213562)
2312
+ addq.ph s4, t0, t4 // tmp10
2313
+ subq.ph s5, t0, t4 // tmp11
2314
+ subq.ph s6, t2, t6 // tmp12 ...
2315
+ addq.ph s7, t2, t6 // tmp13
2316
+ mulq_s.ph s6, s6, t8 // ... tmp12 ...
2317
+ addq.ph t0, s4, s7 // tmp0
2318
+ subq.ph t6, s4, s7 // tmp3
2319
+ shll_s.ph s6, s6, 1 // x2
2320
+ subq.ph s6, s6, s7 // ... tmp12
2321
+ addq.ph t2, s5, s6 // tmp1
2322
+ subq.ph t4, s5, s6 // tmp2
2323
+ addq.ph s5, t1, t7 // z11
2324
+ subq.ph s6, t1, t7 // z12
2325
+ addq.ph s7, t5, t3 // z13
2326
+ subq.ph v0, t5, t3 // z10
2327
+ addq.ph t7, s5, s7 // tmp7
2328
+ subq.ph s5, s5, s7 // tmp11 ...
2329
+ addq.ph v1, v0, s6 // z5 ...
2330
+ mulq_s.ph s5, s5, t8 // ... tmp11
2331
+ lw t8, 8(AT) // FIX(1.847759065)
2332
+ lw s4, 0(AT) // FIX(1.082392200)
2333
+ addq.ph s0, t0, t7 // tmp0 + tmp7
2334
+ subq.ph s7, t0, t7 // tmp0 - tmp7
2335
+ mulq_s.ph v1, v1, t8 // ... z5
2336
+ lw a3, 0(a1)
2337
+ lw t8, 12(AT) // FIX(-2.613125930)
2338
+ shll_s.ph s5, s5, 1 // x2
2339
+ addu a3, a3, a2
2340
+ shll_s.ph v0, v0, 1 // x4
2341
+ mulq_s.ph v0, v0, t8 // tmp12 ...
2342
+ mulq_s.ph s4, s6, s4 // tmp10 ...
2343
+ shll_s.ph v1, v1, 1 // x2
2344
+ addiu a0, a0, 32
2345
+ addiu a1, a1, 8
2346
+ shll_s.ph s6, v0, 1 // x4
2347
+ shll_s.ph s4, s4, 1 // x2
2348
+ addq.ph s6, s6, v1 // ... tmp12
2349
+ shll_s.ph s0, s0, 2
2350
+ subq.ph t5, s6, t7 // tmp6
2351
+ subq.ph s4, s4, v1 // ... tmp10
2352
+ subq.ph t3, s5, t5 // tmp5
2353
+ shll_s.ph s7, s7, 2
2354
+ addq.ph t1, s4, t3 // tmp4
2355
+ addq.ph s1, t2, t5 // tmp1 + tmp6
2356
+ subq.ph s6, t2, t5 // tmp1 - tmp6
2357
+ addq.ph s2, t4, t3 // tmp2 + tmp5
2358
+ subq.ph s5, t4, t3 // tmp2 - tmp5
2359
+ addq.ph s4, t6, t1 // tmp3 + tmp4
2360
+ subq.ph s3, t6, t1 // tmp3 - tmp4
2361
+ shll_s.ph s1, s1, 2
2362
+ shll_s.ph s2, s2, 2
2363
+ shll_s.ph s3, s3, 2
2364
+ shll_s.ph s4, s4, 2
2365
+ shll_s.ph s5, s5, 2
2366
+ shll_s.ph s6, s6, 2
2367
+ precrq.ph.w t0, s1, s0 // B A
2368
+ ins s0, s1, 16, 16 // b a
2369
+ precrq.ph.w t2, s3, s2 // D C
2370
+ ins s2, s3, 16, 16 // d c
2371
+ precrq.ph.w t4, s5, s4 // F E
2372
+ ins s4, s5, 16, 16 // f e
2373
+ precrq.ph.w t6, s7, s6 // H G
2374
+ ins s6, s7, 16, 16 // h g
2375
+ precrq.qb.ph t0, t2, t0 // D C B A
2376
+ precrq.qb.ph s0, s2, s0 // d c b a
2377
+ precrq.qb.ph t4, t6, t4 // H G F E
2378
+ precrq.qb.ph s4, s6, s4 // h g f e
2379
+ addu.qb s0, s0, s8
2380
+ addu.qb s4, s4, s8
2381
+ sw s0, 0(a3) // outptr[0/1/2/3] d c b a
2382
+ sw s4, 4(a3) // outptr[4/5/6/7] h g f e
2383
+ lw a3, -4(a1)
2384
+ addu.qb t0, t0, s8
2385
+ addu a3, a3, a2
2386
+ addu.qb t4, t4, s8
2387
+ sw t0, 0(a3) // outptr[0/1/2/3] D C B A
2388
+ bne a0, t9, 0b
2389
+ sw t4, 4(a3) // outptr[4/5/6/7] H G F E
2390
+
2391
+ 2:
2392
+
2393
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8, a3
2394
+
2395
+ j ra
2396
+ nop
2397
+
2398
+ END(jsimd_idct_ifast_rows_dspr2)
2399
+
2400
+
2401
+ /*****************************************************************************/
2402
+ LEAF_DSPR2(jsimd_fdct_islow_dspr2)
2403
+ /*
2404
+ * a0 = data
2405
+ */
2406
+ SAVE_REGS_ON_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2407
+
2408
+ lui t0, 6437
2409
+ ori t0, 2260
2410
+ lui t1, 9633
2411
+ ori t1, 11363
2412
+ lui t2, 0xd39e
2413
+ ori t2, 0xe6dc
2414
+ lui t3, 0xf72d
2415
+ ori t3, 9633
2416
+ lui t4, 2261
2417
+ ori t4, 9633
2418
+ lui t5, 0xd39e
2419
+ ori t5, 6437
2420
+ lui t6, 9633
2421
+ ori t6, 0xd39d
2422
+ lui t7, 0xe6dc
2423
+ ori t7, 2260
2424
+ lui t8, 4433
2425
+ ori t8, 10703
2426
+ lui t9, 0xd630
2427
+ ori t9, 4433
2428
+ li s8, 8
2429
+ move a1, a0
2430
+ 1:
2431
+ lw s0, 0(a1) // tmp0 = 1|0
2432
+ lw s1, 4(a1) // tmp1 = 3|2
2433
+ lw s2, 8(a1) // tmp2 = 5|4
2434
+ lw s3, 12(a1) // tmp3 = 7|6
2435
+ packrl.ph s1, s1, s1 // tmp1 = 2|3
2436
+ packrl.ph s3, s3, s3 // tmp3 = 6|7
2437
+ subq.ph s7, s1, s2 // tmp7 = 2-5|3-4 = t5|t4
2438
+ subq.ph s5, s0, s3 // tmp5 = 1-6|0-7 = t6|t7
2439
+ mult $0, $0 // ac0 = 0
2440
+ dpa.w.ph $ac0, s7, t0 // ac0 += t5* 6437 + t4* 2260
2441
+ dpa.w.ph $ac0, s5, t1 // ac0 += t6* 9633 + t7* 11363
2442
+ mult $ac1, $0, $0 // ac1 = 0
2443
+ dpa.w.ph $ac1, s7, t2 // ac1 += t5*-11362 + t4* -6436
2444
+ dpa.w.ph $ac1, s5, t3 // ac1 += t6* -2259 + t7* 9633
2445
+ mult $ac2, $0, $0 // ac2 = 0
2446
+ dpa.w.ph $ac2, s7, t4 // ac2 += t5* 2261 + t4* 9633
2447
+ dpa.w.ph $ac2, s5, t5 // ac2 += t6*-11362 + t7* 6437
2448
+ mult $ac3, $0, $0 // ac3 = 0
2449
+ dpa.w.ph $ac3, s7, t6 // ac3 += t5* 9633 + t4*-11363
2450
+ dpa.w.ph $ac3, s5, t7 // ac3 += t6* -6436 + t7* 2260
2451
+ addq.ph s6, s1, s2 // tmp6 = 2+5|3+4 = t2|t3
2452
+ addq.ph s4, s0, s3 // tmp4 = 1+6|0+7 = t1|t0
2453
+ extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
2454
+ extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
2455
+ extr_r.w s2, $ac2, 11 // tmp2 = (ac2 + 1024) >> 11
2456
+ extr_r.w s3, $ac3, 11 // tmp3 = (ac3 + 1024) >> 11
2457
+ addq.ph s5, s4, s6 // tmp5 = t1+t2|t0+t3 = t11|t10
2458
+ subq.ph s7, s4, s6 // tmp7 = t1-t2|t0-t3 = t12|t13
2459
+ sh s0, 2(a1)
2460
+ sh s1, 6(a1)
2461
+ sh s2, 10(a1)
2462
+ sh s3, 14(a1)
2463
+ mult $0, $0 // ac0 = 0
2464
+ dpa.w.ph $ac0, s7, t8 // ac0 += t12* 4433 + t13* 10703
2465
+ mult $ac1, $0, $0 // ac1 = 0
2466
+ dpa.w.ph $ac1, s7, t9 // ac1 += t12*-10704 + t13* 4433
2467
+ sra s4, s5, 16 // tmp4 = t11
2468
+ addiu a1, a1, 16
2469
+ addiu s8, s8, -1
2470
+ extr_r.w s0, $ac0, 11 // tmp0 = (ac0 + 1024) >> 11
2471
+ extr_r.w s1, $ac1, 11 // tmp1 = (ac1 + 1024) >> 11
2472
+ addu s2, s5, s4 // tmp2 = t10 + t11
2473
+ subu s3, s5, s4 // tmp3 = t10 - t11
2474
+ sll s2, s2, 2 // tmp2 = (t10 + t11) << 2
2475
+ sll s3, s3, 2 // tmp3 = (t10 - t11) << 2
2476
+ sh s2, -16(a1)
2477
+ sh s3, -8(a1)
2478
+ sh s0, -12(a1)
2479
+ bgtz s8, 1b
2480
+ sh s1, -4(a1)
2481
+ li t0, 2260
2482
+ li t1, 11363
2483
+ li t2, 9633
2484
+ li t3, 6436
2485
+ li t4, 6437
2486
+ li t5, 2261
2487
+ li t6, 11362
2488
+ li t7, 2259
2489
+ li t8, 4433
2490
+ li t9, 10703
2491
+ li a1, 10704
2492
+ li s8, 8
2493
+
2494
+ 2:
2495
+ lh a2, 0(a0) // 0
2496
+ lh a3, 16(a0) // 8
2497
+ lh v0, 32(a0) // 16
2498
+ lh v1, 48(a0) // 24
2499
+ lh s4, 64(a0) // 32
2500
+ lh s5, 80(a0) // 40
2501
+ lh s6, 96(a0) // 48
2502
+ lh s7, 112(a0) // 56
2503
+ addu s2, v0, s5 // tmp2 = 16 + 40
2504
+ subu s5, v0, s5 // tmp5 = 16 - 40
2505
+ addu s3, v1, s4 // tmp3 = 24 + 32
2506
+ subu s4, v1, s4 // tmp4 = 24 - 32
2507
+ addu s0, a2, s7 // tmp0 = 0 + 56
2508
+ subu s7, a2, s7 // tmp7 = 0 - 56
2509
+ addu s1, a3, s6 // tmp1 = 8 + 48
2510
+ subu s6, a3, s6 // tmp6 = 8 - 48
2511
+ addu a2, s0, s3 // tmp10 = tmp0 + tmp3
2512
+ subu v1, s0, s3 // tmp13 = tmp0 - tmp3
2513
+ addu a3, s1, s2 // tmp11 = tmp1 + tmp2
2514
+ subu v0, s1, s2 // tmp12 = tmp1 - tmp2
2515
+ mult s7, t1 // ac0 = tmp7 * c1
2516
+ madd s4, t0 // ac0 += tmp4 * c0
2517
+ madd s5, t4 // ac0 += tmp5 * c4
2518
+ madd s6, t2 // ac0 += tmp6 * c2
2519
+ mult $ac1, s7, t2 // ac1 = tmp7 * c2
2520
+ msub $ac1, s4, t3 // ac1 -= tmp4 * c3
2521
+ msub $ac1, s5, t6 // ac1 -= tmp5 * c6
2522
+ msub $ac1, s6, t7 // ac1 -= tmp6 * c7
2523
+ mult $ac2, s7, t4 // ac2 = tmp7 * c4
2524
+ madd $ac2, s4, t2 // ac2 += tmp4 * c2
2525
+ madd $ac2, s5, t5 // ac2 += tmp5 * c5
2526
+ msub $ac2, s6, t6 // ac2 -= tmp6 * c6
2527
+ mult $ac3, s7, t0 // ac3 = tmp7 * c0
2528
+ msub $ac3, s4, t1 // ac3 -= tmp4 * c1
2529
+ madd $ac3, s5, t2 // ac3 += tmp5 * c2
2530
+ msub $ac3, s6, t3 // ac3 -= tmp6 * c3
2531
+ extr_r.w s0, $ac0, 15 // tmp0 = (ac0 + 16384) >> 15
2532
+ extr_r.w s1, $ac1, 15 // tmp1 = (ac1 + 16384) >> 15
2533
+ extr_r.w s2, $ac2, 15 // tmp2 = (ac2 + 16384) >> 15
2534
+ extr_r.w s3, $ac3, 15 // tmp3 = (ac3 + 16384) >> 15
2535
+ addiu s8, s8, -1
2536
+ addu s4, a2, a3 // tmp4 = tmp10 + tmp11
2537
+ subu s5, a2, a3 // tmp5 = tmp10 - tmp11
2538
+ sh s0, 16(a0)
2539
+ sh s1, 48(a0)
2540
+ sh s2, 80(a0)
2541
+ sh s3, 112(a0)
2542
+ mult v0, t8 // ac0 = tmp12 * c8
2543
+ madd v1, t9 // ac0 += tmp13 * c9
2544
+ mult $ac1, v1, t8 // ac1 = tmp13 * c8
2545
+ msub $ac1, v0, a1 // ac1 -= tmp12 * c10
2546
+ addiu a0, a0, 2
2547
+ extr_r.w s6, $ac0, 15 // tmp6 = (ac0 + 16384) >> 15
2548
+ extr_r.w s7, $ac1, 15 // tmp7 = (ac1 + 16384) >> 15
2549
+ shra_r.w s4, s4, 2 // tmp4 = (tmp4 + 2) >> 2
2550
+ shra_r.w s5, s5, 2 // tmp5 = (tmp5 + 2) >> 2
2551
+ sh s4, -2(a0)
2552
+ sh s5, 62(a0)
2553
+ sh s6, 30(a0)
2554
+ bgtz s8, 2b
2555
+ sh s7, 94(a0)
2556
+
2557
+ RESTORE_REGS_FROM_STACK 40, s0, s1, s2, s3, s4, s5, s6, s7, s8
2558
+
2559
+ jr ra
2560
+ nop
2561
+
2562
+ END(jsimd_fdct_islow_dspr2)
2563
+
2564
+
2565
+ /**************************************************************************/
2566
+ LEAF_DSPR2(jsimd_fdct_ifast_dspr2)
2567
+ /*
2568
+ * a0 = data
2569
+ */
2570
+ .set at
2571
+
2572
+ SAVE_REGS_ON_STACK 8, s0, s1
2573
+
2574
+ li a1, 0x014e014e // FIX_1_306562965 (334 << 16)|(334 & 0xffff)
2575
+ li a2, 0x008b008b // FIX_0_541196100 (139 << 16)|(139 & 0xffff)
2576
+ li a3, 0x00620062 // FIX_0_382683433 (98 << 16) |(98 & 0xffff)
2577
+ li s1, 0x00b500b5 // FIX_0_707106781 (181 << 16)|(181 & 0xffff)
2578
+
2579
+ move v0, a0
2580
+ addiu v1, v0, 128 // end address
2581
+
2582
+ 0:
2583
+ lw t0, 0(v0) // tmp0 = 1|0
2584
+ lw t1, 4(v0) // tmp1 = 3|2
2585
+ lw t2, 8(v0) // tmp2 = 5|4
2586
+ lw t3, 12(v0) // tmp3 = 7|6
2587
+ packrl.ph t1, t1, t1 // tmp1 = 2|3
2588
+ packrl.ph t3, t3, t3 // tmp3 = 6|7
2589
+ subq.ph t7, t1, t2 // tmp7 = 2-5|3-4 = t5|t4
2590
+ subq.ph t5, t0, t3 // tmp5 = 1-6|0-7 = t6|t7
2591
+ addq.ph t6, t1, t2 // tmp6 = 2+5|3+4 = t2|t3
2592
+ addq.ph t4, t0, t3 // tmp4 = 1+6|0+7 = t1|t0
2593
+ addq.ph t8, t4, t6 // tmp5 = t1+t2|t0+t3 = t11|t10
2594
+ subq.ph t9, t4, t6 // tmp7 = t1-t2|t0-t3 = t12|t13
2595
+ sra t4, t8, 16 // tmp4 = t11
2596
+ mult $0, $0 // ac0 = 0
2597
+ dpa.w.ph $ac0, t9, s1
2598
+ mult $ac1, $0, $0 // ac1 = 0
2599
+ dpa.w.ph $ac1, t7, a3 // ac1 += t4*98 + t5*98
2600
+ dpsx.w.ph $ac1, t5, a3 // ac1 += t6*98 + t7*98
2601
+ mult $ac2, $0, $0 // ac2 = 0
2602
+ dpa.w.ph $ac2, t7, a2 // ac2 += t4*139 + t5*139
2603
+ mult $ac3, $0, $0 // ac3 = 0
2604
+ dpa.w.ph $ac3, t5, a1 // ac3 += t6*334 + t7*334
2605
+ precrq.ph.w t0, t5, t7 // t0 = t5|t6
2606
+ addq.ph t2, t8, t4 // tmp2 = t10 + t11
2607
+ subq.ph t3, t8, t4 // tmp3 = t10 - t11
2608
+ extr.w t4, $ac0, 8
2609
+ mult $0, $0 // ac0 = 0
2610
+ dpa.w.ph $ac0, t0, s1 // ac0 += t5*181 + t6*181
2611
+ extr.w t0, $ac1, 8 // t0 = z5
2612
+ extr.w t1, $ac2, 8 // t1 = MULTIPLY(tmp10, 139)
2613
+ extr.w t7, $ac3, 8 // t2 = MULTIPLY(tmp12, 334)
2614
+ extr.w t8, $ac0, 8 // t8 = z3 = MULTIPLY(tmp11, 181)
2615
+ add t6, t1, t0 // t6 = z2
2616
+ add t7, t7, t0 // t7 = z4
2617
+ subq.ph t0, t5, t8 // t0 = z13 = tmp7 - z3
2618
+ addq.ph t8, t5, t8 // t9 = z11 = tmp7 + z3
2619
+ addq.ph t1, t0, t6 // t1 = z13 + z2
2620
+ subq.ph t6, t0, t6 // t6 = z13 - z2
2621
+ addq.ph t0, t8, t7 // t0 = z11 + z4
2622
+ subq.ph t7, t8, t7 // t7 = z11 - z4
2623
+ addq.ph t5, t4, t9
2624
+ subq.ph t4, t9, t4
2625
+ sh t2, 0(v0)
2626
+ sh t5, 4(v0)
2627
+ sh t3, 8(v0)
2628
+ sh t4, 12(v0)
2629
+ sh t1, 10(v0)
2630
+ sh t6, 6(v0)
2631
+ sh t0, 2(v0)
2632
+ sh t7, 14(v0)
2633
+ addiu v0, 16
2634
+ bne v1, v0, 0b
2635
+ nop
2636
+ move v0, a0
2637
+ addiu v1, v0, 16
2638
+
2639
+ 1:
2640
+ lh t0, 0(v0) // 0
2641
+ lh t1, 16(v0) // 8
2642
+ lh t2, 32(v0) // 16
2643
+ lh t3, 48(v0) // 24
2644
+ lh t4, 64(v0) // 32
2645
+ lh t5, 80(v0) // 40
2646
+ lh t6, 96(v0) // 48
2647
+ lh t7, 112(v0) // 56
2648
+ add t8, t0, t7 // t8 = tmp0
2649
+ sub t7, t0, t7 // t7 = tmp7
2650
+ add t0, t1, t6 // t0 = tmp1
2651
+ sub t1, t1, t6 // t1 = tmp6
2652
+ add t6, t2, t5 // t6 = tmp2
2653
+ sub t5, t2, t5 // t5 = tmp5
2654
+ add t2, t3, t4 // t2 = tmp3
2655
+ sub t3, t3, t4 // t3 = tmp4
2656
+ add t4, t8, t2 // t4 = tmp10 = tmp0 + tmp3
2657
+ sub t8, t8, t2 // t8 = tmp13 = tmp0 - tmp3
2658
+ sub s0, t0, t6 // s0 = tmp12 = tmp1 - tmp2
2659
+ ins t8, s0, 16, 16 // t8 = tmp12|tmp13
2660
+ add t2, t0, t6 // t2 = tmp11 = tmp1 + tmp2
2661
+ mult $0, $0 // ac0 = 0
2662
+ dpa.w.ph $ac0, t8, s1 // ac0 += t12*181 + t13*181
2663
+ add s0, t4, t2 // t8 = tmp10+tmp11
2664
+ sub t4, t4, t2 // t4 = tmp10-tmp11
2665
+ sh s0, 0(v0)
2666
+ sh t4, 64(v0)
2667
+ extr.w t2, $ac0, 8 // z1 = MULTIPLY(tmp12+tmp13, FIX_0_707106781)
2668
+ addq.ph t4, t8, t2 // t9 = tmp13 + z1
2669
+ subq.ph t8, t8, t2 // t2 = tmp13 - z1
2670
+ sh t4, 32(v0)
2671
+ sh t8, 96(v0)
2672
+ add t3, t3, t5 // t3 = tmp10 = tmp4 + tmp5
2673
+ add t0, t5, t1 // t0 = tmp11 = tmp5 + tmp6
2674
+ add t1, t1, t7 // t1 = tmp12 = tmp6 + tmp7
2675
+ andi t4, a1, 0xffff
2676
+ mul s0, t1, t4
2677
+ sra s0, s0, 8 // s0 = z4 = MULTIPLY(tmp12, FIX_1_306562965)
2678
+ ins t1, t3, 16, 16 // t1 = tmp10|tmp12
2679
+ mult $0, $0 // ac0 = 0
2680
+ mulsa.w.ph $ac0, t1, a3 // ac0 += t10*98 - t12*98
2681
+ extr.w t8, $ac0, 8 // z5 = MULTIPLY(tmp10-tmp12, FIX_0_382683433)
2682
+ add t2, t7, t8 // t2 = tmp7 + z5
2683
+ sub t7, t7, t8 // t7 = tmp7 - z5
2684
+ andi t4, a2, 0xffff
2685
+ mul t8, t3, t4
2686
+ sra t8, t8, 8 // t8 = z2 = MULTIPLY(tmp10, FIX_0_541196100)
2687
+ andi t4, s1, 0xffff
2688
+ mul t6, t0, t4
2689
+ sra t6, t6, 8 // t6 = z3 = MULTIPLY(tmp11, FIX_0_707106781)
2690
+ add t0, t6, t8 // t0 = z3 + z2
2691
+ sub t1, t6, t8 // t1 = z3 - z2
2692
+ add t3, t6, s0 // t3 = z3 + z4
2693
+ sub t4, t6, s0 // t4 = z3 - z4
2694
+ sub t5, t2, t1 // t5 = dataptr[5]
2695
+ sub t6, t7, t0 // t6 = dataptr[3]
2696
+ add t3, t2, t3 // t3 = dataptr[1]
2697
+ add t4, t7, t4 // t4 = dataptr[7]
2698
+ sh t5, 80(v0)
2699
+ sh t6, 48(v0)
2700
+ sh t3, 16(v0)
2701
+ sh t4, 112(v0)
2702
+ addiu v0, 2
2703
+ bne v0, v1, 1b
2704
+ nop
2705
+
2706
+ RESTORE_REGS_FROM_STACK 8, s0, s1
2707
+
2708
+ j ra
2709
+ nop
2710
+ END(jsimd_fdct_ifast_dspr2)
2711
+
2712
+
2713
+ /*****************************************************************************/
2714
+ LEAF_DSPR2(jsimd_quantize_dspr2)
2715
+ /*
2716
+ * a0 = coef_block
2717
+ * a1 = divisors
2718
+ * a2 = workspace
2719
+ */
2720
+ .set at
2721
+
2722
+ SAVE_REGS_ON_STACK 16, s0, s1, s2
2723
+
2724
+ addiu v0, a2, 124 // v0 = workspace_end
2725
+ lh t0, 0(a2)
2726
+ lh t1, 0(a1)
2727
+ lh t2, 128(a1)
2728
+ sra t3, t0, 15
2729
+ sll t3, t3, 1
2730
+ addiu t3, t3, 1
2731
+ mul t0, t0, t3
2732
+ lh t4, 384(a1)
2733
+ lh t5, 130(a1)
2734
+ lh t6, 2(a2)
2735
+ lh t7, 2(a1)
2736
+ lh t8, 386(a1)
2737
+
2738
+ 1:
2739
+ andi t1, 0xffff
2740
+ add t9, t0, t2
2741
+ andi t9, 0xffff
2742
+ mul v1, t9, t1
2743
+ sra s0, t6, 15
2744
+ sll s0, s0, 1
2745
+ addiu s0, s0, 1
2746
+ addiu t9, t4, 16
2747
+ srav v1, v1, t9
2748
+ mul v1, v1, t3
2749
+ mul t6, t6, s0
2750
+ andi t7, 0xffff
2751
+ addiu a2, a2, 4
2752
+ addiu a1, a1, 4
2753
+ add s1, t6, t5
2754
+ andi s1, 0xffff
2755
+ sh v1, 0(a0)
2756
+
2757
+ mul s2, s1, t7
2758
+ addiu s1, t8, 16
2759
+ srav s2, s2, s1
2760
+ mul s2, s2, s0
2761
+ lh t0, 0(a2)
2762
+ lh t1, 0(a1)
2763
+ sra t3, t0, 15
2764
+ sll t3, t3, 1
2765
+ addiu t3, t3, 1
2766
+ mul t0, t0, t3
2767
+ lh t2, 128(a1)
2768
+ lh t4, 384(a1)
2769
+ lh t5, 130(a1)
2770
+ lh t8, 386(a1)
2771
+ lh t6, 2(a2)
2772
+ lh t7, 2(a1)
2773
+ sh s2, 2(a0)
2774
+ lh t0, 0(a2)
2775
+ sra t3, t0, 15
2776
+ sll t3, t3, 1
2777
+ addiu t3, t3, 1
2778
+ mul t0, t0, t3
2779
+ bne a2, v0, 1b
2780
+ addiu a0, a0, 4
2781
+
2782
+ andi t1, 0xffff
2783
+ add t9, t0, t2
2784
+ andi t9, 0xffff
2785
+ mul v1, t9, t1
2786
+ sra s0, t6, 15
2787
+ sll s0, s0, 1
2788
+ addiu s0, s0, 1
2789
+ addiu t9, t4, 16
2790
+ srav v1, v1, t9
2791
+ mul v1, v1, t3
2792
+ mul t6, t6, s0
2793
+ andi t7, 0xffff
2794
+ sh v1, 0(a0)
2795
+ add s1, t6, t5
2796
+ andi s1, 0xffff
2797
+ mul s2, s1, t7
2798
+ addiu s1, t8, 16
2799
+ addiu a2, a2, 4
2800
+ addiu a1, a1, 4
2801
+ srav s2, s2, s1
2802
+ mul s2, s2, s0
2803
+ sh s2, 2(a0)
2804
+
2805
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2
2806
+
2807
+ j ra
2808
+ nop
2809
+
2810
+ END(jsimd_quantize_dspr2)
2811
+
2812
+
2813
+ #ifndef __mips_soft_float
2814
+
2815
+ /*****************************************************************************/
2816
+ LEAF_DSPR2(jsimd_quantize_float_dspr2)
2817
+ /*
2818
+ * a0 = coef_block
2819
+ * a1 = divisors
2820
+ * a2 = workspace
2821
+ */
2822
+ .set at
2823
+
2824
+ li t1, 0x46800100 // integer representation 16384.5
2825
+ mtc1 t1, f0
2826
+ li t0, 63
2827
+ 0:
2828
+ lwc1 f2, 0(a2)
2829
+ lwc1 f10, 0(a1)
2830
+ lwc1 f4, 4(a2)
2831
+ lwc1 f12, 4(a1)
2832
+ lwc1 f6, 8(a2)
2833
+ lwc1 f14, 8(a1)
2834
+ lwc1 f8, 12(a2)
2835
+ lwc1 f16, 12(a1)
2836
+ madd.s f2, f0, f2, f10
2837
+ madd.s f4, f0, f4, f12
2838
+ madd.s f6, f0, f6, f14
2839
+ madd.s f8, f0, f8, f16
2840
+ lwc1 f10, 16(a1)
2841
+ lwc1 f12, 20(a1)
2842
+ trunc.w.s f2, f2
2843
+ trunc.w.s f4, f4
2844
+ trunc.w.s f6, f6
2845
+ trunc.w.s f8, f8
2846
+ lwc1 f14, 24(a1)
2847
+ lwc1 f16, 28(a1)
2848
+ mfc1 t1, f2
2849
+ mfc1 t2, f4
2850
+ mfc1 t3, f6
2851
+ mfc1 t4, f8
2852
+ lwc1 f2, 16(a2)
2853
+ lwc1 f4, 20(a2)
2854
+ lwc1 f6, 24(a2)
2855
+ lwc1 f8, 28(a2)
2856
+ madd.s f2, f0, f2, f10
2857
+ madd.s f4, f0, f4, f12
2858
+ madd.s f6, f0, f6, f14
2859
+ madd.s f8, f0, f8, f16
2860
+ addiu t1, t1, -16384
2861
+ addiu t2, t2, -16384
2862
+ addiu t3, t3, -16384
2863
+ addiu t4, t4, -16384
2864
+ trunc.w.s f2, f2
2865
+ trunc.w.s f4, f4
2866
+ trunc.w.s f6, f6
2867
+ trunc.w.s f8, f8
2868
+ sh t1, 0(a0)
2869
+ sh t2, 2(a0)
2870
+ sh t3, 4(a0)
2871
+ sh t4, 6(a0)
2872
+ mfc1 t1, f2
2873
+ mfc1 t2, f4
2874
+ mfc1 t3, f6
2875
+ mfc1 t4, f8
2876
+ addiu t0, t0, -8
2877
+ addiu a2, a2, 32
2878
+ addiu a1, a1, 32
2879
+ addiu t1, t1, -16384
2880
+ addiu t2, t2, -16384
2881
+ addiu t3, t3, -16384
2882
+ addiu t4, t4, -16384
2883
+ sh t1, 8(a0)
2884
+ sh t2, 10(a0)
2885
+ sh t3, 12(a0)
2886
+ sh t4, 14(a0)
2887
+ bgez t0, 0b
2888
+ addiu a0, a0, 16
2889
+
2890
+ j ra
2891
+ nop
2892
+
2893
+ END(jsimd_quantize_float_dspr2)
2894
+
2895
+ #endif
2896
+
2897
+
2898
+ /*****************************************************************************/
2899
+ LEAF_DSPR2(jsimd_idct_2x2_dspr2)
2900
+ /*
2901
+ * a0 = compptr->dct_table
2902
+ * a1 = coef_block
2903
+ * a2 = output_buf
2904
+ * a3 = output_col
2905
+ */
2906
+ .set at
2907
+
2908
+ SAVE_REGS_ON_STACK 24, s0, s1, s2, s3, s4, s5
2909
+
2910
+ addiu sp, sp, -40
2911
+ move v0, sp
2912
+ addiu s2, zero, 29692
2913
+ addiu s3, zero, -10426
2914
+ addiu s4, zero, 6967
2915
+ addiu s5, zero, -5906
2916
+ lh t0, 0(a1) // t0 = inptr[DCTSIZE*0]
2917
+ lh t5, 0(a0) // t5 = quantptr[DCTSIZE*0]
2918
+ lh t1, 48(a1) // t1 = inptr[DCTSIZE*3]
2919
+ lh t6, 48(a0) // t6 = quantptr[DCTSIZE*3]
2920
+ mul t4, t5, t0
2921
+ lh t0, 16(a1) // t0 = inptr[DCTSIZE*1]
2922
+ lh t5, 16(a0) // t5 = quantptr[DCTSIZE*1]
2923
+ mul t6, t6, t1
2924
+ mul t5, t5, t0
2925
+ lh t2, 80(a1) // t2 = inptr[DCTSIZE*5]
2926
+ lh t7, 80(a0) // t7 = quantptr[DCTSIZE*5]
2927
+ lh t3, 112(a1) // t3 = inptr[DCTSIZE*7]
2928
+ lh t8, 112(a0) // t8 = quantptr[DCTSIZE*7]
2929
+ mul t7, t7, t2
2930
+ mult zero, zero
2931
+ mul t8, t8, t3
2932
+ li s0, 0x73FCD746 // s0 = (29692 << 16) | (-10426 & 0xffff)
2933
+ li s1, 0x1B37E8EE // s1 = (6967 << 16) | (-5906 & 0xffff)
2934
+ ins t6, t5, 16, 16 // t6 = t5|t6
2935
+ sll t4, t4, 15
2936
+ dpa.w.ph $ac0, t6, s0
2937
+ lh t1, 2(a1)
2938
+ lh t6, 2(a0)
2939
+ ins t8, t7, 16, 16 // t8 = t7|t8
2940
+ dpa.w.ph $ac0, t8, s1
2941
+ mflo t0, $ac0
2942
+ mul t5, t6, t1
2943
+ lh t1, 18(a1)
2944
+ lh t6, 18(a0)
2945
+ lh t2, 50(a1)
2946
+ lh t7, 50(a0)
2947
+ mul t6, t6, t1
2948
+ subu t8, t4, t0
2949
+ mul t7, t7, t2
2950
+ addu t0, t4, t0
2951
+ shra_r.w t0, t0, 13
2952
+ lh t1, 82(a1)
2953
+ lh t2, 82(a0)
2954
+ lh t3, 114(a1)
2955
+ lh t4, 114(a0)
2956
+ shra_r.w t8, t8, 13
2957
+ mul t1, t1, t2
2958
+ mul t3, t3, t4
2959
+ sw t0, 0(v0)
2960
+ sw t8, 20(v0)
2961
+ sll t4, t5, 15
2962
+ ins t7, t6, 16, 16
2963
+ mult zero, zero
2964
+ dpa.w.ph $ac0, t7, s0
2965
+ ins t3, t1, 16, 16
2966
+ lh t1, 6(a1)
2967
+ lh t6, 6(a0)
2968
+ dpa.w.ph $ac0, t3, s1
2969
+ mflo t0, $ac0
2970
+ mul t5, t6, t1
2971
+ lh t1, 22(a1)
2972
+ lh t6, 22(a0)
2973
+ lh t2, 54(a1)
2974
+ lh t7, 54(a0)
2975
+ mul t6, t6, t1
2976
+ subu t8, t4, t0
2977
+ mul t7, t7, t2
2978
+ addu t0, t4, t0
2979
+ shra_r.w t0, t0, 13
2980
+ lh t1, 86(a1)
2981
+ lh t2, 86(a0)
2982
+ lh t3, 118(a1)
2983
+ lh t4, 118(a0)
2984
+ shra_r.w t8, t8, 13
2985
+ mul t1, t1, t2
2986
+ mul t3, t3, t4
2987
+ sw t0, 4(v0)
2988
+ sw t8, 24(v0)
2989
+ sll t4, t5, 15
2990
+ ins t7, t6, 16, 16
2991
+ mult zero, zero
2992
+ dpa.w.ph $ac0, t7, s0
2993
+ ins t3, t1, 16, 16
2994
+ lh t1, 10(a1)
2995
+ lh t6, 10(a0)
2996
+ dpa.w.ph $ac0, t3, s1
2997
+ mflo t0, $ac0
2998
+ mul t5, t6, t1
2999
+ lh t1, 26(a1)
3000
+ lh t6, 26(a0)
3001
+ lh t2, 58(a1)
3002
+ lh t7, 58(a0)
3003
+ mul t6, t6, t1
3004
+ subu t8, t4, t0
3005
+ mul t7, t7, t2
3006
+ addu t0, t4, t0
3007
+ shra_r.w t0, t0, 13
3008
+ lh t1, 90(a1)
3009
+ lh t2, 90(a0)
3010
+ lh t3, 122(a1)
3011
+ lh t4, 122(a0)
3012
+ shra_r.w t8, t8, 13
3013
+ mul t1, t1, t2
3014
+ mul t3, t3, t4
3015
+ sw t0, 8(v0)
3016
+ sw t8, 28(v0)
3017
+ sll t4, t5, 15
3018
+ ins t7, t6, 16, 16
3019
+ mult zero, zero
3020
+ dpa.w.ph $ac0, t7, s0
3021
+ ins t3, t1, 16, 16
3022
+ lh t1, 14(a1)
3023
+ lh t6, 14(a0)
3024
+ dpa.w.ph $ac0, t3, s1
3025
+ mflo t0, $ac0
3026
+ mul t5, t6, t1
3027
+ lh t1, 30(a1)
3028
+ lh t6, 30(a0)
3029
+ lh t2, 62(a1)
3030
+ lh t7, 62(a0)
3031
+ mul t6, t6, t1
3032
+ subu t8, t4, t0
3033
+ mul t7, t7, t2
3034
+ addu t0, t4, t0
3035
+ shra_r.w t0, t0, 13
3036
+ lh t1, 94(a1)
3037
+ lh t2, 94(a0)
3038
+ lh t3, 126(a1)
3039
+ lh t4, 126(a0)
3040
+ shra_r.w t8, t8, 13
3041
+ mul t1, t1, t2
3042
+ mul t3, t3, t4
3043
+ sw t0, 12(v0)
3044
+ sw t8, 32(v0)
3045
+ sll t4, t5, 15
3046
+ ins t7, t6, 16, 16
3047
+ mult zero, zero
3048
+ dpa.w.ph $ac0, t7, s0
3049
+ ins t3, t1, 16, 16
3050
+ dpa.w.ph $ac0, t3, s1
3051
+ mflo t0, $ac0
3052
+ lw t9, 0(a2)
3053
+ lw t3, 0(v0)
3054
+ lw t7, 4(v0)
3055
+ lw t1, 8(v0)
3056
+ addu t9, t9, a3
3057
+ sll t3, t3, 15
3058
+ subu t8, t4, t0
3059
+ addu t0, t4, t0
3060
+ shra_r.w t0, t0, 13
3061
+ shra_r.w t8, t8, 13
3062
+ sw t0, 16(v0)
3063
+ sw t8, 36(v0)
3064
+ lw t5, 12(v0)
3065
+ lw t6, 16(v0)
3066
+ mult t7, s2
3067
+ madd t1, s3
3068
+ madd t5, s4
3069
+ madd t6, s5
3070
+ lw t5, 24(v0)
3071
+ lw t7, 28(v0)
3072
+ mflo t0, $ac0
3073
+ lw t8, 32(v0)
3074
+ lw t2, 36(v0)
3075
+ mult $ac1, t5, s2
3076
+ madd $ac1, t7, s3
3077
+ madd $ac1, t8, s4
3078
+ madd $ac1, t2, s5
3079
+ addu t1, t3, t0
3080
+ subu t6, t3, t0
3081
+ shra_r.w t1, t1, 20
3082
+ shra_r.w t6, t6, 20
3083
+ mflo t4, $ac1
3084
+ shll_s.w t1, t1, 24
3085
+ shll_s.w t6, t6, 24
3086
+ sra t1, t1, 24
3087
+ sra t6, t6, 24
3088
+ addiu t1, t1, 128
3089
+ addiu t6, t6, 128
3090
+ lw t0, 20(v0)
3091
+ sb t1, 0(t9)
3092
+ sb t6, 1(t9)
3093
+ sll t0, t0, 15
3094
+ lw t9, 4(a2)
3095
+ addu t1, t0, t4
3096
+ subu t6, t0, t4
3097
+ addu t9, t9, a3
3098
+ shra_r.w t1, t1, 20
3099
+ shra_r.w t6, t6, 20
3100
+ shll_s.w t1, t1, 24
3101
+ shll_s.w t6, t6, 24
3102
+ sra t1, t1, 24
3103
+ sra t6, t6, 24
3104
+ addiu t1, t1, 128
3105
+ addiu t6, t6, 128
3106
+ sb t1, 0(t9)
3107
+ sb t6, 1(t9)
3108
+ addiu sp, sp, 40
3109
+
3110
+ RESTORE_REGS_FROM_STACK 24, s0, s1, s2, s3, s4, s5
3111
+
3112
+ j ra
3113
+ nop
3114
+
3115
+ END(jsimd_idct_2x2_dspr2)
3116
+
3117
+
3118
+ /*****************************************************************************/
3119
+ LEAF_DSPR2(jsimd_idct_4x4_dspr2)
3120
+ /*
3121
+ * a0 = compptr->dct_table
3122
+ * a1 = coef_block
3123
+ * a2 = output_buf
3124
+ * a3 = output_col
3125
+ * 16(sp) = workspace[DCTSIZE*4]; // buffers data between passes
3126
+ */
3127
+ .set at
3128
+
3129
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3130
+
3131
+ lw v1, 48(sp)
3132
+ move t0, a1
3133
+ move t1, v1
3134
+ li t9, 4
3135
+ li s0, 0x2e75f93e
3136
+ li s1, 0x21f9ba79
3137
+ li s2, 0xecc2efb0
3138
+ li s3, 0x52031ccd
3139
+
3140
+ 0:
3141
+ lh s6, 32(t0) // inptr[DCTSIZE*2]
3142
+ lh t6, 32(a0) // quantptr[DCTSIZE*2]
3143
+ lh s7, 96(t0) // inptr[DCTSIZE*6]
3144
+ lh t7, 96(a0) // quantptr[DCTSIZE*6]
3145
+ mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3146
+ lh s4, 0(t0) // inptr[DCTSIZE*0]
3147
+ mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3148
+ lh s5, 0(a0) // quantptr[0]
3149
+ li s6, 15137
3150
+ li s7, 6270
3151
+ mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
3152
+ mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3153
+ lh t5, 112(t0) // inptr[DCTSIZE*7]
3154
+ mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3155
+ lh s4, 112(a0) // quantptr[DCTSIZE*7]
3156
+ lh v0, 80(t0) // inptr[DCTSIZE*5]
3157
+ lh s5, 80(a0) // quantptr[DCTSIZE*5]
3158
+ lh s6, 48(a0) // quantptr[DCTSIZE*3]
3159
+ sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
3160
+ lh s7, 16(a0) // quantptr[DCTSIZE*1]
3161
+ lh t8, 16(t0) // inptr[DCTSIZE*1]
3162
+ subu t6, t6, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3163
+ lh t7, 48(t0) // inptr[DCTSIZE*3]
3164
+ mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3165
+ mul v0, s5, v0 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3166
+ mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3167
+ mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3168
+ addu t3, t2, t6 // tmp10 = tmp0 + z2
3169
+ subu t4, t2, t6 // tmp10 = tmp0 - z2
3170
+ mult $ac0, zero, zero
3171
+ mult $ac1, zero, zero
3172
+ ins t5, v0, 16, 16
3173
+ ins t7, t8, 16, 16
3174
+ addiu t9, t9, -1
3175
+ dpa.w.ph $ac0, t5, s0
3176
+ dpa.w.ph $ac0, t7, s1
3177
+ dpa.w.ph $ac1, t5, s2
3178
+ dpa.w.ph $ac1, t7, s3
3179
+ mflo s4, $ac0
3180
+ mflo s5, $ac1
3181
+ addiu a0, a0, 2
3182
+ addiu t1, t1, 4
3183
+ addiu t0, t0, 2
3184
+ addu t6, t4, s4
3185
+ subu t5, t4, s4
3186
+ addu s6, t3, s5
3187
+ subu s7, t3, s5
3188
+ shra_r.w t6, t6, 12 // DESCALE(tmp12 + temp1, 12)
3189
+ shra_r.w t5, t5, 12 // DESCALE(tmp12 - temp1, 12)
3190
+ shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
3191
+ shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
3192
+ sw t6, 28(t1)
3193
+ sw t5, 60(t1)
3194
+ sw s6, -4(t1)
3195
+ bgtz t9, 0b
3196
+ sw s7, 92(t1)
3197
+ // second loop three pass
3198
+ li t9, 3
3199
+ 1:
3200
+ lh s6, 34(t0) // inptr[DCTSIZE*2]
3201
+ lh t6, 34(a0) // quantptr[DCTSIZE*2]
3202
+ lh s7, 98(t0) // inptr[DCTSIZE*6]
3203
+ lh t7, 98(a0) // quantptr[DCTSIZE*6]
3204
+ mul t6, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3205
+ lh s4, 2(t0) // inptr[DCTSIZE*0]
3206
+ mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3207
+ lh s5, 2(a0) // quantptr[DCTSIZE*0]
3208
+ li s6, 15137
3209
+ li s7, 6270
3210
+ mul t2, s4, s5 // tmp0 = (inptr[0] * quantptr[0])
3211
+ mul v0, s6, t6 // z2 = (inptr[DCTSIZE*2] * quantptr[DCTSIZE*2])
3212
+ lh t5, 114(t0) // inptr[DCTSIZE*7]
3213
+ mul t7, s7, t7 // z3 = (inptr[DCTSIZE*6] * quantptr[DCTSIZE*6])
3214
+ lh s4, 114(a0) // quantptr[DCTSIZE*7]
3215
+ lh s5, 82(a0) // quantptr[DCTSIZE*5]
3216
+ lh t6, 82(t0) // inptr[DCTSIZE*5]
3217
+ sll t2, t2, 14 // tmp0 <<= (CONST_BITS+1)
3218
+ lh s6, 50(a0) // quantptr[DCTSIZE*3]
3219
+ lh t8, 18(t0) // inptr[DCTSIZE*1]
3220
+ subu v0, v0, t7 // tmp2 = MULTIPLY(z2, t5) - MULTIPLY(z3, t6)
3221
+ lh t7, 50(t0) // inptr[DCTSIZE*3]
3222
+ lh s7, 18(a0) // quantptr[DCTSIZE*1]
3223
+ mul t5, s4, t5 // z1 = (inptr[DCTSIZE*7] * quantptr[DCTSIZE*7])
3224
+ mul t6, s5, t6 // z2 = (inptr[DCTSIZE*5] * quantptr[DCTSIZE*5])
3225
+ mul t7, s6, t7 // z3 = (inptr[DCTSIZE*3] * quantptr[DCTSIZE*3])
3226
+ mul t8, s7, t8 // z4 = (inptr[DCTSIZE*1] * quantptr[DCTSIZE*1])
3227
+ addu t3, t2, v0 // tmp10 = tmp0 + z2
3228
+ subu t4, t2, v0 // tmp10 = tmp0 - z2
3229
+ mult $ac0, zero, zero
3230
+ mult $ac1, zero, zero
3231
+ ins t5, t6, 16, 16
3232
+ ins t7, t8, 16, 16
3233
+ dpa.w.ph $ac0, t5, s0
3234
+ dpa.w.ph $ac0, t7, s1
3235
+ dpa.w.ph $ac1, t5, s2
3236
+ dpa.w.ph $ac1, t7, s3
3237
+ mflo t5, $ac0
3238
+ mflo t6, $ac1
3239
+ addiu t9, t9, -1
3240
+ addiu t0, t0, 2
3241
+ addiu a0, a0, 2
3242
+ addiu t1, t1, 4
3243
+ addu s5, t4, t5
3244
+ subu s4, t4, t5
3245
+ addu s6, t3, t6
3246
+ subu s7, t3, t6
3247
+ shra_r.w s5, s5, 12 // DESCALE(tmp12 + temp1, 12)
3248
+ shra_r.w s4, s4, 12 // DESCALE(tmp12 - temp1, 12)
3249
+ shra_r.w s6, s6, 12 // DESCALE(tmp10 + temp2, 12)
3250
+ shra_r.w s7, s7, 12 // DESCALE(tmp10 - temp2, 12)
3251
+ sw s5, 32(t1)
3252
+ sw s4, 64(t1)
3253
+ sw s6, 0(t1)
3254
+ bgtz t9, 1b
3255
+ sw s7, 96(t1)
3256
+ move t1, v1
3257
+ li s4, 15137
3258
+ lw s6, 8(t1) // wsptr[2]
3259
+ li s5, 6270
3260
+ lw s7, 24(t1) // wsptr[6]
3261
+ mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3262
+ lw t2, 0(t1) // wsptr[0]
3263
+ mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3264
+ lh t5, 28(t1) // wsptr[7]
3265
+ lh t6, 20(t1) // wsptr[5]
3266
+ lh t7, 12(t1) // wsptr[3]
3267
+ lh t8, 4(t1) // wsptr[1]
3268
+ ins t5, t6, 16, 16
3269
+ ins t7, t8, 16, 16
3270
+ mult $ac0, zero, zero
3271
+ dpa.w.ph $ac0, t5, s0
3272
+ dpa.w.ph $ac0, t7, s1
3273
+ mult $ac1, zero, zero
3274
+ dpa.w.ph $ac1, t5, s2
3275
+ dpa.w.ph $ac1, t7, s3
3276
+ sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3277
+ mflo s6, $ac0
3278
+ // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3279
+ subu s4, s4, s5
3280
+ addu t3, t2, s4 // tmp10 = tmp0 + z2
3281
+ mflo s7, $ac1
3282
+ subu t4, t2, s4 // tmp10 = tmp0 - z2
3283
+ addu t7, t4, s6
3284
+ subu t8, t4, s6
3285
+ addu t5, t3, s7
3286
+ subu t6, t3, s7
3287
+ shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3288
+ shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3289
+ shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3290
+ shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3291
+ sll s4, t9, 2
3292
+ lw v0, 0(a2) // output_buf[ctr]
3293
+ shll_s.w t5, t5, 24
3294
+ shll_s.w t6, t6, 24
3295
+ shll_s.w t7, t7, 24
3296
+ shll_s.w t8, t8, 24
3297
+ sra t5, t5, 24
3298
+ sra t6, t6, 24
3299
+ sra t7, t7, 24
3300
+ sra t8, t8, 24
3301
+ addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3302
+ addiu t5, t5, 128
3303
+ addiu t6, t6, 128
3304
+ addiu t7, t7, 128
3305
+ addiu t8, t8, 128
3306
+ sb t5, 0(v0)
3307
+ sb t7, 1(v0)
3308
+ sb t8, 2(v0)
3309
+ sb t6, 3(v0)
3310
+ // 2
3311
+ li s4, 15137
3312
+ lw s6, 40(t1) // wsptr[2]
3313
+ li s5, 6270
3314
+ lw s7, 56(t1) // wsptr[6]
3315
+ mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3316
+ lw t2, 32(t1) // wsptr[0]
3317
+ mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3318
+ lh t5, 60(t1) // wsptr[7]
3319
+ lh t6, 52(t1) // wsptr[5]
3320
+ lh t7, 44(t1) // wsptr[3]
3321
+ lh t8, 36(t1) // wsptr[1]
3322
+ ins t5, t6, 16, 16
3323
+ ins t7, t8, 16, 16
3324
+ mult $ac0, zero, zero
3325
+ dpa.w.ph $ac0, t5, s0
3326
+ dpa.w.ph $ac0, t7, s1
3327
+ mult $ac1, zero, zero
3328
+ dpa.w.ph $ac1, t5, s2
3329
+ dpa.w.ph $ac1, t7, s3
3330
+ sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3331
+ mflo s6, $ac0
3332
+ // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3333
+ subu s4, s4, s5
3334
+ addu t3, t2, s4 // tmp10 = tmp0 + z2
3335
+ mflo s7, $ac1
3336
+ subu t4, t2, s4 // tmp10 = tmp0 - z2
3337
+ addu t7, t4, s6
3338
+ subu t8, t4, s6
3339
+ addu t5, t3, s7
3340
+ subu t6, t3, s7
3341
+ shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, CONST_BITS-PASS1_BITS+1)
3342
+ shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, CONST_BITS-PASS1_BITS+1)
3343
+ shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, CONST_BITS-PASS1_BITS+1)
3344
+ shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, CONST_BITS-PASS1_BITS+1)
3345
+ sll s4, t9, 2
3346
+ lw v0, 4(a2) // output_buf[ctr]
3347
+ shll_s.w t5, t5, 24
3348
+ shll_s.w t6, t6, 24
3349
+ shll_s.w t7, t7, 24
3350
+ shll_s.w t8, t8, 24
3351
+ sra t5, t5, 24
3352
+ sra t6, t6, 24
3353
+ sra t7, t7, 24
3354
+ sra t8, t8, 24
3355
+ addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3356
+ addiu t5, t5, 128
3357
+ addiu t6, t6, 128
3358
+ addiu t7, t7, 128
3359
+ addiu t8, t8, 128
3360
+ sb t5, 0(v0)
3361
+ sb t7, 1(v0)
3362
+ sb t8, 2(v0)
3363
+ sb t6, 3(v0)
3364
+ // 3
3365
+ li s4, 15137
3366
+ lw s6, 72(t1) // wsptr[2]
3367
+ li s5, 6270
3368
+ lw s7, 88(t1) // wsptr[6]
3369
+ mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3370
+ lw t2, 64(t1) // wsptr[0]
3371
+ mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3372
+ lh t5, 92(t1) // wsptr[7]
3373
+ lh t6, 84(t1) // wsptr[5]
3374
+ lh t7, 76(t1) // wsptr[3]
3375
+ lh t8, 68(t1) // wsptr[1]
3376
+ ins t5, t6, 16, 16
3377
+ ins t7, t8, 16, 16
3378
+ mult $ac0, zero, zero
3379
+ dpa.w.ph $ac0, t5, s0
3380
+ dpa.w.ph $ac0, t7, s1
3381
+ mult $ac1, zero, zero
3382
+ dpa.w.ph $ac1, t5, s2
3383
+ dpa.w.ph $ac1, t7, s3
3384
+ sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3385
+ mflo s6, $ac0
3386
+ // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3387
+ subu s4, s4, s5
3388
+ addu t3, t2, s4 // tmp10 = tmp0 + z2
3389
+ mflo s7, $ac1
3390
+ subu t4, t2, s4 // tmp10 = tmp0 - z2
3391
+ addu t7, t4, s6
3392
+ subu t8, t4, s6
3393
+ addu t5, t3, s7
3394
+ subu t6, t3, s7
3395
+ shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3396
+ shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3397
+ shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3398
+ shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3399
+ sll s4, t9, 2
3400
+ lw v0, 8(a2) // output_buf[ctr]
3401
+ shll_s.w t5, t5, 24
3402
+ shll_s.w t6, t6, 24
3403
+ shll_s.w t7, t7, 24
3404
+ shll_s.w t8, t8, 24
3405
+ sra t5, t5, 24
3406
+ sra t6, t6, 24
3407
+ sra t7, t7, 24
3408
+ sra t8, t8, 24
3409
+ addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3410
+ addiu t5, t5, 128
3411
+ addiu t6, t6, 128
3412
+ addiu t7, t7, 128
3413
+ addiu t8, t8, 128
3414
+ sb t5, 0(v0)
3415
+ sb t7, 1(v0)
3416
+ sb t8, 2(v0)
3417
+ sb t6, 3(v0)
3418
+ li s4, 15137
3419
+ lw s6, 104(t1) // wsptr[2]
3420
+ li s5, 6270
3421
+ lw s7, 120(t1) // wsptr[6]
3422
+ mul s4, s4, s6 // MULTIPLY((JLONG)wsptr[2], FIX_1_847759065)
3423
+ lw t2, 96(t1) // wsptr[0]
3424
+ mul s5, s5, s7 // MULTIPLY((JLONG)wsptr[6], -FIX_0_765366865)
3425
+ lh t5, 124(t1) // wsptr[7]
3426
+ lh t6, 116(t1) // wsptr[5]
3427
+ lh t7, 108(t1) // wsptr[3]
3428
+ lh t8, 100(t1) // wsptr[1]
3429
+ ins t5, t6, 16, 16
3430
+ ins t7, t8, 16, 16
3431
+ mult $ac0, zero, zero
3432
+ dpa.w.ph $ac0, t5, s0
3433
+ dpa.w.ph $ac0, t7, s1
3434
+ mult $ac1, zero, zero
3435
+ dpa.w.ph $ac1, t5, s2
3436
+ dpa.w.ph $ac1, t7, s3
3437
+ sll t2, t2, 14 // tmp0 = ((JLONG)wsptr[0]) << (CONST_BITS+1)
3438
+ mflo s6, $ac0
3439
+ // MULTIPLY(wsptr[2], FIX_1_847759065 + MULTIPLY(wsptr[6], -FIX_0_765366865)
3440
+ subu s4, s4, s5
3441
+ addu t3, t2, s4 // tmp10 = tmp0 + z2;
3442
+ mflo s7, $ac1
3443
+ subu t4, t2, s4 // tmp10 = tmp0 - z2;
3444
+ addu t7, t4, s6
3445
+ subu t8, t4, s6
3446
+ addu t5, t3, s7
3447
+ subu t6, t3, s7
3448
+ shra_r.w t5, t5, 19 // DESCALE(tmp10 + temp2, 19)
3449
+ shra_r.w t6, t6, 19 // DESCALE(tmp10 - temp2, 19)
3450
+ shra_r.w t7, t7, 19 // DESCALE(tmp12 + temp1, 19)
3451
+ shra_r.w t8, t8, 19 // DESCALE(tmp12 - temp1, 19)
3452
+ sll s4, t9, 2
3453
+ lw v0, 12(a2) // output_buf[ctr]
3454
+ shll_s.w t5, t5, 24
3455
+ shll_s.w t6, t6, 24
3456
+ shll_s.w t7, t7, 24
3457
+ shll_s.w t8, t8, 24
3458
+ sra t5, t5, 24
3459
+ sra t6, t6, 24
3460
+ sra t7, t7, 24
3461
+ sra t8, t8, 24
3462
+ addu v0, v0, a3 // outptr = output_buf[ctr] + output_col
3463
+ addiu t5, t5, 128
3464
+ addiu t6, t6, 128
3465
+ addiu t7, t7, 128
3466
+ addiu t8, t8, 128
3467
+ sb t5, 0(v0)
3468
+ sb t7, 1(v0)
3469
+ sb t8, 2(v0)
3470
+ sb t6, 3(v0)
3471
+
3472
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3473
+
3474
+ j ra
3475
+ nop
3476
+ END(jsimd_idct_4x4_dspr2)
3477
+
3478
+
3479
+ /*****************************************************************************/
3480
+ LEAF_DSPR2(jsimd_idct_6x6_dspr2)
3481
+ /*
3482
+ * a0 = compptr->dct_table
3483
+ * a1 = coef_block
3484
+ * a2 = output_buf
3485
+ * a3 = output_col
3486
+ */
3487
+ .set at
3488
+
3489
+ SAVE_REGS_ON_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3490
+
3491
+ addiu sp, sp, -144
3492
+ move v0, sp
3493
+ addiu v1, v0, 24
3494
+ addiu t9, zero, 5793
3495
+ addiu s0, zero, 10033
3496
+ addiu s1, zero, 2998
3497
+
3498
+ 1:
3499
+ lh s2, 0(a0) // q0 = quantptr[ 0]
3500
+ lh s3, 32(a0) // q1 = quantptr[16]
3501
+ lh s4, 64(a0) // q2 = quantptr[32]
3502
+ lh t2, 64(a1) // tmp2 = inptr[32]
3503
+ lh t1, 32(a1) // tmp1 = inptr[16]
3504
+ lh t0, 0(a1) // tmp0 = inptr[ 0]
3505
+ mul t2, t2, s4 // tmp2 = tmp2 * q2
3506
+ mul t1, t1, s3 // tmp1 = tmp1 * q1
3507
+ mul t0, t0, s2 // tmp0 = tmp0 * q0
3508
+ lh t6, 16(a1) // z1 = inptr[ 8]
3509
+ lh t8, 80(a1) // z3 = inptr[40]
3510
+ lh t7, 48(a1) // z2 = inptr[24]
3511
+ lh s2, 16(a0) // q0 = quantptr[ 8]
3512
+ lh s4, 80(a0) // q2 = quantptr[40]
3513
+ lh s3, 48(a0) // q1 = quantptr[24]
3514
+ mul t2, t2, t9 // tmp2 = tmp2 * 5793
3515
+ mul t1, t1, s0 // tmp1 = tmp1 * 10033
3516
+ sll t0, t0, 13 // tmp0 = tmp0 << 13
3517
+ mul t6, t6, s2 // z1 = z1 * q0
3518
+ mul t8, t8, s4 // z3 = z3 * q2
3519
+ mul t7, t7, s3 // z2 = z2 * q1
3520
+ addu t3, t0, t2 // tmp10 = tmp0 + tmp2
3521
+ sll t2, t2, 1 // tmp2 = tmp2 << 2
3522
+ subu t4, t0, t2 // tmp11 = tmp0 - tmp2;
3523
+ subu t5, t3, t1 // tmp12 = tmp10 - tmp1
3524
+ addu t3, t3, t1 // tmp10 = tmp10 + tmp1
3525
+ addu t1, t6, t8 // tmp1 = z1 + z3
3526
+ mul t1, t1, s1 // tmp1 = tmp1 * 2998
3527
+ shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
3528
+ subu t2, t6, t8 // tmp2 = z1 - z3
3529
+ subu t2, t2, t7 // tmp2 = tmp2 - z2
3530
+ sll t2, t2, 2 // tmp2 = tmp2 << 2
3531
+ addu t0, t6, t7 // tmp0 = z1 + z2
3532
+ sll t0, t0, 13 // tmp0 = tmp0 << 13
3533
+ subu s2, t8, t7 // q0 = z3 - z2
3534
+ sll s2, s2, 13 // q0 = q0 << 13
3535
+ addu t0, t0, t1 // tmp0 = tmp0 + tmp1
3536
+ addu t1, s2, t1 // tmp1 = q0 + tmp1
3537
+ addu s2, t4, t2 // q0 = tmp11 + tmp2
3538
+ subu s3, t4, t2 // q1 = tmp11 - tmp2
3539
+ addu t6, t3, t0 // z1 = tmp10 + tmp0
3540
+ subu t7, t3, t0 // z2 = tmp10 - tmp0
3541
+ addu t4, t5, t1 // tmp11 = tmp12 + tmp1
3542
+ subu t5, t5, t1 // tmp12 = tmp12 - tmp1
3543
+ shra_r.w t6, t6, 11 // z1 = (z1 + 1024) >> 11
3544
+ shra_r.w t7, t7, 11 // z2 = (z2 + 1024) >> 11
3545
+ shra_r.w t4, t4, 11 // tmp11 = (tmp11 + 1024) >> 11
3546
+ shra_r.w t5, t5, 11 // tmp12 = (tmp12 + 1024) >> 11
3547
+ sw s2, 24(v0)
3548
+ sw s3, 96(v0)
3549
+ sw t6, 0(v0)
3550
+ sw t7, 120(v0)
3551
+ sw t4, 48(v0)
3552
+ sw t5, 72(v0)
3553
+ addiu v0, v0, 4
3554
+ addiu a1, a1, 2
3555
+ bne v0, v1, 1b
3556
+ addiu a0, a0, 2
3557
+
3558
+ /* Pass 2: process 6 rows from work array, store into output array. */
3559
+ move v0, sp
3560
+ addiu v1, v0, 144
3561
+
3562
+ 2:
3563
+ lw t0, 0(v0)
3564
+ lw t2, 16(v0)
3565
+ lw s5, 0(a2)
3566
+ addiu t0, t0, 16
3567
+ sll t0, t0, 13
3568
+ mul t3, t2, t9
3569
+ lw t6, 4(v0)
3570
+ lw t8, 20(v0)
3571
+ lw t7, 12(v0)
3572
+ addu s5, s5, a3
3573
+ addu s6, t6, t8
3574
+ mul s6, s6, s1
3575
+ addu t1, t0, t3
3576
+ subu t4, t0, t3
3577
+ subu t4, t4, t3
3578
+ lw t3, 8(v0)
3579
+ mul t0, t3, s0
3580
+ addu s7, t6, t7
3581
+ sll s7, s7, 13
3582
+ addu s7, s6, s7
3583
+ subu t2, t8, t7
3584
+ sll t2, t2, 13
3585
+ addu t2, s6, t2
3586
+ subu s6, t6, t7
3587
+ subu s6, s6, t8
3588
+ sll s6, s6, 13
3589
+ addu t3, t1, t0
3590
+ subu t5, t1, t0
3591
+ addu t6, t3, s7
3592
+ subu t3, t3, s7
3593
+ addu t7, t4, s6
3594
+ subu t4, t4, s6
3595
+ addu t8, t5, t2
3596
+ subu t5, t5, t2
3597
+ shll_s.w t6, t6, 6
3598
+ shll_s.w t3, t3, 6
3599
+ shll_s.w t7, t7, 6
3600
+ shll_s.w t4, t4, 6
3601
+ shll_s.w t8, t8, 6
3602
+ shll_s.w t5, t5, 6
3603
+ sra t6, t6, 24
3604
+ addiu t6, t6, 128
3605
+ sra t3, t3, 24
3606
+ addiu t3, t3, 128
3607
+ sb t6, 0(s5)
3608
+ sra t7, t7, 24
3609
+ addiu t7, t7, 128
3610
+ sb t3, 5(s5)
3611
+ sra t4, t4, 24
3612
+ addiu t4, t4, 128
3613
+ sb t7, 1(s5)
3614
+ sra t8, t8, 24
3615
+ addiu t8, t8, 128
3616
+ sb t4, 4(s5)
3617
+ addiu v0, v0, 24
3618
+ sra t5, t5, 24
3619
+ addiu t5, t5, 128
3620
+ sb t8, 2(s5)
3621
+ addiu a2, a2, 4
3622
+ bne v0, v1, 2b
3623
+ sb t5, 3(s5)
3624
+
3625
+ addiu sp, sp, 144
3626
+
3627
+ RESTORE_REGS_FROM_STACK 32, s0, s1, s2, s3, s4, s5, s6, s7
3628
+
3629
+ j ra
3630
+ nop
3631
+
3632
+ END(jsimd_idct_6x6_dspr2)
3633
+
3634
+
3635
+ /*****************************************************************************/
3636
+ LEAF_DSPR2(jsimd_idct_12x12_pass1_dspr2)
3637
+ /*
3638
+ * a0 = compptr->dct_table
3639
+ * a1 = coef_block
3640
+ * a2 = workspace
3641
+ */
3642
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3643
+
3644
+ li a3, 8
3645
+
3646
+ 1:
3647
+ // odd part
3648
+ lh t0, 48(a1)
3649
+ lh t1, 48(a0)
3650
+ lh t2, 16(a1)
3651
+ lh t3, 16(a0)
3652
+ lh t4, 80(a1)
3653
+ lh t5, 80(a0)
3654
+ lh t6, 112(a1)
3655
+ lh t7, 112(a0)
3656
+ mul t0, t0, t1 // z2
3657
+ mul t1, t2, t3 // z1
3658
+ mul t2, t4, t5 // z3
3659
+ mul t3, t6, t7 // z4
3660
+ li t4, 10703 // FIX(1.306562965)
3661
+ li t5, 4433 // FIX_0_541196100
3662
+ li t6, 7053 // FIX(0.860918669)
3663
+ mul t4, t0, t4 // tmp11
3664
+ mul t5, t0, t5 // -tmp14
3665
+ addu t7, t1, t2 // tmp10
3666
+ addu t8, t7, t3 // tmp10 + z4
3667
+ mul t6, t6, t8 // tmp15
3668
+ li t8, 2139 // FIX(0.261052384)
3669
+ mul t8, t7, t8 // MULTIPLY(tmp10, FIX(0.261052384))
3670
+ li t7, 2295 // FIX(0.280143716)
3671
+ mul t7, t1, t7 // MULTIPLY(z1, FIX(0.280143716))
3672
+ addu t9, t2, t3 // z3 + z4
3673
+ li s0, 8565 // FIX(1.045510580)
3674
+ mul t9, t9, s0 // -tmp13
3675
+ li s0, 12112 // FIX(1.478575242)
3676
+ mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242)
3677
+ li s1, 12998 // FIX(1.586706681)
3678
+ mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
3679
+ li s2, 5540 // FIX(0.676326758)
3680
+ mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
3681
+ li s3, 16244 // FIX(1.982889723)
3682
+ mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
3683
+ subu t1, t1, t3 // z1-=z4
3684
+ subu t0, t0, t2 // z2-=z3
3685
+ addu t2, t0, t1 // z1+z2
3686
+ li t3, 4433 // FIX_0_541196100
3687
+ mul t2, t2, t3 // z3
3688
+ li t3, 6270 // FIX_0_765366865
3689
+ mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
3690
+ li t3, 15137 // FIX_0_765366865
3691
+ mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
3692
+ addu t8, t6, t8 // tmp12
3693
+ addu t3, t8, t4 // tmp12 + tmp11
3694
+ addu t3, t3, t7 // tmp10
3695
+ subu t8, t8, t9 // tmp12 + tmp13
3696
+ addu s0, t5, s0
3697
+ subu t8, t8, s0 // tmp12
3698
+ subu t9, t6, t9
3699
+ subu s1, s1, t4
3700
+ addu t9, t9, s1 // tmp13
3701
+ subu t6, t6, t5
3702
+ subu t6, t6, s2
3703
+ subu t6, t6, s3 // tmp15
3704
+ // even part start
3705
+ lh t4, 64(a1)
3706
+ lh t5, 64(a0)
3707
+ lh t7, 32(a1)
3708
+ lh s0, 32(a0)
3709
+ lh s1, 0(a1)
3710
+ lh s2, 0(a0)
3711
+ lh s3, 96(a1)
3712
+ lh v0, 96(a0)
3713
+ mul t4, t4, t5 // DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4])
3714
+ mul t5, t7, s0 // DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2])
3715
+ mul t7, s1, s2 // DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0])
3716
+ mul s0, s3, v0 // DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6])
3717
+ // odd part end
3718
+ addu t1, t2, t1 // tmp11
3719
+ subu t0, t2, t0 // tmp14
3720
+ // update counter and pointers
3721
+ addiu a3, a3, -1
3722
+ addiu a0, a0, 2
3723
+ addiu a1, a1, 2
3724
+ // even part rest
3725
+ li s1, 10033
3726
+ li s2, 11190
3727
+ mul t4, t4, s1 // z4
3728
+ mul s1, t5, s2 // z4
3729
+ sll t5, t5, 13 // z1
3730
+ sll t7, t7, 13
3731
+ addiu t7, t7, 1024 // z3
3732
+ sll s0, s0, 13 // z2
3733
+ addu s2, t7, t4 // tmp10
3734
+ subu t4, t7, t4 // tmp11
3735
+ subu s3, t5, s0 // tmp12
3736
+ addu t2, t7, s3 // tmp21
3737
+ subu s3, t7, s3 // tmp24
3738
+ addu t7, s1, s0 // tmp12
3739
+ addu v0, s2, t7 // tmp20
3740
+ subu s2, s2, t7 // tmp25
3741
+ subu s1, s1, t5 // z4 - z1
3742
+ subu s1, s1, s0 // tmp12
3743
+ addu s0, t4, s1 // tmp22
3744
+ subu t4, t4, s1 // tmp23
3745
+ // final output stage
3746
+ addu t5, v0, t3
3747
+ subu v0, v0, t3
3748
+ addu t3, t2, t1
3749
+ subu t2, t2, t1
3750
+ addu t1, s0, t8
3751
+ subu s0, s0, t8
3752
+ addu t8, t4, t9
3753
+ subu t4, t4, t9
3754
+ addu t9, s3, t0
3755
+ subu s3, s3, t0
3756
+ addu t0, s2, t6
3757
+ subu s2, s2, t6
3758
+ sra t5, t5, 11
3759
+ sra t3, t3, 11
3760
+ sra t1, t1, 11
3761
+ sra t8, t8, 11
3762
+ sra t9, t9, 11
3763
+ sra t0, t0, 11
3764
+ sra s2, s2, 11
3765
+ sra s3, s3, 11
3766
+ sra t4, t4, 11
3767
+ sra s0, s0, 11
3768
+ sra t2, t2, 11
3769
+ sra v0, v0, 11
3770
+ sw t5, 0(a2)
3771
+ sw t3, 32(a2)
3772
+ sw t1, 64(a2)
3773
+ sw t8, 96(a2)
3774
+ sw t9, 128(a2)
3775
+ sw t0, 160(a2)
3776
+ sw s2, 192(a2)
3777
+ sw s3, 224(a2)
3778
+ sw t4, 256(a2)
3779
+ sw s0, 288(a2)
3780
+ sw t2, 320(a2)
3781
+ sw v0, 352(a2)
3782
+ bgtz a3, 1b
3783
+ addiu a2, a2, 4
3784
+
3785
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3786
+
3787
+ j ra
3788
+ nop
3789
+
3790
+ END(jsimd_idct_12x12_pass1_dspr2)
3791
+
3792
+
3793
+ /*****************************************************************************/
3794
+ LEAF_DSPR2(jsimd_idct_12x12_pass2_dspr2)
3795
+ /*
3796
+ * a0 = workspace
3797
+ * a1 = output
3798
+ */
3799
+ SAVE_REGS_ON_STACK 16, s0, s1, s2, s3
3800
+
3801
+ li a3, 12
3802
+
3803
+ 1:
3804
+ // Odd part
3805
+ lw t0, 12(a0)
3806
+ lw t1, 4(a0)
3807
+ lw t2, 20(a0)
3808
+ lw t3, 28(a0)
3809
+ li t4, 10703 // FIX(1.306562965)
3810
+ li t5, 4433 // FIX_0_541196100
3811
+ mul t4, t0, t4 // tmp11
3812
+ mul t5, t0, t5 // -tmp14
3813
+ addu t6, t1, t2 // tmp10
3814
+ li t7, 2139 // FIX(0.261052384)
3815
+ mul t7, t6, t7 // MULTIPLY(tmp10, FIX(0.261052384))
3816
+ addu t6, t6, t3 // tmp10 + z4
3817
+ li t8, 7053 // FIX(0.860918669)
3818
+ mul t6, t6, t8 // tmp15
3819
+ li t8, 2295 // FIX(0.280143716)
3820
+ mul t8, t1, t8 // MULTIPLY(z1, FIX(0.280143716))
3821
+ addu t9, t2, t3 // z3 + z4
3822
+ li s0, 8565 // FIX(1.045510580)
3823
+ mul t9, t9, s0 // -tmp13
3824
+ li s0, 12112 // FIX(1.478575242)
3825
+ mul s0, t2, s0 // MULTIPLY(z3, FIX(1.478575242))
3826
+ li s1, 12998 // FIX(1.586706681)
3827
+ mul s1, t3, s1 // MULTIPLY(z4, FIX(1.586706681))
3828
+ li s2, 5540 // FIX(0.676326758)
3829
+ mul s2, t1, s2 // MULTIPLY(z1, FIX(0.676326758))
3830
+ li s3, 16244 // FIX(1.982889723)
3831
+ mul s3, t3, s3 // MULTIPLY(z4, FIX(1.982889723))
3832
+ subu t1, t1, t3 // z1 -= z4
3833
+ subu t0, t0, t2 // z2 -= z3
3834
+ addu t2, t1, t0 // z1 + z2
3835
+ li t3, 4433 // FIX_0_541196100
3836
+ mul t2, t2, t3 // z3
3837
+ li t3, 6270 // FIX_0_765366865
3838
+ mul t1, t1, t3 // MULTIPLY(z1, FIX_0_765366865)
3839
+ li t3, 15137 // FIX_1_847759065
3840
+ mul t0, t0, t3 // MULTIPLY(z2, FIX_1_847759065)
3841
+ addu t3, t6, t7 // tmp12
3842
+ addu t7, t3, t4
3843
+ addu t7, t7, t8 // tmp10
3844
+ subu t3, t3, t9
3845
+ subu t3, t3, t5
3846
+ subu t3, t3, s0 // tmp12
3847
+ subu t9, t6, t9
3848
+ subu t9, t9, t4
3849
+ addu t9, t9, s1 // tmp13
3850
+ subu t6, t6, t5
3851
+ subu t6, t6, s2
3852
+ subu t6, t6, s3 // tmp15
3853
+ addu t1, t2, t1 // tmp11
3854
+ subu t0, t2, t0 // tmp14
3855
+ // even part
3856
+ lw t2, 16(a0) // z4
3857
+ lw t4, 8(a0) // z1
3858
+ lw t5, 0(a0) // z3
3859
+ lw t8, 24(a0) // z2
3860
+ li s0, 10033 // FIX(1.224744871)
3861
+ li s1, 11190 // FIX(1.366025404)
3862
+ mul t2, t2, s0 // z4
3863
+ mul s0, t4, s1 // z4
3864
+ addiu t5, t5, 0x10
3865
+ sll t5, t5, 13 // z3
3866
+ sll t4, t4, 13 // z1
3867
+ sll t8, t8, 13 // z2
3868
+ subu s1, t4, t8 // tmp12
3869
+ addu s2, t5, t2 // tmp10
3870
+ subu t2, t5, t2 // tmp11
3871
+ addu s3, t5, s1 // tmp21
3872
+ subu s1, t5, s1 // tmp24
3873
+ addu t5, s0, t8 // tmp12
3874
+ addu v0, s2, t5 // tmp20
3875
+ subu t5, s2, t5 // tmp25
3876
+ subu t4, s0, t4
3877
+ subu t4, t4, t8 // tmp12
3878
+ addu t8, t2, t4 // tmp22
3879
+ subu t2, t2, t4 // tmp23
3880
+ // increment counter and pointers
3881
+ addiu a3, a3, -1
3882
+ addiu a0, a0, 32
3883
+ // Final stage
3884
+ addu t4, v0, t7
3885
+ subu v0, v0, t7
3886
+ addu t7, s3, t1
3887
+ subu s3, s3, t1
3888
+ addu t1, t8, t3
3889
+ subu t8, t8, t3
3890
+ addu t3, t2, t9
3891
+ subu t2, t2, t9
3892
+ addu t9, s1, t0
3893
+ subu s1, s1, t0
3894
+ addu t0, t5, t6
3895
+ subu t5, t5, t6
3896
+ sll t4, t4, 4
3897
+ sll t7, t7, 4
3898
+ sll t1, t1, 4
3899
+ sll t3, t3, 4
3900
+ sll t9, t9, 4
3901
+ sll t0, t0, 4
3902
+ sll t5, t5, 4
3903
+ sll s1, s1, 4
3904
+ sll t2, t2, 4
3905
+ sll t8, t8, 4
3906
+ sll s3, s3, 4
3907
+ sll v0, v0, 4
3908
+ shll_s.w t4, t4, 2
3909
+ shll_s.w t7, t7, 2
3910
+ shll_s.w t1, t1, 2
3911
+ shll_s.w t3, t3, 2
3912
+ shll_s.w t9, t9, 2
3913
+ shll_s.w t0, t0, 2
3914
+ shll_s.w t5, t5, 2
3915
+ shll_s.w s1, s1, 2
3916
+ shll_s.w t2, t2, 2
3917
+ shll_s.w t8, t8, 2
3918
+ shll_s.w s3, s3, 2
3919
+ shll_s.w v0, v0, 2
3920
+ srl t4, t4, 24
3921
+ srl t7, t7, 24
3922
+ srl t1, t1, 24
3923
+ srl t3, t3, 24
3924
+ srl t9, t9, 24
3925
+ srl t0, t0, 24
3926
+ srl t5, t5, 24
3927
+ srl s1, s1, 24
3928
+ srl t2, t2, 24
3929
+ srl t8, t8, 24
3930
+ srl s3, s3, 24
3931
+ srl v0, v0, 24
3932
+ lw t6, 0(a1)
3933
+ addiu t4, t4, 0x80
3934
+ addiu t7, t7, 0x80
3935
+ addiu t1, t1, 0x80
3936
+ addiu t3, t3, 0x80
3937
+ addiu t9, t9, 0x80
3938
+ addiu t0, t0, 0x80
3939
+ addiu t5, t5, 0x80
3940
+ addiu s1, s1, 0x80
3941
+ addiu t2, t2, 0x80
3942
+ addiu t8, t8, 0x80
3943
+ addiu s3, s3, 0x80
3944
+ addiu v0, v0, 0x80
3945
+ sb t4, 0(t6)
3946
+ sb t7, 1(t6)
3947
+ sb t1, 2(t6)
3948
+ sb t3, 3(t6)
3949
+ sb t9, 4(t6)
3950
+ sb t0, 5(t6)
3951
+ sb t5, 6(t6)
3952
+ sb s1, 7(t6)
3953
+ sb t2, 8(t6)
3954
+ sb t8, 9(t6)
3955
+ sb s3, 10(t6)
3956
+ sb v0, 11(t6)
3957
+ bgtz a3, 1b
3958
+ addiu a1, a1, 4
3959
+
3960
+ RESTORE_REGS_FROM_STACK 16, s0, s1, s2, s3
3961
+
3962
+ jr ra
3963
+ nop
3964
+
3965
+ END(jsimd_idct_12x12_pass2_dspr2)
3966
+
3967
+
3968
+ /*****************************************************************************/
3969
+ LEAF_DSPR2(jsimd_convsamp_dspr2)
3970
+ /*
3971
+ * a0 = sample_data
3972
+ * a1 = start_col
3973
+ * a2 = workspace
3974
+ */
3975
+ lw t0, 0(a0)
3976
+ li t7, 0xff80ff80
3977
+ addu t0, t0, a1
3978
+ ulw t1, 0(t0)
3979
+ ulw t2, 4(t0)
3980
+ preceu.ph.qbr t3, t1
3981
+ preceu.ph.qbl t4, t1
3982
+ lw t0, 4(a0)
3983
+ preceu.ph.qbr t5, t2
3984
+ preceu.ph.qbl t6, t2
3985
+ addu t0, t0, a1
3986
+ addu.ph t3, t3, t7
3987
+ addu.ph t4, t4, t7
3988
+ ulw t1, 0(t0)
3989
+ ulw t2, 4(t0)
3990
+ addu.ph t5, t5, t7
3991
+ addu.ph t6, t6, t7
3992
+ usw t3, 0(a2)
3993
+ usw t4, 4(a2)
3994
+ preceu.ph.qbr t3, t1
3995
+ preceu.ph.qbl t4, t1
3996
+ usw t5, 8(a2)
3997
+ usw t6, 12(a2)
3998
+
3999
+ lw t0, 8(a0)
4000
+ preceu.ph.qbr t5, t2
4001
+ preceu.ph.qbl t6, t2
4002
+ addu t0, t0, a1
4003
+ addu.ph t3, t3, t7
4004
+ addu.ph t4, t4, t7
4005
+ ulw t1, 0(t0)
4006
+ ulw t2, 4(t0)
4007
+ addu.ph t5, t5, t7
4008
+ addu.ph t6, t6, t7
4009
+ usw t3, 16(a2)
4010
+ usw t4, 20(a2)
4011
+ preceu.ph.qbr t3, t1
4012
+ preceu.ph.qbl t4, t1
4013
+ usw t5, 24(a2)
4014
+ usw t6, 28(a2)
4015
+
4016
+ lw t0, 12(a0)
4017
+ preceu.ph.qbr t5, t2
4018
+ preceu.ph.qbl t6, t2
4019
+ addu t0, t0, a1
4020
+ addu.ph t3, t3, t7
4021
+ addu.ph t4, t4, t7
4022
+ ulw t1, 0(t0)
4023
+ ulw t2, 4(t0)
4024
+ addu.ph t5, t5, t7
4025
+ addu.ph t6, t6, t7
4026
+ usw t3, 32(a2)
4027
+ usw t4, 36(a2)
4028
+ preceu.ph.qbr t3, t1
4029
+ preceu.ph.qbl t4, t1
4030
+ usw t5, 40(a2)
4031
+ usw t6, 44(a2)
4032
+
4033
+ lw t0, 16(a0)
4034
+ preceu.ph.qbr t5, t2
4035
+ preceu.ph.qbl t6, t2
4036
+ addu t0, t0, a1
4037
+ addu.ph t3, t3, t7
4038
+ addu.ph t4, t4, t7
4039
+ ulw t1, 0(t0)
4040
+ ulw t2, 4(t0)
4041
+ addu.ph t5, t5, t7
4042
+ addu.ph t6, t6, t7
4043
+ usw t3, 48(a2)
4044
+ usw t4, 52(a2)
4045
+ preceu.ph.qbr t3, t1
4046
+ preceu.ph.qbl t4, t1
4047
+ usw t5, 56(a2)
4048
+ usw t6, 60(a2)
4049
+
4050
+ lw t0, 20(a0)
4051
+ preceu.ph.qbr t5, t2
4052
+ preceu.ph.qbl t6, t2
4053
+ addu t0, t0, a1
4054
+ addu.ph t3, t3, t7
4055
+ addu.ph t4, t4, t7
4056
+ ulw t1, 0(t0)
4057
+ ulw t2, 4(t0)
4058
+ addu.ph t5, t5, t7
4059
+ addu.ph t6, t6, t7
4060
+ usw t3, 64(a2)
4061
+ usw t4, 68(a2)
4062
+ preceu.ph.qbr t3, t1
4063
+ preceu.ph.qbl t4, t1
4064
+ usw t5, 72(a2)
4065
+ usw t6, 76(a2)
4066
+
4067
+ lw t0, 24(a0)
4068
+ preceu.ph.qbr t5, t2
4069
+ preceu.ph.qbl t6, t2
4070
+ addu t0, t0, a1
4071
+ addu.ph t3, t3, t7
4072
+ addu.ph t4, t4, t7
4073
+ ulw t1, 0(t0)
4074
+ ulw t2, 4(t0)
4075
+ addu.ph t5, t5, t7
4076
+ addu.ph t6, t6, t7
4077
+ usw t3, 80(a2)
4078
+ usw t4, 84(a2)
4079
+ preceu.ph.qbr t3, t1
4080
+ preceu.ph.qbl t4, t1
4081
+ usw t5, 88(a2)
4082
+ usw t6, 92(a2)
4083
+
4084
+ lw t0, 28(a0)
4085
+ preceu.ph.qbr t5, t2
4086
+ preceu.ph.qbl t6, t2
4087
+ addu t0, t0, a1
4088
+ addu.ph t3, t3, t7
4089
+ addu.ph t4, t4, t7
4090
+ ulw t1, 0(t0)
4091
+ ulw t2, 4(t0)
4092
+ addu.ph t5, t5, t7
4093
+ addu.ph t6, t6, t7
4094
+ usw t3, 96(a2)
4095
+ usw t4, 100(a2)
4096
+ preceu.ph.qbr t3, t1
4097
+ preceu.ph.qbl t4, t1
4098
+ usw t5, 104(a2)
4099
+ usw t6, 108(a2)
4100
+ preceu.ph.qbr t5, t2
4101
+ preceu.ph.qbl t6, t2
4102
+ addu.ph t3, t3, t7
4103
+ addu.ph t4, t4, t7
4104
+ addu.ph t5, t5, t7
4105
+ addu.ph t6, t6, t7
4106
+ usw t3, 112(a2)
4107
+ usw t4, 116(a2)
4108
+ usw t5, 120(a2)
4109
+ usw t6, 124(a2)
4110
+
4111
+ j ra
4112
+ nop
4113
+
4114
+ END(jsimd_convsamp_dspr2)
4115
+
4116
+
4117
+ #ifndef __mips_soft_float
4118
+
4119
+ /*****************************************************************************/
4120
+ LEAF_DSPR2(jsimd_convsamp_float_dspr2)
4121
+ /*
4122
+ * a0 = sample_data
4123
+ * a1 = start_col
4124
+ * a2 = workspace
4125
+ */
4126
+ .set at
4127
+
4128
+ lw t0, 0(a0)
4129
+ addu t0, t0, a1
4130
+ lbu t1, 0(t0)
4131
+ lbu t2, 1(t0)
4132
+ lbu t3, 2(t0)
4133
+ lbu t4, 3(t0)
4134
+ lbu t5, 4(t0)
4135
+ lbu t6, 5(t0)
4136
+ lbu t7, 6(t0)
4137
+ lbu t8, 7(t0)
4138
+ addiu t1, t1, -128
4139
+ addiu t2, t2, -128
4140
+ addiu t3, t3, -128
4141
+ addiu t4, t4, -128
4142
+ addiu t5, t5, -128
4143
+ addiu t6, t6, -128
4144
+ addiu t7, t7, -128
4145
+ addiu t8, t8, -128
4146
+ mtc1 t1, f2
4147
+ mtc1 t2, f4
4148
+ mtc1 t3, f6
4149
+ mtc1 t4, f8
4150
+ mtc1 t5, f10
4151
+ mtc1 t6, f12
4152
+ mtc1 t7, f14
4153
+ mtc1 t8, f16
4154
+ cvt.s.w f2, f2
4155
+ cvt.s.w f4, f4
4156
+ cvt.s.w f6, f6
4157
+ cvt.s.w f8, f8
4158
+ cvt.s.w f10, f10
4159
+ cvt.s.w f12, f12
4160
+ cvt.s.w f14, f14
4161
+ cvt.s.w f16, f16
4162
+ lw t0, 4(a0)
4163
+ swc1 f2, 0(a2)
4164
+ swc1 f4, 4(a2)
4165
+ swc1 f6, 8(a2)
4166
+ addu t0, t0, a1
4167
+ swc1 f8, 12(a2)
4168
+ swc1 f10, 16(a2)
4169
+ swc1 f12, 20(a2)
4170
+ swc1 f14, 24(a2)
4171
+ swc1 f16, 28(a2)
4172
+ // elemr 1
4173
+ lbu t1, 0(t0)
4174
+ lbu t2, 1(t0)
4175
+ lbu t3, 2(t0)
4176
+ lbu t4, 3(t0)
4177
+ lbu t5, 4(t0)
4178
+ lbu t6, 5(t0)
4179
+ lbu t7, 6(t0)
4180
+ lbu t8, 7(t0)
4181
+ addiu t1, t1, -128
4182
+ addiu t2, t2, -128
4183
+ addiu t3, t3, -128
4184
+ addiu t4, t4, -128
4185
+ addiu t5, t5, -128
4186
+ addiu t6, t6, -128
4187
+ addiu t7, t7, -128
4188
+ addiu t8, t8, -128
4189
+ mtc1 t1, f2
4190
+ mtc1 t2, f4
4191
+ mtc1 t3, f6
4192
+ mtc1 t4, f8
4193
+ mtc1 t5, f10
4194
+ mtc1 t6, f12
4195
+ mtc1 t7, f14
4196
+ mtc1 t8, f16
4197
+ cvt.s.w f2, f2
4198
+ cvt.s.w f4, f4
4199
+ cvt.s.w f6, f6
4200
+ cvt.s.w f8, f8
4201
+ cvt.s.w f10, f10
4202
+ cvt.s.w f12, f12
4203
+ cvt.s.w f14, f14
4204
+ cvt.s.w f16, f16
4205
+ lw t0, 8(a0)
4206
+ swc1 f2, 32(a2)
4207
+ swc1 f4, 36(a2)
4208
+ swc1 f6, 40(a2)
4209
+ addu t0, t0, a1
4210
+ swc1 f8, 44(a2)
4211
+ swc1 f10, 48(a2)
4212
+ swc1 f12, 52(a2)
4213
+ swc1 f14, 56(a2)
4214
+ swc1 f16, 60(a2)
4215
+ // elemr 2
4216
+ lbu t1, 0(t0)
4217
+ lbu t2, 1(t0)
4218
+ lbu t3, 2(t0)
4219
+ lbu t4, 3(t0)
4220
+ lbu t5, 4(t0)
4221
+ lbu t6, 5(t0)
4222
+ lbu t7, 6(t0)
4223
+ lbu t8, 7(t0)
4224
+ addiu t1, t1, -128
4225
+ addiu t2, t2, -128
4226
+ addiu t3, t3, -128
4227
+ addiu t4, t4, -128
4228
+ addiu t5, t5, -128
4229
+ addiu t6, t6, -128
4230
+ addiu t7, t7, -128
4231
+ addiu t8, t8, -128
4232
+ mtc1 t1, f2
4233
+ mtc1 t2, f4
4234
+ mtc1 t3, f6
4235
+ mtc1 t4, f8
4236
+ mtc1 t5, f10
4237
+ mtc1 t6, f12
4238
+ mtc1 t7, f14
4239
+ mtc1 t8, f16
4240
+ cvt.s.w f2, f2
4241
+ cvt.s.w f4, f4
4242
+ cvt.s.w f6, f6
4243
+ cvt.s.w f8, f8
4244
+ cvt.s.w f10, f10
4245
+ cvt.s.w f12, f12
4246
+ cvt.s.w f14, f14
4247
+ cvt.s.w f16, f16
4248
+ lw t0, 12(a0)
4249
+ swc1 f2, 64(a2)
4250
+ swc1 f4, 68(a2)
4251
+ swc1 f6, 72(a2)
4252
+ addu t0, t0, a1
4253
+ swc1 f8, 76(a2)
4254
+ swc1 f10, 80(a2)
4255
+ swc1 f12, 84(a2)
4256
+ swc1 f14, 88(a2)
4257
+ swc1 f16, 92(a2)
4258
+ // elemr 3
4259
+ lbu t1, 0(t0)
4260
+ lbu t2, 1(t0)
4261
+ lbu t3, 2(t0)
4262
+ lbu t4, 3(t0)
4263
+ lbu t5, 4(t0)
4264
+ lbu t6, 5(t0)
4265
+ lbu t7, 6(t0)
4266
+ lbu t8, 7(t0)
4267
+ addiu t1, t1, -128
4268
+ addiu t2, t2, -128
4269
+ addiu t3, t3, -128
4270
+ addiu t4, t4, -128
4271
+ addiu t5, t5, -128
4272
+ addiu t6, t6, -128
4273
+ addiu t7, t7, -128
4274
+ addiu t8, t8, -128
4275
+ mtc1 t1, f2
4276
+ mtc1 t2, f4
4277
+ mtc1 t3, f6
4278
+ mtc1 t4, f8
4279
+ mtc1 t5, f10
4280
+ mtc1 t6, f12
4281
+ mtc1 t7, f14
4282
+ mtc1 t8, f16
4283
+ cvt.s.w f2, f2
4284
+ cvt.s.w f4, f4
4285
+ cvt.s.w f6, f6
4286
+ cvt.s.w f8, f8
4287
+ cvt.s.w f10, f10
4288
+ cvt.s.w f12, f12
4289
+ cvt.s.w f14, f14
4290
+ cvt.s.w f16, f16
4291
+ lw t0, 16(a0)
4292
+ swc1 f2, 96(a2)
4293
+ swc1 f4, 100(a2)
4294
+ swc1 f6, 104(a2)
4295
+ addu t0, t0, a1
4296
+ swc1 f8, 108(a2)
4297
+ swc1 f10, 112(a2)
4298
+ swc1 f12, 116(a2)
4299
+ swc1 f14, 120(a2)
4300
+ swc1 f16, 124(a2)
4301
+ // elemr 4
4302
+ lbu t1, 0(t0)
4303
+ lbu t2, 1(t0)
4304
+ lbu t3, 2(t0)
4305
+ lbu t4, 3(t0)
4306
+ lbu t5, 4(t0)
4307
+ lbu t6, 5(t0)
4308
+ lbu t7, 6(t0)
4309
+ lbu t8, 7(t0)
4310
+ addiu t1, t1, -128
4311
+ addiu t2, t2, -128
4312
+ addiu t3, t3, -128
4313
+ addiu t4, t4, -128
4314
+ addiu t5, t5, -128
4315
+ addiu t6, t6, -128
4316
+ addiu t7, t7, -128
4317
+ addiu t8, t8, -128
4318
+ mtc1 t1, f2
4319
+ mtc1 t2, f4
4320
+ mtc1 t3, f6
4321
+ mtc1 t4, f8
4322
+ mtc1 t5, f10
4323
+ mtc1 t6, f12
4324
+ mtc1 t7, f14
4325
+ mtc1 t8, f16
4326
+ cvt.s.w f2, f2
4327
+ cvt.s.w f4, f4
4328
+ cvt.s.w f6, f6
4329
+ cvt.s.w f8, f8
4330
+ cvt.s.w f10, f10
4331
+ cvt.s.w f12, f12
4332
+ cvt.s.w f14, f14
4333
+ cvt.s.w f16, f16
4334
+ lw t0, 20(a0)
4335
+ swc1 f2, 128(a2)
4336
+ swc1 f4, 132(a2)
4337
+ swc1 f6, 136(a2)
4338
+ addu t0, t0, a1
4339
+ swc1 f8, 140(a2)
4340
+ swc1 f10, 144(a2)
4341
+ swc1 f12, 148(a2)
4342
+ swc1 f14, 152(a2)
4343
+ swc1 f16, 156(a2)
4344
+ // elemr 5
4345
+ lbu t1, 0(t0)
4346
+ lbu t2, 1(t0)
4347
+ lbu t3, 2(t0)
4348
+ lbu t4, 3(t0)
4349
+ lbu t5, 4(t0)
4350
+ lbu t6, 5(t0)
4351
+ lbu t7, 6(t0)
4352
+ lbu t8, 7(t0)
4353
+ addiu t1, t1, -128
4354
+ addiu t2, t2, -128
4355
+ addiu t3, t3, -128
4356
+ addiu t4, t4, -128
4357
+ addiu t5, t5, -128
4358
+ addiu t6, t6, -128
4359
+ addiu t7, t7, -128
4360
+ addiu t8, t8, -128
4361
+ mtc1 t1, f2
4362
+ mtc1 t2, f4
4363
+ mtc1 t3, f6
4364
+ mtc1 t4, f8
4365
+ mtc1 t5, f10
4366
+ mtc1 t6, f12
4367
+ mtc1 t7, f14
4368
+ mtc1 t8, f16
4369
+ cvt.s.w f2, f2
4370
+ cvt.s.w f4, f4
4371
+ cvt.s.w f6, f6
4372
+ cvt.s.w f8, f8
4373
+ cvt.s.w f10, f10
4374
+ cvt.s.w f12, f12
4375
+ cvt.s.w f14, f14
4376
+ cvt.s.w f16, f16
4377
+ lw t0, 24(a0)
4378
+ swc1 f2, 160(a2)
4379
+ swc1 f4, 164(a2)
4380
+ swc1 f6, 168(a2)
4381
+ addu t0, t0, a1
4382
+ swc1 f8, 172(a2)
4383
+ swc1 f10, 176(a2)
4384
+ swc1 f12, 180(a2)
4385
+ swc1 f14, 184(a2)
4386
+ swc1 f16, 188(a2)
4387
+ // elemr 6
4388
+ lbu t1, 0(t0)
4389
+ lbu t2, 1(t0)
4390
+ lbu t3, 2(t0)
4391
+ lbu t4, 3(t0)
4392
+ lbu t5, 4(t0)
4393
+ lbu t6, 5(t0)
4394
+ lbu t7, 6(t0)
4395
+ lbu t8, 7(t0)
4396
+ addiu t1, t1, -128
4397
+ addiu t2, t2, -128
4398
+ addiu t3, t3, -128
4399
+ addiu t4, t4, -128
4400
+ addiu t5, t5, -128
4401
+ addiu t6, t6, -128
4402
+ addiu t7, t7, -128
4403
+ addiu t8, t8, -128
4404
+ mtc1 t1, f2
4405
+ mtc1 t2, f4
4406
+ mtc1 t3, f6
4407
+ mtc1 t4, f8
4408
+ mtc1 t5, f10
4409
+ mtc1 t6, f12
4410
+ mtc1 t7, f14
4411
+ mtc1 t8, f16
4412
+ cvt.s.w f2, f2
4413
+ cvt.s.w f4, f4
4414
+ cvt.s.w f6, f6
4415
+ cvt.s.w f8, f8
4416
+ cvt.s.w f10, f10
4417
+ cvt.s.w f12, f12
4418
+ cvt.s.w f14, f14
4419
+ cvt.s.w f16, f16
4420
+ lw t0, 28(a0)
4421
+ swc1 f2, 192(a2)
4422
+ swc1 f4, 196(a2)
4423
+ swc1 f6, 200(a2)
4424
+ addu t0, t0, a1
4425
+ swc1 f8, 204(a2)
4426
+ swc1 f10, 208(a2)
4427
+ swc1 f12, 212(a2)
4428
+ swc1 f14, 216(a2)
4429
+ swc1 f16, 220(a2)
4430
+ // elemr 7
4431
+ lbu t1, 0(t0)
4432
+ lbu t2, 1(t0)
4433
+ lbu t3, 2(t0)
4434
+ lbu t4, 3(t0)
4435
+ lbu t5, 4(t0)
4436
+ lbu t6, 5(t0)
4437
+ lbu t7, 6(t0)
4438
+ lbu t8, 7(t0)
4439
+ addiu t1, t1, -128
4440
+ addiu t2, t2, -128
4441
+ addiu t3, t3, -128
4442
+ addiu t4, t4, -128
4443
+ addiu t5, t5, -128
4444
+ addiu t6, t6, -128
4445
+ addiu t7, t7, -128
4446
+ addiu t8, t8, -128
4447
+ mtc1 t1, f2
4448
+ mtc1 t2, f4
4449
+ mtc1 t3, f6
4450
+ mtc1 t4, f8
4451
+ mtc1 t5, f10
4452
+ mtc1 t6, f12
4453
+ mtc1 t7, f14
4454
+ mtc1 t8, f16
4455
+ cvt.s.w f2, f2
4456
+ cvt.s.w f4, f4
4457
+ cvt.s.w f6, f6
4458
+ cvt.s.w f8, f8
4459
+ cvt.s.w f10, f10
4460
+ cvt.s.w f12, f12
4461
+ cvt.s.w f14, f14
4462
+ cvt.s.w f16, f16
4463
+ swc1 f2, 224(a2)
4464
+ swc1 f4, 228(a2)
4465
+ swc1 f6, 232(a2)
4466
+ swc1 f8, 236(a2)
4467
+ swc1 f10, 240(a2)
4468
+ swc1 f12, 244(a2)
4469
+ swc1 f14, 248(a2)
4470
+ swc1 f16, 252(a2)
4471
+
4472
+ j ra
4473
+ nop
4474
+
4475
+ END(jsimd_convsamp_float_dspr2)
4476
+
4477
+ #endif
4478
+
4479
+ /*****************************************************************************/