epeg 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (504) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/MANIFEST +5 -0
  4. data/TODO +1 -0
  5. data/epeg/.dockerignore +4 -0
  6. data/epeg/.gitignore +5 -0
  7. data/epeg/CMakeLists.txt +30 -0
  8. data/epeg/Dockerfile +23 -0
  9. data/epeg/Epeg.h +90 -0
  10. data/epeg/README.md +42 -0
  11. data/epeg/epeg_main.c +1642 -0
  12. data/epeg/epeg_private.h +85 -0
  13. data/epeg/example/.gitignore +1 -0
  14. data/epeg/example/CMakeLists.txt +20 -0
  15. data/epeg/example/example.jpg +0 -0
  16. data/epeg/example/rotatetest.c +29 -0
  17. data/epeg/example/scaletest.c +48 -0
  18. data/epeg/vendor/libjpeg-turbo-2.0.4/BUILDING.md +828 -0
  19. data/epeg/vendor/libjpeg-turbo-2.0.4/CMakeLists.txt +1420 -0
  20. data/epeg/vendor/libjpeg-turbo-2.0.4/ChangeLog.md +1494 -0
  21. data/epeg/vendor/libjpeg-turbo-2.0.4/LICENSE.md +132 -0
  22. data/epeg/vendor/libjpeg-turbo-2.0.4/README.ijg +277 -0
  23. data/epeg/vendor/libjpeg-turbo-2.0.4/README.md +356 -0
  24. data/epeg/vendor/libjpeg-turbo-2.0.4/cderror.h +137 -0
  25. data/epeg/vendor/libjpeg-turbo-2.0.4/cdjpeg.c +145 -0
  26. data/epeg/vendor/libjpeg-turbo-2.0.4/cdjpeg.h +157 -0
  27. data/epeg/vendor/libjpeg-turbo-2.0.4/change.log +315 -0
  28. data/epeg/vendor/libjpeg-turbo-2.0.4/cjpeg.1 +354 -0
  29. data/epeg/vendor/libjpeg-turbo-2.0.4/cjpeg.c +695 -0
  30. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/BuildPackages.cmake +182 -0
  31. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/GNUInstallDirs.cmake +416 -0
  32. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/cmake_uninstall.cmake.in +24 -0
  33. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/testclean.cmake +41 -0
  34. data/epeg/vendor/libjpeg-turbo-2.0.4/cmyk.h +61 -0
  35. data/epeg/vendor/libjpeg-turbo-2.0.4/coderules.txt +78 -0
  36. data/epeg/vendor/libjpeg-turbo-2.0.4/djpeg.1 +296 -0
  37. data/epeg/vendor/libjpeg-turbo-2.0.4/djpeg.c +822 -0
  38. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/annotated.html +104 -0
  39. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/bc_s.png +0 -0
  40. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/bdwn.png +0 -0
  41. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/classes.html +106 -0
  42. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/closed.png +0 -0
  43. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen-extra.css +3 -0
  44. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen.css +1184 -0
  45. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen.png +0 -0
  46. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/dynsections.js +97 -0
  47. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2blank.png +0 -0
  48. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2cl.png +0 -0
  49. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2doc.png +0 -0
  50. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2folderclosed.png +0 -0
  51. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2folderopen.png +0 -0
  52. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2lastnode.png +0 -0
  53. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2link.png +0 -0
  54. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mlastnode.png +0 -0
  55. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mnode.png +0 -0
  56. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mo.png +0 -0
  57. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2node.png +0 -0
  58. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2ns.png +0 -0
  59. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2plastnode.png +0 -0
  60. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2pnode.png +0 -0
  61. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2splitbar.png +0 -0
  62. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2vertline.png +0 -0
  63. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/functions.html +134 -0
  64. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/functions_vars.html +134 -0
  65. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/group___turbo_j_p_e_g.html +2775 -0
  66. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/index.html +90 -0
  67. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/jquery.js +8 -0
  68. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/modules.html +95 -0
  69. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_f.png +0 -0
  70. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_g.png +0 -0
  71. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_h.png +0 -0
  72. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/open.png +0 -0
  73. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_63.html +26 -0
  74. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_63.js +4 -0
  75. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_64.html +26 -0
  76. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_64.js +5 -0
  77. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_68.html +26 -0
  78. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_68.js +4 -0
  79. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6e.html +26 -0
  80. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6e.js +4 -0
  81. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6f.html +26 -0
  82. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6f.js +5 -0
  83. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_72.html +26 -0
  84. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_72.js +4 -0
  85. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_74.html +26 -0
  86. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_74.js +102 -0
  87. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_77.html +26 -0
  88. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_77.js +4 -0
  89. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_78.html +26 -0
  90. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_78.js +4 -0
  91. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_79.html +26 -0
  92. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_79.js +4 -0
  93. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/classes_74.html +26 -0
  94. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/classes_74.js +6 -0
  95. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/close.png +0 -0
  96. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enums_74.html +26 -0
  97. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enums_74.js +8 -0
  98. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enumvalues_74.html +26 -0
  99. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enumvalues_74.js +37 -0
  100. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/functions_74.html +26 -0
  101. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/functions_74.js +31 -0
  102. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/groups_74.html +26 -0
  103. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/groups_74.js +4 -0
  104. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/mag_sel.png +0 -0
  105. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/nomatches.html +12 -0
  106. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search.css +271 -0
  107. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search.js +809 -0
  108. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_l.png +0 -0
  109. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_m.png +0 -0
  110. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_r.png +0 -0
  111. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/typedefs_74.html +26 -0
  112. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/typedefs_74.js +5 -0
  113. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_63.html +26 -0
  114. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_63.js +4 -0
  115. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_64.html +26 -0
  116. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_64.js +5 -0
  117. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_68.html +26 -0
  118. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_68.js +4 -0
  119. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6e.html +26 -0
  120. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6e.js +4 -0
  121. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6f.html +26 -0
  122. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6f.js +5 -0
  123. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_72.html +26 -0
  124. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_72.js +4 -0
  125. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_74.html +26 -0
  126. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_74.js +10 -0
  127. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_77.html +26 -0
  128. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_77.js +4 -0
  129. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_78.html +26 -0
  130. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_78.js +4 -0
  131. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_79.html +26 -0
  132. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_79.js +4 -0
  133. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjregion.html +186 -0
  134. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjscalingfactor.html +148 -0
  135. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjtransform.html +212 -0
  136. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/sync_off.png +0 -0
  137. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/sync_on.png +0 -0
  138. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_a.png +0 -0
  139. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_b.png +0 -0
  140. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_h.png +0 -0
  141. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_s.png +0 -0
  142. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tabs.css +60 -0
  143. data/epeg/vendor/libjpeg-turbo-2.0.4/doxygen-extra.css +3 -0
  144. data/epeg/vendor/libjpeg-turbo-2.0.4/doxygen.config +16 -0
  145. data/epeg/vendor/libjpeg-turbo-2.0.4/example.txt +464 -0
  146. data/epeg/vendor/libjpeg-turbo-2.0.4/jaricom.c +157 -0
  147. data/epeg/vendor/libjpeg-turbo-2.0.4/java/CMakeLists.txt +88 -0
  148. data/epeg/vendor/libjpeg-turbo-2.0.4/java/MANIFEST.MF +2 -0
  149. data/epeg/vendor/libjpeg-turbo-2.0.4/java/README +52 -0
  150. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJBench.java +1021 -0
  151. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJExample.java +405 -0
  152. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJUnitTest.java +960 -0
  153. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/allclasses-frame.html +24 -0
  154. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/allclasses-noframe.html +24 -0
  155. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/constant-values.html +532 -0
  156. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/deprecated-list.html +252 -0
  157. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/help-doc.html +210 -0
  158. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/index-all.html +1029 -0
  159. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/index.html +71 -0
  160. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJ.html +1356 -0
  161. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html +926 -0
  162. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html +241 -0
  163. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html +1255 -0
  164. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJException.html +340 -0
  165. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html +343 -0
  166. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html +751 -0
  167. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html +421 -0
  168. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html +765 -0
  169. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-frame.html +31 -0
  170. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-summary.html +202 -0
  171. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-tree.html +160 -0
  172. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/overview-tree.html +164 -0
  173. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/package-list +1 -0
  174. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/background.gif +0 -0
  175. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/tab.gif +0 -0
  176. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/titlebar.gif +0 -0
  177. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/titlebar_end.gif +0 -0
  178. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/script.js +30 -0
  179. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/serialized-form.html +176 -0
  180. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/stylesheet.css +474 -0
  181. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJ.java +584 -0
  182. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJCompressor.java +677 -0
  183. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java +76 -0
  184. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJDecompressor.java +931 -0
  185. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJException.java +78 -0
  186. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in +59 -0
  187. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJLoader-win.java.in +35 -0
  188. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJScalingFactor.java +115 -0
  189. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJTransform.java +227 -0
  190. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJTransformer.java +163 -0
  191. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/YUVImage.java +445 -0
  192. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJ.h +129 -0
  193. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJCompressor.h +101 -0
  194. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJDecompressor.h +101 -0
  195. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJTransformer.h +29 -0
  196. data/epeg/vendor/libjpeg-turbo-2.0.4/jcapimin.c +295 -0
  197. data/epeg/vendor/libjpeg-turbo-2.0.4/jcapistd.c +162 -0
  198. data/epeg/vendor/libjpeg-turbo-2.0.4/jcarith.c +932 -0
  199. data/epeg/vendor/libjpeg-turbo-2.0.4/jccoefct.c +449 -0
  200. data/epeg/vendor/libjpeg-turbo-2.0.4/jccolext.c +144 -0
  201. data/epeg/vendor/libjpeg-turbo-2.0.4/jccolor.c +710 -0
  202. data/epeg/vendor/libjpeg-turbo-2.0.4/jcdctmgr.c +721 -0
  203. data/epeg/vendor/libjpeg-turbo-2.0.4/jchuff.c +1096 -0
  204. data/epeg/vendor/libjpeg-turbo-2.0.4/jchuff.h +42 -0
  205. data/epeg/vendor/libjpeg-turbo-2.0.4/jcicc.c +105 -0
  206. data/epeg/vendor/libjpeg-turbo-2.0.4/jcinit.c +77 -0
  207. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmainct.c +162 -0
  208. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmarker.c +664 -0
  209. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmaster.c +640 -0
  210. data/epeg/vendor/libjpeg-turbo-2.0.4/jcomapi.c +109 -0
  211. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfig.h.in +73 -0
  212. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfig.txt +143 -0
  213. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfigint.h.in +31 -0
  214. data/epeg/vendor/libjpeg-turbo-2.0.4/jcparam.c +541 -0
  215. data/epeg/vendor/libjpeg-turbo-2.0.4/jcphuff.c +1105 -0
  216. data/epeg/vendor/libjpeg-turbo-2.0.4/jcprepct.c +351 -0
  217. data/epeg/vendor/libjpeg-turbo-2.0.4/jcsample.c +539 -0
  218. data/epeg/vendor/libjpeg-turbo-2.0.4/jcstest.c +126 -0
  219. data/epeg/vendor/libjpeg-turbo-2.0.4/jctrans.c +400 -0
  220. data/epeg/vendor/libjpeg-turbo-2.0.4/jdapimin.c +407 -0
  221. data/epeg/vendor/libjpeg-turbo-2.0.4/jdapistd.c +639 -0
  222. data/epeg/vendor/libjpeg-turbo-2.0.4/jdarith.c +773 -0
  223. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatadst-tj.c +203 -0
  224. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatadst.c +293 -0
  225. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatasrc-tj.c +194 -0
  226. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatasrc.c +295 -0
  227. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcoefct.c +692 -0
  228. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcoefct.h +82 -0
  229. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcol565.c +384 -0
  230. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcolext.c +143 -0
  231. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcolor.c +883 -0
  232. data/epeg/vendor/libjpeg-turbo-2.0.4/jdct.h +208 -0
  233. data/epeg/vendor/libjpeg-turbo-2.0.4/jddctmgr.c +352 -0
  234. data/epeg/vendor/libjpeg-turbo-2.0.4/jdhuff.c +831 -0
  235. data/epeg/vendor/libjpeg-turbo-2.0.4/jdhuff.h +238 -0
  236. data/epeg/vendor/libjpeg-turbo-2.0.4/jdicc.c +171 -0
  237. data/epeg/vendor/libjpeg-turbo-2.0.4/jdinput.c +408 -0
  238. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmainct.c +460 -0
  239. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmainct.h +71 -0
  240. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmarker.c +1377 -0
  241. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmaster.c +737 -0
  242. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmaster.h +28 -0
  243. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmerge.c +617 -0
  244. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmrg565.c +354 -0
  245. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmrgext.c +184 -0
  246. data/epeg/vendor/libjpeg-turbo-2.0.4/jdphuff.c +687 -0
  247. data/epeg/vendor/libjpeg-turbo-2.0.4/jdpostct.c +294 -0
  248. data/epeg/vendor/libjpeg-turbo-2.0.4/jdsample.c +518 -0
  249. data/epeg/vendor/libjpeg-turbo-2.0.4/jdsample.h +50 -0
  250. data/epeg/vendor/libjpeg-turbo-2.0.4/jdtrans.c +155 -0
  251. data/epeg/vendor/libjpeg-turbo-2.0.4/jerror.c +251 -0
  252. data/epeg/vendor/libjpeg-turbo-2.0.4/jerror.h +316 -0
  253. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctflt.c +169 -0
  254. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctfst.c +227 -0
  255. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctint.c +288 -0
  256. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctflt.c +240 -0
  257. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctfst.c +371 -0
  258. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctint.c +2627 -0
  259. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctred.c +409 -0
  260. data/epeg/vendor/libjpeg-turbo-2.0.4/jinclude.h +88 -0
  261. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemmgr.c +1179 -0
  262. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemnobs.c +115 -0
  263. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemsys.h +178 -0
  264. data/epeg/vendor/libjpeg-turbo-2.0.4/jmorecfg.h +421 -0
  265. data/epeg/vendor/libjpeg-turbo-2.0.4/jpeg_nbits_table.h +4098 -0
  266. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegcomp.h +31 -0
  267. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegint.h +368 -0
  268. data/epeg/vendor/libjpeg-turbo-2.0.4/jpeglib.h +1132 -0
  269. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegtran.1 +295 -0
  270. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegtran.c +601 -0
  271. data/epeg/vendor/libjpeg-turbo-2.0.4/jquant1.c +859 -0
  272. data/epeg/vendor/libjpeg-turbo-2.0.4/jquant2.c +1285 -0
  273. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimd.h +117 -0
  274. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimd_none.c +418 -0
  275. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimddct.h +70 -0
  276. data/epeg/vendor/libjpeg-turbo-2.0.4/jstdhuff.c +143 -0
  277. data/epeg/vendor/libjpeg-turbo-2.0.4/jutils.c +133 -0
  278. data/epeg/vendor/libjpeg-turbo-2.0.4/jversion.h +52 -0
  279. data/epeg/vendor/libjpeg-turbo-2.0.4/libjpeg.map.in +11 -0
  280. data/epeg/vendor/libjpeg-turbo-2.0.4/libjpeg.txt +3144 -0
  281. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/CMakeLists.txt +1 -0
  282. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5.c +275 -0
  283. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5.h +57 -0
  284. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5cmp.c +59 -0
  285. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5hl.c +125 -0
  286. data/epeg/vendor/libjpeg-turbo-2.0.4/rdbmp.c +689 -0
  287. data/epeg/vendor/libjpeg-turbo-2.0.4/rdcolmap.c +254 -0
  288. data/epeg/vendor/libjpeg-turbo-2.0.4/rdgif.c +39 -0
  289. data/epeg/vendor/libjpeg-turbo-2.0.4/rdjpgcom.1 +63 -0
  290. data/epeg/vendor/libjpeg-turbo-2.0.4/rdjpgcom.c +510 -0
  291. data/epeg/vendor/libjpeg-turbo-2.0.4/rdppm.c +766 -0
  292. data/epeg/vendor/libjpeg-turbo-2.0.4/rdrle.c +389 -0
  293. data/epeg/vendor/libjpeg-turbo-2.0.4/rdswitch.c +424 -0
  294. data/epeg/vendor/libjpeg-turbo-2.0.4/rdtarga.c +509 -0
  295. data/epeg/vendor/libjpeg-turbo-2.0.4/release/Distribution.xml.in +24 -0
  296. data/epeg/vendor/libjpeg-turbo-2.0.4/release/License.rtf +20 -0
  297. data/epeg/vendor/libjpeg-turbo-2.0.4/release/ReadMe.txt +5 -0
  298. data/epeg/vendor/libjpeg-turbo-2.0.4/release/Welcome.rtf +17 -0
  299. data/epeg/vendor/libjpeg-turbo-2.0.4/release/deb-control.in +31 -0
  300. data/epeg/vendor/libjpeg-turbo-2.0.4/release/installer.nsi.in +191 -0
  301. data/epeg/vendor/libjpeg-turbo-2.0.4/release/libjpeg.pc.in +10 -0
  302. data/epeg/vendor/libjpeg-turbo-2.0.4/release/libturbojpeg.pc.in +10 -0
  303. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makecygwinpkg.in +66 -0
  304. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makedpkg.in +115 -0
  305. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makemacpkg.in +284 -0
  306. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makerpm.in +30 -0
  307. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makesrpm.in +48 -0
  308. data/epeg/vendor/libjpeg-turbo-2.0.4/release/maketarball.in +51 -0
  309. data/epeg/vendor/libjpeg-turbo-2.0.4/release/rpm.spec.in +221 -0
  310. data/epeg/vendor/libjpeg-turbo-2.0.4/release/uninstall.in +113 -0
  311. data/epeg/vendor/libjpeg-turbo-2.0.4/sharedlib/CMakeLists.txt +99 -0
  312. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/CMakeLists.txt +385 -0
  313. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm/jsimd.c +721 -0
  314. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm/jsimd_neon.S +2878 -0
  315. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm64/jsimd.c +798 -0
  316. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm64/jsimd_neon.S +3433 -0
  317. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/gas-preprocessor.in +1 -0
  318. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-avx2.asm +578 -0
  319. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-mmx.asm +476 -0
  320. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-sse2.asm +503 -0
  321. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-avx2.asm +121 -0
  322. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-mmx.asm +121 -0
  323. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-sse2.asm +120 -0
  324. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-avx2.asm +113 -0
  325. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-mmx.asm +113 -0
  326. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-sse2.asm +112 -0
  327. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-avx2.asm +457 -0
  328. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-mmx.asm +355 -0
  329. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-sse2.asm +382 -0
  330. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jchuff-sse2.asm +424 -0
  331. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcphuff-sse2.asm +660 -0
  332. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-avx2.asm +388 -0
  333. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-mmx.asm +324 -0
  334. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-sse2.asm +351 -0
  335. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-avx2.asm +515 -0
  336. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-mmx.asm +404 -0
  337. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-sse2.asm +458 -0
  338. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-avx2.asm +118 -0
  339. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-mmx.asm +117 -0
  340. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-sse2.asm +117 -0
  341. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-avx2.asm +136 -0
  342. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-mmx.asm +123 -0
  343. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-sse2.asm +135 -0
  344. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-avx2.asm +575 -0
  345. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-mmx.asm +460 -0
  346. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-sse2.asm +517 -0
  347. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-avx2.asm +760 -0
  348. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-mmx.asm +731 -0
  349. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-sse2.asm +724 -0
  350. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctflt-3dn.asm +318 -0
  351. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctflt-sse.asm +369 -0
  352. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctfst-mmx.asm +395 -0
  353. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctfst-sse2.asm +403 -0
  354. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-avx2.asm +331 -0
  355. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-mmx.asm +620 -0
  356. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-sse2.asm +633 -0
  357. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-3dn.asm +451 -0
  358. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-sse.asm +571 -0
  359. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-sse2.asm +497 -0
  360. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctfst-mmx.asm +499 -0
  361. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctfst-sse2.asm +501 -0
  362. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-avx2.asm +453 -0
  363. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-mmx.asm +851 -0
  364. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-sse2.asm +858 -0
  365. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctred-mmx.asm +704 -0
  366. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctred-sse2.asm +592 -0
  367. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-3dn.asm +230 -0
  368. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-mmx.asm +276 -0
  369. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-sse.asm +208 -0
  370. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquantf-sse2.asm +168 -0
  371. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquanti-avx2.asm +188 -0
  372. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquanti-sse2.asm +201 -0
  373. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jsimd.c +1253 -0
  374. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jsimdcpu.asm +135 -0
  375. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/jsimd.h +1083 -0
  376. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jccolext-mmi.c +483 -0
  377. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jccolor-mmi.c +148 -0
  378. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jcsample-mmi.c +100 -0
  379. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jcsample.h +28 -0
  380. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdcolext-mmi.c +424 -0
  381. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdcolor-mmi.c +139 -0
  382. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdsample-mmi.c +245 -0
  383. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jfdctint-mmi.c +398 -0
  384. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jidctint-mmi.c +571 -0
  385. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jquanti-mmi.c +130 -0
  386. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jsimd.c +610 -0
  387. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jsimd_mmi.h +57 -0
  388. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/loongson-mmintrin.h +1324 -0
  389. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd.c +1123 -0
  390. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd_dspr2.S +4479 -0
  391. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd_dspr2_asm.h +292 -0
  392. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jcolsamp.inc +135 -0
  393. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jdct.inc +31 -0
  394. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jpeg_nbits_table.inc +4097 -0
  395. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdcfg.inc +93 -0
  396. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdcfg.inc.h +131 -0
  397. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdext.inc +479 -0
  398. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jccolext-altivec.c +269 -0
  399. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jccolor-altivec.c +116 -0
  400. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcgray-altivec.c +111 -0
  401. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcgryext-altivec.c +228 -0
  402. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcsample-altivec.c +159 -0
  403. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcsample.h +28 -0
  404. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdcolext-altivec.c +276 -0
  405. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdcolor-altivec.c +106 -0
  406. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdmerge-altivec.c +130 -0
  407. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdmrgext-altivec.c +329 -0
  408. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdsample-altivec.c +400 -0
  409. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jfdctfst-altivec.c +154 -0
  410. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jfdctint-altivec.c +258 -0
  411. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jidctfst-altivec.c +255 -0
  412. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jidctint-altivec.c +357 -0
  413. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jquanti-altivec.c +250 -0
  414. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jsimd.c +872 -0
  415. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jsimd_altivec.h +98 -0
  416. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolext-avx2.asm +558 -0
  417. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolext-sse2.asm +483 -0
  418. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolor-avx2.asm +121 -0
  419. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolor-sse2.asm +120 -0
  420. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgray-avx2.asm +113 -0
  421. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgray-sse2.asm +112 -0
  422. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgryext-avx2.asm +437 -0
  423. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgryext-sse2.asm +362 -0
  424. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jchuff-sse2.asm +346 -0
  425. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcphuff-sse2.asm +637 -0
  426. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcsample-avx2.asm +366 -0
  427. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcsample-sse2.asm +329 -0
  428. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolext-avx2.asm +495 -0
  429. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolext-sse2.asm +438 -0
  430. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolor-avx2.asm +118 -0
  431. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolor-sse2.asm +117 -0
  432. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmerge-avx2.asm +136 -0
  433. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmerge-sse2.asm +135 -0
  434. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmrgext-avx2.asm +593 -0
  435. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmrgext-sse2.asm +535 -0
  436. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdsample-avx2.asm +695 -0
  437. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdsample-sse2.asm +664 -0
  438. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctflt-sse.asm +355 -0
  439. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctfst-sse2.asm +389 -0
  440. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctint-avx2.asm +320 -0
  441. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctint-sse2.asm +619 -0
  442. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctflt-sse2.asm +481 -0
  443. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctfst-sse2.asm +490 -0
  444. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctint-avx2.asm +417 -0
  445. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctint-sse2.asm +846 -0
  446. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctred-sse2.asm +573 -0
  447. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquantf-sse2.asm +154 -0
  448. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquanti-avx2.asm +162 -0
  449. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquanti-sse2.asm +187 -0
  450. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jsimd.c +1076 -0
  451. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jsimdcpu.asm +86 -0
  452. data/epeg/vendor/libjpeg-turbo-2.0.4/structure.txt +904 -0
  453. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/nightshot_iso_100.bmp +0 -0
  454. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/nightshot_iso_100.txt +25 -0
  455. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test.scan +5 -0
  456. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test1.icc +0 -0
  457. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test1.icc.txt +20 -0
  458. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test2.icc +0 -0
  459. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test2.icc.txt +20 -0
  460. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testimgari.jpg +0 -0
  461. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testimgint.jpg +0 -0
  462. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig.jpg +0 -0
  463. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig.ppm +4 -0
  464. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig12.jpg +0 -0
  465. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_5674_0098.bmp +0 -0
  466. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_6434_0018a.bmp +0 -0
  467. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_6548_0026a.bmp +0 -0
  468. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbench.c +1031 -0
  469. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbenchtest.in +256 -0
  470. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbenchtest.java.in +215 -0
  471. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexample.c +396 -0
  472. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexampletest.in +149 -0
  473. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexampletest.java.in +151 -0
  474. data/epeg/vendor/libjpeg-turbo-2.0.4/tjunittest.c +931 -0
  475. data/epeg/vendor/libjpeg-turbo-2.0.4/tjutil.c +70 -0
  476. data/epeg/vendor/libjpeg-turbo-2.0.4/tjutil.h +47 -0
  477. data/epeg/vendor/libjpeg-turbo-2.0.4/transupp.c +1628 -0
  478. data/epeg/vendor/libjpeg-turbo-2.0.4/transupp.h +210 -0
  479. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-jni.c +1246 -0
  480. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-mapfile +65 -0
  481. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-mapfile.jni +101 -0
  482. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg.c +2152 -0
  483. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg.h +1744 -0
  484. data/epeg/vendor/libjpeg-turbo-2.0.4/usage.txt +635 -0
  485. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jconfig.h.in +34 -0
  486. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg62-memsrcdst.def +108 -0
  487. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg62.def +106 -0
  488. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg7-memsrcdst.def +110 -0
  489. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg7.def +108 -0
  490. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg8.def +111 -0
  491. data/epeg/vendor/libjpeg-turbo-2.0.4/wizard.txt +212 -0
  492. data/epeg/vendor/libjpeg-turbo-2.0.4/wrbmp.c +558 -0
  493. data/epeg/vendor/libjpeg-turbo-2.0.4/wrgif.c +413 -0
  494. data/epeg/vendor/libjpeg-turbo-2.0.4/wrjpgcom.1 +103 -0
  495. data/epeg/vendor/libjpeg-turbo-2.0.4/wrjpgcom.c +591 -0
  496. data/epeg/vendor/libjpeg-turbo-2.0.4/wrppm.c +365 -0
  497. data/epeg/vendor/libjpeg-turbo-2.0.4/wrrle.c +309 -0
  498. data/epeg/vendor/libjpeg-turbo-2.0.4/wrtarga.c +261 -0
  499. data/epeg.c +131 -0
  500. data/epeg.gemspec +18 -0
  501. data/extconf.rb +80 -0
  502. data/test.jpg +0 -0
  503. data/test.rb +42 -0
  504. metadata +546 -0
@@ -0,0 +1,2878 @@
1
+ /*
2
+ * ARMv7 NEON optimizations for libjpeg-turbo
3
+ *
4
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
5
+ * All Rights Reserved.
6
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7
+ * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
8
+ * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
9
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
10
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
11
+ *
12
+ * This software is provided 'as-is', without any express or implied
13
+ * warranty. In no event will the authors be held liable for any damages
14
+ * arising from the use of this software.
15
+ *
16
+ * Permission is granted to anyone to use this software for any purpose,
17
+ * including commercial applications, and to alter it and redistribute it
18
+ * freely, subject to the following restrictions:
19
+ *
20
+ * 1. The origin of this software must not be misrepresented; you must not
21
+ * claim that you wrote the original software. If you use this software
22
+ * in a product, an acknowledgment in the product documentation would be
23
+ * appreciated but is not required.
24
+ * 2. Altered source versions must be plainly marked as such, and must not be
25
+ * misrepresented as being the original software.
26
+ * 3. This notice may not be removed or altered from any source distribution.
27
+ */
28
+
29
+ #if defined(__linux__) && defined(__ELF__)
30
+ .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
31
+ #endif
32
+
33
+ .text
34
+ .fpu neon
35
+ .arch armv7a
36
+ .object_arch armv4
37
+ .arm
38
+ .syntax unified
39
+
40
+
41
+ #define RESPECT_STRICT_ALIGNMENT 1
42
+
43
+
44
+ /*****************************************************************************/
45
+
46
+ /* Supplementary macro for setting function attributes */
47
+ .macro asm_function fname
48
+ #ifdef __APPLE__
49
+ .private_extern _\fname
50
+ .globl _\fname
51
+ _\fname:
52
+ #else
53
+ .global \fname
54
+ #ifdef __ELF__
55
+ .hidden \fname
56
+ .type \fname, %function
57
+ #endif
58
+ \fname:
59
+ #endif
60
+ .endm
61
+
62
+ /* Transpose a block of 4x4 coefficients in four 64-bit registers */
63
+ .macro transpose_4x4 x0, x1, x2, x3
64
+ vtrn.16 \x0, \x1
65
+ vtrn.16 \x2, \x3
66
+ vtrn.32 \x0, \x2
67
+ vtrn.32 \x1, \x3
68
+ .endm
69
+
70
+
71
+ #define CENTERJSAMPLE 128
72
+
73
+ /*****************************************************************************/
74
+
75
+ /*
76
+ * Perform dequantization and inverse DCT on one block of coefficients.
77
+ *
78
+ * GLOBAL(void)
79
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
80
+ * JSAMPARRAY output_buf, JDIMENSION output_col)
81
+ */
82
+
83
+ #define FIX_0_298631336 (2446)
84
+ #define FIX_0_390180644 (3196)
85
+ #define FIX_0_541196100 (4433)
86
+ #define FIX_0_765366865 (6270)
87
+ #define FIX_0_899976223 (7373)
88
+ #define FIX_1_175875602 (9633)
89
+ #define FIX_1_501321110 (12299)
90
+ #define FIX_1_847759065 (15137)
91
+ #define FIX_1_961570560 (16069)
92
+ #define FIX_2_053119869 (16819)
93
+ #define FIX_2_562915447 (20995)
94
+ #define FIX_3_072711026 (25172)
95
+
96
+ #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
97
+ #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
98
+ #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
99
+ #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
100
+ #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
101
+ #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
102
+ #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
103
+ #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
104
+
105
+ /*
106
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
107
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
108
+ */
109
+ #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
110
+ DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
111
+ JLONG q1, q2, q3, q4, q5, q6, q7; \
112
+ JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
113
+ \
114
+ /* 1-D iDCT input data */ \
115
+ row0 = xrow0; \
116
+ row1 = xrow1; \
117
+ row2 = xrow2; \
118
+ row3 = xrow3; \
119
+ row4 = xrow4; \
120
+ row5 = xrow5; \
121
+ row6 = xrow6; \
122
+ row7 = xrow7; \
123
+ \
124
+ q5 = row7 + row3; \
125
+ q4 = row5 + row1; \
126
+ q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
127
+ MULTIPLY(q4, FIX_1_175875602); \
128
+ q7 = MULTIPLY(q5, FIX_1_175875602) + \
129
+ MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
130
+ q2 = MULTIPLY(row2, FIX_0_541196100) + \
131
+ MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
132
+ q4 = q6; \
133
+ q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
134
+ q6 += MULTIPLY(row5, -FIX_2_562915447) + \
135
+ MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
136
+ /* now we can use q1 (reloadable constants have been used up) */ \
137
+ q1 = q3 + q2; \
138
+ q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
139
+ MULTIPLY(row1, -FIX_0_899976223); \
140
+ q5 = q7; \
141
+ q1 = q1 + q6; \
142
+ q7 += MULTIPLY(row7, -FIX_0_899976223) + \
143
+ MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
144
+ \
145
+ /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
146
+ tmp11_plus_tmp2 = q1; \
147
+ row1 = 0; \
148
+ \
149
+ q1 = q1 - q6; \
150
+ q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
151
+ MULTIPLY(row3, -FIX_2_562915447); \
152
+ q1 = q1 - q6; \
153
+ q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
154
+ MULTIPLY(row6, FIX_0_541196100); \
155
+ q3 = q3 - q2; \
156
+ \
157
+ /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
158
+ tmp11_minus_tmp2 = q1; \
159
+ \
160
+ q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
161
+ q2 = q1 + q6; \
162
+ q1 = q1 - q6; \
163
+ \
164
+ /* pick up the results */ \
165
+ tmp0 = q4; \
166
+ tmp1 = q5; \
167
+ tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
168
+ tmp3 = q7; \
169
+ tmp10 = q2; \
170
+ tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
171
+ tmp12 = q3; \
172
+ tmp13 = q1; \
173
+ }
174
+
175
+ #define XFIX_0_899976223 d0[0]
176
+ #define XFIX_0_541196100 d0[1]
177
+ #define XFIX_2_562915447 d0[2]
178
+ #define XFIX_0_298631336_MINUS_0_899976223 d0[3]
179
+ #define XFIX_1_501321110_MINUS_0_899976223 d1[0]
180
+ #define XFIX_2_053119869_MINUS_2_562915447 d1[1]
181
+ #define XFIX_0_541196100_PLUS_0_765366865 d1[2]
182
+ #define XFIX_1_175875602 d1[3]
183
+ #define XFIX_1_175875602_MINUS_0_390180644 d2[0]
184
+ #define XFIX_0_541196100_MINUS_1_847759065 d2[1]
185
+ #define XFIX_3_072711026_MINUS_2_562915447 d2[2]
186
+ #define XFIX_1_175875602_MINUS_1_961570560 d2[3]
187
+
188
+ .balign 16
189
+ jsimd_idct_islow_neon_consts:
190
+ .short FIX_0_899976223 /* d0[0] */
191
+ .short FIX_0_541196100 /* d0[1] */
192
+ .short FIX_2_562915447 /* d0[2] */
193
+ .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
194
+ .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
195
+ .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
196
+ .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
197
+ .short FIX_1_175875602 /* d1[3] */
198
+ /* reloadable constants */
199
+ .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
200
+ .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
201
+ .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
202
+ .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
203
+
204
+ asm_function jsimd_idct_islow_neon
205
+
206
+ DCT_TABLE .req r0
207
+ COEF_BLOCK .req r1
208
+ OUTPUT_BUF .req r2
209
+ OUTPUT_COL .req r3
210
+ TMP1 .req r0
211
+ TMP2 .req r1
212
+ TMP3 .req r2
213
+ TMP4 .req ip
214
+
215
+ ROW0L .req d16
216
+ ROW0R .req d17
217
+ ROW1L .req d18
218
+ ROW1R .req d19
219
+ ROW2L .req d20
220
+ ROW2R .req d21
221
+ ROW3L .req d22
222
+ ROW3R .req d23
223
+ ROW4L .req d24
224
+ ROW4R .req d25
225
+ ROW5L .req d26
226
+ ROW5R .req d27
227
+ ROW6L .req d28
228
+ ROW6R .req d29
229
+ ROW7L .req d30
230
+ ROW7R .req d31
231
+
232
+ /* Load and dequantize coefficients into NEON registers
233
+ * with the following allocation:
234
+ * 0 1 2 3 | 4 5 6 7
235
+ * ---------+--------
236
+ * 0 | d16 | d17 ( q8 )
237
+ * 1 | d18 | d19 ( q9 )
238
+ * 2 | d20 | d21 ( q10 )
239
+ * 3 | d22 | d23 ( q11 )
240
+ * 4 | d24 | d25 ( q12 )
241
+ * 5 | d26 | d27 ( q13 )
242
+ * 6 | d28 | d29 ( q14 )
243
+ * 7 | d30 | d31 ( q15 )
244
+ */
245
+ adr ip, jsimd_idct_islow_neon_consts
246
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
247
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
248
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
249
+ vmul.s16 q8, q8, q0
250
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
251
+ vmul.s16 q9, q9, q1
252
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
253
+ vmul.s16 q10, q10, q2
254
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
255
+ vmul.s16 q11, q11, q3
256
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
257
+ vmul.s16 q12, q12, q0
258
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
259
+ vmul.s16 q14, q14, q2
260
+ vmul.s16 q13, q13, q1
261
+ vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
262
+ add ip, ip, #16
263
+ vmul.s16 q15, q15, q3
264
+ vpush {d8-d15} /* save NEON registers */
265
+ /* 1-D IDCT, pass 1, left 4x8 half */
266
+ vadd.s16 d4, ROW7L, ROW3L
267
+ vadd.s16 d5, ROW5L, ROW1L
268
+ vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
269
+ vmlal.s16 q6, d5, XFIX_1_175875602
270
+ vmull.s16 q7, d4, XFIX_1_175875602
271
+ /* Check for the zero coefficients in the right 4x8 half */
272
+ push {r4, r5}
273
+ vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
274
+ vsubl.s16 q3, ROW0L, ROW4L
275
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
276
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
277
+ vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
278
+ orr r0, r4, r5
279
+ vmov q4, q6
280
+ vmlsl.s16 q6, ROW5L, XFIX_2_562915447
281
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
282
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
283
+ vshl.s32 q3, q3, #13
284
+ orr r0, r0, r4
285
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
286
+ orr r0, r0, r5
287
+ vadd.s32 q1, q3, q2
288
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
289
+ vmov q5, q7
290
+ vadd.s32 q1, q1, q6
291
+ orr r0, r0, r4
292
+ vmlsl.s16 q7, ROW7L, XFIX_0_899976223
293
+ orr r0, r0, r5
294
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
295
+ vrshrn.s32 ROW1L, q1, #11
296
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
297
+ vsub.s32 q1, q1, q6
298
+ vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
299
+ orr r0, r0, r4
300
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
301
+ orr r0, r0, r5
302
+ vsub.s32 q1, q1, q6
303
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
304
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
305
+ vmlal.s16 q6, ROW6L, XFIX_0_541196100
306
+ vsub.s32 q3, q3, q2
307
+ orr r0, r0, r4
308
+ vrshrn.s32 ROW6L, q1, #11
309
+ orr r0, r0, r5
310
+ vadd.s32 q1, q3, q5
311
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
312
+ vsub.s32 q3, q3, q5
313
+ vaddl.s16 q5, ROW0L, ROW4L
314
+ orr r0, r0, r4
315
+ vrshrn.s32 ROW2L, q1, #11
316
+ orr r0, r0, r5
317
+ vrshrn.s32 ROW5L, q3, #11
318
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
319
+ vshl.s32 q5, q5, #13
320
+ vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
321
+ orr r0, r0, r4
322
+ vadd.s32 q2, q5, q6
323
+ orrs r0, r0, r5
324
+ vsub.s32 q1, q5, q6
325
+ vadd.s32 q6, q2, q7
326
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
327
+ vsub.s32 q2, q2, q7
328
+ vadd.s32 q5, q1, q4
329
+ orr r0, r4, r5
330
+ vsub.s32 q3, q1, q4
331
+ pop {r4, r5}
332
+ vrshrn.s32 ROW7L, q2, #11
333
+ vrshrn.s32 ROW3L, q5, #11
334
+ vrshrn.s32 ROW0L, q6, #11
335
+ vrshrn.s32 ROW4L, q3, #11
336
+
337
+ beq 3f /* Go to do some special handling for the sparse
338
+ right 4x8 half */
339
+
340
+ /* 1-D IDCT, pass 1, right 4x8 half */
341
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
342
+ vadd.s16 d10, ROW7R, ROW3R
343
+ vadd.s16 d8, ROW5R, ROW1R
344
+ /* Transpose left 4x8 half */
345
+ vtrn.16 ROW6L, ROW7L
346
+ vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
347
+ vmlal.s16 q6, d8, XFIX_1_175875602
348
+ vtrn.16 ROW2L, ROW3L
349
+ vmull.s16 q7, d10, XFIX_1_175875602
350
+ vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
351
+ vtrn.16 ROW0L, ROW1L
352
+ vsubl.s16 q3, ROW0R, ROW4R
353
+ vmull.s16 q2, ROW2R, XFIX_0_541196100
354
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
355
+ vtrn.16 ROW4L, ROW5L
356
+ vmov q4, q6
357
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447
358
+ vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
359
+ vtrn.32 ROW1L, ROW3L
360
+ vshl.s32 q3, q3, #13
361
+ vmlsl.s16 q4, ROW1R, XFIX_0_899976223
362
+ vtrn.32 ROW4L, ROW6L
363
+ vadd.s32 q1, q3, q2
364
+ vmov q5, q7
365
+ vadd.s32 q1, q1, q6
366
+ vtrn.32 ROW0L, ROW2L
367
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223
368
+ vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
369
+ vrshrn.s32 ROW1R, q1, #11
370
+ vtrn.32 ROW5L, ROW7L
371
+ vsub.s32 q1, q1, q6
372
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
373
+ vmlsl.s16 q5, ROW3R, XFIX_2_562915447
374
+ vsub.s32 q1, q1, q6
375
+ vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
376
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100
377
+ vsub.s32 q3, q3, q2
378
+ vrshrn.s32 ROW6R, q1, #11
379
+ vadd.s32 q1, q3, q5
380
+ vsub.s32 q3, q3, q5
381
+ vaddl.s16 q5, ROW0R, ROW4R
382
+ vrshrn.s32 ROW2R, q1, #11
383
+ vrshrn.s32 ROW5R, q3, #11
384
+ vshl.s32 q5, q5, #13
385
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
386
+ vadd.s32 q2, q5, q6
387
+ vsub.s32 q1, q5, q6
388
+ vadd.s32 q6, q2, q7
389
+ vsub.s32 q2, q2, q7
390
+ vadd.s32 q5, q1, q4
391
+ vsub.s32 q3, q1, q4
392
+ vrshrn.s32 ROW7R, q2, #11
393
+ vrshrn.s32 ROW3R, q5, #11
394
+ vrshrn.s32 ROW0R, q6, #11
395
+ vrshrn.s32 ROW4R, q3, #11
396
+ /* Transpose right 4x8 half */
397
+ vtrn.16 ROW6R, ROW7R
398
+ vtrn.16 ROW2R, ROW3R
399
+ vtrn.16 ROW0R, ROW1R
400
+ vtrn.16 ROW4R, ROW5R
401
+ vtrn.32 ROW1R, ROW3R
402
+ vtrn.32 ROW4R, ROW6R
403
+ vtrn.32 ROW0R, ROW2R
404
+ vtrn.32 ROW5R, ROW7R
405
+
406
+ 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
407
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
408
+ vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
409
+ vmlal.s16 q6, ROW1L, XFIX_1_175875602
410
+ vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
411
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
412
+ vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
413
+ vmlal.s16 q7, ROW3L, XFIX_1_175875602
414
+ vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
415
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
416
+ vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
417
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
418
+ vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
419
+ vmov q4, q6
420
+ vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
421
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
422
+ vshl.s32 q3, q3, #13
423
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
424
+ vadd.s32 q1, q3, q2
425
+ vmov q5, q7
426
+ vadd.s32 q1, q1, q6
427
+ vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
428
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
429
+ vshrn.s32 ROW1L, q1, #16
430
+ vsub.s32 q1, q1, q6
431
+ vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
432
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
433
+ vsub.s32 q1, q1, q6
434
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
435
+ vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
436
+ vsub.s32 q3, q3, q2
437
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
438
+ vadd.s32 q1, q3, q5
439
+ vsub.s32 q3, q3, q5
440
+ vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
441
+ vshrn.s32 ROW2L, q1, #16
442
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
443
+ vshl.s32 q5, q5, #13
444
+ vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
445
+ vadd.s32 q2, q5, q6
446
+ vsub.s32 q1, q5, q6
447
+ vadd.s32 q6, q2, q7
448
+ vsub.s32 q2, q2, q7
449
+ vadd.s32 q5, q1, q4
450
+ vsub.s32 q3, q1, q4
451
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
452
+ vshrn.s32 ROW3L, q5, #16
453
+ vshrn.s32 ROW0L, q6, #16
454
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
455
+ /* 1-D IDCT, pass 2, right 4x8 half */
456
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
457
+ vmull.s16 q6, ROW5R, XFIX_1_175875602
458
+ vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
459
+ vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
460
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
461
+ vmull.s16 q7, ROW7R, XFIX_1_175875602
462
+ vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
463
+ vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
464
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
465
+ vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
466
+ vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
467
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
468
+ vmov q4, q6
469
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447
470
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
471
+ vshl.s32 q3, q3, #13
472
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
473
+ vadd.s32 q1, q3, q2
474
+ vmov q5, q7
475
+ vadd.s32 q1, q1, q6
476
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223
477
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
478
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
479
+ vsub.s32 q1, q1, q6
480
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
481
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
482
+ vsub.s32 q1, q1, q6
483
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
484
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100
485
+ vsub.s32 q3, q3, q2
486
+ vshrn.s32 ROW6R, q1, #16
487
+ vadd.s32 q1, q3, q5
488
+ vsub.s32 q3, q3, q5
489
+ vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
490
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
491
+ vshrn.s32 ROW5R, q3, #16
492
+ vshl.s32 q5, q5, #13
493
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
494
+ vadd.s32 q2, q5, q6
495
+ vsub.s32 q1, q5, q6
496
+ vadd.s32 q6, q2, q7
497
+ vsub.s32 q2, q2, q7
498
+ vadd.s32 q5, q1, q4
499
+ vsub.s32 q3, q1, q4
500
+ vshrn.s32 ROW7R, q2, #16
501
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
502
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
503
+ vshrn.s32 ROW4R, q3, #16
504
+
505
+ 2: /* Descale to 8-bit and range limit */
506
+ vqrshrn.s16 d16, q8, #2
507
+ vqrshrn.s16 d17, q9, #2
508
+ vqrshrn.s16 d18, q10, #2
509
+ vqrshrn.s16 d19, q11, #2
510
+ vpop {d8-d15} /* restore NEON registers */
511
+ vqrshrn.s16 d20, q12, #2
512
+ /* Transpose the final 8-bit samples and do signed->unsigned conversion */
513
+ vtrn.16 q8, q9
514
+ vqrshrn.s16 d21, q13, #2
515
+ vqrshrn.s16 d22, q14, #2
516
+ vmov.u8 q0, #(CENTERJSAMPLE)
517
+ vqrshrn.s16 d23, q15, #2
518
+ vtrn.8 d16, d17
519
+ vtrn.8 d18, d19
520
+ vadd.u8 q8, q8, q0
521
+ vadd.u8 q9, q9, q0
522
+ vtrn.16 q10, q11
523
+ /* Store results to the output buffer */
524
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
525
+ add TMP1, TMP1, OUTPUT_COL
526
+ add TMP2, TMP2, OUTPUT_COL
527
+ vst1.8 {d16}, [TMP1]
528
+ vtrn.8 d20, d21
529
+ vst1.8 {d17}, [TMP2]
530
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
531
+ add TMP1, TMP1, OUTPUT_COL
532
+ add TMP2, TMP2, OUTPUT_COL
533
+ vst1.8 {d18}, [TMP1]
534
+ vadd.u8 q10, q10, q0
535
+ vst1.8 {d19}, [TMP2]
536
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
537
+ add TMP1, TMP1, OUTPUT_COL
538
+ add TMP2, TMP2, OUTPUT_COL
539
+ add TMP3, TMP3, OUTPUT_COL
540
+ add TMP4, TMP4, OUTPUT_COL
541
+ vtrn.8 d22, d23
542
+ vst1.8 {d20}, [TMP1]
543
+ vadd.u8 q11, q11, q0
544
+ vst1.8 {d21}, [TMP2]
545
+ vst1.8 {d22}, [TMP3]
546
+ vst1.8 {d23}, [TMP4]
547
+ bx lr
548
+
549
+ 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
550
+
551
+ /* Transpose left 4x8 half */
552
+ vtrn.16 ROW6L, ROW7L
553
+ vtrn.16 ROW2L, ROW3L
554
+ vtrn.16 ROW0L, ROW1L
555
+ vtrn.16 ROW4L, ROW5L
556
+ vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
557
+ vtrn.32 ROW1L, ROW3L
558
+ vtrn.32 ROW4L, ROW6L
559
+ vtrn.32 ROW0L, ROW2L
560
+ vtrn.32 ROW5L, ROW7L
561
+
562
+ cmp r0, #0
563
+ beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
564
+ pass */
565
+
566
+ /* Only row 0 is non-zero for the right 4x8 half */
567
+ vdup.s16 ROW1R, ROW0R[1]
568
+ vdup.s16 ROW2R, ROW0R[2]
569
+ vdup.s16 ROW3R, ROW0R[3]
570
+ vdup.s16 ROW4R, ROW0R[0]
571
+ vdup.s16 ROW5R, ROW0R[1]
572
+ vdup.s16 ROW6R, ROW0R[2]
573
+ vdup.s16 ROW7R, ROW0R[3]
574
+ vdup.s16 ROW0R, ROW0R[0]
575
+ b 1b /* Go to 'normal' second pass */
576
+
577
+ 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
578
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
579
+ vmull.s16 q6, ROW1L, XFIX_1_175875602
580
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
581
+ vmull.s16 q7, ROW3L, XFIX_1_175875602
582
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
583
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
584
+ vshll.s16 q3, ROW0L, #13
585
+ vmov q4, q6
586
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
587
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
588
+ vadd.s32 q1, q3, q2
589
+ vmov q5, q7
590
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
591
+ vadd.s32 q1, q1, q6
592
+ vadd.s32 q6, q6, q6
593
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
594
+ vshrn.s32 ROW1L, q1, #16
595
+ vsub.s32 q1, q1, q6
596
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
597
+ vsub.s32 q3, q3, q2
598
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
599
+ vadd.s32 q1, q3, q5
600
+ vsub.s32 q3, q3, q5
601
+ vshll.s16 q5, ROW0L, #13
602
+ vshrn.s32 ROW2L, q1, #16
603
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
604
+ vadd.s32 q2, q5, q6
605
+ vsub.s32 q1, q5, q6
606
+ vadd.s32 q6, q2, q7
607
+ vsub.s32 q2, q2, q7
608
+ vadd.s32 q5, q1, q4
609
+ vsub.s32 q3, q1, q4
610
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
611
+ vshrn.s32 ROW3L, q5, #16
612
+ vshrn.s32 ROW0L, q6, #16
613
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
614
+ /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
615
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
616
+ vmull.s16 q6, ROW5L, XFIX_1_175875602
617
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
618
+ vmull.s16 q7, ROW7L, XFIX_1_175875602
619
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
620
+ vmull.s16 q2, ROW6L, XFIX_0_541196100
621
+ vshll.s16 q3, ROW4L, #13
622
+ vmov q4, q6
623
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
624
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223
625
+ vadd.s32 q1, q3, q2
626
+ vmov q5, q7
627
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
628
+ vadd.s32 q1, q1, q6
629
+ vadd.s32 q6, q6, q6
630
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447
631
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
632
+ vsub.s32 q1, q1, q6
633
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
634
+ vsub.s32 q3, q3, q2
635
+ vshrn.s32 ROW6R, q1, #16
636
+ vadd.s32 q1, q3, q5
637
+ vsub.s32 q3, q3, q5
638
+ vshll.s16 q5, ROW4L, #13
639
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
640
+ vshrn.s32 ROW5R, q3, #16
641
+ vadd.s32 q2, q5, q6
642
+ vsub.s32 q1, q5, q6
643
+ vadd.s32 q6, q2, q7
644
+ vsub.s32 q2, q2, q7
645
+ vadd.s32 q5, q1, q4
646
+ vsub.s32 q3, q1, q4
647
+ vshrn.s32 ROW7R, q2, #16
648
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
649
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
650
+ vshrn.s32 ROW4R, q3, #16
651
+ b 2b /* Go to epilogue */
652
+
653
+ .unreq DCT_TABLE
654
+ .unreq COEF_BLOCK
655
+ .unreq OUTPUT_BUF
656
+ .unreq OUTPUT_COL
657
+ .unreq TMP1
658
+ .unreq TMP2
659
+ .unreq TMP3
660
+ .unreq TMP4
661
+
662
+ .unreq ROW0L
663
+ .unreq ROW0R
664
+ .unreq ROW1L
665
+ .unreq ROW1R
666
+ .unreq ROW2L
667
+ .unreq ROW2R
668
+ .unreq ROW3L
669
+ .unreq ROW3R
670
+ .unreq ROW4L
671
+ .unreq ROW4R
672
+ .unreq ROW5L
673
+ .unreq ROW5R
674
+ .unreq ROW6L
675
+ .unreq ROW6R
676
+ .unreq ROW7L
677
+ .unreq ROW7R
678
+
679
+
680
+ /*****************************************************************************/
681
+
682
+ /*
683
+ * jsimd_idct_ifast_neon
684
+ *
685
+ * This function contains a fast, not so accurate integer implementation of
686
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
687
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
688
+ * function from jidctfst.c
689
+ *
690
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
691
+ * But in ARM NEON case some extra additions are required because VQDMULH
692
+ * instruction can't handle the constants larger than 1. So the expressions
693
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
694
+ * which introduces an extra addition. Overall, there are 6 extra additions
695
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
696
+ */
697
+
698
+ #define XFIX_1_082392200 d0[0]
699
+ #define XFIX_1_414213562 d0[1]
700
+ #define XFIX_1_847759065 d0[2]
701
+ #define XFIX_2_613125930 d0[3]
702
+
703
+ .balign 16
704
+ jsimd_idct_ifast_neon_consts:
705
+ .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
706
+ .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
707
+ .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
708
+ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
709
+
710
+ asm_function jsimd_idct_ifast_neon
711
+
712
+ DCT_TABLE .req r0
713
+ COEF_BLOCK .req r1
714
+ OUTPUT_BUF .req r2
715
+ OUTPUT_COL .req r3
716
+ TMP1 .req r0
717
+ TMP2 .req r1
718
+ TMP3 .req r2
719
+ TMP4 .req ip
720
+
721
+ /* Load and dequantize coefficients into NEON registers
722
+ * with the following allocation:
723
+ * 0 1 2 3 | 4 5 6 7
724
+ * ---------+--------
725
+ * 0 | d16 | d17 ( q8 )
726
+ * 1 | d18 | d19 ( q9 )
727
+ * 2 | d20 | d21 ( q10 )
728
+ * 3 | d22 | d23 ( q11 )
729
+ * 4 | d24 | d25 ( q12 )
730
+ * 5 | d26 | d27 ( q13 )
731
+ * 6 | d28 | d29 ( q14 )
732
+ * 7 | d30 | d31 ( q15 )
733
+ */
734
+ adr ip, jsimd_idct_ifast_neon_consts
735
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
736
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
737
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
738
+ vmul.s16 q8, q8, q0
739
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
740
+ vmul.s16 q9, q9, q1
741
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
742
+ vmul.s16 q10, q10, q2
743
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
744
+ vmul.s16 q11, q11, q3
745
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
746
+ vmul.s16 q12, q12, q0
747
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
748
+ vmul.s16 q14, q14, q2
749
+ vmul.s16 q13, q13, q1
750
+ vld1.16 {d0}, [ip, :64] /* load constants */
751
+ vmul.s16 q15, q15, q3
752
+ vpush {d8-d13} /* save NEON registers */
753
+ /* 1-D IDCT, pass 1 */
754
+ vsub.s16 q2, q10, q14
755
+ vadd.s16 q14, q10, q14
756
+ vsub.s16 q1, q11, q13
757
+ vadd.s16 q13, q11, q13
758
+ vsub.s16 q5, q9, q15
759
+ vadd.s16 q15, q9, q15
760
+ vqdmulh.s16 q4, q2, XFIX_1_414213562
761
+ vqdmulh.s16 q6, q1, XFIX_2_613125930
762
+ vadd.s16 q3, q1, q1
763
+ vsub.s16 q1, q5, q1
764
+ vadd.s16 q10, q2, q4
765
+ vqdmulh.s16 q4, q1, XFIX_1_847759065
766
+ vsub.s16 q2, q15, q13
767
+ vadd.s16 q3, q3, q6
768
+ vqdmulh.s16 q6, q2, XFIX_1_414213562
769
+ vadd.s16 q1, q1, q4
770
+ vqdmulh.s16 q4, q5, XFIX_1_082392200
771
+ vsub.s16 q10, q10, q14
772
+ vadd.s16 q2, q2, q6
773
+ vsub.s16 q6, q8, q12
774
+ vadd.s16 q12, q8, q12
775
+ vadd.s16 q9, q5, q4
776
+ vadd.s16 q5, q6, q10
777
+ vsub.s16 q10, q6, q10
778
+ vadd.s16 q6, q15, q13
779
+ vadd.s16 q8, q12, q14
780
+ vsub.s16 q3, q6, q3
781
+ vsub.s16 q12, q12, q14
782
+ vsub.s16 q3, q3, q1
783
+ vsub.s16 q1, q9, q1
784
+ vadd.s16 q2, q3, q2
785
+ vsub.s16 q15, q8, q6
786
+ vadd.s16 q1, q1, q2
787
+ vadd.s16 q8, q8, q6
788
+ vadd.s16 q14, q5, q3
789
+ vsub.s16 q9, q5, q3
790
+ vsub.s16 q13, q10, q2
791
+ vadd.s16 q10, q10, q2
792
+ /* Transpose */
793
+ vtrn.16 q8, q9
794
+ vsub.s16 q11, q12, q1
795
+ vtrn.16 q14, q15
796
+ vadd.s16 q12, q12, q1
797
+ vtrn.16 q10, q11
798
+ vtrn.16 q12, q13
799
+ vtrn.32 q9, q11
800
+ vtrn.32 q12, q14
801
+ vtrn.32 q8, q10
802
+ vtrn.32 q13, q15
803
+ vswp d28, d21
804
+ vswp d26, d19
805
+ /* 1-D IDCT, pass 2 */
806
+ vsub.s16 q2, q10, q14
807
+ vswp d30, d23
808
+ vadd.s16 q14, q10, q14
809
+ vswp d24, d17
810
+ vsub.s16 q1, q11, q13
811
+ vadd.s16 q13, q11, q13
812
+ vsub.s16 q5, q9, q15
813
+ vadd.s16 q15, q9, q15
814
+ vqdmulh.s16 q4, q2, XFIX_1_414213562
815
+ vqdmulh.s16 q6, q1, XFIX_2_613125930
816
+ vadd.s16 q3, q1, q1
817
+ vsub.s16 q1, q5, q1
818
+ vadd.s16 q10, q2, q4
819
+ vqdmulh.s16 q4, q1, XFIX_1_847759065
820
+ vsub.s16 q2, q15, q13
821
+ vadd.s16 q3, q3, q6
822
+ vqdmulh.s16 q6, q2, XFIX_1_414213562
823
+ vadd.s16 q1, q1, q4
824
+ vqdmulh.s16 q4, q5, XFIX_1_082392200
825
+ vsub.s16 q10, q10, q14
826
+ vadd.s16 q2, q2, q6
827
+ vsub.s16 q6, q8, q12
828
+ vadd.s16 q12, q8, q12
829
+ vadd.s16 q9, q5, q4
830
+ vadd.s16 q5, q6, q10
831
+ vsub.s16 q10, q6, q10
832
+ vadd.s16 q6, q15, q13
833
+ vadd.s16 q8, q12, q14
834
+ vsub.s16 q3, q6, q3
835
+ vsub.s16 q12, q12, q14
836
+ vsub.s16 q3, q3, q1
837
+ vsub.s16 q1, q9, q1
838
+ vadd.s16 q2, q3, q2
839
+ vsub.s16 q15, q8, q6
840
+ vadd.s16 q1, q1, q2
841
+ vadd.s16 q8, q8, q6
842
+ vadd.s16 q14, q5, q3
843
+ vsub.s16 q9, q5, q3
844
+ vsub.s16 q13, q10, q2
845
+ vpop {d8-d13} /* restore NEON registers */
846
+ vadd.s16 q10, q10, q2
847
+ vsub.s16 q11, q12, q1
848
+ vadd.s16 q12, q12, q1
849
+ /* Descale to 8-bit and range limit */
850
+ vmov.u8 q0, #0x80
851
+ vqshrn.s16 d16, q8, #5
852
+ vqshrn.s16 d17, q9, #5
853
+ vqshrn.s16 d18, q10, #5
854
+ vqshrn.s16 d19, q11, #5
855
+ vqshrn.s16 d20, q12, #5
856
+ vqshrn.s16 d21, q13, #5
857
+ vqshrn.s16 d22, q14, #5
858
+ vqshrn.s16 d23, q15, #5
859
+ vadd.u8 q8, q8, q0
860
+ vadd.u8 q9, q9, q0
861
+ vadd.u8 q10, q10, q0
862
+ vadd.u8 q11, q11, q0
863
+ /* Transpose the final 8-bit samples */
864
+ vtrn.16 q8, q9
865
+ vtrn.16 q10, q11
866
+ vtrn.32 q8, q10
867
+ vtrn.32 q9, q11
868
+ vtrn.8 d16, d17
869
+ vtrn.8 d18, d19
870
+ /* Store results to the output buffer */
871
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
872
+ add TMP1, TMP1, OUTPUT_COL
873
+ add TMP2, TMP2, OUTPUT_COL
874
+ vst1.8 {d16}, [TMP1]
875
+ vst1.8 {d17}, [TMP2]
876
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
877
+ add TMP1, TMP1, OUTPUT_COL
878
+ add TMP2, TMP2, OUTPUT_COL
879
+ vst1.8 {d18}, [TMP1]
880
+ vtrn.8 d20, d21
881
+ vst1.8 {d19}, [TMP2]
882
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
883
+ add TMP1, TMP1, OUTPUT_COL
884
+ add TMP2, TMP2, OUTPUT_COL
885
+ add TMP3, TMP3, OUTPUT_COL
886
+ add TMP4, TMP4, OUTPUT_COL
887
+ vst1.8 {d20}, [TMP1]
888
+ vtrn.8 d22, d23
889
+ vst1.8 {d21}, [TMP2]
890
+ vst1.8 {d22}, [TMP3]
891
+ vst1.8 {d23}, [TMP4]
892
+ bx lr
893
+
894
+ .unreq DCT_TABLE
895
+ .unreq COEF_BLOCK
896
+ .unreq OUTPUT_BUF
897
+ .unreq OUTPUT_COL
898
+ .unreq TMP1
899
+ .unreq TMP2
900
+ .unreq TMP3
901
+ .unreq TMP4
902
+
903
+
904
+ /*****************************************************************************/
905
+
906
+ /*
907
+ * jsimd_idct_4x4_neon
908
+ *
909
+ * This function contains inverse-DCT code for getting reduced-size
910
+ * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
911
+ * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
912
+ * function from jpeg-6b (jidctred.c).
913
+ *
914
+ * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
915
+ * requires much less arithmetic operations and hence should be faster.
916
+ * The primary purpose of this particular NEON optimized function is
917
+ * bit exact compatibility with jpeg-6b.
918
+ *
919
+ * TODO: a bit better instructions scheduling can be achieved by expanding
920
+ * idct_helper/transpose_4x4 macros and reordering instructions,
921
+ * but readability will suffer somewhat.
922
+ */
923
+
924
+ #define CONST_BITS 13
925
+
926
+ #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
927
+ #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
928
+ #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
929
+ #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
930
+ #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
931
+ #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
932
+ #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
933
+ #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
934
+ #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
935
+ #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
936
+ #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
937
+ #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
938
+ #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
939
+ #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
940
+
941
+ .balign 16
942
+ jsimd_idct_4x4_neon_consts:
943
+ .short FIX_1_847759065 /* d0[0] */
944
+ .short -FIX_0_765366865 /* d0[1] */
945
+ .short -FIX_0_211164243 /* d0[2] */
946
+ .short FIX_1_451774981 /* d0[3] */
947
+ .short -FIX_2_172734803 /* d1[0] */
948
+ .short FIX_1_061594337 /* d1[1] */
949
+ .short -FIX_0_509795579 /* d1[2] */
950
+ .short -FIX_0_601344887 /* d1[3] */
951
+ .short FIX_0_899976223 /* d2[0] */
952
+ .short FIX_2_562915447 /* d2[1] */
953
+ .short 1 << (CONST_BITS + 1) /* d2[2] */
954
+ .short 0 /* d2[3] */
955
+
956
+ .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
957
+ vmull.s16 q14, \x4, d2[2]
958
+ vmlal.s16 q14, \x8, d0[0]
959
+ vmlal.s16 q14, \x14, d0[1]
960
+
961
+ vmull.s16 q13, \x16, d1[2]
962
+ vmlal.s16 q13, \x12, d1[3]
963
+ vmlal.s16 q13, \x10, d2[0]
964
+ vmlal.s16 q13, \x6, d2[1]
965
+
966
+ vmull.s16 q15, \x4, d2[2]
967
+ vmlsl.s16 q15, \x8, d0[0]
968
+ vmlsl.s16 q15, \x14, d0[1]
969
+
970
+ vmull.s16 q12, \x16, d0[2]
971
+ vmlal.s16 q12, \x12, d0[3]
972
+ vmlal.s16 q12, \x10, d1[0]
973
+ vmlal.s16 q12, \x6, d1[1]
974
+
975
+ vadd.s32 q10, q14, q13
976
+ vsub.s32 q14, q14, q13
977
+
978
+ .if \shift > 16
979
+ vrshr.s32 q10, q10, #\shift
980
+ vrshr.s32 q14, q14, #\shift
981
+ vmovn.s32 \y26, q10
982
+ vmovn.s32 \y29, q14
983
+ .else
984
+ vrshrn.s32 \y26, q10, #\shift
985
+ vrshrn.s32 \y29, q14, #\shift
986
+ .endif
987
+
988
+ vadd.s32 q10, q15, q12
989
+ vsub.s32 q15, q15, q12
990
+
991
+ .if \shift > 16
992
+ vrshr.s32 q10, q10, #\shift
993
+ vrshr.s32 q15, q15, #\shift
994
+ vmovn.s32 \y27, q10
995
+ vmovn.s32 \y28, q15
996
+ .else
997
+ vrshrn.s32 \y27, q10, #\shift
998
+ vrshrn.s32 \y28, q15, #\shift
999
+ .endif
1000
+ .endm
1001
+
1002
+ asm_function jsimd_idct_4x4_neon
1003
+
1004
+ DCT_TABLE .req r0
1005
+ COEF_BLOCK .req r1
1006
+ OUTPUT_BUF .req r2
1007
+ OUTPUT_COL .req r3
1008
+ TMP1 .req r0
1009
+ TMP2 .req r1
1010
+ TMP3 .req r2
1011
+ TMP4 .req ip
1012
+
1013
+ vpush {d8-d15}
1014
+
1015
+ /* Load constants (d3 is just used for padding) */
1016
+ adr TMP4, jsimd_idct_4x4_neon_consts
1017
+ vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
1018
+
1019
+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
1020
+ * 0 1 2 3 | 4 5 6 7
1021
+ * ---------+--------
1022
+ * 0 | d4 | d5
1023
+ * 1 | d6 | d7
1024
+ * 2 | d8 | d9
1025
+ * 3 | d10 | d11
1026
+ * 4 | - | -
1027
+ * 5 | d12 | d13
1028
+ * 6 | d14 | d15
1029
+ * 7 | d16 | d17
1030
+ */
1031
+ vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1032
+ vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
1033
+ add COEF_BLOCK, COEF_BLOCK, #16
1034
+ vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
1035
+ vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1036
+ /* dequantize */
1037
+ vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1038
+ vmul.s16 q2, q2, q9
1039
+ vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
1040
+ vmul.s16 q3, q3, q10
1041
+ vmul.s16 q4, q4, q11
1042
+ add DCT_TABLE, DCT_TABLE, #16
1043
+ vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
1044
+ vmul.s16 q5, q5, q12
1045
+ vmul.s16 q6, q6, q13
1046
+ vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1047
+ vmul.s16 q7, q7, q14
1048
+ vmul.s16 q8, q8, q15
1049
+
1050
+ /* Pass 1 */
1051
+ idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
1052
+ transpose_4x4 d4, d6, d8, d10
1053
+ idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
1054
+ transpose_4x4 d5, d7, d9, d11
1055
+
1056
+ /* Pass 2 */
1057
+ idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
1058
+ transpose_4x4 d26, d27, d28, d29
1059
+
1060
+ /* Range limit */
1061
+ vmov.u16 q15, #0x80
1062
+ vadd.s16 q13, q13, q15
1063
+ vadd.s16 q14, q14, q15
1064
+ vqmovun.s16 d26, q13
1065
+ vqmovun.s16 d27, q14
1066
+
1067
+ /* Store results to the output buffer */
1068
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
1069
+ add TMP1, TMP1, OUTPUT_COL
1070
+ add TMP2, TMP2, OUTPUT_COL
1071
+ add TMP3, TMP3, OUTPUT_COL
1072
+ add TMP4, TMP4, OUTPUT_COL
1073
+
1074
+ #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1075
+ /* We can use much less instructions on little endian systems if the
1076
+ * OS kernel is not configured to trap unaligned memory accesses
1077
+ */
1078
+ vst1.32 {d26[0]}, [TMP1]!
1079
+ vst1.32 {d27[0]}, [TMP3]!
1080
+ vst1.32 {d26[1]}, [TMP2]!
1081
+ vst1.32 {d27[1]}, [TMP4]!
1082
+ #else
1083
+ vst1.8 {d26[0]}, [TMP1]!
1084
+ vst1.8 {d27[0]}, [TMP3]!
1085
+ vst1.8 {d26[1]}, [TMP1]!
1086
+ vst1.8 {d27[1]}, [TMP3]!
1087
+ vst1.8 {d26[2]}, [TMP1]!
1088
+ vst1.8 {d27[2]}, [TMP3]!
1089
+ vst1.8 {d26[3]}, [TMP1]!
1090
+ vst1.8 {d27[3]}, [TMP3]!
1091
+
1092
+ vst1.8 {d26[4]}, [TMP2]!
1093
+ vst1.8 {d27[4]}, [TMP4]!
1094
+ vst1.8 {d26[5]}, [TMP2]!
1095
+ vst1.8 {d27[5]}, [TMP4]!
1096
+ vst1.8 {d26[6]}, [TMP2]!
1097
+ vst1.8 {d27[6]}, [TMP4]!
1098
+ vst1.8 {d26[7]}, [TMP2]!
1099
+ vst1.8 {d27[7]}, [TMP4]!
1100
+ #endif
1101
+
1102
+ vpop {d8-d15}
1103
+ bx lr
1104
+
1105
+ .unreq DCT_TABLE
1106
+ .unreq COEF_BLOCK
1107
+ .unreq OUTPUT_BUF
1108
+ .unreq OUTPUT_COL
1109
+ .unreq TMP1
1110
+ .unreq TMP2
1111
+ .unreq TMP3
1112
+ .unreq TMP4
1113
+
1114
+ .purgem idct_helper
1115
+
1116
+
1117
+ /*****************************************************************************/
1118
+
1119
+ /*
1120
+ * jsimd_idct_2x2_neon
1121
+ *
1122
+ * This function contains inverse-DCT code for getting reduced-size
1123
+ * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1124
+ * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1125
+ * function from jpeg-6b (jidctred.c).
1126
+ *
1127
+ * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1128
+ * requires much less arithmetic operations and hence should be faster.
1129
+ * The primary purpose of this particular NEON optimized function is
1130
+ * bit exact compatibility with jpeg-6b.
1131
+ */
1132
+
1133
+ .balign 8
1134
+ jsimd_idct_2x2_neon_consts:
1135
+ .short -FIX_0_720959822 /* d0[0] */
1136
+ .short FIX_0_850430095 /* d0[1] */
1137
+ .short -FIX_1_272758580 /* d0[2] */
1138
+ .short FIX_3_624509785 /* d0[3] */
1139
+
1140
+ .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1141
+ vshll.s16 q14, \x4, #15
1142
+ vmull.s16 q13, \x6, d0[3]
1143
+ vmlal.s16 q13, \x10, d0[2]
1144
+ vmlal.s16 q13, \x12, d0[1]
1145
+ vmlal.s16 q13, \x16, d0[0]
1146
+
1147
+ vadd.s32 q10, q14, q13
1148
+ vsub.s32 q14, q14, q13
1149
+
1150
+ .if \shift > 16
1151
+ vrshr.s32 q10, q10, #\shift
1152
+ vrshr.s32 q14, q14, #\shift
1153
+ vmovn.s32 \y26, q10
1154
+ vmovn.s32 \y27, q14
1155
+ .else
1156
+ vrshrn.s32 \y26, q10, #\shift
1157
+ vrshrn.s32 \y27, q14, #\shift
1158
+ .endif
1159
+ .endm
1160
+
1161
+ asm_function jsimd_idct_2x2_neon
1162
+
1163
+ DCT_TABLE .req r0
1164
+ COEF_BLOCK .req r1
1165
+ OUTPUT_BUF .req r2
1166
+ OUTPUT_COL .req r3
1167
+ TMP1 .req r0
1168
+ TMP2 .req ip
1169
+
1170
+ vpush {d8-d15}
1171
+
1172
+ /* Load constants */
1173
+ adr TMP2, jsimd_idct_2x2_neon_consts
1174
+ vld1.16 {d0}, [TMP2, :64]
1175
+
1176
+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
1177
+ * 0 1 2 3 | 4 5 6 7
1178
+ * ---------+--------
1179
+ * 0 | d4 | d5
1180
+ * 1 | d6 | d7
1181
+ * 2 | - | -
1182
+ * 3 | d10 | d11
1183
+ * 4 | - | -
1184
+ * 5 | d12 | d13
1185
+ * 6 | - | -
1186
+ * 7 | d16 | d17
1187
+ */
1188
+ vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1189
+ add COEF_BLOCK, COEF_BLOCK, #16
1190
+ vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
1191
+ add COEF_BLOCK, COEF_BLOCK, #16
1192
+ vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
1193
+ add COEF_BLOCK, COEF_BLOCK, #16
1194
+ vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1195
+ /* Dequantize */
1196
+ vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1197
+ vmul.s16 q2, q2, q9
1198
+ vmul.s16 q3, q3, q10
1199
+ add DCT_TABLE, DCT_TABLE, #16
1200
+ vld1.16 {d24, d25}, [DCT_TABLE, :128]!
1201
+ vmul.s16 q5, q5, q12
1202
+ add DCT_TABLE, DCT_TABLE, #16
1203
+ vld1.16 {d26, d27}, [DCT_TABLE, :128]!
1204
+ vmul.s16 q6, q6, q13
1205
+ add DCT_TABLE, DCT_TABLE, #16
1206
+ vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1207
+ vmul.s16 q8, q8, q15
1208
+
1209
+ /* Pass 1 */
1210
+ #if 0
1211
+ idct_helper d4, d6, d10, d12, d16, 13, d4, d6
1212
+ transpose_4x4 d4, d6, d8, d10
1213
+ idct_helper d5, d7, d11, d13, d17, 13, d5, d7
1214
+ transpose_4x4 d5, d7, d9, d11
1215
+ #else
1216
+ vmull.s16 q13, d6, d0[3]
1217
+ vmlal.s16 q13, d10, d0[2]
1218
+ vmlal.s16 q13, d12, d0[1]
1219
+ vmlal.s16 q13, d16, d0[0]
1220
+ vmull.s16 q12, d7, d0[3]
1221
+ vmlal.s16 q12, d11, d0[2]
1222
+ vmlal.s16 q12, d13, d0[1]
1223
+ vmlal.s16 q12, d17, d0[0]
1224
+ vshll.s16 q14, d4, #15
1225
+ vshll.s16 q15, d5, #15
1226
+ vadd.s32 q10, q14, q13
1227
+ vsub.s32 q14, q14, q13
1228
+ vrshrn.s32 d4, q10, #13
1229
+ vrshrn.s32 d6, q14, #13
1230
+ vadd.s32 q10, q15, q12
1231
+ vsub.s32 q14, q15, q12
1232
+ vrshrn.s32 d5, q10, #13
1233
+ vrshrn.s32 d7, q14, #13
1234
+ vtrn.16 q2, q3
1235
+ vtrn.32 q3, q5
1236
+ #endif
1237
+
1238
+ /* Pass 2 */
1239
+ idct_helper d4, d6, d10, d7, d11, 20, d26, d27
1240
+
1241
+ /* Range limit */
1242
+ vmov.u16 q15, #0x80
1243
+ vadd.s16 q13, q13, q15
1244
+ vqmovun.s16 d26, q13
1245
+ vqmovun.s16 d27, q13
1246
+
1247
+ /* Store results to the output buffer */
1248
+ ldmia OUTPUT_BUF, {TMP1, TMP2}
1249
+ add TMP1, TMP1, OUTPUT_COL
1250
+ add TMP2, TMP2, OUTPUT_COL
1251
+
1252
+ vst1.8 {d26[0]}, [TMP1]!
1253
+ vst1.8 {d27[4]}, [TMP1]!
1254
+ vst1.8 {d26[1]}, [TMP2]!
1255
+ vst1.8 {d27[5]}, [TMP2]!
1256
+
1257
+ vpop {d8-d15}
1258
+ bx lr
1259
+
1260
+ .unreq DCT_TABLE
1261
+ .unreq COEF_BLOCK
1262
+ .unreq OUTPUT_BUF
1263
+ .unreq OUTPUT_COL
1264
+ .unreq TMP1
1265
+ .unreq TMP2
1266
+
1267
+ .purgem idct_helper
1268
+
1269
+
1270
+ /*****************************************************************************/
1271
+
1272
+ /*
1273
+ * jsimd_ycc_extrgb_convert_neon
1274
+ * jsimd_ycc_extbgr_convert_neon
1275
+ * jsimd_ycc_extrgbx_convert_neon
1276
+ * jsimd_ycc_extbgrx_convert_neon
1277
+ * jsimd_ycc_extxbgr_convert_neon
1278
+ * jsimd_ycc_extxrgb_convert_neon
1279
+ *
1280
+ * Colorspace conversion YCbCr -> RGB
1281
+ */
1282
+
1283
+
1284
+ .macro do_load size
1285
+ .if \size == 8
1286
+ vld1.8 {d4}, [U, :64]!
1287
+ vld1.8 {d5}, [V, :64]!
1288
+ vld1.8 {d0}, [Y, :64]!
1289
+ pld [U, #64]
1290
+ pld [V, #64]
1291
+ pld [Y, #64]
1292
+ .elseif \size == 4
1293
+ vld1.8 {d4[0]}, [U]!
1294
+ vld1.8 {d4[1]}, [U]!
1295
+ vld1.8 {d4[2]}, [U]!
1296
+ vld1.8 {d4[3]}, [U]!
1297
+ vld1.8 {d5[0]}, [V]!
1298
+ vld1.8 {d5[1]}, [V]!
1299
+ vld1.8 {d5[2]}, [V]!
1300
+ vld1.8 {d5[3]}, [V]!
1301
+ vld1.8 {d0[0]}, [Y]!
1302
+ vld1.8 {d0[1]}, [Y]!
1303
+ vld1.8 {d0[2]}, [Y]!
1304
+ vld1.8 {d0[3]}, [Y]!
1305
+ .elseif \size == 2
1306
+ vld1.8 {d4[4]}, [U]!
1307
+ vld1.8 {d4[5]}, [U]!
1308
+ vld1.8 {d5[4]}, [V]!
1309
+ vld1.8 {d5[5]}, [V]!
1310
+ vld1.8 {d0[4]}, [Y]!
1311
+ vld1.8 {d0[5]}, [Y]!
1312
+ .elseif \size == 1
1313
+ vld1.8 {d4[6]}, [U]!
1314
+ vld1.8 {d5[6]}, [V]!
1315
+ vld1.8 {d0[6]}, [Y]!
1316
+ .else
1317
+ .error unsupported macroblock size
1318
+ .endif
1319
+ .endm
1320
+
1321
+ .macro do_store bpp, size
1322
+ .if \bpp == 24
1323
+ .if \size == 8
1324
+ vst3.8 {d10, d11, d12}, [RGB]!
1325
+ .elseif \size == 4
1326
+ vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1327
+ vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1328
+ vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1329
+ vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1330
+ .elseif \size == 2
1331
+ vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1332
+ vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1333
+ .elseif \size == 1
1334
+ vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1335
+ .else
1336
+ .error unsupported macroblock size
1337
+ .endif
1338
+ .elseif \bpp == 32
1339
+ .if \size == 8
1340
+ vst4.8 {d10, d11, d12, d13}, [RGB]!
1341
+ .elseif \size == 4
1342
+ vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1343
+ vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1344
+ vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1345
+ vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1346
+ .elseif \size == 2
1347
+ vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1348
+ vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1349
+ .elseif \size == 1
1350
+ vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1351
+ .else
1352
+ .error unsupported macroblock size
1353
+ .endif
1354
+ .elseif \bpp == 16
1355
+ .if \size == 8
1356
+ vst1.16 {q15}, [RGB]!
1357
+ .elseif \size == 4
1358
+ vst1.16 {d30}, [RGB]!
1359
+ .elseif \size == 2
1360
+ vst1.16 {d31[0]}, [RGB]!
1361
+ vst1.16 {d31[1]}, [RGB]!
1362
+ .elseif \size == 1
1363
+ vst1.16 {d31[2]}, [RGB]!
1364
+ .else
1365
+ .error unsupported macroblock size
1366
+ .endif
1367
+ .else
1368
+ .error unsupported bpp
1369
+ .endif
1370
+ .endm
1371
+
1372
+ .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1373
+
1374
+ /*
1375
+ * 2-stage pipelined YCbCr->RGB conversion
1376
+ */
1377
+
1378
+ .macro do_yuv_to_rgb_stage1
1379
+ vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1380
+ vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1381
+ vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1382
+ vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1383
+ vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1384
+ vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1385
+ vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1386
+ vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1387
+ vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1388
+ vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1389
+ .endm
1390
+
1391
+ .macro do_yuv_to_rgb_stage2
1392
+ vrshrn.s32 d20, q10, #15
1393
+ vrshrn.s32 d21, q11, #15
1394
+ vrshrn.s32 d24, q12, #14
1395
+ vrshrn.s32 d25, q13, #14
1396
+ vrshrn.s32 d28, q14, #14
1397
+ vrshrn.s32 d29, q15, #14
1398
+ vaddw.u8 q11, q10, d0
1399
+ vaddw.u8 q12, q12, d0
1400
+ vaddw.u8 q14, q14, d0
1401
+ .if \bpp != 16
1402
+ vqmovun.s16 d1\g_offs, q11
1403
+ vqmovun.s16 d1\r_offs, q12
1404
+ vqmovun.s16 d1\b_offs, q14
1405
+ .else /* rgb565 */
1406
+ vqshlu.s16 q13, q11, #8
1407
+ vqshlu.s16 q15, q12, #8
1408
+ vqshlu.s16 q14, q14, #8
1409
+ vsri.u16 q15, q13, #5
1410
+ vsri.u16 q15, q14, #11
1411
+ .endif
1412
+ .endm
1413
+
1414
+ .macro do_yuv_to_rgb_stage2_store_load_stage1
1415
+ /* "do_yuv_to_rgb_stage2" and "store" */
1416
+ vrshrn.s32 d20, q10, #15
1417
+ /* "load" and "do_yuv_to_rgb_stage1" */
1418
+ pld [U, #64]
1419
+ vrshrn.s32 d21, q11, #15
1420
+ pld [V, #64]
1421
+ vrshrn.s32 d24, q12, #14
1422
+ vrshrn.s32 d25, q13, #14
1423
+ vld1.8 {d4}, [U, :64]!
1424
+ vrshrn.s32 d28, q14, #14
1425
+ vld1.8 {d5}, [V, :64]!
1426
+ vrshrn.s32 d29, q15, #14
1427
+ vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1428
+ vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1429
+ vaddw.u8 q11, q10, d0
1430
+ vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1431
+ vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1432
+ vaddw.u8 q12, q12, d0
1433
+ vaddw.u8 q14, q14, d0
1434
+ .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
1435
+ vqmovun.s16 d1\g_offs, q11
1436
+ pld [Y, #64]
1437
+ vqmovun.s16 d1\r_offs, q12
1438
+ vld1.8 {d0}, [Y, :64]!
1439
+ vqmovun.s16 d1\b_offs, q14
1440
+ vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1441
+ vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1442
+ do_store \bpp, 8
1443
+ vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1444
+ vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1445
+ vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1446
+ vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1447
+ .else /**************************** rgb565 ********************************/
1448
+ vqshlu.s16 q13, q11, #8
1449
+ pld [Y, #64]
1450
+ vqshlu.s16 q15, q12, #8
1451
+ vqshlu.s16 q14, q14, #8
1452
+ vld1.8 {d0}, [Y, :64]!
1453
+ vmull.s16 q11, d7, d1[1]
1454
+ vmlal.s16 q11, d9, d1[2]
1455
+ vsri.u16 q15, q13, #5
1456
+ vmull.s16 q12, d8, d1[0]
1457
+ vsri.u16 q15, q14, #11
1458
+ vmull.s16 q13, d9, d1[0]
1459
+ vmull.s16 q14, d6, d1[3]
1460
+ do_store \bpp, 8
1461
+ vmull.s16 q15, d7, d1[3]
1462
+ .endif
1463
+ .endm
1464
+
1465
+ .macro do_yuv_to_rgb
1466
+ do_yuv_to_rgb_stage1
1467
+ do_yuv_to_rgb_stage2
1468
+ .endm
1469
+
1470
+ /* Apple gas crashes on adrl, work around that by using adr.
1471
+ * But this requires a copy of these constants for each function.
1472
+ */
1473
+
1474
+ .balign 16
1475
+ jsimd_ycc_\colorid\()_neon_consts:
1476
+ .short 0, 0, 0, 0
1477
+ .short 22971, -11277, -23401, 29033
1478
+ .short -128, -128, -128, -128
1479
+ .short -128, -128, -128, -128
1480
+
1481
+ asm_function jsimd_ycc_\colorid\()_convert_neon
1482
+ OUTPUT_WIDTH .req r0
1483
+ INPUT_BUF .req r1
1484
+ INPUT_ROW .req r2
1485
+ OUTPUT_BUF .req r3
1486
+ NUM_ROWS .req r4
1487
+
1488
+ INPUT_BUF0 .req r5
1489
+ INPUT_BUF1 .req r6
1490
+ INPUT_BUF2 .req INPUT_BUF
1491
+
1492
+ RGB .req r7
1493
+ Y .req r8
1494
+ U .req r9
1495
+ V .req r10
1496
+ N .req ip
1497
+
1498
+ /* Load constants to d1, d2, d3 (d0 is just used for padding) */
1499
+ adr ip, jsimd_ycc_\colorid\()_neon_consts
1500
+ vld1.16 {d0, d1, d2, d3}, [ip, :128]
1501
+
1502
+ /* Save ARM registers and handle input arguments */
1503
+ push {r4, r5, r6, r7, r8, r9, r10, lr}
1504
+ ldr NUM_ROWS, [sp, #(4 * 8)]
1505
+ ldr INPUT_BUF0, [INPUT_BUF]
1506
+ ldr INPUT_BUF1, [INPUT_BUF, #4]
1507
+ ldr INPUT_BUF2, [INPUT_BUF, #8]
1508
+ .unreq INPUT_BUF
1509
+
1510
+ /* Save NEON registers */
1511
+ vpush {d8-d15}
1512
+
1513
+ /* Initially set d10, d11, d12, d13 to 0xFF */
1514
+ vmov.u8 q5, #255
1515
+ vmov.u8 q6, #255
1516
+
1517
+ /* Outer loop over scanlines */
1518
+ cmp NUM_ROWS, #1
1519
+ blt 9f
1520
+ 0:
1521
+ ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
1522
+ ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
1523
+ mov N, OUTPUT_WIDTH
1524
+ ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
1525
+ add INPUT_ROW, INPUT_ROW, #1
1526
+ ldr RGB, [OUTPUT_BUF], #4
1527
+
1528
+ /* Inner loop over pixels */
1529
+ subs N, N, #8
1530
+ blt 3f
1531
+ do_load 8
1532
+ do_yuv_to_rgb_stage1
1533
+ subs N, N, #8
1534
+ blt 2f
1535
+ 1:
1536
+ do_yuv_to_rgb_stage2_store_load_stage1
1537
+ subs N, N, #8
1538
+ bge 1b
1539
+ 2:
1540
+ do_yuv_to_rgb_stage2
1541
+ do_store \bpp, 8
1542
+ tst N, #7
1543
+ beq 8f
1544
+ 3:
1545
+ tst N, #4
1546
+ beq 3f
1547
+ do_load 4
1548
+ 3:
1549
+ tst N, #2
1550
+ beq 4f
1551
+ do_load 2
1552
+ 4:
1553
+ tst N, #1
1554
+ beq 5f
1555
+ do_load 1
1556
+ 5:
1557
+ do_yuv_to_rgb
1558
+ tst N, #4
1559
+ beq 6f
1560
+ do_store \bpp, 4
1561
+ 6:
1562
+ tst N, #2
1563
+ beq 7f
1564
+ do_store \bpp, 2
1565
+ 7:
1566
+ tst N, #1
1567
+ beq 8f
1568
+ do_store \bpp, 1
1569
+ 8:
1570
+ subs NUM_ROWS, NUM_ROWS, #1
1571
+ bgt 0b
1572
+ 9:
1573
+ /* Restore all registers and return */
1574
+ vpop {d8-d15}
1575
+ pop {r4, r5, r6, r7, r8, r9, r10, pc}
1576
+
1577
+ .unreq OUTPUT_WIDTH
1578
+ .unreq INPUT_ROW
1579
+ .unreq OUTPUT_BUF
1580
+ .unreq NUM_ROWS
1581
+ .unreq INPUT_BUF0
1582
+ .unreq INPUT_BUF1
1583
+ .unreq INPUT_BUF2
1584
+ .unreq RGB
1585
+ .unreq Y
1586
+ .unreq U
1587
+ .unreq V
1588
+ .unreq N
1589
+
1590
+ .purgem do_yuv_to_rgb
1591
+ .purgem do_yuv_to_rgb_stage1
1592
+ .purgem do_yuv_to_rgb_stage2
1593
+ .purgem do_yuv_to_rgb_stage2_store_load_stage1
1594
+
1595
+ .endm
1596
+
1597
+ /*--------------------------------- id ----- bpp R G B */
1598
+ generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
1599
+ generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
1600
+ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
1601
+ generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
1602
+ generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
1603
+ generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
1604
+ generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0
1605
+
1606
+ .purgem do_load
1607
+ .purgem do_store
1608
+
1609
+
1610
+ /*****************************************************************************/
1611
+
1612
+ /*
1613
+ * jsimd_extrgb_ycc_convert_neon
1614
+ * jsimd_extbgr_ycc_convert_neon
1615
+ * jsimd_extrgbx_ycc_convert_neon
1616
+ * jsimd_extbgrx_ycc_convert_neon
1617
+ * jsimd_extxbgr_ycc_convert_neon
1618
+ * jsimd_extxrgb_ycc_convert_neon
1619
+ *
1620
+ * Colorspace conversion RGB -> YCbCr
1621
+ */
1622
+
1623
+ .macro do_store size
1624
+ .if \size == 8
1625
+ vst1.8 {d20}, [Y]!
1626
+ vst1.8 {d21}, [U]!
1627
+ vst1.8 {d22}, [V]!
1628
+ .elseif \size == 4
1629
+ vst1.8 {d20[0]}, [Y]!
1630
+ vst1.8 {d20[1]}, [Y]!
1631
+ vst1.8 {d20[2]}, [Y]!
1632
+ vst1.8 {d20[3]}, [Y]!
1633
+ vst1.8 {d21[0]}, [U]!
1634
+ vst1.8 {d21[1]}, [U]!
1635
+ vst1.8 {d21[2]}, [U]!
1636
+ vst1.8 {d21[3]}, [U]!
1637
+ vst1.8 {d22[0]}, [V]!
1638
+ vst1.8 {d22[1]}, [V]!
1639
+ vst1.8 {d22[2]}, [V]!
1640
+ vst1.8 {d22[3]}, [V]!
1641
+ .elseif \size == 2
1642
+ vst1.8 {d20[4]}, [Y]!
1643
+ vst1.8 {d20[5]}, [Y]!
1644
+ vst1.8 {d21[4]}, [U]!
1645
+ vst1.8 {d21[5]}, [U]!
1646
+ vst1.8 {d22[4]}, [V]!
1647
+ vst1.8 {d22[5]}, [V]!
1648
+ .elseif \size == 1
1649
+ vst1.8 {d20[6]}, [Y]!
1650
+ vst1.8 {d21[6]}, [U]!
1651
+ vst1.8 {d22[6]}, [V]!
1652
+ .else
1653
+ .error unsupported macroblock size
1654
+ .endif
1655
+ .endm
1656
+
1657
+ .macro do_load bpp, size
1658
+ .if \bpp == 24
1659
+ .if \size == 8
1660
+ vld3.8 {d10, d11, d12}, [RGB]!
1661
+ pld [RGB, #128]
1662
+ .elseif \size == 4
1663
+ vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1664
+ vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1665
+ vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1666
+ vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1667
+ .elseif \size == 2
1668
+ vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1669
+ vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1670
+ .elseif \size == 1
1671
+ vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1672
+ .else
1673
+ .error unsupported macroblock size
1674
+ .endif
1675
+ .elseif \bpp == 32
1676
+ .if \size == 8
1677
+ vld4.8 {d10, d11, d12, d13}, [RGB]!
1678
+ pld [RGB, #128]
1679
+ .elseif \size == 4
1680
+ vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1681
+ vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1682
+ vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1683
+ vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1684
+ .elseif \size == 2
1685
+ vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1686
+ vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1687
+ .elseif \size == 1
1688
+ vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1689
+ .else
1690
+ .error unsupported macroblock size
1691
+ .endif
1692
+ .else
1693
+ .error unsupported bpp
1694
+ .endif
1695
+ .endm
1696
+
1697
+ .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1698
+
1699
+ /*
1700
+ * 2-stage pipelined RGB->YCbCr conversion
1701
+ */
1702
+
1703
+ .macro do_rgb_to_yuv_stage1
1704
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1705
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1706
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1707
+ vmull.u16 q7, d4, d0[0]
1708
+ vmlal.u16 q7, d6, d0[1]
1709
+ vmlal.u16 q7, d8, d0[2]
1710
+ vmull.u16 q8, d5, d0[0]
1711
+ vmlal.u16 q8, d7, d0[1]
1712
+ vmlal.u16 q8, d9, d0[2]
1713
+ vrev64.32 q9, q1
1714
+ vrev64.32 q13, q1
1715
+ vmlsl.u16 q9, d4, d0[3]
1716
+ vmlsl.u16 q9, d6, d1[0]
1717
+ vmlal.u16 q9, d8, d1[1]
1718
+ vmlsl.u16 q13, d5, d0[3]
1719
+ vmlsl.u16 q13, d7, d1[0]
1720
+ vmlal.u16 q13, d9, d1[1]
1721
+ vrev64.32 q14, q1
1722
+ vrev64.32 q15, q1
1723
+ vmlal.u16 q14, d4, d1[1]
1724
+ vmlsl.u16 q14, d6, d1[2]
1725
+ vmlsl.u16 q14, d8, d1[3]
1726
+ vmlal.u16 q15, d5, d1[1]
1727
+ vmlsl.u16 q15, d7, d1[2]
1728
+ vmlsl.u16 q15, d9, d1[3]
1729
+ .endm
1730
+
1731
+ .macro do_rgb_to_yuv_stage2
1732
+ vrshrn.u32 d20, q7, #16
1733
+ vrshrn.u32 d21, q8, #16
1734
+ vshrn.u32 d22, q9, #16
1735
+ vshrn.u32 d23, q13, #16
1736
+ vshrn.u32 d24, q14, #16
1737
+ vshrn.u32 d25, q15, #16
1738
+ vmovn.u16 d20, q10 /* d20 = y */
1739
+ vmovn.u16 d21, q11 /* d21 = u */
1740
+ vmovn.u16 d22, q12 /* d22 = v */
1741
+ .endm
1742
+
1743
+ .macro do_rgb_to_yuv
1744
+ do_rgb_to_yuv_stage1
1745
+ do_rgb_to_yuv_stage2
1746
+ .endm
1747
+
1748
+ .macro do_rgb_to_yuv_stage2_store_load_stage1
1749
+ vrshrn.u32 d20, q7, #16
1750
+ vrshrn.u32 d21, q8, #16
1751
+ vshrn.u32 d22, q9, #16
1752
+ vrev64.32 q9, q1
1753
+ vshrn.u32 d23, q13, #16
1754
+ vrev64.32 q13, q1
1755
+ vshrn.u32 d24, q14, #16
1756
+ vshrn.u32 d25, q15, #16
1757
+ do_load \bpp, 8
1758
+ vmovn.u16 d20, q10 /* d20 = y */
1759
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1760
+ vmovn.u16 d21, q11 /* d21 = u */
1761
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1762
+ vmovn.u16 d22, q12 /* d22 = v */
1763
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1764
+ vmull.u16 q7, d4, d0[0]
1765
+ vmlal.u16 q7, d6, d0[1]
1766
+ vmlal.u16 q7, d8, d0[2]
1767
+ vst1.8 {d20}, [Y]!
1768
+ vmull.u16 q8, d5, d0[0]
1769
+ vmlal.u16 q8, d7, d0[1]
1770
+ vmlal.u16 q8, d9, d0[2]
1771
+ vmlsl.u16 q9, d4, d0[3]
1772
+ vmlsl.u16 q9, d6, d1[0]
1773
+ vmlal.u16 q9, d8, d1[1]
1774
+ vst1.8 {d21}, [U]!
1775
+ vmlsl.u16 q13, d5, d0[3]
1776
+ vmlsl.u16 q13, d7, d1[0]
1777
+ vmlal.u16 q13, d9, d1[1]
1778
+ vrev64.32 q14, q1
1779
+ vrev64.32 q15, q1
1780
+ vmlal.u16 q14, d4, d1[1]
1781
+ vmlsl.u16 q14, d6, d1[2]
1782
+ vmlsl.u16 q14, d8, d1[3]
1783
+ vst1.8 {d22}, [V]!
1784
+ vmlal.u16 q15, d5, d1[1]
1785
+ vmlsl.u16 q15, d7, d1[2]
1786
+ vmlsl.u16 q15, d9, d1[3]
1787
+ .endm
1788
+
1789
+ .balign 16
1790
+ jsimd_\colorid\()_ycc_neon_consts:
1791
+ .short 19595, 38470, 7471, 11059
1792
+ .short 21709, 32768, 27439, 5329
1793
+ .short 32767, 128, 32767, 128
1794
+ .short 32767, 128, 32767, 128
1795
+
1796
+ asm_function jsimd_\colorid\()_ycc_convert_neon
1797
+ OUTPUT_WIDTH .req r0
1798
+ INPUT_BUF .req r1
1799
+ OUTPUT_BUF .req r2
1800
+ OUTPUT_ROW .req r3
1801
+ NUM_ROWS .req r4
1802
+
1803
+ OUTPUT_BUF0 .req r5
1804
+ OUTPUT_BUF1 .req r6
1805
+ OUTPUT_BUF2 .req OUTPUT_BUF
1806
+
1807
+ RGB .req r7
1808
+ Y .req r8
1809
+ U .req r9
1810
+ V .req r10
1811
+ N .req ip
1812
+
1813
+ /* Load constants to d0, d1, d2, d3 */
1814
+ adr ip, jsimd_\colorid\()_ycc_neon_consts
1815
+ vld1.16 {d0, d1, d2, d3}, [ip, :128]
1816
+
1817
+ /* Save ARM registers and handle input arguments */
1818
+ push {r4, r5, r6, r7, r8, r9, r10, lr}
1819
+ ldr NUM_ROWS, [sp, #(4 * 8)]
1820
+ ldr OUTPUT_BUF0, [OUTPUT_BUF]
1821
+ ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
1822
+ ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
1823
+ .unreq OUTPUT_BUF
1824
+
1825
+ /* Save NEON registers */
1826
+ vpush {d8-d15}
1827
+
1828
+ /* Outer loop over scanlines */
1829
+ cmp NUM_ROWS, #1
1830
+ blt 9f
1831
+ 0:
1832
+ ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1833
+ ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1834
+ mov N, OUTPUT_WIDTH
1835
+ ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1836
+ add OUTPUT_ROW, OUTPUT_ROW, #1
1837
+ ldr RGB, [INPUT_BUF], #4
1838
+
1839
+ /* Inner loop over pixels */
1840
+ subs N, N, #8
1841
+ blt 3f
1842
+ do_load \bpp, 8
1843
+ do_rgb_to_yuv_stage1
1844
+ subs N, N, #8
1845
+ blt 2f
1846
+ 1:
1847
+ do_rgb_to_yuv_stage2_store_load_stage1
1848
+ subs N, N, #8
1849
+ bge 1b
1850
+ 2:
1851
+ do_rgb_to_yuv_stage2
1852
+ do_store 8
1853
+ tst N, #7
1854
+ beq 8f
1855
+ 3:
1856
+ tst N, #4
1857
+ beq 3f
1858
+ do_load \bpp, 4
1859
+ 3:
1860
+ tst N, #2
1861
+ beq 4f
1862
+ do_load \bpp, 2
1863
+ 4:
1864
+ tst N, #1
1865
+ beq 5f
1866
+ do_load \bpp, 1
1867
+ 5:
1868
+ do_rgb_to_yuv
1869
+ tst N, #4
1870
+ beq 6f
1871
+ do_store 4
1872
+ 6:
1873
+ tst N, #2
1874
+ beq 7f
1875
+ do_store 2
1876
+ 7:
1877
+ tst N, #1
1878
+ beq 8f
1879
+ do_store 1
1880
+ 8:
1881
+ subs NUM_ROWS, NUM_ROWS, #1
1882
+ bgt 0b
1883
+ 9:
1884
+ /* Restore all registers and return */
1885
+ vpop {d8-d15}
1886
+ pop {r4, r5, r6, r7, r8, r9, r10, pc}
1887
+
1888
+ .unreq OUTPUT_WIDTH
1889
+ .unreq OUTPUT_ROW
1890
+ .unreq INPUT_BUF
1891
+ .unreq NUM_ROWS
1892
+ .unreq OUTPUT_BUF0
1893
+ .unreq OUTPUT_BUF1
1894
+ .unreq OUTPUT_BUF2
1895
+ .unreq RGB
1896
+ .unreq Y
1897
+ .unreq U
1898
+ .unreq V
1899
+ .unreq N
1900
+
1901
+ .purgem do_rgb_to_yuv
1902
+ .purgem do_rgb_to_yuv_stage1
1903
+ .purgem do_rgb_to_yuv_stage2
1904
+ .purgem do_rgb_to_yuv_stage2_store_load_stage1
1905
+
1906
+ .endm
1907
+
1908
+ /*--------------------------------- id ----- bpp R G B */
1909
+ generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
1910
+ generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
1911
+ generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1912
+ generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1913
+ generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1914
+ generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1915
+
1916
+ .purgem do_load
1917
+ .purgem do_store
1918
+
1919
+
1920
+ /*****************************************************************************/
1921
+
1922
+ /*
1923
+ * Load data into workspace, applying unsigned->signed conversion
1924
+ *
1925
+ * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1926
+ * rid of VST1.16 instructions
1927
+ */
1928
+
1929
+ asm_function jsimd_convsamp_neon
1930
+ SAMPLE_DATA .req r0
1931
+ START_COL .req r1
1932
+ WORKSPACE .req r2
1933
+ TMP1 .req r3
1934
+ TMP2 .req r4
1935
+ TMP3 .req r5
1936
+ TMP4 .req ip
1937
+
1938
+ push {r4, r5}
1939
+ vmov.u8 d0, #128
1940
+
1941
+ ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1942
+ add TMP1, TMP1, START_COL
1943
+ add TMP2, TMP2, START_COL
1944
+ add TMP3, TMP3, START_COL
1945
+ add TMP4, TMP4, START_COL
1946
+ vld1.8 {d16}, [TMP1]
1947
+ vsubl.u8 q8, d16, d0
1948
+ vld1.8 {d18}, [TMP2]
1949
+ vsubl.u8 q9, d18, d0
1950
+ vld1.8 {d20}, [TMP3]
1951
+ vsubl.u8 q10, d20, d0
1952
+ vld1.8 {d22}, [TMP4]
1953
+ ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1954
+ vsubl.u8 q11, d22, d0
1955
+ vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
1956
+ add TMP1, TMP1, START_COL
1957
+ add TMP2, TMP2, START_COL
1958
+ vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
1959
+ add TMP3, TMP3, START_COL
1960
+ add TMP4, TMP4, START_COL
1961
+ vld1.8 {d24}, [TMP1]
1962
+ vsubl.u8 q12, d24, d0
1963
+ vld1.8 {d26}, [TMP2]
1964
+ vsubl.u8 q13, d26, d0
1965
+ vld1.8 {d28}, [TMP3]
1966
+ vsubl.u8 q14, d28, d0
1967
+ vld1.8 {d30}, [TMP4]
1968
+ vsubl.u8 q15, d30, d0
1969
+ vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
1970
+ vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
1971
+ pop {r4, r5}
1972
+ bx lr
1973
+
1974
+ .unreq SAMPLE_DATA
1975
+ .unreq START_COL
1976
+ .unreq WORKSPACE
1977
+ .unreq TMP1
1978
+ .unreq TMP2
1979
+ .unreq TMP3
1980
+ .unreq TMP4
1981
+
1982
+
1983
+ /*****************************************************************************/
1984
+
1985
+ /*
1986
+ * jsimd_fdct_ifast_neon
1987
+ *
1988
+ * This function contains a fast, not so accurate integer implementation of
1989
+ * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1990
+ * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1991
+ * function from jfdctfst.c
1992
+ *
1993
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
1994
+ * rid of a bunch of VLD1.16 instructions
1995
+ */
1996
+
1997
+ #define XFIX_0_382683433 d0[0]
1998
+ #define XFIX_0_541196100 d0[1]
1999
+ #define XFIX_0_707106781 d0[2]
2000
+ #define XFIX_1_306562965 d0[3]
2001
+
2002
+ .balign 16
2003
+ jsimd_fdct_ifast_neon_consts:
2004
+ .short (98 * 128) /* XFIX_0_382683433 */
2005
+ .short (139 * 128) /* XFIX_0_541196100 */
2006
+ .short (181 * 128) /* XFIX_0_707106781 */
2007
+ .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
2008
+
2009
+ asm_function jsimd_fdct_ifast_neon
2010
+
2011
+ DATA .req r0
2012
+ TMP .req ip
2013
+
2014
+ vpush {d8-d15}
2015
+
2016
+ /* Load constants */
2017
+ adr TMP, jsimd_fdct_ifast_neon_consts
2018
+ vld1.16 {d0}, [TMP, :64]
2019
+
2020
+ /* Load all DATA into NEON registers with the following allocation:
2021
+ * 0 1 2 3 | 4 5 6 7
2022
+ * ---------+--------
2023
+ * 0 | d16 | d17 | q8
2024
+ * 1 | d18 | d19 | q9
2025
+ * 2 | d20 | d21 | q10
2026
+ * 3 | d22 | d23 | q11
2027
+ * 4 | d24 | d25 | q12
2028
+ * 5 | d26 | d27 | q13
2029
+ * 6 | d28 | d29 | q14
2030
+ * 7 | d30 | d31 | q15
2031
+ */
2032
+
2033
+ vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
2034
+ vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
2035
+ vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
2036
+ vld1.16 {d28, d29, d30, d31}, [DATA, :128]
2037
+ sub DATA, DATA, #(128 - 32)
2038
+
2039
+ mov TMP, #2
2040
+ 1:
2041
+ /* Transpose */
2042
+ vtrn.16 q12, q13
2043
+ vtrn.16 q10, q11
2044
+ vtrn.16 q8, q9
2045
+ vtrn.16 q14, q15
2046
+ vtrn.32 q9, q11
2047
+ vtrn.32 q13, q15
2048
+ vtrn.32 q8, q10
2049
+ vtrn.32 q12, q14
2050
+ vswp d30, d23
2051
+ vswp d24, d17
2052
+ vswp d26, d19
2053
+ /* 1-D FDCT */
2054
+ vadd.s16 q2, q11, q12
2055
+ vswp d28, d21
2056
+ vsub.s16 q12, q11, q12
2057
+ vsub.s16 q6, q10, q13
2058
+ vadd.s16 q10, q10, q13
2059
+ vsub.s16 q7, q9, q14
2060
+ vadd.s16 q9, q9, q14
2061
+ vsub.s16 q1, q8, q15
2062
+ vadd.s16 q8, q8, q15
2063
+ vsub.s16 q4, q9, q10
2064
+ vsub.s16 q5, q8, q2
2065
+ vadd.s16 q3, q9, q10
2066
+ vadd.s16 q4, q4, q5
2067
+ vadd.s16 q2, q8, q2
2068
+ vqdmulh.s16 q4, q4, XFIX_0_707106781
2069
+ vadd.s16 q11, q12, q6
2070
+ vadd.s16 q8, q2, q3
2071
+ vsub.s16 q12, q2, q3
2072
+ vadd.s16 q3, q6, q7
2073
+ vadd.s16 q7, q7, q1
2074
+ vqdmulh.s16 q3, q3, XFIX_0_707106781
2075
+ vsub.s16 q6, q11, q7
2076
+ vadd.s16 q10, q5, q4
2077
+ vqdmulh.s16 q6, q6, XFIX_0_382683433
2078
+ vsub.s16 q14, q5, q4
2079
+ vqdmulh.s16 q11, q11, XFIX_0_541196100
2080
+ vqdmulh.s16 q5, q7, XFIX_1_306562965
2081
+ vadd.s16 q4, q1, q3
2082
+ vsub.s16 q3, q1, q3
2083
+ vadd.s16 q7, q7, q6
2084
+ vadd.s16 q11, q11, q6
2085
+ vadd.s16 q7, q7, q5
2086
+ vadd.s16 q13, q3, q11
2087
+ vsub.s16 q11, q3, q11
2088
+ vadd.s16 q9, q4, q7
2089
+ vsub.s16 q15, q4, q7
2090
+ subs TMP, TMP, #1
2091
+ bne 1b
2092
+
2093
+ /* store results */
2094
+ vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
2095
+ vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
2096
+ vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
2097
+ vst1.16 {d28, d29, d30, d31}, [DATA, :128]
2098
+
2099
+ vpop {d8-d15}
2100
+ bx lr
2101
+
2102
+ .unreq DATA
2103
+ .unreq TMP
2104
+
2105
+
2106
+ /*****************************************************************************/
2107
+
2108
+ /*
2109
+ * GLOBAL(void)
2110
+ * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
2111
+ * DCTELEM *workspace);
2112
+ *
2113
+ * Note: the code uses 2 stage pipelining in order to improve instructions
2114
+ * scheduling and eliminate stalls (this provides ~15% better
2115
+ * performance for this function on both ARM Cortex-A8 and
2116
+ * ARM Cortex-A9 when compared to the non-pipelined variant).
2117
+ * The instructions which belong to the second stage use different
2118
+ * indentation for better readiability.
2119
+ */
2120
+ asm_function jsimd_quantize_neon
2121
+
2122
+ COEF_BLOCK .req r0
2123
+ DIVISORS .req r1
2124
+ WORKSPACE .req r2
2125
+
2126
+ RECIPROCAL .req DIVISORS
2127
+ CORRECTION .req r3
2128
+ SHIFT .req ip
2129
+ LOOP_COUNT .req r4
2130
+
2131
+ vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2132
+ vabs.s16 q12, q0
2133
+ add CORRECTION, DIVISORS, #(64 * 2)
2134
+ add SHIFT, DIVISORS, #(64 * 6)
2135
+ vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2136
+ vabs.s16 q13, q1
2137
+ vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2138
+ vadd.u16 q12, q12, q10 /* add correction */
2139
+ vadd.u16 q13, q13, q11
2140
+ vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2141
+ vmull.u16 q11, d25, d17
2142
+ vmull.u16 q8, d26, d18
2143
+ vmull.u16 q9, d27, d19
2144
+ vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2145
+ vshrn.u32 d20, q10, #16
2146
+ vshrn.u32 d21, q11, #16
2147
+ vshrn.u32 d22, q8, #16
2148
+ vshrn.u32 d23, q9, #16
2149
+ vneg.s16 q12, q12
2150
+ vneg.s16 q13, q13
2151
+ vshr.s16 q2, q0, #15 /* extract sign */
2152
+ vshr.s16 q3, q1, #15
2153
+ vshl.u16 q14, q10, q12 /* shift */
2154
+ vshl.u16 q15, q11, q13
2155
+
2156
+ push {r4, r5}
2157
+ mov LOOP_COUNT, #3
2158
+ 1:
2159
+ vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2160
+ veor.u16 q14, q14, q2 /* restore sign */
2161
+ vabs.s16 q12, q0
2162
+ vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2163
+ vabs.s16 q13, q1
2164
+ veor.u16 q15, q15, q3
2165
+ vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2166
+ vadd.u16 q12, q12, q10 /* add correction */
2167
+ vadd.u16 q13, q13, q11
2168
+ vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2169
+ vmull.u16 q11, d25, d17
2170
+ vmull.u16 q8, d26, d18
2171
+ vmull.u16 q9, d27, d19
2172
+ vsub.u16 q14, q14, q2
2173
+ vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2174
+ vsub.u16 q15, q15, q3
2175
+ vshrn.u32 d20, q10, #16
2176
+ vshrn.u32 d21, q11, #16
2177
+ vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2178
+ vshrn.u32 d22, q8, #16
2179
+ vshrn.u32 d23, q9, #16
2180
+ vneg.s16 q12, q12
2181
+ vneg.s16 q13, q13
2182
+ vshr.s16 q2, q0, #15 /* extract sign */
2183
+ vshr.s16 q3, q1, #15
2184
+ vshl.u16 q14, q10, q12 /* shift */
2185
+ vshl.u16 q15, q11, q13
2186
+ subs LOOP_COUNT, LOOP_COUNT, #1
2187
+ bne 1b
2188
+ pop {r4, r5}
2189
+
2190
+ veor.u16 q14, q14, q2 /* restore sign */
2191
+ veor.u16 q15, q15, q3
2192
+ vsub.u16 q14, q14, q2
2193
+ vsub.u16 q15, q15, q3
2194
+ vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2195
+
2196
+ bx lr /* return */
2197
+
2198
+ .unreq COEF_BLOCK
2199
+ .unreq DIVISORS
2200
+ .unreq WORKSPACE
2201
+ .unreq RECIPROCAL
2202
+ .unreq CORRECTION
2203
+ .unreq SHIFT
2204
+ .unreq LOOP_COUNT
2205
+
2206
+
2207
+ /*****************************************************************************/
2208
+
2209
+ /*
2210
+ * GLOBAL(void)
2211
+ * jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
2212
+ * JDIMENSION downsampled_width,
2213
+ * JSAMPARRAY input_data,
2214
+ * JSAMPARRAY *output_data_ptr);
2215
+ *
2216
+ * Note: the use of unaligned writes is the main remaining bottleneck in
2217
+ * this code, which can be potentially solved to get up to tens
2218
+ * of percents performance improvement on Cortex-A8/Cortex-A9.
2219
+ */
2220
+
2221
+ /*
2222
+ * Upsample 16 source pixels to 32 destination pixels. The new 16 source
2223
+ * pixels are loaded to q0. The previous 16 source pixels are in q1. The
2224
+ * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
2225
+ * Register d28 is used for multiplication by 3. Register q15 is used
2226
+ * for adding +1 bias.
2227
+ */
2228
+ .macro upsample16 OUTPTR, INPTR
2229
+ vld1.8 {q0}, [\INPTR]!
2230
+ vmovl.u8 q8, d0
2231
+ vext.8 q2, q1, q0, #15
2232
+ vmovl.u8 q9, d1
2233
+ vaddw.u8 q10, q15, d4
2234
+ vaddw.u8 q11, q15, d5
2235
+ vmlal.u8 q8, d4, d28
2236
+ vmlal.u8 q9, d5, d28
2237
+ vmlal.u8 q10, d0, d28
2238
+ vmlal.u8 q11, d1, d28
2239
+ vmov q1, q0 /* backup source pixels to q1 */
2240
+ vrshrn.u16 d6, q8, #2
2241
+ vrshrn.u16 d7, q9, #2
2242
+ vshrn.u16 d8, q10, #2
2243
+ vshrn.u16 d9, q11, #2
2244
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2245
+ .endm
2246
+
2247
+ /*
2248
+ * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
2249
+ * macro, the roles of q0 and q1 registers are reversed for even and odd
2250
+ * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
2251
+ * Also this unrolling allows to reorder loads and stores to compensate
2252
+ * multiplication latency and reduce stalls.
2253
+ */
2254
+ .macro upsample32 OUTPTR, INPTR
2255
+ /* even 16 pixels group */
2256
+ vld1.8 {q0}, [\INPTR]!
2257
+ vmovl.u8 q8, d0
2258
+ vext.8 q2, q1, q0, #15
2259
+ vmovl.u8 q9, d1
2260
+ vaddw.u8 q10, q15, d4
2261
+ vaddw.u8 q11, q15, d5
2262
+ vmlal.u8 q8, d4, d28
2263
+ vmlal.u8 q9, d5, d28
2264
+ vmlal.u8 q10, d0, d28
2265
+ vmlal.u8 q11, d1, d28
2266
+ /* odd 16 pixels group */
2267
+ vld1.8 {q1}, [\INPTR]!
2268
+ vrshrn.u16 d6, q8, #2
2269
+ vrshrn.u16 d7, q9, #2
2270
+ vshrn.u16 d8, q10, #2
2271
+ vshrn.u16 d9, q11, #2
2272
+ vmovl.u8 q8, d2
2273
+ vext.8 q2, q0, q1, #15
2274
+ vmovl.u8 q9, d3
2275
+ vaddw.u8 q10, q15, d4
2276
+ vaddw.u8 q11, q15, d5
2277
+ vmlal.u8 q8, d4, d28
2278
+ vmlal.u8 q9, d5, d28
2279
+ vmlal.u8 q10, d2, d28
2280
+ vmlal.u8 q11, d3, d28
2281
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2282
+ vrshrn.u16 d6, q8, #2
2283
+ vrshrn.u16 d7, q9, #2
2284
+ vshrn.u16 d8, q10, #2
2285
+ vshrn.u16 d9, q11, #2
2286
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2287
+ .endm
2288
+
2289
+ /*
2290
+ * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
2291
+ */
2292
+ .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
2293
+ /* special case for the first and last pixels */
2294
+ sub \WIDTH, \WIDTH, #1
2295
+ add \OUTPTR, \OUTPTR, #1
2296
+ ldrb \TMP1, [\INPTR, \WIDTH]
2297
+ strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
2298
+ ldrb \TMP1, [\INPTR], #1
2299
+ strb \TMP1, [\OUTPTR, #-1]
2300
+ vmov.8 d3[7], \TMP1
2301
+
2302
+ subs \WIDTH, \WIDTH, #32
2303
+ blt 5f
2304
+ 0: /* process 32 pixels per iteration */
2305
+ upsample32 \OUTPTR, \INPTR
2306
+ subs \WIDTH, \WIDTH, #32
2307
+ bge 0b
2308
+ 5:
2309
+ adds \WIDTH, \WIDTH, #16
2310
+ blt 1f
2311
+ 0: /* process 16 pixels if needed */
2312
+ upsample16 \OUTPTR, \INPTR
2313
+ subs \WIDTH, \WIDTH, #16
2314
+ 1:
2315
+ adds \WIDTH, \WIDTH, #16
2316
+ beq 9f
2317
+
2318
+ /* load the remaining 1-15 pixels */
2319
+ add \INPTR, \INPTR, \WIDTH
2320
+ tst \WIDTH, #1
2321
+ beq 2f
2322
+ sub \INPTR, \INPTR, #1
2323
+ vld1.8 {d0[0]}, [\INPTR]
2324
+ 2:
2325
+ tst \WIDTH, #2
2326
+ beq 2f
2327
+ vext.8 d0, d0, d0, #6
2328
+ sub \INPTR, \INPTR, #1
2329
+ vld1.8 {d0[1]}, [\INPTR]
2330
+ sub \INPTR, \INPTR, #1
2331
+ vld1.8 {d0[0]}, [\INPTR]
2332
+ 2:
2333
+ tst \WIDTH, #4
2334
+ beq 2f
2335
+ vrev64.32 d0, d0
2336
+ sub \INPTR, \INPTR, #1
2337
+ vld1.8 {d0[3]}, [\INPTR]
2338
+ sub \INPTR, \INPTR, #1
2339
+ vld1.8 {d0[2]}, [\INPTR]
2340
+ sub \INPTR, \INPTR, #1
2341
+ vld1.8 {d0[1]}, [\INPTR]
2342
+ sub \INPTR, \INPTR, #1
2343
+ vld1.8 {d0[0]}, [\INPTR]
2344
+ 2:
2345
+ tst \WIDTH, #8
2346
+ beq 2f
2347
+ vmov d1, d0
2348
+ sub \INPTR, \INPTR, #8
2349
+ vld1.8 {d0}, [\INPTR]
2350
+ 2: /* upsample the remaining pixels */
2351
+ vmovl.u8 q8, d0
2352
+ vext.8 q2, q1, q0, #15
2353
+ vmovl.u8 q9, d1
2354
+ vaddw.u8 q10, q15, d4
2355
+ vaddw.u8 q11, q15, d5
2356
+ vmlal.u8 q8, d4, d28
2357
+ vmlal.u8 q9, d5, d28
2358
+ vmlal.u8 q10, d0, d28
2359
+ vmlal.u8 q11, d1, d28
2360
+ vrshrn.u16 d10, q8, #2
2361
+ vrshrn.u16 d12, q9, #2
2362
+ vshrn.u16 d11, q10, #2
2363
+ vshrn.u16 d13, q11, #2
2364
+ vzip.8 d10, d11
2365
+ vzip.8 d12, d13
2366
+ /* store the remaining pixels */
2367
+ tst \WIDTH, #8
2368
+ beq 2f
2369
+ vst1.8 {d10, d11}, [\OUTPTR]!
2370
+ vmov q5, q6
2371
+ 2:
2372
+ tst \WIDTH, #4
2373
+ beq 2f
2374
+ vst1.8 {d10}, [\OUTPTR]!
2375
+ vmov d10, d11
2376
+ 2:
2377
+ tst \WIDTH, #2
2378
+ beq 2f
2379
+ vst1.8 {d10[0]}, [\OUTPTR]!
2380
+ vst1.8 {d10[1]}, [\OUTPTR]!
2381
+ vst1.8 {d10[2]}, [\OUTPTR]!
2382
+ vst1.8 {d10[3]}, [\OUTPTR]!
2383
+ vext.8 d10, d10, d10, #4
2384
+ 2:
2385
+ tst \WIDTH, #1
2386
+ beq 2f
2387
+ vst1.8 {d10[0]}, [\OUTPTR]!
2388
+ vst1.8 {d10[1]}, [\OUTPTR]!
2389
+ 2:
2390
+ 9:
2391
+ .endm
2392
+
2393
+ asm_function jsimd_h2v1_fancy_upsample_neon
2394
+
2395
+ MAX_V_SAMP_FACTOR .req r0
2396
+ DOWNSAMPLED_WIDTH .req r1
2397
+ INPUT_DATA .req r2
2398
+ OUTPUT_DATA_PTR .req r3
2399
+ OUTPUT_DATA .req OUTPUT_DATA_PTR
2400
+
2401
+ OUTPTR .req r4
2402
+ INPTR .req r5
2403
+ WIDTH .req ip
2404
+ TMP .req lr
2405
+
2406
+ push {r4, r5, r6, lr}
2407
+ vpush {d8-d15}
2408
+
2409
+ ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
2410
+ cmp MAX_V_SAMP_FACTOR, #0
2411
+ ble 99f
2412
+
2413
+ /* initialize constants */
2414
+ vmov.u8 d28, #3
2415
+ vmov.u16 q15, #1
2416
+ 11:
2417
+ ldr INPTR, [INPUT_DATA], #4
2418
+ ldr OUTPTR, [OUTPUT_DATA], #4
2419
+ mov WIDTH, DOWNSAMPLED_WIDTH
2420
+ upsample_row OUTPTR, INPTR, WIDTH, TMP
2421
+ subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
2422
+ bgt 11b
2423
+
2424
+ 99:
2425
+ vpop {d8-d15}
2426
+ pop {r4, r5, r6, pc}
2427
+
2428
+ .unreq MAX_V_SAMP_FACTOR
2429
+ .unreq DOWNSAMPLED_WIDTH
2430
+ .unreq INPUT_DATA
2431
+ .unreq OUTPUT_DATA_PTR
2432
+ .unreq OUTPUT_DATA
2433
+
2434
+ .unreq OUTPTR
2435
+ .unreq INPTR
2436
+ .unreq WIDTH
2437
+ .unreq TMP
2438
+
2439
+ .purgem upsample16
2440
+ .purgem upsample32
2441
+ .purgem upsample_row
2442
+
2443
+
2444
+ /*****************************************************************************/
2445
+
2446
+ /*
2447
+ * GLOBAL(JOCTET *)
2448
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
2449
+ * JCOEFPTR block, int last_dc_val,
2450
+ * c_derived_tbl *dctbl, c_derived_tbl *actbl)
2451
+ *
2452
+ */
2453
+
2454
+ .macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
2455
+ sub \PUT_BITS, \PUT_BITS, #0x8
2456
+ lsr \TMP, \PUT_BUFFER, \PUT_BITS
2457
+ uxtb \TMP, \TMP
2458
+ strb \TMP, [\BUFFER, #1]!
2459
+ cmp \TMP, #0xff
2460
+ /*it eq*/
2461
+ strbeq \ZERO, [\BUFFER, #1]!
2462
+ .endm
2463
+
2464
+ .macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
2465
+ /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
2466
+ add \PUT_BITS, \SIZE
2467
+ /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/
2468
+ orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
2469
+ .endm
2470
+
2471
+ .macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
2472
+ cmp \PUT_BITS, #0x10
2473
+ blt 15f
2474
+ eor \ZERO, \ZERO, \ZERO
2475
+ emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
2476
+ emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
2477
+ 15:
2478
+ .endm
2479
+
2480
+ .balign 16
2481
+ jsimd_huff_encode_one_block_neon_consts:
2482
+ .byte 0x01
2483
+ .byte 0x02
2484
+ .byte 0x04
2485
+ .byte 0x08
2486
+ .byte 0x10
2487
+ .byte 0x20
2488
+ .byte 0x40
2489
+ .byte 0x80
2490
+
2491
+ asm_function jsimd_huff_encode_one_block_neon
2492
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
2493
+ add r7, sp, #0x1c
2494
+ sub r4, sp, #0x40
2495
+ bfc r4, #0, #5
2496
+ mov sp, r4 /* align sp on 32 bytes */
2497
+ vst1.64 {d8, d9, d10, d11}, [r4, :128]!
2498
+ vst1.64 {d12, d13, d14, d15}, [r4, :128]
2499
+ sub sp, #0x140 /* reserve 320 bytes */
2500
+ str r0, [sp, #0x18] /* working state > sp + Ox18 */
2501
+ add r4, sp, #0x20 /* r4 = t1 */
2502
+ ldr lr, [r7, #0x8] /* lr = dctbl */
2503
+ sub r10, r1, #0x1 /* r10=buffer-- */
2504
+ ldrsh r1, [r2]
2505
+ mov r9, #0x10
2506
+ mov r8, #0x1
2507
+ adr r5, jsimd_huff_encode_one_block_neon_consts
2508
+ /* prepare data */
2509
+ vld1.8 {d26}, [r5, :64]
2510
+ veor q8, q8, q8
2511
+ veor q9, q9, q9
2512
+ vdup.16 q14, r9
2513
+ vdup.16 q15, r8
2514
+ veor q10, q10, q10
2515
+ veor q11, q11, q11
2516
+ sub r1, r1, r3
2517
+ add r9, r2, #0x22
2518
+ add r8, r2, #0x18
2519
+ add r3, r2, #0x36
2520
+ vmov.16 d0[0], r1
2521
+ vld1.16 {d2[0]}, [r9, :16]
2522
+ vld1.16 {d4[0]}, [r8, :16]
2523
+ vld1.16 {d6[0]}, [r3, :16]
2524
+ add r1, r2, #0x2
2525
+ add r9, r2, #0x30
2526
+ add r8, r2, #0x26
2527
+ add r3, r2, #0x28
2528
+ vld1.16 {d0[1]}, [r1, :16]
2529
+ vld1.16 {d2[1]}, [r9, :16]
2530
+ vld1.16 {d4[1]}, [r8, :16]
2531
+ vld1.16 {d6[1]}, [r3, :16]
2532
+ add r1, r2, #0x10
2533
+ add r9, r2, #0x40
2534
+ add r8, r2, #0x34
2535
+ add r3, r2, #0x1a
2536
+ vld1.16 {d0[2]}, [r1, :16]
2537
+ vld1.16 {d2[2]}, [r9, :16]
2538
+ vld1.16 {d4[2]}, [r8, :16]
2539
+ vld1.16 {d6[2]}, [r3, :16]
2540
+ add r1, r2, #0x20
2541
+ add r9, r2, #0x32
2542
+ add r8, r2, #0x42
2543
+ add r3, r2, #0xc
2544
+ vld1.16 {d0[3]}, [r1, :16]
2545
+ vld1.16 {d2[3]}, [r9, :16]
2546
+ vld1.16 {d4[3]}, [r8, :16]
2547
+ vld1.16 {d6[3]}, [r3, :16]
2548
+ add r1, r2, #0x12
2549
+ add r9, r2, #0x24
2550
+ add r8, r2, #0x50
2551
+ add r3, r2, #0xe
2552
+ vld1.16 {d1[0]}, [r1, :16]
2553
+ vld1.16 {d3[0]}, [r9, :16]
2554
+ vld1.16 {d5[0]}, [r8, :16]
2555
+ vld1.16 {d7[0]}, [r3, :16]
2556
+ add r1, r2, #0x4
2557
+ add r9, r2, #0x16
2558
+ add r8, r2, #0x60
2559
+ add r3, r2, #0x1c
2560
+ vld1.16 {d1[1]}, [r1, :16]
2561
+ vld1.16 {d3[1]}, [r9, :16]
2562
+ vld1.16 {d5[1]}, [r8, :16]
2563
+ vld1.16 {d7[1]}, [r3, :16]
2564
+ add r1, r2, #0x6
2565
+ add r9, r2, #0x8
2566
+ add r8, r2, #0x52
2567
+ add r3, r2, #0x2a
2568
+ vld1.16 {d1[2]}, [r1, :16]
2569
+ vld1.16 {d3[2]}, [r9, :16]
2570
+ vld1.16 {d5[2]}, [r8, :16]
2571
+ vld1.16 {d7[2]}, [r3, :16]
2572
+ add r1, r2, #0x14
2573
+ add r9, r2, #0xa
2574
+ add r8, r2, #0x44
2575
+ add r3, r2, #0x38
2576
+ vld1.16 {d1[3]}, [r1, :16]
2577
+ vld1.16 {d3[3]}, [r9, :16]
2578
+ vld1.16 {d5[3]}, [r8, :16]
2579
+ vld1.16 {d7[3]}, [r3, :16]
2580
+ vcgt.s16 q8, q8, q0
2581
+ vcgt.s16 q9, q9, q1
2582
+ vcgt.s16 q10, q10, q2
2583
+ vcgt.s16 q11, q11, q3
2584
+ vabs.s16 q0, q0
2585
+ vabs.s16 q1, q1
2586
+ vabs.s16 q2, q2
2587
+ vabs.s16 q3, q3
2588
+ veor q8, q8, q0
2589
+ veor q9, q9, q1
2590
+ veor q10, q10, q2
2591
+ veor q11, q11, q3
2592
+ add r9, r4, #0x20
2593
+ add r8, r4, #0x80
2594
+ add r3, r4, #0xa0
2595
+ vclz.i16 q0, q0
2596
+ vclz.i16 q1, q1
2597
+ vclz.i16 q2, q2
2598
+ vclz.i16 q3, q3
2599
+ vsub.i16 q0, q14, q0
2600
+ vsub.i16 q1, q14, q1
2601
+ vsub.i16 q2, q14, q2
2602
+ vsub.i16 q3, q14, q3
2603
+ vst1.16 {d0, d1, d2, d3}, [r4, :256]
2604
+ vst1.16 {d4, d5, d6, d7}, [r9, :256]
2605
+ vshl.s16 q0, q15, q0
2606
+ vshl.s16 q1, q15, q1
2607
+ vshl.s16 q2, q15, q2
2608
+ vshl.s16 q3, q15, q3
2609
+ vsub.i16 q0, q0, q15
2610
+ vsub.i16 q1, q1, q15
2611
+ vsub.i16 q2, q2, q15
2612
+ vsub.i16 q3, q3, q15
2613
+ vand q8, q8, q0
2614
+ vand q9, q9, q1
2615
+ vand q10, q10, q2
2616
+ vand q11, q11, q3
2617
+ vst1.16 {d16, d17, d18, d19}, [r8, :256]
2618
+ vst1.16 {d20, d21, d22, d23}, [r3, :256]
2619
+ add r1, r2, #0x46
2620
+ add r9, r2, #0x3a
2621
+ add r8, r2, #0x74
2622
+ add r3, r2, #0x6a
2623
+ vld1.16 {d8[0]}, [r1, :16]
2624
+ vld1.16 {d10[0]}, [r9, :16]
2625
+ vld1.16 {d12[0]}, [r8, :16]
2626
+ vld1.16 {d14[0]}, [r3, :16]
2627
+ veor q8, q8, q8
2628
+ veor q9, q9, q9
2629
+ veor q10, q10, q10
2630
+ veor q11, q11, q11
2631
+ add r1, r2, #0x54
2632
+ add r9, r2, #0x2c
2633
+ add r8, r2, #0x76
2634
+ add r3, r2, #0x78
2635
+ vld1.16 {d8[1]}, [r1, :16]
2636
+ vld1.16 {d10[1]}, [r9, :16]
2637
+ vld1.16 {d12[1]}, [r8, :16]
2638
+ vld1.16 {d14[1]}, [r3, :16]
2639
+ add r1, r2, #0x62
2640
+ add r9, r2, #0x1e
2641
+ add r8, r2, #0x68
2642
+ add r3, r2, #0x7a
2643
+ vld1.16 {d8[2]}, [r1, :16]
2644
+ vld1.16 {d10[2]}, [r9, :16]
2645
+ vld1.16 {d12[2]}, [r8, :16]
2646
+ vld1.16 {d14[2]}, [r3, :16]
2647
+ add r1, r2, #0x70
2648
+ add r9, r2, #0x2e
2649
+ add r8, r2, #0x5a
2650
+ add r3, r2, #0x6c
2651
+ vld1.16 {d8[3]}, [r1, :16]
2652
+ vld1.16 {d10[3]}, [r9, :16]
2653
+ vld1.16 {d12[3]}, [r8, :16]
2654
+ vld1.16 {d14[3]}, [r3, :16]
2655
+ add r1, r2, #0x72
2656
+ add r9, r2, #0x3c
2657
+ add r8, r2, #0x4c
2658
+ add r3, r2, #0x5e
2659
+ vld1.16 {d9[0]}, [r1, :16]
2660
+ vld1.16 {d11[0]}, [r9, :16]
2661
+ vld1.16 {d13[0]}, [r8, :16]
2662
+ vld1.16 {d15[0]}, [r3, :16]
2663
+ add r1, r2, #0x64
2664
+ add r9, r2, #0x4a
2665
+ add r8, r2, #0x3e
2666
+ add r3, r2, #0x6e
2667
+ vld1.16 {d9[1]}, [r1, :16]
2668
+ vld1.16 {d11[1]}, [r9, :16]
2669
+ vld1.16 {d13[1]}, [r8, :16]
2670
+ vld1.16 {d15[1]}, [r3, :16]
2671
+ add r1, r2, #0x56
2672
+ add r9, r2, #0x58
2673
+ add r8, r2, #0x4e
2674
+ add r3, r2, #0x7c
2675
+ vld1.16 {d9[2]}, [r1, :16]
2676
+ vld1.16 {d11[2]}, [r9, :16]
2677
+ vld1.16 {d13[2]}, [r8, :16]
2678
+ vld1.16 {d15[2]}, [r3, :16]
2679
+ add r1, r2, #0x48
2680
+ add r9, r2, #0x66
2681
+ add r8, r2, #0x5c
2682
+ add r3, r2, #0x7e
2683
+ vld1.16 {d9[3]}, [r1, :16]
2684
+ vld1.16 {d11[3]}, [r9, :16]
2685
+ vld1.16 {d13[3]}, [r8, :16]
2686
+ vld1.16 {d15[3]}, [r3, :16]
2687
+ vcgt.s16 q8, q8, q4
2688
+ vcgt.s16 q9, q9, q5
2689
+ vcgt.s16 q10, q10, q6
2690
+ vcgt.s16 q11, q11, q7
2691
+ vabs.s16 q4, q4
2692
+ vabs.s16 q5, q5
2693
+ vabs.s16 q6, q6
2694
+ vabs.s16 q7, q7
2695
+ veor q8, q8, q4
2696
+ veor q9, q9, q5
2697
+ veor q10, q10, q6
2698
+ veor q11, q11, q7
2699
+ add r1, r4, #0x40
2700
+ add r9, r4, #0x60
2701
+ add r8, r4, #0xc0
2702
+ add r3, r4, #0xe0
2703
+ vclz.i16 q4, q4
2704
+ vclz.i16 q5, q5
2705
+ vclz.i16 q6, q6
2706
+ vclz.i16 q7, q7
2707
+ vsub.i16 q4, q14, q4
2708
+ vsub.i16 q5, q14, q5
2709
+ vsub.i16 q6, q14, q6
2710
+ vsub.i16 q7, q14, q7
2711
+ vst1.16 {d8, d9, d10, d11}, [r1, :256]
2712
+ vst1.16 {d12, d13, d14, d15}, [r9, :256]
2713
+ vshl.s16 q4, q15, q4
2714
+ vshl.s16 q5, q15, q5
2715
+ vshl.s16 q6, q15, q6
2716
+ vshl.s16 q7, q15, q7
2717
+ vsub.i16 q4, q4, q15
2718
+ vsub.i16 q5, q5, q15
2719
+ vsub.i16 q6, q6, q15
2720
+ vsub.i16 q7, q7, q15
2721
+ vand q8, q8, q4
2722
+ vand q9, q9, q5
2723
+ vand q10, q10, q6
2724
+ vand q11, q11, q7
2725
+ vst1.16 {d16, d17, d18, d19}, [r8, :256]
2726
+ vst1.16 {d20, d21, d22, d23}, [r3, :256]
2727
+ ldr r12, [r7, #0xc] /* r12 = actbl */
2728
+ add r1, lr, #0x400 /* r1 = dctbl->ehufsi */
2729
+ mov r9, r12 /* r9 = actbl */
2730
+ add r6, r4, #0x80 /* r6 = t2 */
2731
+ ldr r11, [r0, #0x8] /* r11 = put_buffer */
2732
+ ldr r4, [r0, #0xc] /* r4 = put_bits */
2733
+ ldrh r2, [r6, #-128] /* r2 = nbits */
2734
+ ldrh r3, [r6] /* r3 = temp2 & (((JLONG)1)<<nbits) - 1; */
2735
+ ldr r0, [lr, r2, lsl #2]
2736
+ ldrb r5, [r1, r2]
2737
+ put_bits r11, r4, r0, r5
2738
+ checkbuf15 r10, r11, r4, r5, r0
2739
+ put_bits r11, r4, r3, r2
2740
+ checkbuf15 r10, r11, r4, r5, r0
2741
+ mov lr, r6 /* lr = t2 */
2742
+ add r5, r9, #0x400 /* r5 = actbl->ehufsi */
2743
+ ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */
2744
+ veor q8, q8, q8
2745
+ vceq.i16 q0, q0, q8
2746
+ vceq.i16 q1, q1, q8
2747
+ vceq.i16 q2, q2, q8
2748
+ vceq.i16 q3, q3, q8
2749
+ vceq.i16 q4, q4, q8
2750
+ vceq.i16 q5, q5, q8
2751
+ vceq.i16 q6, q6, q8
2752
+ vceq.i16 q7, q7, q8
2753
+ vmovn.i16 d0, q0
2754
+ vmovn.i16 d2, q1
2755
+ vmovn.i16 d4, q2
2756
+ vmovn.i16 d6, q3
2757
+ vmovn.i16 d8, q4
2758
+ vmovn.i16 d10, q5
2759
+ vmovn.i16 d12, q6
2760
+ vmovn.i16 d14, q7
2761
+ vand d0, d0, d26
2762
+ vand d2, d2, d26
2763
+ vand d4, d4, d26
2764
+ vand d6, d6, d26
2765
+ vand d8, d8, d26
2766
+ vand d10, d10, d26
2767
+ vand d12, d12, d26
2768
+ vand d14, d14, d26
2769
+ vpadd.i8 d0, d0, d2
2770
+ vpadd.i8 d4, d4, d6
2771
+ vpadd.i8 d8, d8, d10
2772
+ vpadd.i8 d12, d12, d14
2773
+ vpadd.i8 d0, d0, d4
2774
+ vpadd.i8 d8, d8, d12
2775
+ vpadd.i8 d0, d0, d8
2776
+ vmov.32 r1, d0[1]
2777
+ vmov.32 r8, d0[0]
2778
+ mvn r1, r1
2779
+ mvn r8, r8
2780
+ lsrs r1, r1, #0x1
2781
+ rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */
2782
+ rbit r1, r1 /* r1 = index1 */
2783
+ rbit r8, r8 /* r8 = index0 */
2784
+ ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */
2785
+ str r1, [sp, #0x14] /* index1 > sp + 0x14 */
2786
+ cmp r8, #0x0
2787
+ beq 6f
2788
+ 1:
2789
+ clz r2, r8
2790
+ add lr, lr, r2, lsl #1
2791
+ lsl r8, r8, r2
2792
+ ldrh r1, [lr, #-126]
2793
+ 2:
2794
+ cmp r2, #0x10
2795
+ blt 3f
2796
+ sub r2, r2, #0x10
2797
+ put_bits r11, r4, r0, r6
2798
+ cmp r4, #0x10
2799
+ blt 2b
2800
+ eor r3, r3, r3
2801
+ emit_byte r10, r11, r4, r3, r12
2802
+ emit_byte r10, r11, r4, r3, r12
2803
+ b 2b
2804
+ 3:
2805
+ add r2, r1, r2, lsl #4
2806
+ ldrh r3, [lr, #2]!
2807
+ ldr r12, [r9, r2, lsl #2]
2808
+ ldrb r2, [r5, r2]
2809
+ put_bits r11, r4, r12, r2
2810
+ checkbuf15 r10, r11, r4, r2, r12
2811
+ put_bits r11, r4, r3, r1
2812
+ checkbuf15 r10, r11, r4, r2, r12
2813
+ lsls r8, r8, #0x1
2814
+ bne 1b
2815
+ 6:
2816
+ add r12, sp, #0x20 /* r12 = t1 */
2817
+ ldr r8, [sp, #0x14] /* r8 = index1 */
2818
+ adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */
2819
+ cmp r8, #0x0
2820
+ beq 6f
2821
+ clz r2, r8
2822
+ sub r12, r12, lr
2823
+ lsl r8, r8, r2
2824
+ add r2, r2, r12, lsr #1
2825
+ add lr, lr, r2, lsl #1
2826
+ b 7f
2827
+ 1:
2828
+ clz r2, r8
2829
+ add lr, lr, r2, lsl #1
2830
+ lsl r8, r8, r2
2831
+ 7:
2832
+ ldrh r1, [lr, #-126]
2833
+ 2:
2834
+ cmp r2, #0x10
2835
+ blt 3f
2836
+ sub r2, r2, #0x10
2837
+ put_bits r11, r4, r0, r6
2838
+ cmp r4, #0x10
2839
+ blt 2b
2840
+ eor r3, r3, r3
2841
+ emit_byte r10, r11, r4, r3, r12
2842
+ emit_byte r10, r11, r4, r3, r12
2843
+ b 2b
2844
+ 3:
2845
+ add r2, r1, r2, lsl #4
2846
+ ldrh r3, [lr, #2]!
2847
+ ldr r12, [r9, r2, lsl #2]
2848
+ ldrb r2, [r5, r2]
2849
+ put_bits r11, r4, r12, r2
2850
+ checkbuf15 r10, r11, r4, r2, r12
2851
+ put_bits r11, r4, r3, r1
2852
+ checkbuf15 r10, r11, r4, r2, r12
2853
+ lsls r8, r8, #0x1
2854
+ bne 1b
2855
+ 6:
2856
+ add r0, sp, #0x20
2857
+ add r0, #0xfe
2858
+ cmp lr, r0
2859
+ bhs 1f
2860
+ ldr r1, [r9]
2861
+ ldrb r0, [r5]
2862
+ put_bits r11, r4, r1, r0
2863
+ checkbuf15 r10, r11, r4, r0, r1
2864
+ 1:
2865
+ ldr r12, [sp, #0x18]
2866
+ str r11, [r12, #0x8]
2867
+ str r4, [r12, #0xc]
2868
+ add r0, r10, #0x1
2869
+ add r4, sp, #0x140
2870
+ vld1.64 {d8, d9, d10, d11}, [r4, :128]!
2871
+ vld1.64 {d12, d13, d14, d15}, [r4, :128]
2872
+ sub r4, r7, #0x1c
2873
+ mov sp, r4
2874
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
2875
+
2876
+ .purgem emit_byte
2877
+ .purgem put_bits
2878
+ .purgem checkbuf15