epeg 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (504) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/MANIFEST +5 -0
  4. data/TODO +1 -0
  5. data/epeg/.dockerignore +4 -0
  6. data/epeg/.gitignore +5 -0
  7. data/epeg/CMakeLists.txt +30 -0
  8. data/epeg/Dockerfile +23 -0
  9. data/epeg/Epeg.h +90 -0
  10. data/epeg/README.md +42 -0
  11. data/epeg/epeg_main.c +1642 -0
  12. data/epeg/epeg_private.h +85 -0
  13. data/epeg/example/.gitignore +1 -0
  14. data/epeg/example/CMakeLists.txt +20 -0
  15. data/epeg/example/example.jpg +0 -0
  16. data/epeg/example/rotatetest.c +29 -0
  17. data/epeg/example/scaletest.c +48 -0
  18. data/epeg/vendor/libjpeg-turbo-2.0.4/BUILDING.md +828 -0
  19. data/epeg/vendor/libjpeg-turbo-2.0.4/CMakeLists.txt +1420 -0
  20. data/epeg/vendor/libjpeg-turbo-2.0.4/ChangeLog.md +1494 -0
  21. data/epeg/vendor/libjpeg-turbo-2.0.4/LICENSE.md +132 -0
  22. data/epeg/vendor/libjpeg-turbo-2.0.4/README.ijg +277 -0
  23. data/epeg/vendor/libjpeg-turbo-2.0.4/README.md +356 -0
  24. data/epeg/vendor/libjpeg-turbo-2.0.4/cderror.h +137 -0
  25. data/epeg/vendor/libjpeg-turbo-2.0.4/cdjpeg.c +145 -0
  26. data/epeg/vendor/libjpeg-turbo-2.0.4/cdjpeg.h +157 -0
  27. data/epeg/vendor/libjpeg-turbo-2.0.4/change.log +315 -0
  28. data/epeg/vendor/libjpeg-turbo-2.0.4/cjpeg.1 +354 -0
  29. data/epeg/vendor/libjpeg-turbo-2.0.4/cjpeg.c +695 -0
  30. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/BuildPackages.cmake +182 -0
  31. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/GNUInstallDirs.cmake +416 -0
  32. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/cmake_uninstall.cmake.in +24 -0
  33. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/testclean.cmake +41 -0
  34. data/epeg/vendor/libjpeg-turbo-2.0.4/cmyk.h +61 -0
  35. data/epeg/vendor/libjpeg-turbo-2.0.4/coderules.txt +78 -0
  36. data/epeg/vendor/libjpeg-turbo-2.0.4/djpeg.1 +296 -0
  37. data/epeg/vendor/libjpeg-turbo-2.0.4/djpeg.c +822 -0
  38. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/annotated.html +104 -0
  39. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/bc_s.png +0 -0
  40. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/bdwn.png +0 -0
  41. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/classes.html +106 -0
  42. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/closed.png +0 -0
  43. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen-extra.css +3 -0
  44. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen.css +1184 -0
  45. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen.png +0 -0
  46. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/dynsections.js +97 -0
  47. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2blank.png +0 -0
  48. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2cl.png +0 -0
  49. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2doc.png +0 -0
  50. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2folderclosed.png +0 -0
  51. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2folderopen.png +0 -0
  52. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2lastnode.png +0 -0
  53. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2link.png +0 -0
  54. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mlastnode.png +0 -0
  55. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mnode.png +0 -0
  56. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mo.png +0 -0
  57. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2node.png +0 -0
  58. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2ns.png +0 -0
  59. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2plastnode.png +0 -0
  60. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2pnode.png +0 -0
  61. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2splitbar.png +0 -0
  62. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2vertline.png +0 -0
  63. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/functions.html +134 -0
  64. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/functions_vars.html +134 -0
  65. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/group___turbo_j_p_e_g.html +2775 -0
  66. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/index.html +90 -0
  67. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/jquery.js +8 -0
  68. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/modules.html +95 -0
  69. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_f.png +0 -0
  70. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_g.png +0 -0
  71. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_h.png +0 -0
  72. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/open.png +0 -0
  73. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_63.html +26 -0
  74. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_63.js +4 -0
  75. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_64.html +26 -0
  76. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_64.js +5 -0
  77. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_68.html +26 -0
  78. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_68.js +4 -0
  79. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6e.html +26 -0
  80. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6e.js +4 -0
  81. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6f.html +26 -0
  82. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6f.js +5 -0
  83. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_72.html +26 -0
  84. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_72.js +4 -0
  85. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_74.html +26 -0
  86. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_74.js +102 -0
  87. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_77.html +26 -0
  88. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_77.js +4 -0
  89. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_78.html +26 -0
  90. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_78.js +4 -0
  91. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_79.html +26 -0
  92. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_79.js +4 -0
  93. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/classes_74.html +26 -0
  94. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/classes_74.js +6 -0
  95. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/close.png +0 -0
  96. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enums_74.html +26 -0
  97. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enums_74.js +8 -0
  98. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enumvalues_74.html +26 -0
  99. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enumvalues_74.js +37 -0
  100. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/functions_74.html +26 -0
  101. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/functions_74.js +31 -0
  102. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/groups_74.html +26 -0
  103. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/groups_74.js +4 -0
  104. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/mag_sel.png +0 -0
  105. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/nomatches.html +12 -0
  106. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search.css +271 -0
  107. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search.js +809 -0
  108. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_l.png +0 -0
  109. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_m.png +0 -0
  110. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_r.png +0 -0
  111. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/typedefs_74.html +26 -0
  112. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/typedefs_74.js +5 -0
  113. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_63.html +26 -0
  114. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_63.js +4 -0
  115. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_64.html +26 -0
  116. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_64.js +5 -0
  117. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_68.html +26 -0
  118. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_68.js +4 -0
  119. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6e.html +26 -0
  120. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6e.js +4 -0
  121. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6f.html +26 -0
  122. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6f.js +5 -0
  123. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_72.html +26 -0
  124. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_72.js +4 -0
  125. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_74.html +26 -0
  126. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_74.js +10 -0
  127. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_77.html +26 -0
  128. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_77.js +4 -0
  129. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_78.html +26 -0
  130. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_78.js +4 -0
  131. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_79.html +26 -0
  132. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_79.js +4 -0
  133. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjregion.html +186 -0
  134. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjscalingfactor.html +148 -0
  135. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjtransform.html +212 -0
  136. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/sync_off.png +0 -0
  137. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/sync_on.png +0 -0
  138. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_a.png +0 -0
  139. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_b.png +0 -0
  140. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_h.png +0 -0
  141. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_s.png +0 -0
  142. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tabs.css +60 -0
  143. data/epeg/vendor/libjpeg-turbo-2.0.4/doxygen-extra.css +3 -0
  144. data/epeg/vendor/libjpeg-turbo-2.0.4/doxygen.config +16 -0
  145. data/epeg/vendor/libjpeg-turbo-2.0.4/example.txt +464 -0
  146. data/epeg/vendor/libjpeg-turbo-2.0.4/jaricom.c +157 -0
  147. data/epeg/vendor/libjpeg-turbo-2.0.4/java/CMakeLists.txt +88 -0
  148. data/epeg/vendor/libjpeg-turbo-2.0.4/java/MANIFEST.MF +2 -0
  149. data/epeg/vendor/libjpeg-turbo-2.0.4/java/README +52 -0
  150. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJBench.java +1021 -0
  151. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJExample.java +405 -0
  152. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJUnitTest.java +960 -0
  153. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/allclasses-frame.html +24 -0
  154. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/allclasses-noframe.html +24 -0
  155. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/constant-values.html +532 -0
  156. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/deprecated-list.html +252 -0
  157. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/help-doc.html +210 -0
  158. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/index-all.html +1029 -0
  159. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/index.html +71 -0
  160. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJ.html +1356 -0
  161. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html +926 -0
  162. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html +241 -0
  163. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html +1255 -0
  164. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJException.html +340 -0
  165. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html +343 -0
  166. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html +751 -0
  167. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html +421 -0
  168. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html +765 -0
  169. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-frame.html +31 -0
  170. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-summary.html +202 -0
  171. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-tree.html +160 -0
  172. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/overview-tree.html +164 -0
  173. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/package-list +1 -0
  174. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/background.gif +0 -0
  175. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/tab.gif +0 -0
  176. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/titlebar.gif +0 -0
  177. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/titlebar_end.gif +0 -0
  178. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/script.js +30 -0
  179. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/serialized-form.html +176 -0
  180. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/stylesheet.css +474 -0
  181. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJ.java +584 -0
  182. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJCompressor.java +677 -0
  183. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java +76 -0
  184. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJDecompressor.java +931 -0
  185. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJException.java +78 -0
  186. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in +59 -0
  187. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJLoader-win.java.in +35 -0
  188. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJScalingFactor.java +115 -0
  189. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJTransform.java +227 -0
  190. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJTransformer.java +163 -0
  191. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/YUVImage.java +445 -0
  192. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJ.h +129 -0
  193. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJCompressor.h +101 -0
  194. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJDecompressor.h +101 -0
  195. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJTransformer.h +29 -0
  196. data/epeg/vendor/libjpeg-turbo-2.0.4/jcapimin.c +295 -0
  197. data/epeg/vendor/libjpeg-turbo-2.0.4/jcapistd.c +162 -0
  198. data/epeg/vendor/libjpeg-turbo-2.0.4/jcarith.c +932 -0
  199. data/epeg/vendor/libjpeg-turbo-2.0.4/jccoefct.c +449 -0
  200. data/epeg/vendor/libjpeg-turbo-2.0.4/jccolext.c +144 -0
  201. data/epeg/vendor/libjpeg-turbo-2.0.4/jccolor.c +710 -0
  202. data/epeg/vendor/libjpeg-turbo-2.0.4/jcdctmgr.c +721 -0
  203. data/epeg/vendor/libjpeg-turbo-2.0.4/jchuff.c +1096 -0
  204. data/epeg/vendor/libjpeg-turbo-2.0.4/jchuff.h +42 -0
  205. data/epeg/vendor/libjpeg-turbo-2.0.4/jcicc.c +105 -0
  206. data/epeg/vendor/libjpeg-turbo-2.0.4/jcinit.c +77 -0
  207. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmainct.c +162 -0
  208. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmarker.c +664 -0
  209. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmaster.c +640 -0
  210. data/epeg/vendor/libjpeg-turbo-2.0.4/jcomapi.c +109 -0
  211. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfig.h.in +73 -0
  212. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfig.txt +143 -0
  213. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfigint.h.in +31 -0
  214. data/epeg/vendor/libjpeg-turbo-2.0.4/jcparam.c +541 -0
  215. data/epeg/vendor/libjpeg-turbo-2.0.4/jcphuff.c +1105 -0
  216. data/epeg/vendor/libjpeg-turbo-2.0.4/jcprepct.c +351 -0
  217. data/epeg/vendor/libjpeg-turbo-2.0.4/jcsample.c +539 -0
  218. data/epeg/vendor/libjpeg-turbo-2.0.4/jcstest.c +126 -0
  219. data/epeg/vendor/libjpeg-turbo-2.0.4/jctrans.c +400 -0
  220. data/epeg/vendor/libjpeg-turbo-2.0.4/jdapimin.c +407 -0
  221. data/epeg/vendor/libjpeg-turbo-2.0.4/jdapistd.c +639 -0
  222. data/epeg/vendor/libjpeg-turbo-2.0.4/jdarith.c +773 -0
  223. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatadst-tj.c +203 -0
  224. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatadst.c +293 -0
  225. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatasrc-tj.c +194 -0
  226. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatasrc.c +295 -0
  227. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcoefct.c +692 -0
  228. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcoefct.h +82 -0
  229. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcol565.c +384 -0
  230. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcolext.c +143 -0
  231. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcolor.c +883 -0
  232. data/epeg/vendor/libjpeg-turbo-2.0.4/jdct.h +208 -0
  233. data/epeg/vendor/libjpeg-turbo-2.0.4/jddctmgr.c +352 -0
  234. data/epeg/vendor/libjpeg-turbo-2.0.4/jdhuff.c +831 -0
  235. data/epeg/vendor/libjpeg-turbo-2.0.4/jdhuff.h +238 -0
  236. data/epeg/vendor/libjpeg-turbo-2.0.4/jdicc.c +171 -0
  237. data/epeg/vendor/libjpeg-turbo-2.0.4/jdinput.c +408 -0
  238. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmainct.c +460 -0
  239. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmainct.h +71 -0
  240. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmarker.c +1377 -0
  241. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmaster.c +737 -0
  242. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmaster.h +28 -0
  243. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmerge.c +617 -0
  244. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmrg565.c +354 -0
  245. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmrgext.c +184 -0
  246. data/epeg/vendor/libjpeg-turbo-2.0.4/jdphuff.c +687 -0
  247. data/epeg/vendor/libjpeg-turbo-2.0.4/jdpostct.c +294 -0
  248. data/epeg/vendor/libjpeg-turbo-2.0.4/jdsample.c +518 -0
  249. data/epeg/vendor/libjpeg-turbo-2.0.4/jdsample.h +50 -0
  250. data/epeg/vendor/libjpeg-turbo-2.0.4/jdtrans.c +155 -0
  251. data/epeg/vendor/libjpeg-turbo-2.0.4/jerror.c +251 -0
  252. data/epeg/vendor/libjpeg-turbo-2.0.4/jerror.h +316 -0
  253. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctflt.c +169 -0
  254. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctfst.c +227 -0
  255. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctint.c +288 -0
  256. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctflt.c +240 -0
  257. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctfst.c +371 -0
  258. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctint.c +2627 -0
  259. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctred.c +409 -0
  260. data/epeg/vendor/libjpeg-turbo-2.0.4/jinclude.h +88 -0
  261. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemmgr.c +1179 -0
  262. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemnobs.c +115 -0
  263. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemsys.h +178 -0
  264. data/epeg/vendor/libjpeg-turbo-2.0.4/jmorecfg.h +421 -0
  265. data/epeg/vendor/libjpeg-turbo-2.0.4/jpeg_nbits_table.h +4098 -0
  266. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegcomp.h +31 -0
  267. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegint.h +368 -0
  268. data/epeg/vendor/libjpeg-turbo-2.0.4/jpeglib.h +1132 -0
  269. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegtran.1 +295 -0
  270. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegtran.c +601 -0
  271. data/epeg/vendor/libjpeg-turbo-2.0.4/jquant1.c +859 -0
  272. data/epeg/vendor/libjpeg-turbo-2.0.4/jquant2.c +1285 -0
  273. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimd.h +117 -0
  274. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimd_none.c +418 -0
  275. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimddct.h +70 -0
  276. data/epeg/vendor/libjpeg-turbo-2.0.4/jstdhuff.c +143 -0
  277. data/epeg/vendor/libjpeg-turbo-2.0.4/jutils.c +133 -0
  278. data/epeg/vendor/libjpeg-turbo-2.0.4/jversion.h +52 -0
  279. data/epeg/vendor/libjpeg-turbo-2.0.4/libjpeg.map.in +11 -0
  280. data/epeg/vendor/libjpeg-turbo-2.0.4/libjpeg.txt +3144 -0
  281. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/CMakeLists.txt +1 -0
  282. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5.c +275 -0
  283. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5.h +57 -0
  284. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5cmp.c +59 -0
  285. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5hl.c +125 -0
  286. data/epeg/vendor/libjpeg-turbo-2.0.4/rdbmp.c +689 -0
  287. data/epeg/vendor/libjpeg-turbo-2.0.4/rdcolmap.c +254 -0
  288. data/epeg/vendor/libjpeg-turbo-2.0.4/rdgif.c +39 -0
  289. data/epeg/vendor/libjpeg-turbo-2.0.4/rdjpgcom.1 +63 -0
  290. data/epeg/vendor/libjpeg-turbo-2.0.4/rdjpgcom.c +510 -0
  291. data/epeg/vendor/libjpeg-turbo-2.0.4/rdppm.c +766 -0
  292. data/epeg/vendor/libjpeg-turbo-2.0.4/rdrle.c +389 -0
  293. data/epeg/vendor/libjpeg-turbo-2.0.4/rdswitch.c +424 -0
  294. data/epeg/vendor/libjpeg-turbo-2.0.4/rdtarga.c +509 -0
  295. data/epeg/vendor/libjpeg-turbo-2.0.4/release/Distribution.xml.in +24 -0
  296. data/epeg/vendor/libjpeg-turbo-2.0.4/release/License.rtf +20 -0
  297. data/epeg/vendor/libjpeg-turbo-2.0.4/release/ReadMe.txt +5 -0
  298. data/epeg/vendor/libjpeg-turbo-2.0.4/release/Welcome.rtf +17 -0
  299. data/epeg/vendor/libjpeg-turbo-2.0.4/release/deb-control.in +31 -0
  300. data/epeg/vendor/libjpeg-turbo-2.0.4/release/installer.nsi.in +191 -0
  301. data/epeg/vendor/libjpeg-turbo-2.0.4/release/libjpeg.pc.in +10 -0
  302. data/epeg/vendor/libjpeg-turbo-2.0.4/release/libturbojpeg.pc.in +10 -0
  303. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makecygwinpkg.in +66 -0
  304. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makedpkg.in +115 -0
  305. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makemacpkg.in +284 -0
  306. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makerpm.in +30 -0
  307. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makesrpm.in +48 -0
  308. data/epeg/vendor/libjpeg-turbo-2.0.4/release/maketarball.in +51 -0
  309. data/epeg/vendor/libjpeg-turbo-2.0.4/release/rpm.spec.in +221 -0
  310. data/epeg/vendor/libjpeg-turbo-2.0.4/release/uninstall.in +113 -0
  311. data/epeg/vendor/libjpeg-turbo-2.0.4/sharedlib/CMakeLists.txt +99 -0
  312. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/CMakeLists.txt +385 -0
  313. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm/jsimd.c +721 -0
  314. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm/jsimd_neon.S +2878 -0
  315. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm64/jsimd.c +798 -0
  316. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm64/jsimd_neon.S +3433 -0
  317. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/gas-preprocessor.in +1 -0
  318. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-avx2.asm +578 -0
  319. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-mmx.asm +476 -0
  320. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-sse2.asm +503 -0
  321. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-avx2.asm +121 -0
  322. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-mmx.asm +121 -0
  323. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-sse2.asm +120 -0
  324. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-avx2.asm +113 -0
  325. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-mmx.asm +113 -0
  326. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-sse2.asm +112 -0
  327. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-avx2.asm +457 -0
  328. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-mmx.asm +355 -0
  329. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-sse2.asm +382 -0
  330. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jchuff-sse2.asm +424 -0
  331. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcphuff-sse2.asm +660 -0
  332. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-avx2.asm +388 -0
  333. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-mmx.asm +324 -0
  334. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-sse2.asm +351 -0
  335. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-avx2.asm +515 -0
  336. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-mmx.asm +404 -0
  337. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-sse2.asm +458 -0
  338. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-avx2.asm +118 -0
  339. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-mmx.asm +117 -0
  340. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-sse2.asm +117 -0
  341. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-avx2.asm +136 -0
  342. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-mmx.asm +123 -0
  343. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-sse2.asm +135 -0
  344. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-avx2.asm +575 -0
  345. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-mmx.asm +460 -0
  346. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-sse2.asm +517 -0
  347. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-avx2.asm +760 -0
  348. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-mmx.asm +731 -0
  349. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-sse2.asm +724 -0
  350. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctflt-3dn.asm +318 -0
  351. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctflt-sse.asm +369 -0
  352. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctfst-mmx.asm +395 -0
  353. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctfst-sse2.asm +403 -0
  354. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-avx2.asm +331 -0
  355. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-mmx.asm +620 -0
  356. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-sse2.asm +633 -0
  357. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-3dn.asm +451 -0
  358. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-sse.asm +571 -0
  359. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-sse2.asm +497 -0
  360. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctfst-mmx.asm +499 -0
  361. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctfst-sse2.asm +501 -0
  362. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-avx2.asm +453 -0
  363. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-mmx.asm +851 -0
  364. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-sse2.asm +858 -0
  365. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctred-mmx.asm +704 -0
  366. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctred-sse2.asm +592 -0
  367. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-3dn.asm +230 -0
  368. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-mmx.asm +276 -0
  369. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-sse.asm +208 -0
  370. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquantf-sse2.asm +168 -0
  371. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquanti-avx2.asm +188 -0
  372. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquanti-sse2.asm +201 -0
  373. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jsimd.c +1253 -0
  374. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jsimdcpu.asm +135 -0
  375. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/jsimd.h +1083 -0
  376. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jccolext-mmi.c +483 -0
  377. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jccolor-mmi.c +148 -0
  378. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jcsample-mmi.c +100 -0
  379. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jcsample.h +28 -0
  380. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdcolext-mmi.c +424 -0
  381. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdcolor-mmi.c +139 -0
  382. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdsample-mmi.c +245 -0
  383. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jfdctint-mmi.c +398 -0
  384. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jidctint-mmi.c +571 -0
  385. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jquanti-mmi.c +130 -0
  386. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jsimd.c +610 -0
  387. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jsimd_mmi.h +57 -0
  388. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/loongson-mmintrin.h +1324 -0
  389. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd.c +1123 -0
  390. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd_dspr2.S +4479 -0
  391. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd_dspr2_asm.h +292 -0
  392. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jcolsamp.inc +135 -0
  393. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jdct.inc +31 -0
  394. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jpeg_nbits_table.inc +4097 -0
  395. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdcfg.inc +93 -0
  396. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdcfg.inc.h +131 -0
  397. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdext.inc +479 -0
  398. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jccolext-altivec.c +269 -0
  399. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jccolor-altivec.c +116 -0
  400. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcgray-altivec.c +111 -0
  401. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcgryext-altivec.c +228 -0
  402. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcsample-altivec.c +159 -0
  403. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcsample.h +28 -0
  404. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdcolext-altivec.c +276 -0
  405. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdcolor-altivec.c +106 -0
  406. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdmerge-altivec.c +130 -0
  407. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdmrgext-altivec.c +329 -0
  408. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdsample-altivec.c +400 -0
  409. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jfdctfst-altivec.c +154 -0
  410. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jfdctint-altivec.c +258 -0
  411. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jidctfst-altivec.c +255 -0
  412. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jidctint-altivec.c +357 -0
  413. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jquanti-altivec.c +250 -0
  414. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jsimd.c +872 -0
  415. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jsimd_altivec.h +98 -0
  416. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolext-avx2.asm +558 -0
  417. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolext-sse2.asm +483 -0
  418. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolor-avx2.asm +121 -0
  419. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolor-sse2.asm +120 -0
  420. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgray-avx2.asm +113 -0
  421. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgray-sse2.asm +112 -0
  422. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgryext-avx2.asm +437 -0
  423. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgryext-sse2.asm +362 -0
  424. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jchuff-sse2.asm +346 -0
  425. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcphuff-sse2.asm +637 -0
  426. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcsample-avx2.asm +366 -0
  427. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcsample-sse2.asm +329 -0
  428. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolext-avx2.asm +495 -0
  429. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolext-sse2.asm +438 -0
  430. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolor-avx2.asm +118 -0
  431. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolor-sse2.asm +117 -0
  432. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmerge-avx2.asm +136 -0
  433. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmerge-sse2.asm +135 -0
  434. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmrgext-avx2.asm +593 -0
  435. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmrgext-sse2.asm +535 -0
  436. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdsample-avx2.asm +695 -0
  437. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdsample-sse2.asm +664 -0
  438. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctflt-sse.asm +355 -0
  439. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctfst-sse2.asm +389 -0
  440. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctint-avx2.asm +320 -0
  441. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctint-sse2.asm +619 -0
  442. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctflt-sse2.asm +481 -0
  443. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctfst-sse2.asm +490 -0
  444. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctint-avx2.asm +417 -0
  445. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctint-sse2.asm +846 -0
  446. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctred-sse2.asm +573 -0
  447. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquantf-sse2.asm +154 -0
  448. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquanti-avx2.asm +162 -0
  449. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquanti-sse2.asm +187 -0
  450. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jsimd.c +1076 -0
  451. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jsimdcpu.asm +86 -0
  452. data/epeg/vendor/libjpeg-turbo-2.0.4/structure.txt +904 -0
  453. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/nightshot_iso_100.bmp +0 -0
  454. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/nightshot_iso_100.txt +25 -0
  455. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test.scan +5 -0
  456. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test1.icc +0 -0
  457. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test1.icc.txt +20 -0
  458. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test2.icc +0 -0
  459. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test2.icc.txt +20 -0
  460. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testimgari.jpg +0 -0
  461. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testimgint.jpg +0 -0
  462. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig.jpg +0 -0
  463. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig.ppm +4 -0
  464. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig12.jpg +0 -0
  465. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_5674_0098.bmp +0 -0
  466. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_6434_0018a.bmp +0 -0
  467. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_6548_0026a.bmp +0 -0
  468. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbench.c +1031 -0
  469. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbenchtest.in +256 -0
  470. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbenchtest.java.in +215 -0
  471. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexample.c +396 -0
  472. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexampletest.in +149 -0
  473. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexampletest.java.in +151 -0
  474. data/epeg/vendor/libjpeg-turbo-2.0.4/tjunittest.c +931 -0
  475. data/epeg/vendor/libjpeg-turbo-2.0.4/tjutil.c +70 -0
  476. data/epeg/vendor/libjpeg-turbo-2.0.4/tjutil.h +47 -0
  477. data/epeg/vendor/libjpeg-turbo-2.0.4/transupp.c +1628 -0
  478. data/epeg/vendor/libjpeg-turbo-2.0.4/transupp.h +210 -0
  479. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-jni.c +1246 -0
  480. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-mapfile +65 -0
  481. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-mapfile.jni +101 -0
  482. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg.c +2152 -0
  483. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg.h +1744 -0
  484. data/epeg/vendor/libjpeg-turbo-2.0.4/usage.txt +635 -0
  485. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jconfig.h.in +34 -0
  486. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg62-memsrcdst.def +108 -0
  487. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg62.def +106 -0
  488. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg7-memsrcdst.def +110 -0
  489. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg7.def +108 -0
  490. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg8.def +111 -0
  491. data/epeg/vendor/libjpeg-turbo-2.0.4/wizard.txt +212 -0
  492. data/epeg/vendor/libjpeg-turbo-2.0.4/wrbmp.c +558 -0
  493. data/epeg/vendor/libjpeg-turbo-2.0.4/wrgif.c +413 -0
  494. data/epeg/vendor/libjpeg-turbo-2.0.4/wrjpgcom.1 +103 -0
  495. data/epeg/vendor/libjpeg-turbo-2.0.4/wrjpgcom.c +591 -0
  496. data/epeg/vendor/libjpeg-turbo-2.0.4/wrppm.c +365 -0
  497. data/epeg/vendor/libjpeg-turbo-2.0.4/wrrle.c +309 -0
  498. data/epeg/vendor/libjpeg-turbo-2.0.4/wrtarga.c +261 -0
  499. data/epeg.c +131 -0
  500. data/epeg.gemspec +18 -0
  501. data/extconf.rb +80 -0
  502. data/test.jpg +0 -0
  503. data/test.rb +42 -0
  504. metadata +546 -0
@@ -0,0 +1,3433 @@
1
+ /*
2
+ * ARMv8 NEON optimizations for libjpeg-turbo
3
+ *
4
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
5
+ * All Rights Reserved.
6
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7
+ * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved.
8
+ * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
9
+ * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved.
10
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
11
+ * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.
12
+ *
13
+ * This software is provided 'as-is', without any express or implied
14
+ * warranty. In no event will the authors be held liable for any damages
15
+ * arising from the use of this software.
16
+ *
17
+ * Permission is granted to anyone to use this software for any purpose,
18
+ * including commercial applications, and to alter it and redistribute it
19
+ * freely, subject to the following restrictions:
20
+ *
21
+ * 1. The origin of this software must not be misrepresented; you must not
22
+ * claim that you wrote the original software. If you use this software
23
+ * in a product, an acknowledgment in the product documentation would be
24
+ * appreciated but is not required.
25
+ * 2. Altered source versions must be plainly marked as such, and must not be
26
+ * misrepresented as being the original software.
27
+ * 3. This notice may not be removed or altered from any source distribution.
28
+ */
29
+
30
+ #if defined(__linux__) && defined(__ELF__)
31
+ .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
32
+ #endif
33
+
34
+ #if defined(__APPLE__)
35
+ .section __DATA, __const
36
+ #else
37
+ .section .rodata, "a", %progbits
38
+ #endif
39
+
40
+ /* Constants for jsimd_idct_islow_neon() */
41
+
42
+ #define F_0_298 2446 /* FIX(0.298631336) */
43
+ #define F_0_390 3196 /* FIX(0.390180644) */
44
+ #define F_0_541 4433 /* FIX(0.541196100) */
45
+ #define F_0_765 6270 /* FIX(0.765366865) */
46
+ #define F_0_899 7373 /* FIX(0.899976223) */
47
+ #define F_1_175 9633 /* FIX(1.175875602) */
48
+ #define F_1_501 12299 /* FIX(1.501321110) */
49
+ #define F_1_847 15137 /* FIX(1.847759065) */
50
+ #define F_1_961 16069 /* FIX(1.961570560) */
51
+ #define F_2_053 16819 /* FIX(2.053119869) */
52
+ #define F_2_562 20995 /* FIX(2.562915447) */
53
+ #define F_3_072 25172 /* FIX(3.072711026) */
54
+
55
+ .balign 16
56
+ Ljsimd_idct_islow_neon_consts:
57
+ .short F_0_298
58
+ .short -F_0_390
59
+ .short F_0_541
60
+ .short F_0_765
61
+ .short - F_0_899
62
+ .short F_1_175
63
+ .short F_1_501
64
+ .short - F_1_847
65
+ .short - F_1_961
66
+ .short F_2_053
67
+ .short - F_2_562
68
+ .short F_3_072
69
+ .short 0 /* padding */
70
+ .short 0
71
+ .short 0
72
+ .short 0
73
+
74
+ #undef F_0_298
75
+ #undef F_0_390
76
+ #undef F_0_541
77
+ #undef F_0_765
78
+ #undef F_0_899
79
+ #undef F_1_175
80
+ #undef F_1_501
81
+ #undef F_1_847
82
+ #undef F_1_961
83
+ #undef F_2_053
84
+ #undef F_2_562
85
+ #undef F_3_072
86
+
87
+ /* Constants for jsimd_idct_ifast_neon() */
88
+
89
+ .balign 16
90
+ Ljsimd_idct_ifast_neon_consts:
91
+ .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
92
+ .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
93
+ .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
94
+ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
95
+
96
+ /* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */
97
+
98
+ #define CONST_BITS 13
99
+
100
+ #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
101
+ #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
102
+ #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
103
+ #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
104
+ #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
105
+ #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
106
+ #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
107
+ #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
108
+ #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
109
+ #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
110
+ #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
111
+ #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
112
+ #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
113
+ #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
114
+
115
+ .balign 16
116
+ Ljsimd_idct_4x4_neon_consts:
117
+ .short FIX_1_847759065 /* v0.h[0] */
118
+ .short -FIX_0_765366865 /* v0.h[1] */
119
+ .short -FIX_0_211164243 /* v0.h[2] */
120
+ .short FIX_1_451774981 /* v0.h[3] */
121
+ .short -FIX_2_172734803 /* d1[0] */
122
+ .short FIX_1_061594337 /* d1[1] */
123
+ .short -FIX_0_509795579 /* d1[2] */
124
+ .short -FIX_0_601344887 /* d1[3] */
125
+ .short FIX_0_899976223 /* v2.h[0] */
126
+ .short FIX_2_562915447 /* v2.h[1] */
127
+ .short 1 << (CONST_BITS + 1) /* v2.h[2] */
128
+ .short 0 /* v2.h[3] */
129
+
130
+ .balign 8
131
+ Ljsimd_idct_2x2_neon_consts:
132
+ .short -FIX_0_720959822 /* v14[0] */
133
+ .short FIX_0_850430095 /* v14[1] */
134
+ .short -FIX_1_272758580 /* v14[2] */
135
+ .short FIX_3_624509785 /* v14[3] */
136
+
137
+ /* Constants for jsimd_ycc_*_neon() */
138
+
139
+ .balign 16
140
+ Ljsimd_ycc_rgb_neon_consts:
141
+ .short 0, 0, 0, 0
142
+ .short 22971, -11277, -23401, 29033
143
+ .short -128, -128, -128, -128
144
+ .short -128, -128, -128, -128
145
+
146
+ /* Constants for jsimd_*_ycc_neon() */
147
+
148
+ .balign 16
149
+ Ljsimd_rgb_ycc_neon_consts:
150
+ .short 19595, 38470, 7471, 11059
151
+ .short 21709, 32768, 27439, 5329
152
+ .short 32767, 128, 32767, 128
153
+ .short 32767, 128, 32767, 128
154
+
155
+ /* Constants for jsimd_fdct_islow_neon() */
156
+
157
+ #define F_0_298 2446 /* FIX(0.298631336) */
158
+ #define F_0_390 3196 /* FIX(0.390180644) */
159
+ #define F_0_541 4433 /* FIX(0.541196100) */
160
+ #define F_0_765 6270 /* FIX(0.765366865) */
161
+ #define F_0_899 7373 /* FIX(0.899976223) */
162
+ #define F_1_175 9633 /* FIX(1.175875602) */
163
+ #define F_1_501 12299 /* FIX(1.501321110) */
164
+ #define F_1_847 15137 /* FIX(1.847759065) */
165
+ #define F_1_961 16069 /* FIX(1.961570560) */
166
+ #define F_2_053 16819 /* FIX(2.053119869) */
167
+ #define F_2_562 20995 /* FIX(2.562915447) */
168
+ #define F_3_072 25172 /* FIX(3.072711026) */
169
+
170
+ .balign 16
171
+ Ljsimd_fdct_islow_neon_consts:
172
+ .short F_0_298
173
+ .short -F_0_390
174
+ .short F_0_541
175
+ .short F_0_765
176
+ .short - F_0_899
177
+ .short F_1_175
178
+ .short F_1_501
179
+ .short - F_1_847
180
+ .short - F_1_961
181
+ .short F_2_053
182
+ .short - F_2_562
183
+ .short F_3_072
184
+ .short 0 /* padding */
185
+ .short 0
186
+ .short 0
187
+ .short 0
188
+
189
+ #undef F_0_298
190
+ #undef F_0_390
191
+ #undef F_0_541
192
+ #undef F_0_765
193
+ #undef F_0_899
194
+ #undef F_1_175
195
+ #undef F_1_501
196
+ #undef F_1_847
197
+ #undef F_1_961
198
+ #undef F_2_053
199
+ #undef F_2_562
200
+ #undef F_3_072
201
+
202
+ /* Constants for jsimd_fdct_ifast_neon() */
203
+
204
+ .balign 16
205
+ Ljsimd_fdct_ifast_neon_consts:
206
+ .short (98 * 128) /* XFIX_0_382683433 */
207
+ .short (139 * 128) /* XFIX_0_541196100 */
208
+ .short (181 * 128) /* XFIX_0_707106781 */
209
+ .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
210
+
211
+ /* Constants for jsimd_h2*_downsample_neon() */
212
+
213
+ .balign 16
214
+ Ljsimd_h2_downsample_neon_consts:
215
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
216
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */
217
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
218
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */
219
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
220
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */
221
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
222
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */
223
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
224
+ 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */
225
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
226
+ 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */
227
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
228
+ 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */
229
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
230
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */
231
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
232
+ 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */
233
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
234
+ 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */
235
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
236
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */
237
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
238
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */
239
+ .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
240
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */
241
+ .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
242
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */
243
+ .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
244
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */
245
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
246
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */
247
+
248
+ /* Constants for jsimd_huff_encode_one_block_neon() */
249
+
250
+ .balign 16
251
+ Ljsimd_huff_encode_one_block_neon_consts:
252
+ .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
253
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
254
+ .byte 0, 1, 2, 3, 16, 17, 32, 33, \
255
+ 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
256
+ .byte 34, 35, 48, 49, 255, 255, 50, 51, \
257
+ 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
258
+ .byte 8, 9, 22, 23, 36, 37, 50, 51, \
259
+ 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
260
+ .byte 54, 55, 40, 41, 26, 27, 12, 13, \
261
+ 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
262
+ .byte 6, 7, 20, 21, 34, 35, 48, 49, \
263
+ 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
264
+ .byte 42, 43, 28, 29, 14, 15, 30, 31, \
265
+ 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
266
+ .byte 255, 255, 255, 255, 56, 57, 42, 43, \
267
+ 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
268
+ .byte 26, 27, 40, 41, 42, 43, 28, 29, \
269
+ 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
270
+ .byte 255, 255, 255, 255, 0, 1, 255, 255, \
271
+ 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
272
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, \
273
+ 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
274
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, \
275
+ 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
276
+ .byte 4, 5, 6, 7, 255, 255, 255, 255, \
277
+ 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
278
+
279
+ .text
280
+
281
+
282
+ #define RESPECT_STRICT_ALIGNMENT 1
283
+
284
+
285
+ /*****************************************************************************/
286
+
287
+ /* Supplementary macro for setting function attributes */
288
+ .macro asm_function fname
289
+ #ifdef __APPLE__
290
+ .private_extern _\fname
291
+ .globl _\fname
292
+ _\fname:
293
+ #else
294
+ .global \fname
295
+ #ifdef __ELF__
296
+ .hidden \fname
297
+ .type \fname, %function
298
+ #endif
299
+ \fname:
300
+ #endif
301
+ .endm
302
+
303
+ /* Get symbol location */
304
+ .macro get_symbol_loc reg, symbol
305
+ #ifdef __APPLE__
306
+ adrp \reg, \symbol@PAGE
307
+ add \reg, \reg, \symbol@PAGEOFF
308
+ #else
309
+ adrp \reg, \symbol
310
+ add \reg, \reg, :lo12:\symbol
311
+ #endif
312
+ .endm
313
+
314
+ /* Transpose elements of single 128 bit registers */
315
+ .macro transpose_single x0, x1, xi, xilen, literal
316
+ ins \xi\xilen[0], \x0\xilen[0]
317
+ ins \x1\xilen[0], \x0\xilen[1]
318
+ trn1 \x0\literal, \x0\literal, \x1\literal
319
+ trn2 \x1\literal, \xi\literal, \x1\literal
320
+ .endm
321
+
322
+ /* Transpose elements of 2 different registers */
323
+ .macro transpose x0, x1, xi, xilen, literal
324
+ mov \xi\xilen, \x0\xilen
325
+ trn1 \x0\literal, \x0\literal, \x1\literal
326
+ trn2 \x1\literal, \xi\literal, \x1\literal
327
+ .endm
328
+
329
+ /* Transpose a block of 4x4 coefficients in four 64-bit registers */
330
+ .macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
331
+ mov \xi\xilen, \x0\xilen
332
+ trn1 \x0\x0len, \x0\x0len, \x2\x2len
333
+ trn2 \x2\x2len, \xi\x0len, \x2\x2len
334
+ mov \xi\xilen, \x1\xilen
335
+ trn1 \x1\x1len, \x1\x1len, \x3\x3len
336
+ trn2 \x3\x3len, \xi\x1len, \x3\x3len
337
+ .endm
338
+
339
+ .macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
340
+ mov \xi\xilen, \x0\xilen
341
+ trn1 \x0\x0len, \x0\x0len, \x1\x1len
342
+ trn2 \x1\x2len, \xi\x0len, \x1\x2len
343
+ mov \xi\xilen, \x2\xilen
344
+ trn1 \x2\x2len, \x2\x2len, \x3\x3len
345
+ trn2 \x3\x2len, \xi\x1len, \x3\x3len
346
+ .endm
347
+
348
+ .macro transpose_4x4 x0, x1, x2, x3, x5
349
+ transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
350
+ transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
351
+ .endm
352
+
353
+ .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
354
+ trn1 \t0\().8h, \l0\().8h, \l1\().8h
355
+ trn1 \t1\().8h, \l2\().8h, \l3\().8h
356
+ trn1 \t2\().8h, \l4\().8h, \l5\().8h
357
+ trn1 \t3\().8h, \l6\().8h, \l7\().8h
358
+ trn2 \l1\().8h, \l0\().8h, \l1\().8h
359
+ trn2 \l3\().8h, \l2\().8h, \l3\().8h
360
+ trn2 \l5\().8h, \l4\().8h, \l5\().8h
361
+ trn2 \l7\().8h, \l6\().8h, \l7\().8h
362
+
363
+ trn1 \l4\().4s, \t2\().4s, \t3\().4s
364
+ trn2 \t3\().4s, \t2\().4s, \t3\().4s
365
+ trn1 \t2\().4s, \t0\().4s, \t1\().4s
366
+ trn2 \l2\().4s, \t0\().4s, \t1\().4s
367
+ trn1 \t0\().4s, \l1\().4s, \l3\().4s
368
+ trn2 \l3\().4s, \l1\().4s, \l3\().4s
369
+ trn2 \t1\().4s, \l5\().4s, \l7\().4s
370
+ trn1 \l5\().4s, \l5\().4s, \l7\().4s
371
+
372
+ trn2 \l6\().2d, \l2\().2d, \t3\().2d
373
+ trn1 \l0\().2d, \t2\().2d, \l4\().2d
374
+ trn1 \l1\().2d, \t0\().2d, \l5\().2d
375
+ trn2 \l7\().2d, \l3\().2d, \t1\().2d
376
+ trn1 \l2\().2d, \l2\().2d, \t3\().2d
377
+ trn2 \l4\().2d, \t2\().2d, \l4\().2d
378
+ trn1 \l3\().2d, \l3\().2d, \t1\().2d
379
+ trn2 \l5\().2d, \t0\().2d, \l5\().2d
380
+ .endm
381
+
382
+
383
+ #define CENTERJSAMPLE 128
384
+
385
+ /*****************************************************************************/
386
+
387
+ /*
388
+ * Perform dequantization and inverse DCT on one block of coefficients.
389
+ *
390
+ * GLOBAL(void)
391
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
392
+ * JSAMPARRAY output_buf, JDIMENSION output_col)
393
+ */
394
+
395
+ #define CONST_BITS 13
396
+ #define PASS1_BITS 2
397
+
398
+ #define XFIX_P_0_298 v0.h[0]
399
+ #define XFIX_N_0_390 v0.h[1]
400
+ #define XFIX_P_0_541 v0.h[2]
401
+ #define XFIX_P_0_765 v0.h[3]
402
+ #define XFIX_N_0_899 v0.h[4]
403
+ #define XFIX_P_1_175 v0.h[5]
404
+ #define XFIX_P_1_501 v0.h[6]
405
+ #define XFIX_N_1_847 v0.h[7]
406
+ #define XFIX_N_1_961 v1.h[0]
407
+ #define XFIX_P_2_053 v1.h[1]
408
+ #define XFIX_N_2_562 v1.h[2]
409
+ #define XFIX_P_3_072 v1.h[3]
410
+
411
+ asm_function jsimd_idct_islow_neon
412
+ DCT_TABLE .req x0
413
+ COEF_BLOCK .req x1
414
+ OUTPUT_BUF .req x2
415
+ OUTPUT_COL .req x3
416
+ TMP1 .req x0
417
+ TMP2 .req x1
418
+ TMP3 .req x9
419
+ TMP4 .req x10
420
+ TMP5 .req x11
421
+ TMP6 .req x12
422
+ TMP7 .req x13
423
+ TMP8 .req x14
424
+
425
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
426
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
427
+ instruction ensures that those bits are set to zero. */
428
+ uxtw x3, w3
429
+
430
+ sub sp, sp, #64
431
+ get_symbol_loc x15, Ljsimd_idct_islow_neon_consts
432
+ mov x10, sp
433
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
434
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
435
+ ld1 {v0.8h, v1.8h}, [x15]
436
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
437
+ ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
438
+ ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
439
+ ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
440
+
441
+ cmeq v16.8h, v3.8h, #0
442
+ cmeq v26.8h, v4.8h, #0
443
+ cmeq v27.8h, v5.8h, #0
444
+ cmeq v28.8h, v6.8h, #0
445
+ cmeq v29.8h, v7.8h, #0
446
+ cmeq v30.8h, v8.8h, #0
447
+ cmeq v31.8h, v9.8h, #0
448
+
449
+ and v10.16b, v16.16b, v26.16b
450
+ and v11.16b, v27.16b, v28.16b
451
+ and v12.16b, v29.16b, v30.16b
452
+ and v13.16b, v31.16b, v10.16b
453
+ and v14.16b, v11.16b, v12.16b
454
+ mul v2.8h, v2.8h, v18.8h
455
+ and v15.16b, v13.16b, v14.16b
456
+ shl v10.8h, v2.8h, #(PASS1_BITS)
457
+ sqxtn v16.8b, v15.8h
458
+ mov TMP1, v16.d[0]
459
+ mvn TMP2, TMP1
460
+
461
+ cbnz TMP2, 2f
462
+ /* case all AC coeffs are zeros */
463
+ dup v2.2d, v10.d[0]
464
+ dup v6.2d, v10.d[1]
465
+ mov v3.16b, v2.16b
466
+ mov v7.16b, v6.16b
467
+ mov v4.16b, v2.16b
468
+ mov v8.16b, v6.16b
469
+ mov v5.16b, v2.16b
470
+ mov v9.16b, v6.16b
471
+ 1:
472
+ /* for this transpose, we should organise data like this:
473
+ * 00, 01, 02, 03, 40, 41, 42, 43
474
+ * 10, 11, 12, 13, 50, 51, 52, 53
475
+ * 20, 21, 22, 23, 60, 61, 62, 63
476
+ * 30, 31, 32, 33, 70, 71, 72, 73
477
+ * 04, 05, 06, 07, 44, 45, 46, 47
478
+ * 14, 15, 16, 17, 54, 55, 56, 57
479
+ * 24, 25, 26, 27, 64, 65, 66, 67
480
+ * 34, 35, 36, 37, 74, 75, 76, 77
481
+ */
482
+ trn1 v28.8h, v2.8h, v3.8h
483
+ trn1 v29.8h, v4.8h, v5.8h
484
+ trn1 v30.8h, v6.8h, v7.8h
485
+ trn1 v31.8h, v8.8h, v9.8h
486
+ trn2 v16.8h, v2.8h, v3.8h
487
+ trn2 v17.8h, v4.8h, v5.8h
488
+ trn2 v18.8h, v6.8h, v7.8h
489
+ trn2 v19.8h, v8.8h, v9.8h
490
+ trn1 v2.4s, v28.4s, v29.4s
491
+ trn1 v6.4s, v30.4s, v31.4s
492
+ trn1 v3.4s, v16.4s, v17.4s
493
+ trn1 v7.4s, v18.4s, v19.4s
494
+ trn2 v4.4s, v28.4s, v29.4s
495
+ trn2 v8.4s, v30.4s, v31.4s
496
+ trn2 v5.4s, v16.4s, v17.4s
497
+ trn2 v9.4s, v18.4s, v19.4s
498
+ /* Even part: reverse the even part of the forward DCT. */
499
+ add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
500
+ add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
501
+ smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
502
+ sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
503
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
504
+ sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
505
+ mov v21.16b, v19.16b /* tmp3 = z1 */
506
+ mov v20.16b, v18.16b /* tmp3 = z1 */
507
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
508
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
509
+ sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
510
+ smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
511
+ smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
512
+ sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
513
+ sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
514
+ add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
515
+ sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
516
+ add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
517
+ sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
518
+ add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
519
+ sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
520
+ add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
521
+ sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
522
+
523
+ /* Odd part per figure 8; the matrix is unitary and hence its
524
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
525
+ */
526
+
527
+ add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
528
+ add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
529
+ add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
530
+ add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
531
+ add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
532
+
533
+ smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
534
+ smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
535
+ smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
536
+ smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
537
+ smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
538
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
539
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
540
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
541
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
542
+
543
+ smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
544
+ smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
545
+ smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
546
+ smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
547
+ smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
548
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
549
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
550
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
551
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
552
+
553
+ add v23.4s, v23.4s, v27.4s /* z3 += z5 */
554
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
555
+ add v25.4s, v25.4s, v27.4s /* z4 += z5 */
556
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
557
+
558
+ add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
559
+ add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
560
+ add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
561
+ add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
562
+ add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
563
+ add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
564
+ add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
565
+ add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
566
+
567
+ add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
568
+ add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
569
+ add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
570
+ add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
571
+ add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
572
+ add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
573
+ add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
574
+ add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
575
+
576
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
577
+
578
+ add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
579
+ add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
580
+ sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
581
+ sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
582
+ add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
583
+ add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
584
+ sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
585
+ sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
586
+ add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
587
+ add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
588
+ sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
589
+ sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
590
+ add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
591
+ add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
592
+ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
593
+ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
594
+
595
+ shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
596
+ shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
597
+ shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
598
+ shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
599
+ shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
600
+ shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
601
+ shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
602
+ shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
603
+ shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
604
+ shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
605
+ shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
606
+ shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
607
+ shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
608
+ shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
609
+ shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
610
+ shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
611
+ movi v0.16b, #(CENTERJSAMPLE)
612
+ /* Prepare pointers (dual-issue with NEON instructions) */
613
+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
614
+ sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
615
+ ldp TMP3, TMP4, [OUTPUT_BUF], 16
616
+ sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
617
+ add TMP1, TMP1, OUTPUT_COL
618
+ sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
619
+ add TMP2, TMP2, OUTPUT_COL
620
+ sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
621
+ add TMP3, TMP3, OUTPUT_COL
622
+ sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
623
+ add TMP4, TMP4, OUTPUT_COL
624
+ sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
625
+ ldp TMP5, TMP6, [OUTPUT_BUF], 16
626
+ sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
627
+ ldp TMP7, TMP8, [OUTPUT_BUF], 16
628
+ sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
629
+ add TMP5, TMP5, OUTPUT_COL
630
+ add v16.16b, v28.16b, v0.16b
631
+ add TMP6, TMP6, OUTPUT_COL
632
+ add v18.16b, v29.16b, v0.16b
633
+ add TMP7, TMP7, OUTPUT_COL
634
+ add v20.16b, v30.16b, v0.16b
635
+ add TMP8, TMP8, OUTPUT_COL
636
+ add v22.16b, v31.16b, v0.16b
637
+
638
+ /* Transpose the final 8-bit samples */
639
+ trn1 v28.16b, v16.16b, v18.16b
640
+ trn1 v30.16b, v20.16b, v22.16b
641
+ trn2 v29.16b, v16.16b, v18.16b
642
+ trn2 v31.16b, v20.16b, v22.16b
643
+
644
+ trn1 v16.8h, v28.8h, v30.8h
645
+ trn2 v18.8h, v28.8h, v30.8h
646
+ trn1 v20.8h, v29.8h, v31.8h
647
+ trn2 v22.8h, v29.8h, v31.8h
648
+
649
+ uzp1 v28.4s, v16.4s, v18.4s
650
+ uzp2 v30.4s, v16.4s, v18.4s
651
+ uzp1 v29.4s, v20.4s, v22.4s
652
+ uzp2 v31.4s, v20.4s, v22.4s
653
+
654
+ /* Store results to the output buffer */
655
+ st1 {v28.d}[0], [TMP1]
656
+ st1 {v29.d}[0], [TMP2]
657
+ st1 {v28.d}[1], [TMP3]
658
+ st1 {v29.d}[1], [TMP4]
659
+ st1 {v30.d}[0], [TMP5]
660
+ st1 {v31.d}[0], [TMP6]
661
+ st1 {v30.d}[1], [TMP7]
662
+ st1 {v31.d}[1], [TMP8]
663
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
664
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
665
+ blr x30
666
+
667
+ .balign 16
668
+ 2:
669
+ mul v3.8h, v3.8h, v19.8h
670
+ mul v4.8h, v4.8h, v20.8h
671
+ mul v5.8h, v5.8h, v21.8h
672
+ add TMP4, xzr, TMP2, LSL #32
673
+ mul v6.8h, v6.8h, v22.8h
674
+ mul v7.8h, v7.8h, v23.8h
675
+ adds TMP3, xzr, TMP2, LSR #32
676
+ mul v8.8h, v8.8h, v24.8h
677
+ mul v9.8h, v9.8h, v25.8h
678
+ b.ne 3f
679
+ /* Right AC coef is zero */
680
+ dup v15.2d, v10.d[1]
681
+ /* Even part: reverse the even part of the forward DCT. */
682
+ add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
683
+ add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
684
+ sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
685
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
686
+ sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
687
+ mov v20.16b, v18.16b /* tmp3 = z1 */
688
+ sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
689
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
690
+ smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
691
+ add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
692
+ sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
693
+ add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
694
+ sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
695
+
696
+ /* Odd part per figure 8; the matrix is unitary and hence its
697
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
698
+ */
699
+
700
+ add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
701
+ add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
702
+ add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
703
+ add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
704
+ add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */
705
+
706
+ smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
707
+ smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
708
+ smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
709
+ smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
710
+ smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
711
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
712
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
713
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
714
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
715
+
716
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
717
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
718
+
719
+ add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
720
+ add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
721
+ add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
722
+ add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
723
+
724
+ add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
725
+ add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
726
+ add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
727
+ add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
728
+
729
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
730
+
731
+ add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
732
+ sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
733
+ add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
734
+ sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
735
+ add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
736
+ sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
737
+ add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
738
+ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
739
+
740
+ rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
741
+ rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
742
+ rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
743
+ rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
744
+ rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
745
+ rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
746
+ rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
747
+ rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
748
+ mov v6.16b, v15.16b
749
+ mov v7.16b, v15.16b
750
+ mov v8.16b, v15.16b
751
+ mov v9.16b, v15.16b
752
+ b 1b
753
+
754
+ .balign 16
755
+ 3:
756
+ cbnz TMP4, 4f
757
+ /* Left AC coef is zero */
758
+ dup v14.2d, v10.d[0]
759
+ /* Even part: reverse the even part of the forward DCT. */
760
+ add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
761
+ add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
762
+ smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
763
+ sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
764
+ sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
765
+ mov v21.16b, v19.16b /* tmp3 = z1 */
766
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
767
+ sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
768
+ smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
769
+ add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
770
+ sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
771
+ add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
772
+ sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
773
+
774
+ /* Odd part per figure 8; the matrix is unitary and hence its
775
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
776
+ */
777
+
778
+ add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
779
+ add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
780
+ add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
781
+ add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
782
+ add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
783
+
784
+ smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
785
+ smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
786
+ smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
787
+ smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
788
+ smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
789
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
790
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
791
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
792
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
793
+
794
+ add v23.4s, v23.4s, v27.4s /* z3 += z5 */
795
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
796
+ add v25.4s, v25.4s, v27.4s /* z4 += z5 */
797
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
798
+
799
+ add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
800
+ add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
801
+ add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
802
+ add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
803
+
804
+ add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
805
+ add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
806
+ add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
807
+ add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
808
+
809
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
810
+
811
+ add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
812
+ sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
813
+ add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
814
+ sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
815
+ add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
816
+ sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
817
+ add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
818
+ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
819
+
820
+ mov v2.16b, v14.16b
821
+ mov v3.16b, v14.16b
822
+ mov v4.16b, v14.16b
823
+ mov v5.16b, v14.16b
824
+ rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
825
+ rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
826
+ rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
827
+ rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
828
+ rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
829
+ rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
830
+ rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
831
+ rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
832
+ b 1b
833
+
834
+ .balign 16
835
+ 4:
836
+ /* "No" AC coef is zero */
837
+ /* Even part: reverse the even part of the forward DCT. */
838
+ add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
839
+ add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
840
+ smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
841
+ sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
842
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
843
+ sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
844
+ mov v21.16b, v19.16b /* tmp3 = z1 */
845
+ mov v20.16b, v18.16b /* tmp3 = z1 */
846
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
847
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
848
+ sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
849
+ smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
850
+ smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
851
+ sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
852
+ sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
853
+ add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
854
+ sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
855
+ add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
856
+ sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
857
+ add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
858
+ sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
859
+ add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
860
+ sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
861
+
862
+ /* Odd part per figure 8; the matrix is unitary and hence its
863
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
864
+ */
865
+
866
+ add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
867
+ add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
868
+ add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
869
+ add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
870
+ add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
871
+
872
+ smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
873
+ smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
874
+ smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
875
+ smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
876
+ smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
877
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
878
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
879
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
880
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
881
+
882
+ smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
883
+ smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
884
+ smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
885
+ smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
886
+ smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
887
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
888
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
889
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
890
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
891
+
892
+ add v23.4s, v23.4s, v27.4s /* z3 += z5 */
893
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
894
+ add v25.4s, v25.4s, v27.4s /* z4 += z5 */
895
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
896
+
897
+ add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
898
+ add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
899
+ add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
900
+ add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
901
+ add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
902
+ add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
903
+ add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
904
+ add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
905
+
906
+ add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
907
+ add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
908
+ add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
909
+ add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
910
+ add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
911
+ add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
912
+ add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
913
+ add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
914
+
915
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
916
+
917
+ add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
918
+ add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
919
+ sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
920
+ sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
921
+ add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
922
+ add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
923
+ sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
924
+ sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
925
+ add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
926
+ add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
927
+ sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
928
+ sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
929
+ add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
930
+ add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
931
+ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
932
+ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
933
+
934
+ rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
935
+ rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
936
+ rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
937
+ rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
938
+ rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
939
+ rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
940
+ rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
941
+ rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
942
+ rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
943
+ rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
944
+ rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
945
+ rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
946
+ rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
947
+ rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
948
+ rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
949
+ rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
950
+ b 1b
951
+
952
+ .unreq DCT_TABLE
953
+ .unreq COEF_BLOCK
954
+ .unreq OUTPUT_BUF
955
+ .unreq OUTPUT_COL
956
+ .unreq TMP1
957
+ .unreq TMP2
958
+ .unreq TMP3
959
+ .unreq TMP4
960
+ .unreq TMP5
961
+ .unreq TMP6
962
+ .unreq TMP7
963
+ .unreq TMP8
964
+
965
+ #undef CENTERJSAMPLE
966
+ #undef CONST_BITS
967
+ #undef PASS1_BITS
968
+ #undef XFIX_P_0_298
969
+ #undef XFIX_N_0_390
970
+ #undef XFIX_P_0_541
971
+ #undef XFIX_P_0_765
972
+ #undef XFIX_N_0_899
973
+ #undef XFIX_P_1_175
974
+ #undef XFIX_P_1_501
975
+ #undef XFIX_N_1_847
976
+ #undef XFIX_N_1_961
977
+ #undef XFIX_P_2_053
978
+ #undef XFIX_N_2_562
979
+ #undef XFIX_P_3_072
980
+
981
+
982
+ /*****************************************************************************/
983
+
984
+ /*
985
+ * jsimd_idct_ifast_neon
986
+ *
987
+ * This function contains a fast, not so accurate integer implementation of
988
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
989
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
990
+ * function from jidctfst.c
991
+ *
992
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
993
+ * But in ARM NEON case some extra additions are required because VQDMULH
994
+ * instruction can't handle the constants larger than 1. So the expressions
995
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
996
+ * which introduces an extra addition. Overall, there are 6 extra additions
997
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
998
+ */
999
+
1000
+ #define XFIX_1_082392200 v0.h[0]
1001
+ #define XFIX_1_414213562 v0.h[1]
1002
+ #define XFIX_1_847759065 v0.h[2]
1003
+ #define XFIX_2_613125930 v0.h[3]
1004
+
1005
+ asm_function jsimd_idct_ifast_neon
1006
+
1007
+ DCT_TABLE .req x0
1008
+ COEF_BLOCK .req x1
1009
+ OUTPUT_BUF .req x2
1010
+ OUTPUT_COL .req x3
1011
+ TMP1 .req x0
1012
+ TMP2 .req x1
1013
+ TMP3 .req x9
1014
+ TMP4 .req x10
1015
+ TMP5 .req x11
1016
+ TMP6 .req x12
1017
+ TMP7 .req x13
1018
+ TMP8 .req x14
1019
+
1020
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1021
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
1022
+ instruction ensures that those bits are set to zero. */
1023
+ uxtw x3, w3
1024
+
1025
+ /* Load and dequantize coefficients into NEON registers
1026
+ * with the following allocation:
1027
+ * 0 1 2 3 | 4 5 6 7
1028
+ * ---------+--------
1029
+ * 0 | d16 | d17 ( v16.8h )
1030
+ * 1 | d18 | d19 ( v17.8h )
1031
+ * 2 | d20 | d21 ( v18.8h )
1032
+ * 3 | d22 | d23 ( v19.8h )
1033
+ * 4 | d24 | d25 ( v20.8h )
1034
+ * 5 | d26 | d27 ( v21.8h )
1035
+ * 6 | d28 | d29 ( v22.8h )
1036
+ * 7 | d30 | d31 ( v23.8h )
1037
+ */
1038
+ /* Save NEON registers used in fast IDCT */
1039
+ get_symbol_loc TMP5, Ljsimd_idct_ifast_neon_consts
1040
+ ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32
1041
+ ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
1042
+ ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32
1043
+ mul v16.8h, v16.8h, v0.8h
1044
+ ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
1045
+ mul v17.8h, v17.8h, v1.8h
1046
+ ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32
1047
+ mul v18.8h, v18.8h, v2.8h
1048
+ ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
1049
+ mul v19.8h, v19.8h, v3.8h
1050
+ ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32
1051
+ mul v20.8h, v20.8h, v0.8h
1052
+ ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
1053
+ mul v22.8h, v22.8h, v2.8h
1054
+ mul v21.8h, v21.8h, v1.8h
1055
+ ld1 {v0.4h}, [TMP5] /* load constants */
1056
+ mul v23.8h, v23.8h, v3.8h
1057
+
1058
+ /* 1-D IDCT, pass 1 */
1059
+ sub v2.8h, v18.8h, v22.8h
1060
+ add v22.8h, v18.8h, v22.8h
1061
+ sub v1.8h, v19.8h, v21.8h
1062
+ add v21.8h, v19.8h, v21.8h
1063
+ sub v5.8h, v17.8h, v23.8h
1064
+ add v23.8h, v17.8h, v23.8h
1065
+ sqdmulh v4.8h, v2.8h, XFIX_1_414213562
1066
+ sqdmulh v6.8h, v1.8h, XFIX_2_613125930
1067
+ add v3.8h, v1.8h, v1.8h
1068
+ sub v1.8h, v5.8h, v1.8h
1069
+ add v18.8h, v2.8h, v4.8h
1070
+ sqdmulh v4.8h, v1.8h, XFIX_1_847759065
1071
+ sub v2.8h, v23.8h, v21.8h
1072
+ add v3.8h, v3.8h, v6.8h
1073
+ sqdmulh v6.8h, v2.8h, XFIX_1_414213562
1074
+ add v1.8h, v1.8h, v4.8h
1075
+ sqdmulh v4.8h, v5.8h, XFIX_1_082392200
1076
+ sub v18.8h, v18.8h, v22.8h
1077
+ add v2.8h, v2.8h, v6.8h
1078
+ sub v6.8h, v16.8h, v20.8h
1079
+ add v20.8h, v16.8h, v20.8h
1080
+ add v17.8h, v5.8h, v4.8h
1081
+ add v5.8h, v6.8h, v18.8h
1082
+ sub v18.8h, v6.8h, v18.8h
1083
+ add v6.8h, v23.8h, v21.8h
1084
+ add v16.8h, v20.8h, v22.8h
1085
+ sub v3.8h, v6.8h, v3.8h
1086
+ sub v20.8h, v20.8h, v22.8h
1087
+ sub v3.8h, v3.8h, v1.8h
1088
+ sub v1.8h, v17.8h, v1.8h
1089
+ add v2.8h, v3.8h, v2.8h
1090
+ sub v23.8h, v16.8h, v6.8h
1091
+ add v1.8h, v1.8h, v2.8h
1092
+ add v16.8h, v16.8h, v6.8h
1093
+ add v22.8h, v5.8h, v3.8h
1094
+ sub v17.8h, v5.8h, v3.8h
1095
+ sub v21.8h, v18.8h, v2.8h
1096
+ add v18.8h, v18.8h, v2.8h
1097
+ sub v19.8h, v20.8h, v1.8h
1098
+ add v20.8h, v20.8h, v1.8h
1099
+ transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
1100
+ /* 1-D IDCT, pass 2 */
1101
+ sub v2.8h, v18.8h, v22.8h
1102
+ add v22.8h, v18.8h, v22.8h
1103
+ sub v1.8h, v19.8h, v21.8h
1104
+ add v21.8h, v19.8h, v21.8h
1105
+ sub v5.8h, v17.8h, v23.8h
1106
+ add v23.8h, v17.8h, v23.8h
1107
+ sqdmulh v4.8h, v2.8h, XFIX_1_414213562
1108
+ sqdmulh v6.8h, v1.8h, XFIX_2_613125930
1109
+ add v3.8h, v1.8h, v1.8h
1110
+ sub v1.8h, v5.8h, v1.8h
1111
+ add v18.8h, v2.8h, v4.8h
1112
+ sqdmulh v4.8h, v1.8h, XFIX_1_847759065
1113
+ sub v2.8h, v23.8h, v21.8h
1114
+ add v3.8h, v3.8h, v6.8h
1115
+ sqdmulh v6.8h, v2.8h, XFIX_1_414213562
1116
+ add v1.8h, v1.8h, v4.8h
1117
+ sqdmulh v4.8h, v5.8h, XFIX_1_082392200
1118
+ sub v18.8h, v18.8h, v22.8h
1119
+ add v2.8h, v2.8h, v6.8h
1120
+ sub v6.8h, v16.8h, v20.8h
1121
+ add v20.8h, v16.8h, v20.8h
1122
+ add v17.8h, v5.8h, v4.8h
1123
+ add v5.8h, v6.8h, v18.8h
1124
+ sub v18.8h, v6.8h, v18.8h
1125
+ add v6.8h, v23.8h, v21.8h
1126
+ add v16.8h, v20.8h, v22.8h
1127
+ sub v3.8h, v6.8h, v3.8h
1128
+ sub v20.8h, v20.8h, v22.8h
1129
+ sub v3.8h, v3.8h, v1.8h
1130
+ sub v1.8h, v17.8h, v1.8h
1131
+ add v2.8h, v3.8h, v2.8h
1132
+ sub v23.8h, v16.8h, v6.8h
1133
+ add v1.8h, v1.8h, v2.8h
1134
+ add v16.8h, v16.8h, v6.8h
1135
+ add v22.8h, v5.8h, v3.8h
1136
+ sub v17.8h, v5.8h, v3.8h
1137
+ sub v21.8h, v18.8h, v2.8h
1138
+ add v18.8h, v18.8h, v2.8h
1139
+ sub v19.8h, v20.8h, v1.8h
1140
+ add v20.8h, v20.8h, v1.8h
1141
+ /* Descale to 8-bit and range limit */
1142
+ movi v0.16b, #0x80
1143
+ /* Prepare pointers (dual-issue with NEON instructions) */
1144
+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
1145
+ sqshrn v28.8b, v16.8h, #5
1146
+ ldp TMP3, TMP4, [OUTPUT_BUF], 16
1147
+ sqshrn v29.8b, v17.8h, #5
1148
+ add TMP1, TMP1, OUTPUT_COL
1149
+ sqshrn v30.8b, v18.8h, #5
1150
+ add TMP2, TMP2, OUTPUT_COL
1151
+ sqshrn v31.8b, v19.8h, #5
1152
+ add TMP3, TMP3, OUTPUT_COL
1153
+ sqshrn2 v28.16b, v20.8h, #5
1154
+ add TMP4, TMP4, OUTPUT_COL
1155
+ sqshrn2 v29.16b, v21.8h, #5
1156
+ ldp TMP5, TMP6, [OUTPUT_BUF], 16
1157
+ sqshrn2 v30.16b, v22.8h, #5
1158
+ ldp TMP7, TMP8, [OUTPUT_BUF], 16
1159
+ sqshrn2 v31.16b, v23.8h, #5
1160
+ add TMP5, TMP5, OUTPUT_COL
1161
+ add v16.16b, v28.16b, v0.16b
1162
+ add TMP6, TMP6, OUTPUT_COL
1163
+ add v18.16b, v29.16b, v0.16b
1164
+ add TMP7, TMP7, OUTPUT_COL
1165
+ add v20.16b, v30.16b, v0.16b
1166
+ add TMP8, TMP8, OUTPUT_COL
1167
+ add v22.16b, v31.16b, v0.16b
1168
+
1169
+ /* Transpose the final 8-bit samples */
1170
+ trn1 v28.16b, v16.16b, v18.16b
1171
+ trn1 v30.16b, v20.16b, v22.16b
1172
+ trn2 v29.16b, v16.16b, v18.16b
1173
+ trn2 v31.16b, v20.16b, v22.16b
1174
+
1175
+ trn1 v16.8h, v28.8h, v30.8h
1176
+ trn2 v18.8h, v28.8h, v30.8h
1177
+ trn1 v20.8h, v29.8h, v31.8h
1178
+ trn2 v22.8h, v29.8h, v31.8h
1179
+
1180
+ uzp1 v28.4s, v16.4s, v18.4s
1181
+ uzp2 v30.4s, v16.4s, v18.4s
1182
+ uzp1 v29.4s, v20.4s, v22.4s
1183
+ uzp2 v31.4s, v20.4s, v22.4s
1184
+
1185
+ /* Store results to the output buffer */
1186
+ st1 {v28.d}[0], [TMP1]
1187
+ st1 {v29.d}[0], [TMP2]
1188
+ st1 {v28.d}[1], [TMP3]
1189
+ st1 {v29.d}[1], [TMP4]
1190
+ st1 {v30.d}[0], [TMP5]
1191
+ st1 {v31.d}[0], [TMP6]
1192
+ st1 {v30.d}[1], [TMP7]
1193
+ st1 {v31.d}[1], [TMP8]
1194
+ blr x30
1195
+
1196
+ .unreq DCT_TABLE
1197
+ .unreq COEF_BLOCK
1198
+ .unreq OUTPUT_BUF
1199
+ .unreq OUTPUT_COL
1200
+ .unreq TMP1
1201
+ .unreq TMP2
1202
+ .unreq TMP3
1203
+ .unreq TMP4
1204
+ .unreq TMP5
1205
+ .unreq TMP6
1206
+ .unreq TMP7
1207
+ .unreq TMP8
1208
+
1209
+
1210
+ /*****************************************************************************/
1211
+
1212
+ /*
1213
+ * jsimd_idct_4x4_neon
1214
+ *
1215
+ * This function contains inverse-DCT code for getting reduced-size
1216
+ * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
1217
+ * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1218
+ * function from jpeg-6b (jidctred.c).
1219
+ *
1220
+ * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1221
+ * requires much less arithmetic operations and hence should be faster.
1222
+ * The primary purpose of this particular NEON optimized function is
1223
+ * bit exact compatibility with jpeg-6b.
1224
+ *
1225
+ * TODO: a bit better instructions scheduling can be achieved by expanding
1226
+ * idct_helper/transpose_4x4 macros and reordering instructions,
1227
+ * but readability will suffer somewhat.
1228
+ */
1229
+
1230
+ .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1231
+ smull v28.4s, \x4, v2.h[2]
1232
+ smlal v28.4s, \x8, v0.h[0]
1233
+ smlal v28.4s, \x14, v0.h[1]
1234
+
1235
+ smull v26.4s, \x16, v1.h[2]
1236
+ smlal v26.4s, \x12, v1.h[3]
1237
+ smlal v26.4s, \x10, v2.h[0]
1238
+ smlal v26.4s, \x6, v2.h[1]
1239
+
1240
+ smull v30.4s, \x4, v2.h[2]
1241
+ smlsl v30.4s, \x8, v0.h[0]
1242
+ smlsl v30.4s, \x14, v0.h[1]
1243
+
1244
+ smull v24.4s, \x16, v0.h[2]
1245
+ smlal v24.4s, \x12, v0.h[3]
1246
+ smlal v24.4s, \x10, v1.h[0]
1247
+ smlal v24.4s, \x6, v1.h[1]
1248
+
1249
+ add v20.4s, v28.4s, v26.4s
1250
+ sub v28.4s, v28.4s, v26.4s
1251
+
1252
+ .if \shift > 16
1253
+ srshr v20.4s, v20.4s, #\shift
1254
+ srshr v28.4s, v28.4s, #\shift
1255
+ xtn \y26, v20.4s
1256
+ xtn \y29, v28.4s
1257
+ .else
1258
+ rshrn \y26, v20.4s, #\shift
1259
+ rshrn \y29, v28.4s, #\shift
1260
+ .endif
1261
+
1262
+ add v20.4s, v30.4s, v24.4s
1263
+ sub v30.4s, v30.4s, v24.4s
1264
+
1265
+ .if \shift > 16
1266
+ srshr v20.4s, v20.4s, #\shift
1267
+ srshr v30.4s, v30.4s, #\shift
1268
+ xtn \y27, v20.4s
1269
+ xtn \y28, v30.4s
1270
+ .else
1271
+ rshrn \y27, v20.4s, #\shift
1272
+ rshrn \y28, v30.4s, #\shift
1273
+ .endif
1274
+ .endm
1275
+
1276
+ asm_function jsimd_idct_4x4_neon
1277
+
1278
+ DCT_TABLE .req x0
1279
+ COEF_BLOCK .req x1
1280
+ OUTPUT_BUF .req x2
1281
+ OUTPUT_COL .req x3
1282
+ TMP1 .req x0
1283
+ TMP2 .req x1
1284
+ TMP3 .req x2
1285
+ TMP4 .req x15
1286
+
1287
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1288
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
1289
+ instruction ensures that those bits are set to zero. */
1290
+ uxtw x3, w3
1291
+
1292
+ /* Save all used NEON registers */
1293
+ sub sp, sp, 64
1294
+ mov x9, sp
1295
+ /* Load constants (v3.4h is just used for padding) */
1296
+ get_symbol_loc TMP4, Ljsimd_idct_4x4_neon_consts
1297
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1298
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1299
+ ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1300
+
1301
+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
1302
+ * 0 1 2 3 | 4 5 6 7
1303
+ * ---------+--------
1304
+ * 0 | v4.4h | v5.4h
1305
+ * 1 | v6.4h | v7.4h
1306
+ * 2 | v8.4h | v9.4h
1307
+ * 3 | v10.4h | v11.4h
1308
+ * 4 | - | -
1309
+ * 5 | v12.4h | v13.4h
1310
+ * 6 | v14.4h | v15.4h
1311
+ * 7 | v16.4h | v17.4h
1312
+ */
1313
+ ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1314
+ ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1315
+ add COEF_BLOCK, COEF_BLOCK, #16
1316
+ ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1317
+ ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1318
+ /* dequantize */
1319
+ ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1320
+ mul v4.4h, v4.4h, v18.4h
1321
+ mul v5.4h, v5.4h, v19.4h
1322
+ ins v4.d[1], v5.d[0] /* 128 bit q4 */
1323
+ ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1324
+ mul v6.4h, v6.4h, v20.4h
1325
+ mul v7.4h, v7.4h, v21.4h
1326
+ ins v6.d[1], v7.d[0] /* 128 bit q6 */
1327
+ mul v8.4h, v8.4h, v22.4h
1328
+ mul v9.4h, v9.4h, v23.4h
1329
+ ins v8.d[1], v9.d[0] /* 128 bit q8 */
1330
+ add DCT_TABLE, DCT_TABLE, #16
1331
+ ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1332
+ mul v10.4h, v10.4h, v24.4h
1333
+ mul v11.4h, v11.4h, v25.4h
1334
+ ins v10.d[1], v11.d[0] /* 128 bit q10 */
1335
+ mul v12.4h, v12.4h, v26.4h
1336
+ mul v13.4h, v13.4h, v27.4h
1337
+ ins v12.d[1], v13.d[0] /* 128 bit q12 */
1338
+ ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1339
+ mul v14.4h, v14.4h, v28.4h
1340
+ mul v15.4h, v15.4h, v29.4h
1341
+ ins v14.d[1], v15.d[0] /* 128 bit q14 */
1342
+ mul v16.4h, v16.4h, v30.4h
1343
+ mul v17.4h, v17.4h, v31.4h
1344
+ ins v16.d[1], v17.d[0] /* 128 bit q16 */
1345
+
1346
+ /* Pass 1 */
1347
+ idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
1348
+ v4.4h, v6.4h, v8.4h, v10.4h
1349
+ transpose_4x4 v4, v6, v8, v10, v3
1350
+ ins v10.d[1], v11.d[0]
1351
+ idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
1352
+ v5.4h, v7.4h, v9.4h, v11.4h
1353
+ transpose_4x4 v5, v7, v9, v11, v3
1354
+ ins v10.d[1], v11.d[0]
1355
+
1356
+ /* Pass 2 */
1357
+ idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
1358
+ v26.4h, v27.4h, v28.4h, v29.4h
1359
+ transpose_4x4 v26, v27, v28, v29, v3
1360
+
1361
+ /* Range limit */
1362
+ movi v30.8h, #0x80
1363
+ ins v26.d[1], v27.d[0]
1364
+ ins v28.d[1], v29.d[0]
1365
+ add v26.8h, v26.8h, v30.8h
1366
+ add v28.8h, v28.8h, v30.8h
1367
+ sqxtun v26.8b, v26.8h
1368
+ sqxtun v27.8b, v28.8h
1369
+
1370
+ /* Store results to the output buffer */
1371
+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
1372
+ ldp TMP3, TMP4, [OUTPUT_BUF]
1373
+ add TMP1, TMP1, OUTPUT_COL
1374
+ add TMP2, TMP2, OUTPUT_COL
1375
+ add TMP3, TMP3, OUTPUT_COL
1376
+ add TMP4, TMP4, OUTPUT_COL
1377
+
1378
+ #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1379
+ /* We can use much less instructions on little endian systems if the
1380
+ * OS kernel is not configured to trap unaligned memory accesses
1381
+ */
1382
+ st1 {v26.s}[0], [TMP1], 4
1383
+ st1 {v27.s}[0], [TMP3], 4
1384
+ st1 {v26.s}[1], [TMP2], 4
1385
+ st1 {v27.s}[1], [TMP4], 4
1386
+ #else
1387
+ st1 {v26.b}[0], [TMP1], 1
1388
+ st1 {v27.b}[0], [TMP3], 1
1389
+ st1 {v26.b}[1], [TMP1], 1
1390
+ st1 {v27.b}[1], [TMP3], 1
1391
+ st1 {v26.b}[2], [TMP1], 1
1392
+ st1 {v27.b}[2], [TMP3], 1
1393
+ st1 {v26.b}[3], [TMP1], 1
1394
+ st1 {v27.b}[3], [TMP3], 1
1395
+
1396
+ st1 {v26.b}[4], [TMP2], 1
1397
+ st1 {v27.b}[4], [TMP4], 1
1398
+ st1 {v26.b}[5], [TMP2], 1
1399
+ st1 {v27.b}[5], [TMP4], 1
1400
+ st1 {v26.b}[6], [TMP2], 1
1401
+ st1 {v27.b}[6], [TMP4], 1
1402
+ st1 {v26.b}[7], [TMP2], 1
1403
+ st1 {v27.b}[7], [TMP4], 1
1404
+ #endif
1405
+
1406
+ /* vpop {v8.4h - v15.4h} ;not available */
1407
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1408
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1409
+ blr x30
1410
+
1411
+ .unreq DCT_TABLE
1412
+ .unreq COEF_BLOCK
1413
+ .unreq OUTPUT_BUF
1414
+ .unreq OUTPUT_COL
1415
+ .unreq TMP1
1416
+ .unreq TMP2
1417
+ .unreq TMP3
1418
+ .unreq TMP4
1419
+
1420
+ .purgem idct_helper
1421
+
1422
+
1423
+ /*****************************************************************************/
1424
+
1425
+ /*
1426
+ * jsimd_idct_2x2_neon
1427
+ *
1428
+ * This function contains inverse-DCT code for getting reduced-size
1429
+ * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1430
+ * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1431
+ * function from jpeg-6b (jidctred.c).
1432
+ *
1433
+ * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1434
+ * requires much less arithmetic operations and hence should be faster.
1435
+ * The primary purpose of this particular NEON optimized function is
1436
+ * bit exact compatibility with jpeg-6b.
1437
+ */
1438
+
1439
+ .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1440
+ sshll v15.4s, \x4, #15
1441
+ smull v26.4s, \x6, v14.h[3]
1442
+ smlal v26.4s, \x10, v14.h[2]
1443
+ smlal v26.4s, \x12, v14.h[1]
1444
+ smlal v26.4s, \x16, v14.h[0]
1445
+
1446
+ add v20.4s, v15.4s, v26.4s
1447
+ sub v15.4s, v15.4s, v26.4s
1448
+
1449
+ .if \shift > 16
1450
+ srshr v20.4s, v20.4s, #\shift
1451
+ srshr v15.4s, v15.4s, #\shift
1452
+ xtn \y26, v20.4s
1453
+ xtn \y27, v15.4s
1454
+ .else
1455
+ rshrn \y26, v20.4s, #\shift
1456
+ rshrn \y27, v15.4s, #\shift
1457
+ .endif
1458
+ .endm
1459
+
1460
+ asm_function jsimd_idct_2x2_neon
1461
+
1462
+ DCT_TABLE .req x0
1463
+ COEF_BLOCK .req x1
1464
+ OUTPUT_BUF .req x2
1465
+ OUTPUT_COL .req x3
1466
+ TMP1 .req x0
1467
+ TMP2 .req x15
1468
+
1469
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1470
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
1471
+ instruction ensures that those bits are set to zero. */
1472
+ uxtw x3, w3
1473
+
1474
+ /* vpush {v8.4h - v15.4h} ; not available */
1475
+ sub sp, sp, 64
1476
+ mov x9, sp
1477
+
1478
+ /* Load constants */
1479
+ get_symbol_loc TMP2, Ljsimd_idct_2x2_neon_consts
1480
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1481
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1482
+ ld1 {v14.4h}, [TMP2]
1483
+
1484
+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
1485
+ * 0 1 2 3 | 4 5 6 7
1486
+ * ---------+--------
1487
+ * 0 | v4.4h | v5.4h
1488
+ * 1 | v6.4h | v7.4h
1489
+ * 2 | - | -
1490
+ * 3 | v10.4h | v11.4h
1491
+ * 4 | - | -
1492
+ * 5 | v12.4h | v13.4h
1493
+ * 6 | - | -
1494
+ * 7 | v16.4h | v17.4h
1495
+ */
1496
+ ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1497
+ add COEF_BLOCK, COEF_BLOCK, #16
1498
+ ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
1499
+ add COEF_BLOCK, COEF_BLOCK, #16
1500
+ ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
1501
+ add COEF_BLOCK, COEF_BLOCK, #16
1502
+ ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1503
+ /* Dequantize */
1504
+ ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1505
+ mul v4.4h, v4.4h, v18.4h
1506
+ mul v5.4h, v5.4h, v19.4h
1507
+ ins v4.d[1], v5.d[0]
1508
+ mul v6.4h, v6.4h, v20.4h
1509
+ mul v7.4h, v7.4h, v21.4h
1510
+ ins v6.d[1], v7.d[0]
1511
+ add DCT_TABLE, DCT_TABLE, #16
1512
+ ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
1513
+ mul v10.4h, v10.4h, v24.4h
1514
+ mul v11.4h, v11.4h, v25.4h
1515
+ ins v10.d[1], v11.d[0]
1516
+ add DCT_TABLE, DCT_TABLE, #16
1517
+ ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
1518
+ mul v12.4h, v12.4h, v26.4h
1519
+ mul v13.4h, v13.4h, v27.4h
1520
+ ins v12.d[1], v13.d[0]
1521
+ add DCT_TABLE, DCT_TABLE, #16
1522
+ ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1523
+ mul v16.4h, v16.4h, v30.4h
1524
+ mul v17.4h, v17.4h, v31.4h
1525
+ ins v16.d[1], v17.d[0]
1526
+
1527
+ /* Pass 1 */
1528
+ #if 0
1529
+ idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1530
+ transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
1531
+ idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1532
+ transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
1533
+ #else
1534
+ smull v26.4s, v6.4h, v14.h[3]
1535
+ smlal v26.4s, v10.4h, v14.h[2]
1536
+ smlal v26.4s, v12.4h, v14.h[1]
1537
+ smlal v26.4s, v16.4h, v14.h[0]
1538
+ smull v24.4s, v7.4h, v14.h[3]
1539
+ smlal v24.4s, v11.4h, v14.h[2]
1540
+ smlal v24.4s, v13.4h, v14.h[1]
1541
+ smlal v24.4s, v17.4h, v14.h[0]
1542
+ sshll v15.4s, v4.4h, #15
1543
+ sshll v30.4s, v5.4h, #15
1544
+ add v20.4s, v15.4s, v26.4s
1545
+ sub v15.4s, v15.4s, v26.4s
1546
+ rshrn v4.4h, v20.4s, #13
1547
+ rshrn v6.4h, v15.4s, #13
1548
+ add v20.4s, v30.4s, v24.4s
1549
+ sub v15.4s, v30.4s, v24.4s
1550
+ rshrn v5.4h, v20.4s, #13
1551
+ rshrn v7.4h, v15.4s, #13
1552
+ ins v4.d[1], v5.d[0]
1553
+ ins v6.d[1], v7.d[0]
1554
+ transpose v4, v6, v3, .16b, .8h
1555
+ transpose v6, v10, v3, .16b, .4s
1556
+ ins v11.d[0], v10.d[1]
1557
+ ins v7.d[0], v6.d[1]
1558
+ #endif
1559
+
1560
+ /* Pass 2 */
1561
+ idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1562
+
1563
+ /* Range limit */
1564
+ movi v30.8h, #0x80
1565
+ ins v26.d[1], v27.d[0]
1566
+ add v26.8h, v26.8h, v30.8h
1567
+ sqxtun v30.8b, v26.8h
1568
+ ins v26.d[0], v30.d[0]
1569
+ sqxtun v27.8b, v26.8h
1570
+
1571
+ /* Store results to the output buffer */
1572
+ ldp TMP1, TMP2, [OUTPUT_BUF]
1573
+ add TMP1, TMP1, OUTPUT_COL
1574
+ add TMP2, TMP2, OUTPUT_COL
1575
+
1576
+ st1 {v26.b}[0], [TMP1], 1
1577
+ st1 {v27.b}[4], [TMP1], 1
1578
+ st1 {v26.b}[1], [TMP2], 1
1579
+ st1 {v27.b}[5], [TMP2], 1
1580
+
1581
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1582
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1583
+ blr x30
1584
+
1585
+ .unreq DCT_TABLE
1586
+ .unreq COEF_BLOCK
1587
+ .unreq OUTPUT_BUF
1588
+ .unreq OUTPUT_COL
1589
+ .unreq TMP1
1590
+ .unreq TMP2
1591
+
1592
+ .purgem idct_helper
1593
+
1594
+
1595
+ /*****************************************************************************/
1596
+
1597
+ /*
1598
+ * jsimd_ycc_extrgb_convert_neon
1599
+ * jsimd_ycc_extbgr_convert_neon
1600
+ * jsimd_ycc_extrgbx_convert_neon
1601
+ * jsimd_ycc_extbgrx_convert_neon
1602
+ * jsimd_ycc_extxbgr_convert_neon
1603
+ * jsimd_ycc_extxrgb_convert_neon
1604
+ *
1605
+ * Colorspace conversion YCbCr -> RGB
1606
+ */
1607
+
1608
+ .macro do_load size
1609
+ .if \size == 8
1610
+ ld1 {v4.8b}, [U], 8
1611
+ ld1 {v5.8b}, [V], 8
1612
+ ld1 {v0.8b}, [Y], 8
1613
+ prfm pldl1keep, [U, #64]
1614
+ prfm pldl1keep, [V, #64]
1615
+ prfm pldl1keep, [Y, #64]
1616
+ .elseif \size == 4
1617
+ ld1 {v4.b}[0], [U], 1
1618
+ ld1 {v4.b}[1], [U], 1
1619
+ ld1 {v4.b}[2], [U], 1
1620
+ ld1 {v4.b}[3], [U], 1
1621
+ ld1 {v5.b}[0], [V], 1
1622
+ ld1 {v5.b}[1], [V], 1
1623
+ ld1 {v5.b}[2], [V], 1
1624
+ ld1 {v5.b}[3], [V], 1
1625
+ ld1 {v0.b}[0], [Y], 1
1626
+ ld1 {v0.b}[1], [Y], 1
1627
+ ld1 {v0.b}[2], [Y], 1
1628
+ ld1 {v0.b}[3], [Y], 1
1629
+ .elseif \size == 2
1630
+ ld1 {v4.b}[4], [U], 1
1631
+ ld1 {v4.b}[5], [U], 1
1632
+ ld1 {v5.b}[4], [V], 1
1633
+ ld1 {v5.b}[5], [V], 1
1634
+ ld1 {v0.b}[4], [Y], 1
1635
+ ld1 {v0.b}[5], [Y], 1
1636
+ .elseif \size == 1
1637
+ ld1 {v4.b}[6], [U], 1
1638
+ ld1 {v5.b}[6], [V], 1
1639
+ ld1 {v0.b}[6], [Y], 1
1640
+ .else
1641
+ .error unsupported macroblock size
1642
+ .endif
1643
+ .endm
1644
+
1645
+ .macro do_store bpp, size, fast_st3
1646
+ .if \bpp == 24
1647
+ .if \size == 8
1648
+ .if \fast_st3 == 1
1649
+ st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
1650
+ .else
1651
+ st1 {v10.b}[0], [RGB], #1
1652
+ st1 {v11.b}[0], [RGB], #1
1653
+ st1 {v12.b}[0], [RGB], #1
1654
+
1655
+ st1 {v10.b}[1], [RGB], #1
1656
+ st1 {v11.b}[1], [RGB], #1
1657
+ st1 {v12.b}[1], [RGB], #1
1658
+
1659
+ st1 {v10.b}[2], [RGB], #1
1660
+ st1 {v11.b}[2], [RGB], #1
1661
+ st1 {v12.b}[2], [RGB], #1
1662
+
1663
+ st1 {v10.b}[3], [RGB], #1
1664
+ st1 {v11.b}[3], [RGB], #1
1665
+ st1 {v12.b}[3], [RGB], #1
1666
+
1667
+ st1 {v10.b}[4], [RGB], #1
1668
+ st1 {v11.b}[4], [RGB], #1
1669
+ st1 {v12.b}[4], [RGB], #1
1670
+
1671
+ st1 {v10.b}[5], [RGB], #1
1672
+ st1 {v11.b}[5], [RGB], #1
1673
+ st1 {v12.b}[5], [RGB], #1
1674
+
1675
+ st1 {v10.b}[6], [RGB], #1
1676
+ st1 {v11.b}[6], [RGB], #1
1677
+ st1 {v12.b}[6], [RGB], #1
1678
+
1679
+ st1 {v10.b}[7], [RGB], #1
1680
+ st1 {v11.b}[7], [RGB], #1
1681
+ st1 {v12.b}[7], [RGB], #1
1682
+ .endif
1683
+ .elseif \size == 4
1684
+ st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
1685
+ st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
1686
+ st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
1687
+ st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
1688
+ .elseif \size == 2
1689
+ st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
1690
+ st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
1691
+ .elseif \size == 1
1692
+ st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
1693
+ .else
1694
+ .error unsupported macroblock size
1695
+ .endif
1696
+ .elseif \bpp == 32
1697
+ .if \size == 8
1698
+ st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1699
+ .elseif \size == 4
1700
+ st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1701
+ st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1702
+ st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1703
+ st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1704
+ .elseif \size == 2
1705
+ st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1706
+ st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1707
+ .elseif \size == 1
1708
+ st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1709
+ .else
1710
+ .error unsupported macroblock size
1711
+ .endif
1712
+ .elseif \bpp == 16
1713
+ .if \size == 8
1714
+ st1 {v25.8h}, [RGB], 16
1715
+ .elseif \size == 4
1716
+ st1 {v25.4h}, [RGB], 8
1717
+ .elseif \size == 2
1718
+ st1 {v25.h}[4], [RGB], 2
1719
+ st1 {v25.h}[5], [RGB], 2
1720
+ .elseif \size == 1
1721
+ st1 {v25.h}[6], [RGB], 2
1722
+ .else
1723
+ .error unsupported macroblock size
1724
+ .endif
1725
+ .else
1726
+ .error unsupported bpp
1727
+ .endif
1728
+ .endm
1729
+
1730
+ .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
1731
+ g_offs, gsize, b_offs, bsize, \
1732
+ defsize, fast_st3
1733
+
1734
+ /*
1735
+ * 2-stage pipelined YCbCr->RGB conversion
1736
+ */
1737
+
1738
+ .macro do_yuv_to_rgb_stage1
1739
+ uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
1740
+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1741
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1742
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1743
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1744
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1745
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1746
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1747
+ smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
1748
+ smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
1749
+ .endm
1750
+
1751
+ .macro do_yuv_to_rgb_stage2
1752
+ rshrn v20.4h, v20.4s, #15
1753
+ rshrn2 v20.8h, v22.4s, #15
1754
+ rshrn v24.4h, v24.4s, #14
1755
+ rshrn2 v24.8h, v26.4s, #14
1756
+ rshrn v28.4h, v28.4s, #14
1757
+ rshrn2 v28.8h, v30.4s, #14
1758
+ uaddw v20.8h, v20.8h, v0.8b
1759
+ uaddw v24.8h, v24.8h, v0.8b
1760
+ uaddw v28.8h, v28.8h, v0.8b
1761
+ .if \bpp != 16
1762
+ sqxtun v1\g_offs\defsize, v20.8h
1763
+ sqxtun v1\r_offs\defsize, v24.8h
1764
+ sqxtun v1\b_offs\defsize, v28.8h
1765
+ .else
1766
+ sqshlu v21.8h, v20.8h, #8
1767
+ sqshlu v25.8h, v24.8h, #8
1768
+ sqshlu v29.8h, v28.8h, #8
1769
+ sri v25.8h, v21.8h, #5
1770
+ sri v25.8h, v29.8h, #11
1771
+ .endif
1772
+ .endm
1773
+
1774
+ .macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
1775
+ rshrn v20.4h, v20.4s, #15
1776
+ rshrn v24.4h, v24.4s, #14
1777
+ rshrn v28.4h, v28.4s, #14
1778
+ ld1 {v4.8b}, [U], 8
1779
+ rshrn2 v20.8h, v22.4s, #15
1780
+ rshrn2 v24.8h, v26.4s, #14
1781
+ rshrn2 v28.8h, v30.4s, #14
1782
+ ld1 {v5.8b}, [V], 8
1783
+ uaddw v20.8h, v20.8h, v0.8b
1784
+ uaddw v24.8h, v24.8h, v0.8b
1785
+ uaddw v28.8h, v28.8h, v0.8b
1786
+ .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
1787
+ sqxtun v1\g_offs\defsize, v20.8h
1788
+ ld1 {v0.8b}, [Y], 8
1789
+ sqxtun v1\r_offs\defsize, v24.8h
1790
+ prfm pldl1keep, [U, #64]
1791
+ prfm pldl1keep, [V, #64]
1792
+ prfm pldl1keep, [Y, #64]
1793
+ sqxtun v1\b_offs\defsize, v28.8h
1794
+ uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1795
+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1796
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1797
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1798
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1799
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1800
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1801
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1802
+ .else /**************************** rgb565 ********************************/
1803
+ sqshlu v21.8h, v20.8h, #8
1804
+ sqshlu v25.8h, v24.8h, #8
1805
+ sqshlu v29.8h, v28.8h, #8
1806
+ uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1807
+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1808
+ ld1 {v0.8b}, [Y], 8
1809
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1810
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1811
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1812
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1813
+ sri v25.8h, v21.8h, #5
1814
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1815
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1816
+ prfm pldl1keep, [U, #64]
1817
+ prfm pldl1keep, [V, #64]
1818
+ prfm pldl1keep, [Y, #64]
1819
+ sri v25.8h, v29.8h, #11
1820
+ .endif
1821
+ do_store \bpp, 8, \fast_st3
1822
+ smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
1823
+ smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
1824
+ .endm
1825
+
1826
+ .macro do_yuv_to_rgb
1827
+ do_yuv_to_rgb_stage1
1828
+ do_yuv_to_rgb_stage2
1829
+ .endm
1830
+
1831
+ .if \fast_st3 == 1
1832
+ asm_function jsimd_ycc_\colorid\()_convert_neon
1833
+ .else
1834
+ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
1835
+ .endif
1836
+ OUTPUT_WIDTH .req w0
1837
+ INPUT_BUF .req x1
1838
+ INPUT_ROW .req w2
1839
+ OUTPUT_BUF .req x3
1840
+ NUM_ROWS .req w4
1841
+
1842
+ INPUT_BUF0 .req x5
1843
+ INPUT_BUF1 .req x6
1844
+ INPUT_BUF2 .req x1
1845
+
1846
+ RGB .req x7
1847
+ Y .req x9
1848
+ U .req x10
1849
+ V .req x11
1850
+ N .req w15
1851
+
1852
+ sub sp, sp, 64
1853
+ mov x9, sp
1854
+
1855
+ /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
1856
+ get_symbol_loc x15, Ljsimd_ycc_rgb_neon_consts
1857
+
1858
+ /* Save NEON registers */
1859
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1860
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1861
+ ld1 {v0.4h, v1.4h}, [x15], 16
1862
+ ld1 {v2.8h}, [x15]
1863
+
1864
+ ldr INPUT_BUF0, [INPUT_BUF]
1865
+ ldr INPUT_BUF1, [INPUT_BUF, #8]
1866
+ ldr INPUT_BUF2, [INPUT_BUF, #16]
1867
+ .unreq INPUT_BUF
1868
+
1869
+ /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1870
+ movi v10.16b, #255
1871
+ movi v13.16b, #255
1872
+
1873
+ /* Outer loop over scanlines */
1874
+ cmp NUM_ROWS, #1
1875
+ b.lt 9f
1876
+ 0:
1877
+ ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
1878
+ ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
1879
+ mov N, OUTPUT_WIDTH
1880
+ ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
1881
+ add INPUT_ROW, INPUT_ROW, #1
1882
+ ldr RGB, [OUTPUT_BUF], #8
1883
+
1884
+ /* Inner loop over pixels */
1885
+ subs N, N, #8
1886
+ b.lt 3f
1887
+ do_load 8
1888
+ do_yuv_to_rgb_stage1
1889
+ subs N, N, #8
1890
+ b.lt 2f
1891
+ 1:
1892
+ do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
1893
+ subs N, N, #8
1894
+ b.ge 1b
1895
+ 2:
1896
+ do_yuv_to_rgb_stage2
1897
+ do_store \bpp, 8, \fast_st3
1898
+ tst N, #7
1899
+ b.eq 8f
1900
+ 3:
1901
+ tst N, #4
1902
+ b.eq 3f
1903
+ do_load 4
1904
+ 3:
1905
+ tst N, #2
1906
+ b.eq 4f
1907
+ do_load 2
1908
+ 4:
1909
+ tst N, #1
1910
+ b.eq 5f
1911
+ do_load 1
1912
+ 5:
1913
+ do_yuv_to_rgb
1914
+ tst N, #4
1915
+ b.eq 6f
1916
+ do_store \bpp, 4, \fast_st3
1917
+ 6:
1918
+ tst N, #2
1919
+ b.eq 7f
1920
+ do_store \bpp, 2, \fast_st3
1921
+ 7:
1922
+ tst N, #1
1923
+ b.eq 8f
1924
+ do_store \bpp, 1, \fast_st3
1925
+ 8:
1926
+ subs NUM_ROWS, NUM_ROWS, #1
1927
+ b.gt 0b
1928
+ 9:
1929
+ /* Restore all registers and return */
1930
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1931
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1932
+ br x30
1933
+ .unreq OUTPUT_WIDTH
1934
+ .unreq INPUT_ROW
1935
+ .unreq OUTPUT_BUF
1936
+ .unreq NUM_ROWS
1937
+ .unreq INPUT_BUF0
1938
+ .unreq INPUT_BUF1
1939
+ .unreq INPUT_BUF2
1940
+ .unreq RGB
1941
+ .unreq Y
1942
+ .unreq U
1943
+ .unreq V
1944
+ .unreq N
1945
+
1946
+ .purgem do_yuv_to_rgb
1947
+ .purgem do_yuv_to_rgb_stage1
1948
+ .purgem do_yuv_to_rgb_stage2
1949
+ .purgem do_yuv_to_rgb_stage2_store_load_stage1
1950
+
1951
+ .endm
1952
+
1953
+ /*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/
1954
+ generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1
1955
+ generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1
1956
+ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1
1957
+ generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1
1958
+ generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1
1959
+ generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1
1960
+ generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1
1961
+
1962
+ generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0
1963
+ generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0
1964
+
1965
+ .purgem do_load
1966
+ .purgem do_store
1967
+
1968
+
1969
+ /*****************************************************************************/
1970
+
1971
+ /*
1972
+ * jsimd_extrgb_ycc_convert_neon
1973
+ * jsimd_extbgr_ycc_convert_neon
1974
+ * jsimd_extrgbx_ycc_convert_neon
1975
+ * jsimd_extbgrx_ycc_convert_neon
1976
+ * jsimd_extxbgr_ycc_convert_neon
1977
+ * jsimd_extxrgb_ycc_convert_neon
1978
+ *
1979
+ * Colorspace conversion RGB -> YCbCr
1980
+ */
1981
+
1982
+ .macro do_store size
1983
+ .if \size == 8
1984
+ st1 {v20.8b}, [Y], #8
1985
+ st1 {v21.8b}, [U], #8
1986
+ st1 {v22.8b}, [V], #8
1987
+ .elseif \size == 4
1988
+ st1 {v20.b}[0], [Y], #1
1989
+ st1 {v20.b}[1], [Y], #1
1990
+ st1 {v20.b}[2], [Y], #1
1991
+ st1 {v20.b}[3], [Y], #1
1992
+ st1 {v21.b}[0], [U], #1
1993
+ st1 {v21.b}[1], [U], #1
1994
+ st1 {v21.b}[2], [U], #1
1995
+ st1 {v21.b}[3], [U], #1
1996
+ st1 {v22.b}[0], [V], #1
1997
+ st1 {v22.b}[1], [V], #1
1998
+ st1 {v22.b}[2], [V], #1
1999
+ st1 {v22.b}[3], [V], #1
2000
+ .elseif \size == 2
2001
+ st1 {v20.b}[4], [Y], #1
2002
+ st1 {v20.b}[5], [Y], #1
2003
+ st1 {v21.b}[4], [U], #1
2004
+ st1 {v21.b}[5], [U], #1
2005
+ st1 {v22.b}[4], [V], #1
2006
+ st1 {v22.b}[5], [V], #1
2007
+ .elseif \size == 1
2008
+ st1 {v20.b}[6], [Y], #1
2009
+ st1 {v21.b}[6], [U], #1
2010
+ st1 {v22.b}[6], [V], #1
2011
+ .else
2012
+ .error unsupported macroblock size
2013
+ .endif
2014
+ .endm
2015
+
2016
+ .macro do_load bpp, size, fast_ld3
2017
+ .if \bpp == 24
2018
+ .if \size == 8
2019
+ .if \fast_ld3 == 1
2020
+ ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24
2021
+ .else
2022
+ ld1 {v10.b}[0], [RGB], #1
2023
+ ld1 {v11.b}[0], [RGB], #1
2024
+ ld1 {v12.b}[0], [RGB], #1
2025
+
2026
+ ld1 {v10.b}[1], [RGB], #1
2027
+ ld1 {v11.b}[1], [RGB], #1
2028
+ ld1 {v12.b}[1], [RGB], #1
2029
+
2030
+ ld1 {v10.b}[2], [RGB], #1
2031
+ ld1 {v11.b}[2], [RGB], #1
2032
+ ld1 {v12.b}[2], [RGB], #1
2033
+
2034
+ ld1 {v10.b}[3], [RGB], #1
2035
+ ld1 {v11.b}[3], [RGB], #1
2036
+ ld1 {v12.b}[3], [RGB], #1
2037
+
2038
+ ld1 {v10.b}[4], [RGB], #1
2039
+ ld1 {v11.b}[4], [RGB], #1
2040
+ ld1 {v12.b}[4], [RGB], #1
2041
+
2042
+ ld1 {v10.b}[5], [RGB], #1
2043
+ ld1 {v11.b}[5], [RGB], #1
2044
+ ld1 {v12.b}[5], [RGB], #1
2045
+
2046
+ ld1 {v10.b}[6], [RGB], #1
2047
+ ld1 {v11.b}[6], [RGB], #1
2048
+ ld1 {v12.b}[6], [RGB], #1
2049
+
2050
+ ld1 {v10.b}[7], [RGB], #1
2051
+ ld1 {v11.b}[7], [RGB], #1
2052
+ ld1 {v12.b}[7], [RGB], #1
2053
+ .endif
2054
+ prfm pldl1keep, [RGB, #128]
2055
+ .elseif \size == 4
2056
+ ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3
2057
+ ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3
2058
+ ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3
2059
+ ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3
2060
+ .elseif \size == 2
2061
+ ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3
2062
+ ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3
2063
+ .elseif \size == 1
2064
+ ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3
2065
+ .else
2066
+ .error unsupported macroblock size
2067
+ .endif
2068
+ .elseif \bpp == 32
2069
+ .if \size == 8
2070
+ ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
2071
+ prfm pldl1keep, [RGB, #128]
2072
+ .elseif \size == 4
2073
+ ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
2074
+ ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
2075
+ ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
2076
+ ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
2077
+ .elseif \size == 2
2078
+ ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
2079
+ ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
2080
+ .elseif \size == 1
2081
+ ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
2082
+ .else
2083
+ .error unsupported macroblock size
2084
+ .endif
2085
+ .else
2086
+ .error unsupported bpp
2087
+ .endif
2088
+ .endm
2089
+
2090
+ .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
2091
+ b_offs, fast_ld3
2092
+
2093
+ /*
2094
+ * 2-stage pipelined RGB->YCbCr conversion
2095
+ */
2096
+
2097
+ .macro do_rgb_to_yuv_stage1
2098
+ ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */
2099
+ ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */
2100
+ ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */
2101
+ rev64 v18.4s, v1.4s
2102
+ rev64 v26.4s, v1.4s
2103
+ rev64 v28.4s, v1.4s
2104
+ rev64 v30.4s, v1.4s
2105
+ umull v14.4s, v4.4h, v0.h[0]
2106
+ umull2 v16.4s, v4.8h, v0.h[0]
2107
+ umlsl v18.4s, v4.4h, v0.h[3]
2108
+ umlsl2 v26.4s, v4.8h, v0.h[3]
2109
+ umlal v28.4s, v4.4h, v0.h[5]
2110
+ umlal2 v30.4s, v4.8h, v0.h[5]
2111
+ umlal v14.4s, v6.4h, v0.h[1]
2112
+ umlal2 v16.4s, v6.8h, v0.h[1]
2113
+ umlsl v18.4s, v6.4h, v0.h[4]
2114
+ umlsl2 v26.4s, v6.8h, v0.h[4]
2115
+ umlsl v28.4s, v6.4h, v0.h[6]
2116
+ umlsl2 v30.4s, v6.8h, v0.h[6]
2117
+ umlal v14.4s, v8.4h, v0.h[2]
2118
+ umlal2 v16.4s, v8.8h, v0.h[2]
2119
+ umlal v18.4s, v8.4h, v0.h[5]
2120
+ umlal2 v26.4s, v8.8h, v0.h[5]
2121
+ umlsl v28.4s, v8.4h, v0.h[7]
2122
+ umlsl2 v30.4s, v8.8h, v0.h[7]
2123
+ .endm
2124
+
2125
+ .macro do_rgb_to_yuv_stage2
2126
+ rshrn v20.4h, v14.4s, #16
2127
+ shrn v22.4h, v18.4s, #16
2128
+ shrn v24.4h, v28.4s, #16
2129
+ rshrn2 v20.8h, v16.4s, #16
2130
+ shrn2 v22.8h, v26.4s, #16
2131
+ shrn2 v24.8h, v30.4s, #16
2132
+ xtn v20.8b, v20.8h /* v20 = y */
2133
+ xtn v21.8b, v22.8h /* v21 = u */
2134
+ xtn v22.8b, v24.8h /* v22 = v */
2135
+ .endm
2136
+
2137
+ .macro do_rgb_to_yuv
2138
+ do_rgb_to_yuv_stage1
2139
+ do_rgb_to_yuv_stage2
2140
+ .endm
2141
+
2142
+ /* TODO: expand macros and interleave instructions if some in-order
2143
+ * ARM64 processor actually can dual-issue LOAD/STORE with ALU */
2144
+ .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
2145
+ do_rgb_to_yuv_stage2
2146
+ do_load \bpp, 8, \fast_ld3
2147
+ st1 {v20.8b}, [Y], #8
2148
+ st1 {v21.8b}, [U], #8
2149
+ st1 {v22.8b}, [V], #8
2150
+ do_rgb_to_yuv_stage1
2151
+ .endm
2152
+
2153
+ .if \fast_ld3 == 1
2154
+ asm_function jsimd_\colorid\()_ycc_convert_neon
2155
+ .else
2156
+ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
2157
+ .endif
2158
+ OUTPUT_WIDTH .req w0
2159
+ INPUT_BUF .req x1
2160
+ OUTPUT_BUF .req x2
2161
+ OUTPUT_ROW .req w3
2162
+ NUM_ROWS .req w4
2163
+
2164
+ OUTPUT_BUF0 .req x5
2165
+ OUTPUT_BUF1 .req x6
2166
+ OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */
2167
+
2168
+ RGB .req x7
2169
+ Y .req x9
2170
+ U .req x10
2171
+ V .req x11
2172
+ N .req w12
2173
+
2174
+ /* Load constants to d0, d1, d2, d3 */
2175
+ get_symbol_loc x13, Ljsimd_rgb_ycc_neon_consts
2176
+ ld1 {v0.8h, v1.8h}, [x13]
2177
+
2178
+ ldr OUTPUT_BUF0, [OUTPUT_BUF]
2179
+ ldr OUTPUT_BUF1, [OUTPUT_BUF, #8]
2180
+ ldr OUTPUT_BUF2, [OUTPUT_BUF, #16]
2181
+ .unreq OUTPUT_BUF
2182
+
2183
+ /* Save NEON registers */
2184
+ sub sp, sp, #64
2185
+ mov x9, sp
2186
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
2187
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
2188
+
2189
+ /* Outer loop over scanlines */
2190
+ cmp NUM_ROWS, #1
2191
+ b.lt 9f
2192
+ 0:
2193
+ ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
2194
+ ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
2195
+ mov N, OUTPUT_WIDTH
2196
+ ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
2197
+ add OUTPUT_ROW, OUTPUT_ROW, #1
2198
+ ldr RGB, [INPUT_BUF], #8
2199
+
2200
+ /* Inner loop over pixels */
2201
+ subs N, N, #8
2202
+ b.lt 3f
2203
+ do_load \bpp, 8, \fast_ld3
2204
+ do_rgb_to_yuv_stage1
2205
+ subs N, N, #8
2206
+ b.lt 2f
2207
+ 1:
2208
+ do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
2209
+ subs N, N, #8
2210
+ b.ge 1b
2211
+ 2:
2212
+ do_rgb_to_yuv_stage2
2213
+ do_store 8
2214
+ tst N, #7
2215
+ b.eq 8f
2216
+ 3:
2217
+ tbz N, #2, 3f
2218
+ do_load \bpp, 4, \fast_ld3
2219
+ 3:
2220
+ tbz N, #1, 4f
2221
+ do_load \bpp, 2, \fast_ld3
2222
+ 4:
2223
+ tbz N, #0, 5f
2224
+ do_load \bpp, 1, \fast_ld3
2225
+ 5:
2226
+ do_rgb_to_yuv
2227
+ tbz N, #2, 6f
2228
+ do_store 4
2229
+ 6:
2230
+ tbz N, #1, 7f
2231
+ do_store 2
2232
+ 7:
2233
+ tbz N, #0, 8f
2234
+ do_store 1
2235
+ 8:
2236
+ subs NUM_ROWS, NUM_ROWS, #1
2237
+ b.gt 0b
2238
+ 9:
2239
+ /* Restore all registers and return */
2240
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2241
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2242
+ br x30
2243
+
2244
+ .unreq OUTPUT_WIDTH
2245
+ .unreq OUTPUT_ROW
2246
+ .unreq INPUT_BUF
2247
+ .unreq NUM_ROWS
2248
+ .unreq OUTPUT_BUF0
2249
+ .unreq OUTPUT_BUF1
2250
+ .unreq OUTPUT_BUF2
2251
+ .unreq RGB
2252
+ .unreq Y
2253
+ .unreq U
2254
+ .unreq V
2255
+ .unreq N
2256
+
2257
+ .purgem do_rgb_to_yuv
2258
+ .purgem do_rgb_to_yuv_stage1
2259
+ .purgem do_rgb_to_yuv_stage2
2260
+ .purgem do_rgb_to_yuv_stage2_store_load_stage1
2261
+
2262
+ .endm
2263
+
2264
+ /*--------------------------------- id ----- bpp R G B Fast LD3 */
2265
+ generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1
2266
+ generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1
2267
+ generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
2268
+ generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
2269
+ generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
2270
+ generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
2271
+
2272
+ generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0
2273
+ generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0
2274
+
2275
+ .purgem do_load
2276
+ .purgem do_store
2277
+
2278
+
2279
+ /*****************************************************************************/
2280
+
2281
+ /*
2282
+ * Load data into workspace, applying unsigned->signed conversion
2283
+ *
2284
+ * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
2285
+ * rid of VST1.16 instructions
2286
+ */
2287
+
2288
+ asm_function jsimd_convsamp_neon
2289
+ SAMPLE_DATA .req x0
2290
+ START_COL .req x1
2291
+ WORKSPACE .req x2
2292
+ TMP1 .req x9
2293
+ TMP2 .req x10
2294
+ TMP3 .req x11
2295
+ TMP4 .req x12
2296
+ TMP5 .req x13
2297
+ TMP6 .req x14
2298
+ TMP7 .req x15
2299
+ TMP8 .req x4
2300
+ TMPDUP .req w3
2301
+
2302
+ /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
2303
+ guarantee that the upper (unused) 32 bits of x1 are valid. This
2304
+ instruction ensures that those bits are set to zero. */
2305
+ uxtw x1, w1
2306
+
2307
+ mov TMPDUP, #128
2308
+ ldp TMP1, TMP2, [SAMPLE_DATA], 16
2309
+ ldp TMP3, TMP4, [SAMPLE_DATA], 16
2310
+ dup v0.8b, TMPDUP
2311
+ add TMP1, TMP1, START_COL
2312
+ add TMP2, TMP2, START_COL
2313
+ ldp TMP5, TMP6, [SAMPLE_DATA], 16
2314
+ add TMP3, TMP3, START_COL
2315
+ add TMP4, TMP4, START_COL
2316
+ ldp TMP7, TMP8, [SAMPLE_DATA], 16
2317
+ add TMP5, TMP5, START_COL
2318
+ add TMP6, TMP6, START_COL
2319
+ ld1 {v16.8b}, [TMP1]
2320
+ add TMP7, TMP7, START_COL
2321
+ add TMP8, TMP8, START_COL
2322
+ ld1 {v17.8b}, [TMP2]
2323
+ usubl v16.8h, v16.8b, v0.8b
2324
+ ld1 {v18.8b}, [TMP3]
2325
+ usubl v17.8h, v17.8b, v0.8b
2326
+ ld1 {v19.8b}, [TMP4]
2327
+ usubl v18.8h, v18.8b, v0.8b
2328
+ ld1 {v20.8b}, [TMP5]
2329
+ usubl v19.8h, v19.8b, v0.8b
2330
+ ld1 {v21.8b}, [TMP6]
2331
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
2332
+ usubl v20.8h, v20.8b, v0.8b
2333
+ ld1 {v22.8b}, [TMP7]
2334
+ usubl v21.8h, v21.8b, v0.8b
2335
+ ld1 {v23.8b}, [TMP8]
2336
+ usubl v22.8h, v22.8b, v0.8b
2337
+ usubl v23.8h, v23.8b, v0.8b
2338
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
2339
+
2340
+ br x30
2341
+
2342
+ .unreq SAMPLE_DATA
2343
+ .unreq START_COL
2344
+ .unreq WORKSPACE
2345
+ .unreq TMP1
2346
+ .unreq TMP2
2347
+ .unreq TMP3
2348
+ .unreq TMP4
2349
+ .unreq TMP5
2350
+ .unreq TMP6
2351
+ .unreq TMP7
2352
+ .unreq TMP8
2353
+ .unreq TMPDUP
2354
+
2355
+ /*****************************************************************************/
2356
+
2357
+ /*
2358
+ * jsimd_fdct_islow_neon
2359
+ *
2360
+ * This file contains a slow-but-accurate integer implementation of the
2361
+ * forward DCT (Discrete Cosine Transform). The following code is based
2362
+ * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
2363
+ * more details.
2364
+ *
2365
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
2366
+ * rid of a bunch of VLD1.16 instructions
2367
+ */
2368
+
2369
+ #define CONST_BITS 13
2370
+ #define PASS1_BITS 2
2371
+
2372
+ #define DESCALE_P1 (CONST_BITS - PASS1_BITS)
2373
+ #define DESCALE_P2 (CONST_BITS + PASS1_BITS)
2374
+
2375
+ #define XFIX_P_0_298 v0.h[0]
2376
+ #define XFIX_N_0_390 v0.h[1]
2377
+ #define XFIX_P_0_541 v0.h[2]
2378
+ #define XFIX_P_0_765 v0.h[3]
2379
+ #define XFIX_N_0_899 v0.h[4]
2380
+ #define XFIX_P_1_175 v0.h[5]
2381
+ #define XFIX_P_1_501 v0.h[6]
2382
+ #define XFIX_N_1_847 v0.h[7]
2383
+ #define XFIX_N_1_961 v1.h[0]
2384
+ #define XFIX_P_2_053 v1.h[1]
2385
+ #define XFIX_N_2_562 v1.h[2]
2386
+ #define XFIX_P_3_072 v1.h[3]
2387
+
2388
+ asm_function jsimd_fdct_islow_neon
2389
+
2390
+ DATA .req x0
2391
+ TMP .req x9
2392
+
2393
+ /* Load constants */
2394
+ get_symbol_loc TMP, Ljsimd_fdct_islow_neon_consts
2395
+ ld1 {v0.8h, v1.8h}, [TMP]
2396
+
2397
+ /* Save NEON registers */
2398
+ sub sp, sp, #64
2399
+ mov x10, sp
2400
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
2401
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
2402
+
2403
+ /* Load all DATA into NEON registers with the following allocation:
2404
+ * 0 1 2 3 | 4 5 6 7
2405
+ * ---------+--------
2406
+ * 0 | d16 | d17 | v16.8h
2407
+ * 1 | d18 | d19 | v17.8h
2408
+ * 2 | d20 | d21 | v18.8h
2409
+ * 3 | d22 | d23 | v19.8h
2410
+ * 4 | d24 | d25 | v20.8h
2411
+ * 5 | d26 | d27 | v21.8h
2412
+ * 6 | d28 | d29 | v22.8h
2413
+ * 7 | d30 | d31 | v23.8h
2414
+ */
2415
+
2416
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2417
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2418
+ sub DATA, DATA, #64
2419
+
2420
+ /* Transpose */
2421
+ transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2422
+ /* 1-D FDCT */
2423
+ add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
2424
+ sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
2425
+ add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
2426
+ sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
2427
+ add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
2428
+ sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
2429
+ add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
2430
+ sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
2431
+
2432
+ /* even part */
2433
+
2434
+ add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
2435
+ sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
2436
+ add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
2437
+ sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
2438
+
2439
+ add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
2440
+ sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
2441
+
2442
+ add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
2443
+
2444
+ shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
2445
+ shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
2446
+
2447
+ smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2448
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2449
+ mov v22.16b, v18.16b
2450
+ mov v25.16b, v24.16b
2451
+
2452
+ smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2453
+ smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2454
+ smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2455
+ smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2456
+
2457
+ rshrn v18.4h, v18.4s, #DESCALE_P1
2458
+ rshrn v22.4h, v22.4s, #DESCALE_P1
2459
+ rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2460
+ rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2461
+
2462
+ /* Odd part */
2463
+
2464
+ add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
2465
+ add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
2466
+ add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
2467
+ add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
2468
+ smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
2469
+ smull2 v5.4s, v10.8h, XFIX_P_1_175
2470
+ smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2471
+ smlal2 v5.4s, v11.8h, XFIX_P_1_175
2472
+
2473
+ smull2 v24.4s, v28.8h, XFIX_P_0_298
2474
+ smull2 v25.4s, v29.8h, XFIX_P_2_053
2475
+ smull2 v26.4s, v30.8h, XFIX_P_3_072
2476
+ smull2 v27.4s, v31.8h, XFIX_P_1_501
2477
+ smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2478
+ smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2479
+ smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2480
+ smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2481
+
2482
+ smull2 v12.4s, v8.8h, XFIX_N_0_899
2483
+ smull2 v13.4s, v9.8h, XFIX_N_2_562
2484
+ smull2 v14.4s, v10.8h, XFIX_N_1_961
2485
+ smull2 v15.4s, v11.8h, XFIX_N_0_390
2486
+ smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
2487
+ smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
2488
+ smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
2489
+ smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
2490
+
2491
+ add v10.4s, v10.4s, v4.4s /* z3 += z5 */
2492
+ add v14.4s, v14.4s, v5.4s
2493
+ add v11.4s, v11.4s, v4.4s /* z4 += z5 */
2494
+ add v15.4s, v15.4s, v5.4s
2495
+
2496
+ add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
2497
+ add v24.4s, v24.4s, v12.4s
2498
+ add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
2499
+ add v25.4s, v25.4s, v13.4s
2500
+ add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
2501
+ add v26.4s, v26.4s, v14.4s
2502
+ add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
2503
+ add v27.4s, v27.4s, v15.4s
2504
+
2505
+ add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
2506
+ add v24.4s, v24.4s, v14.4s
2507
+ add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
2508
+ add v25.4s, v25.4s, v15.4s
2509
+ add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
2510
+ add v26.4s, v26.4s, v13.4s
2511
+ add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
2512
+ add v27.4s, v27.4s, v12.4s
2513
+
2514
+ rshrn v23.4h, v28.4s, #DESCALE_P1
2515
+ rshrn v21.4h, v29.4s, #DESCALE_P1
2516
+ rshrn v19.4h, v30.4s, #DESCALE_P1
2517
+ rshrn v17.4h, v31.4s, #DESCALE_P1
2518
+ rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2519
+ rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2520
+ rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2521
+ rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2522
+
2523
+ /* Transpose */
2524
+ transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2525
+
2526
+ /* 1-D FDCT */
2527
+ add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
2528
+ sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
2529
+ add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
2530
+ sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
2531
+ add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
2532
+ sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
2533
+ add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
2534
+ sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
2535
+
2536
+ /* even part */
2537
+ add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
2538
+ sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
2539
+ add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
2540
+ sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
2541
+
2542
+ add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
2543
+ sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
2544
+
2545
+ add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
2546
+
2547
+ srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
2548
+ srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
2549
+
2550
+ smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2551
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2552
+ mov v22.16b, v18.16b
2553
+ mov v25.16b, v24.16b
2554
+
2555
+ smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2556
+ smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2557
+ smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2558
+ smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2559
+
2560
+ rshrn v18.4h, v18.4s, #DESCALE_P2
2561
+ rshrn v22.4h, v22.4s, #DESCALE_P2
2562
+ rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2563
+ rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2564
+
2565
+ /* Odd part */
2566
+ add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
2567
+ add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
2568
+ add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
2569
+ add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
2570
+
2571
+ smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
2572
+ smull2 v5.4s, v10.8h, XFIX_P_1_175
2573
+ smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2574
+ smlal2 v5.4s, v11.8h, XFIX_P_1_175
2575
+
2576
+ smull2 v24.4s, v28.8h, XFIX_P_0_298
2577
+ smull2 v25.4s, v29.8h, XFIX_P_2_053
2578
+ smull2 v26.4s, v30.8h, XFIX_P_3_072
2579
+ smull2 v27.4s, v31.8h, XFIX_P_1_501
2580
+ smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2581
+ smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2582
+ smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2583
+ smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2584
+
2585
+ smull2 v12.4s, v8.8h, XFIX_N_0_899
2586
+ smull2 v13.4s, v9.8h, XFIX_N_2_562
2587
+ smull2 v14.4s, v10.8h, XFIX_N_1_961
2588
+ smull2 v15.4s, v11.8h, XFIX_N_0_390
2589
+ smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
2590
+ smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
2591
+ smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
2592
+ smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
2593
+
2594
+ add v10.4s, v10.4s, v4.4s
2595
+ add v14.4s, v14.4s, v5.4s
2596
+ add v11.4s, v11.4s, v4.4s
2597
+ add v15.4s, v15.4s, v5.4s
2598
+
2599
+ add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
2600
+ add v24.4s, v24.4s, v12.4s
2601
+ add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
2602
+ add v25.4s, v25.4s, v13.4s
2603
+ add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
2604
+ add v26.4s, v26.4s, v14.4s
2605
+ add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
2606
+ add v27.4s, v27.4s, v15.4s
2607
+
2608
+ add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
2609
+ add v24.4s, v24.4s, v14.4s
2610
+ add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
2611
+ add v25.4s, v25.4s, v15.4s
2612
+ add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
2613
+ add v26.4s, v26.4s, v13.4s
2614
+ add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
2615
+ add v27.4s, v27.4s, v12.4s
2616
+
2617
+ rshrn v23.4h, v28.4s, #DESCALE_P2
2618
+ rshrn v21.4h, v29.4s, #DESCALE_P2
2619
+ rshrn v19.4h, v30.4s, #DESCALE_P2
2620
+ rshrn v17.4h, v31.4s, #DESCALE_P2
2621
+ rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2622
+ rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2623
+ rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2624
+ rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2625
+
2626
+ /* store results */
2627
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2628
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2629
+
2630
+ /* Restore NEON registers */
2631
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2632
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2633
+
2634
+ br x30
2635
+
2636
+ .unreq DATA
2637
+ .unreq TMP
2638
+
2639
+ #undef XFIX_P_0_298
2640
+ #undef XFIX_N_0_390
2641
+ #undef XFIX_P_0_541
2642
+ #undef XFIX_P_0_765
2643
+ #undef XFIX_N_0_899
2644
+ #undef XFIX_P_1_175
2645
+ #undef XFIX_P_1_501
2646
+ #undef XFIX_N_1_847
2647
+ #undef XFIX_N_1_961
2648
+ #undef XFIX_P_2_053
2649
+ #undef XFIX_N_2_562
2650
+ #undef XFIX_P_3_072
2651
+
2652
+
2653
+ /*****************************************************************************/
2654
+
2655
+ /*
2656
+ * jsimd_fdct_ifast_neon
2657
+ *
2658
+ * This function contains a fast, not so accurate integer implementation of
2659
+ * the forward DCT (Discrete Cosine Transform). It uses the same calculations
2660
+ * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
2661
+ * function from jfdctfst.c
2662
+ *
2663
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
2664
+ * rid of a bunch of VLD1.16 instructions
2665
+ */
2666
+
2667
+ #undef XFIX_0_541196100
2668
+ #define XFIX_0_382683433 v0.h[0]
2669
+ #define XFIX_0_541196100 v0.h[1]
2670
+ #define XFIX_0_707106781 v0.h[2]
2671
+ #define XFIX_1_306562965 v0.h[3]
2672
+
2673
+ asm_function jsimd_fdct_ifast_neon
2674
+
2675
+ DATA .req x0
2676
+ TMP .req x9
2677
+
2678
+ /* Load constants */
2679
+ get_symbol_loc TMP, Ljsimd_fdct_ifast_neon_consts
2680
+ ld1 {v0.4h}, [TMP]
2681
+
2682
+ /* Load all DATA into NEON registers with the following allocation:
2683
+ * 0 1 2 3 | 4 5 6 7
2684
+ * ---------+--------
2685
+ * 0 | d16 | d17 | v0.8h
2686
+ * 1 | d18 | d19 | q9
2687
+ * 2 | d20 | d21 | q10
2688
+ * 3 | d22 | d23 | q11
2689
+ * 4 | d24 | d25 | q12
2690
+ * 5 | d26 | d27 | q13
2691
+ * 6 | d28 | d29 | q14
2692
+ * 7 | d30 | d31 | q15
2693
+ */
2694
+
2695
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2696
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2697
+ mov TMP, #2
2698
+ sub DATA, DATA, #64
2699
+ 1:
2700
+ /* Transpose */
2701
+ transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
2702
+ subs TMP, TMP, #1
2703
+ /* 1-D FDCT */
2704
+ add v4.8h, v19.8h, v20.8h
2705
+ sub v20.8h, v19.8h, v20.8h
2706
+ sub v28.8h, v18.8h, v21.8h
2707
+ add v18.8h, v18.8h, v21.8h
2708
+ sub v29.8h, v17.8h, v22.8h
2709
+ add v17.8h, v17.8h, v22.8h
2710
+ sub v21.8h, v16.8h, v23.8h
2711
+ add v16.8h, v16.8h, v23.8h
2712
+ sub v6.8h, v17.8h, v18.8h
2713
+ sub v7.8h, v16.8h, v4.8h
2714
+ add v5.8h, v17.8h, v18.8h
2715
+ add v6.8h, v6.8h, v7.8h
2716
+ add v4.8h, v16.8h, v4.8h
2717
+ sqdmulh v6.8h, v6.8h, XFIX_0_707106781
2718
+ add v19.8h, v20.8h, v28.8h
2719
+ add v16.8h, v4.8h, v5.8h
2720
+ sub v20.8h, v4.8h, v5.8h
2721
+ add v5.8h, v28.8h, v29.8h
2722
+ add v29.8h, v29.8h, v21.8h
2723
+ sqdmulh v5.8h, v5.8h, XFIX_0_707106781
2724
+ sub v28.8h, v19.8h, v29.8h
2725
+ add v18.8h, v7.8h, v6.8h
2726
+ sqdmulh v28.8h, v28.8h, XFIX_0_382683433
2727
+ sub v22.8h, v7.8h, v6.8h
2728
+ sqdmulh v19.8h, v19.8h, XFIX_0_541196100
2729
+ sqdmulh v7.8h, v29.8h, XFIX_1_306562965
2730
+ add v6.8h, v21.8h, v5.8h
2731
+ sub v5.8h, v21.8h, v5.8h
2732
+ add v29.8h, v29.8h, v28.8h
2733
+ add v19.8h, v19.8h, v28.8h
2734
+ add v29.8h, v29.8h, v7.8h
2735
+ add v21.8h, v5.8h, v19.8h
2736
+ sub v19.8h, v5.8h, v19.8h
2737
+ add v17.8h, v6.8h, v29.8h
2738
+ sub v23.8h, v6.8h, v29.8h
2739
+
2740
+ b.ne 1b
2741
+
2742
+ /* store results */
2743
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2744
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2745
+
2746
+ br x30
2747
+
2748
+ .unreq DATA
2749
+ .unreq TMP
2750
+ #undef XFIX_0_382683433
2751
+ #undef XFIX_0_541196100
2752
+ #undef XFIX_0_707106781
2753
+ #undef XFIX_1_306562965
2754
+
2755
+
2756
+ /*****************************************************************************/
2757
+
2758
+ /*
2759
+ * GLOBAL(void)
2760
+ * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
2761
+ * DCTELEM *workspace);
2762
+ *
2763
+ */
2764
+ asm_function jsimd_quantize_neon
2765
+
2766
+ COEF_BLOCK .req x0
2767
+ DIVISORS .req x1
2768
+ WORKSPACE .req x2
2769
+
2770
+ RECIPROCAL .req DIVISORS
2771
+ CORRECTION .req x9
2772
+ SHIFT .req x10
2773
+ LOOP_COUNT .req x11
2774
+
2775
+ mov LOOP_COUNT, #2
2776
+ add CORRECTION, DIVISORS, #(64 * 2)
2777
+ add SHIFT, DIVISORS, #(64 * 6)
2778
+ 1:
2779
+ subs LOOP_COUNT, LOOP_COUNT, #1
2780
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
2781
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
2782
+ abs v20.8h, v0.8h
2783
+ abs v21.8h, v1.8h
2784
+ abs v22.8h, v2.8h
2785
+ abs v23.8h, v3.8h
2786
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
2787
+ add v20.8h, v20.8h, v4.8h /* add correction */
2788
+ add v21.8h, v21.8h, v5.8h
2789
+ add v22.8h, v22.8h, v6.8h
2790
+ add v23.8h, v23.8h, v7.8h
2791
+ umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */
2792
+ umull2 v16.4s, v20.8h, v28.8h
2793
+ umull v5.4s, v21.4h, v29.4h
2794
+ umull2 v17.4s, v21.8h, v29.8h
2795
+ umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */
2796
+ umull2 v18.4s, v22.8h, v30.8h
2797
+ umull v7.4s, v23.4h, v31.4h
2798
+ umull2 v19.4s, v23.8h, v31.8h
2799
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
2800
+ shrn v4.4h, v4.4s, #16
2801
+ shrn v5.4h, v5.4s, #16
2802
+ shrn v6.4h, v6.4s, #16
2803
+ shrn v7.4h, v7.4s, #16
2804
+ shrn2 v4.8h, v16.4s, #16
2805
+ shrn2 v5.8h, v17.4s, #16
2806
+ shrn2 v6.8h, v18.4s, #16
2807
+ shrn2 v7.8h, v19.4s, #16
2808
+ neg v24.8h, v24.8h
2809
+ neg v25.8h, v25.8h
2810
+ neg v26.8h, v26.8h
2811
+ neg v27.8h, v27.8h
2812
+ sshr v0.8h, v0.8h, #15 /* extract sign */
2813
+ sshr v1.8h, v1.8h, #15
2814
+ sshr v2.8h, v2.8h, #15
2815
+ sshr v3.8h, v3.8h, #15
2816
+ ushl v4.8h, v4.8h, v24.8h /* shift */
2817
+ ushl v5.8h, v5.8h, v25.8h
2818
+ ushl v6.8h, v6.8h, v26.8h
2819
+ ushl v7.8h, v7.8h, v27.8h
2820
+
2821
+ eor v4.16b, v4.16b, v0.16b /* restore sign */
2822
+ eor v5.16b, v5.16b, v1.16b
2823
+ eor v6.16b, v6.16b, v2.16b
2824
+ eor v7.16b, v7.16b, v3.16b
2825
+ sub v4.8h, v4.8h, v0.8h
2826
+ sub v5.8h, v5.8h, v1.8h
2827
+ sub v6.8h, v6.8h, v2.8h
2828
+ sub v7.8h, v7.8h, v3.8h
2829
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
2830
+
2831
+ b.ne 1b
2832
+
2833
+ br x30 /* return */
2834
+
2835
+ .unreq COEF_BLOCK
2836
+ .unreq DIVISORS
2837
+ .unreq WORKSPACE
2838
+ .unreq RECIPROCAL
2839
+ .unreq CORRECTION
2840
+ .unreq SHIFT
2841
+ .unreq LOOP_COUNT
2842
+
2843
+
2844
+ /*****************************************************************************/
2845
+
2846
+ /*
2847
+ * Downsample pixel values of a single component.
2848
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
2849
+ * without smoothing.
2850
+ *
2851
+ * GLOBAL(void)
2852
+ * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
2853
+ * JDIMENSION v_samp_factor,
2854
+ * JDIMENSION width_in_blocks,
2855
+ * JSAMPARRAY input_data, JSAMPARRAY output_data);
2856
+ */
2857
+
2858
+ asm_function jsimd_h2v1_downsample_neon
2859
+ IMAGE_WIDTH .req x0
2860
+ MAX_V_SAMP .req x1
2861
+ V_SAMP .req x2
2862
+ BLOCK_WIDTH .req x3
2863
+ INPUT_DATA .req x4
2864
+ OUTPUT_DATA .req x5
2865
+ OUTPTR .req x9
2866
+ INPTR .req x10
2867
+ TMP1 .req x11
2868
+ TMP2 .req x12
2869
+ TMP3 .req x13
2870
+ TMPDUP .req w15
2871
+
2872
+ mov TMPDUP, #0x10000
2873
+ lsl TMP2, BLOCK_WIDTH, #4
2874
+ sub TMP2, TMP2, IMAGE_WIDTH
2875
+ get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts
2876
+ add TMP3, TMP3, TMP2, lsl #4
2877
+ dup v16.4s, TMPDUP
2878
+ ld1 {v18.16b}, [TMP3]
2879
+
2880
+ 1: /* row loop */
2881
+ ldr INPTR, [INPUT_DATA], #8
2882
+ ldr OUTPTR, [OUTPUT_DATA], #8
2883
+ subs TMP1, BLOCK_WIDTH, #1
2884
+ b.eq 3f
2885
+ 2: /* columns */
2886
+ ld1 {v0.16b}, [INPTR], #16
2887
+ mov v4.16b, v16.16b
2888
+ subs TMP1, TMP1, #1
2889
+ uadalp v4.8h, v0.16b
2890
+ shrn v6.8b, v4.8h, #1
2891
+ st1 {v6.8b}, [OUTPTR], #8
2892
+ b.ne 2b
2893
+ 3: /* last columns */
2894
+ ld1 {v0.16b}, [INPTR]
2895
+ mov v4.16b, v16.16b
2896
+ subs V_SAMP, V_SAMP, #1
2897
+ /* expand right */
2898
+ tbl v2.16b, {v0.16b}, v18.16b
2899
+ uadalp v4.8h, v2.16b
2900
+ shrn v6.8b, v4.8h, #1
2901
+ st1 {v6.8b}, [OUTPTR], #8
2902
+ b.ne 1b
2903
+
2904
+ br x30
2905
+
2906
+ .unreq IMAGE_WIDTH
2907
+ .unreq MAX_V_SAMP
2908
+ .unreq V_SAMP
2909
+ .unreq BLOCK_WIDTH
2910
+ .unreq INPUT_DATA
2911
+ .unreq OUTPUT_DATA
2912
+ .unreq OUTPTR
2913
+ .unreq INPTR
2914
+ .unreq TMP1
2915
+ .unreq TMP2
2916
+ .unreq TMP3
2917
+ .unreq TMPDUP
2918
+
2919
+
2920
+ /*****************************************************************************/
2921
+
2922
+ /*
2923
+ * Downsample pixel values of a single component.
2924
+ * This version handles the common case of 2:1 horizontal and 2:1 vertical,
2925
+ * without smoothing.
2926
+ *
2927
+ * GLOBAL(void)
2928
+ * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
2929
+ * JDIMENSION v_samp_factor,
2930
+ * JDIMENSION width_in_blocks,
2931
+ * JSAMPARRAY input_data, JSAMPARRAY output_data);
2932
+ */
2933
+
2934
+ .balign 16
2935
+ asm_function jsimd_h2v2_downsample_neon
2936
+ IMAGE_WIDTH .req x0
2937
+ MAX_V_SAMP .req x1
2938
+ V_SAMP .req x2
2939
+ BLOCK_WIDTH .req x3
2940
+ INPUT_DATA .req x4
2941
+ OUTPUT_DATA .req x5
2942
+ OUTPTR .req x9
2943
+ INPTR0 .req x10
2944
+ INPTR1 .req x14
2945
+ TMP1 .req x11
2946
+ TMP2 .req x12
2947
+ TMP3 .req x13
2948
+ TMPDUP .req w15
2949
+
2950
+ mov TMPDUP, #1
2951
+ lsl TMP2, BLOCK_WIDTH, #4
2952
+ lsl TMPDUP, TMPDUP, #17
2953
+ sub TMP2, TMP2, IMAGE_WIDTH
2954
+ get_symbol_loc TMP3, Ljsimd_h2_downsample_neon_consts
2955
+ orr TMPDUP, TMPDUP, #1
2956
+ add TMP3, TMP3, TMP2, lsl #4
2957
+ dup v16.4s, TMPDUP
2958
+ ld1 {v18.16b}, [TMP3]
2959
+
2960
+ 1: /* row loop */
2961
+ ldr INPTR0, [INPUT_DATA], #8
2962
+ ldr OUTPTR, [OUTPUT_DATA], #8
2963
+ ldr INPTR1, [INPUT_DATA], #8
2964
+ subs TMP1, BLOCK_WIDTH, #1
2965
+ b.eq 3f
2966
+ 2: /* columns */
2967
+ ld1 {v0.16b}, [INPTR0], #16
2968
+ ld1 {v1.16b}, [INPTR1], #16
2969
+ mov v4.16b, v16.16b
2970
+ subs TMP1, TMP1, #1
2971
+ uadalp v4.8h, v0.16b
2972
+ uadalp v4.8h, v1.16b
2973
+ shrn v6.8b, v4.8h, #2
2974
+ st1 {v6.8b}, [OUTPTR], #8
2975
+ b.ne 2b
2976
+ 3: /* last columns */
2977
+ ld1 {v0.16b}, [INPTR0], #16
2978
+ ld1 {v1.16b}, [INPTR1], #16
2979
+ mov v4.16b, v16.16b
2980
+ subs V_SAMP, V_SAMP, #1
2981
+ /* expand right */
2982
+ tbl v2.16b, {v0.16b}, v18.16b
2983
+ tbl v3.16b, {v1.16b}, v18.16b
2984
+ uadalp v4.8h, v2.16b
2985
+ uadalp v4.8h, v3.16b
2986
+ shrn v6.8b, v4.8h, #2
2987
+ st1 {v6.8b}, [OUTPTR], #8
2988
+ b.ne 1b
2989
+
2990
+ br x30
2991
+
2992
+ .unreq IMAGE_WIDTH
2993
+ .unreq MAX_V_SAMP
2994
+ .unreq V_SAMP
2995
+ .unreq BLOCK_WIDTH
2996
+ .unreq INPUT_DATA
2997
+ .unreq OUTPUT_DATA
2998
+ .unreq OUTPTR
2999
+ .unreq INPTR0
3000
+ .unreq INPTR1
3001
+ .unreq TMP1
3002
+ .unreq TMP2
3003
+ .unreq TMP3
3004
+ .unreq TMPDUP
3005
+
3006
+
3007
+ /*****************************************************************************/
3008
+
3009
+ /*
3010
+ * GLOBAL(JOCTET *)
3011
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
3012
+ * JCOEFPTR block, int last_dc_val,
3013
+ * c_derived_tbl *dctbl, c_derived_tbl *actbl)
3014
+ *
3015
+ */
3016
+
3017
+ BUFFER .req x1
3018
+ PUT_BUFFER .req x6
3019
+ PUT_BITS .req x7
3020
+ PUT_BITSw .req w7
3021
+
3022
+ .macro emit_byte
3023
+ sub PUT_BITS, PUT_BITS, #0x8
3024
+ lsr x19, PUT_BUFFER, PUT_BITS
3025
+ uxtb w19, w19
3026
+ strb w19, [BUFFER, #1]!
3027
+ cmp w19, #0xff
3028
+ b.ne 14f
3029
+ strb wzr, [BUFFER, #1]!
3030
+ 14:
3031
+ .endm
3032
+ .macro put_bits CODE, SIZE
3033
+ lsl PUT_BUFFER, PUT_BUFFER, \SIZE
3034
+ add PUT_BITS, PUT_BITS, \SIZE
3035
+ orr PUT_BUFFER, PUT_BUFFER, \CODE
3036
+ .endm
3037
+ .macro checkbuf31
3038
+ cmp PUT_BITS, #0x20
3039
+ b.lt 31f
3040
+ emit_byte
3041
+ emit_byte
3042
+ emit_byte
3043
+ emit_byte
3044
+ 31:
3045
+ .endm
3046
+ .macro checkbuf47
3047
+ cmp PUT_BITS, #0x30
3048
+ b.lt 47f
3049
+ emit_byte
3050
+ emit_byte
3051
+ emit_byte
3052
+ emit_byte
3053
+ emit_byte
3054
+ emit_byte
3055
+ 47:
3056
+ .endm
3057
+
3058
+ .macro generate_jsimd_huff_encode_one_block fast_tbl
3059
+
3060
+ .if \fast_tbl == 1
3061
+ asm_function jsimd_huff_encode_one_block_neon
3062
+ .else
3063
+ asm_function jsimd_huff_encode_one_block_neon_slowtbl
3064
+ .endif
3065
+ sub sp, sp, 272
3066
+ sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
3067
+ /* Save ARM registers */
3068
+ stp x19, x20, [sp]
3069
+ get_symbol_loc x15, Ljsimd_huff_encode_one_block_neon_consts
3070
+ ldr PUT_BUFFER, [x0, #0x10]
3071
+ ldr PUT_BITSw, [x0, #0x18]
3072
+ ldrsh w12, [x2] /* load DC coeff in w12 */
3073
+ /* prepare data */
3074
+ .if \fast_tbl == 1
3075
+ ld1 {v23.16b}, [x15], #16
3076
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
3077
+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
3078
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
3079
+ ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
3080
+ ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
3081
+ sub w12, w12, w3 /* last_dc_val, not used afterwards */
3082
+ /* ZigZag 8x8 */
3083
+ tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
3084
+ tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
3085
+ tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
3086
+ tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
3087
+ tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
3088
+ tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
3089
+ tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
3090
+ tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
3091
+ ins v0.h[0], w12
3092
+ tbx v1.16b, {v28.16b}, v16.16b
3093
+ tbx v2.16b, {v29.16b, v30.16b}, v17.16b
3094
+ tbx v5.16b, {v29.16b, v30.16b}, v18.16b
3095
+ tbx v6.16b, {v31.16b}, v19.16b
3096
+ .else
3097
+ add x13, x2, #0x22
3098
+ sub w12, w12, w3 /* last_dc_val, not used afterwards */
3099
+ ld1 {v23.16b}, [x15]
3100
+ add x14, x2, #0x18
3101
+ add x3, x2, #0x36
3102
+ ins v0.h[0], w12
3103
+ add x9, x2, #0x2
3104
+ ld1 {v1.h}[0], [x13]
3105
+ add x15, x2, #0x30
3106
+ ld1 {v2.h}[0], [x14]
3107
+ add x19, x2, #0x26
3108
+ ld1 {v3.h}[0], [x3]
3109
+ add x20, x2, #0x28
3110
+ ld1 {v0.h}[1], [x9]
3111
+ add x12, x2, #0x10
3112
+ ld1 {v1.h}[1], [x15]
3113
+ add x13, x2, #0x40
3114
+ ld1 {v2.h}[1], [x19]
3115
+ add x14, x2, #0x34
3116
+ ld1 {v3.h}[1], [x20]
3117
+ add x3, x2, #0x1a
3118
+ ld1 {v0.h}[2], [x12]
3119
+ add x9, x2, #0x20
3120
+ ld1 {v1.h}[2], [x13]
3121
+ add x15, x2, #0x32
3122
+ ld1 {v2.h}[2], [x14]
3123
+ add x19, x2, #0x42
3124
+ ld1 {v3.h}[2], [x3]
3125
+ add x20, x2, #0xc
3126
+ ld1 {v0.h}[3], [x9]
3127
+ add x12, x2, #0x12
3128
+ ld1 {v1.h}[3], [x15]
3129
+ add x13, x2, #0x24
3130
+ ld1 {v2.h}[3], [x19]
3131
+ add x14, x2, #0x50
3132
+ ld1 {v3.h}[3], [x20]
3133
+ add x3, x2, #0xe
3134
+ ld1 {v0.h}[4], [x12]
3135
+ add x9, x2, #0x4
3136
+ ld1 {v1.h}[4], [x13]
3137
+ add x15, x2, #0x16
3138
+ ld1 {v2.h}[4], [x14]
3139
+ add x19, x2, #0x60
3140
+ ld1 {v3.h}[4], [x3]
3141
+ add x20, x2, #0x1c
3142
+ ld1 {v0.h}[5], [x9]
3143
+ add x12, x2, #0x6
3144
+ ld1 {v1.h}[5], [x15]
3145
+ add x13, x2, #0x8
3146
+ ld1 {v2.h}[5], [x19]
3147
+ add x14, x2, #0x52
3148
+ ld1 {v3.h}[5], [x20]
3149
+ add x3, x2, #0x2a
3150
+ ld1 {v0.h}[6], [x12]
3151
+ add x9, x2, #0x14
3152
+ ld1 {v1.h}[6], [x13]
3153
+ add x15, x2, #0xa
3154
+ ld1 {v2.h}[6], [x14]
3155
+ add x19, x2, #0x44
3156
+ ld1 {v3.h}[6], [x3]
3157
+ add x20, x2, #0x38
3158
+ ld1 {v0.h}[7], [x9]
3159
+ add x12, x2, #0x46
3160
+ ld1 {v1.h}[7], [x15]
3161
+ add x13, x2, #0x3a
3162
+ ld1 {v2.h}[7], [x19]
3163
+ add x14, x2, #0x74
3164
+ ld1 {v3.h}[7], [x20]
3165
+ add x3, x2, #0x6a
3166
+ ld1 {v4.h}[0], [x12]
3167
+ add x9, x2, #0x54
3168
+ ld1 {v5.h}[0], [x13]
3169
+ add x15, x2, #0x2c
3170
+ ld1 {v6.h}[0], [x14]
3171
+ add x19, x2, #0x76
3172
+ ld1 {v7.h}[0], [x3]
3173
+ add x20, x2, #0x78
3174
+ ld1 {v4.h}[1], [x9]
3175
+ add x12, x2, #0x62
3176
+ ld1 {v5.h}[1], [x15]
3177
+ add x13, x2, #0x1e
3178
+ ld1 {v6.h}[1], [x19]
3179
+ add x14, x2, #0x68
3180
+ ld1 {v7.h}[1], [x20]
3181
+ add x3, x2, #0x7a
3182
+ ld1 {v4.h}[2], [x12]
3183
+ add x9, x2, #0x70
3184
+ ld1 {v5.h}[2], [x13]
3185
+ add x15, x2, #0x2e
3186
+ ld1 {v6.h}[2], [x14]
3187
+ add x19, x2, #0x5a
3188
+ ld1 {v7.h}[2], [x3]
3189
+ add x20, x2, #0x6c
3190
+ ld1 {v4.h}[3], [x9]
3191
+ add x12, x2, #0x72
3192
+ ld1 {v5.h}[3], [x15]
3193
+ add x13, x2, #0x3c
3194
+ ld1 {v6.h}[3], [x19]
3195
+ add x14, x2, #0x4c
3196
+ ld1 {v7.h}[3], [x20]
3197
+ add x3, x2, #0x5e
3198
+ ld1 {v4.h}[4], [x12]
3199
+ add x9, x2, #0x64
3200
+ ld1 {v5.h}[4], [x13]
3201
+ add x15, x2, #0x4a
3202
+ ld1 {v6.h}[4], [x14]
3203
+ add x19, x2, #0x3e
3204
+ ld1 {v7.h}[4], [x3]
3205
+ add x20, x2, #0x6e
3206
+ ld1 {v4.h}[5], [x9]
3207
+ add x12, x2, #0x56
3208
+ ld1 {v5.h}[5], [x15]
3209
+ add x13, x2, #0x58
3210
+ ld1 {v6.h}[5], [x19]
3211
+ add x14, x2, #0x4e
3212
+ ld1 {v7.h}[5], [x20]
3213
+ add x3, x2, #0x7c
3214
+ ld1 {v4.h}[6], [x12]
3215
+ add x9, x2, #0x48
3216
+ ld1 {v5.h}[6], [x13]
3217
+ add x15, x2, #0x66
3218
+ ld1 {v6.h}[6], [x14]
3219
+ add x19, x2, #0x5c
3220
+ ld1 {v7.h}[6], [x3]
3221
+ add x20, x2, #0x7e
3222
+ ld1 {v4.h}[7], [x9]
3223
+ ld1 {v5.h}[7], [x15]
3224
+ ld1 {v6.h}[7], [x19]
3225
+ ld1 {v7.h}[7], [x20]
3226
+ .endif
3227
+ cmlt v24.8h, v0.8h, #0
3228
+ cmlt v25.8h, v1.8h, #0
3229
+ cmlt v26.8h, v2.8h, #0
3230
+ cmlt v27.8h, v3.8h, #0
3231
+ cmlt v28.8h, v4.8h, #0
3232
+ cmlt v29.8h, v5.8h, #0
3233
+ cmlt v30.8h, v6.8h, #0
3234
+ cmlt v31.8h, v7.8h, #0
3235
+ abs v0.8h, v0.8h
3236
+ abs v1.8h, v1.8h
3237
+ abs v2.8h, v2.8h
3238
+ abs v3.8h, v3.8h
3239
+ abs v4.8h, v4.8h
3240
+ abs v5.8h, v5.8h
3241
+ abs v6.8h, v6.8h
3242
+ abs v7.8h, v7.8h
3243
+ eor v24.16b, v24.16b, v0.16b
3244
+ eor v25.16b, v25.16b, v1.16b
3245
+ eor v26.16b, v26.16b, v2.16b
3246
+ eor v27.16b, v27.16b, v3.16b
3247
+ eor v28.16b, v28.16b, v4.16b
3248
+ eor v29.16b, v29.16b, v5.16b
3249
+ eor v30.16b, v30.16b, v6.16b
3250
+ eor v31.16b, v31.16b, v7.16b
3251
+ cmeq v16.8h, v0.8h, #0
3252
+ cmeq v17.8h, v1.8h, #0
3253
+ cmeq v18.8h, v2.8h, #0
3254
+ cmeq v19.8h, v3.8h, #0
3255
+ cmeq v20.8h, v4.8h, #0
3256
+ cmeq v21.8h, v5.8h, #0
3257
+ cmeq v22.8h, v6.8h, #0
3258
+ xtn v16.8b, v16.8h
3259
+ xtn v18.8b, v18.8h
3260
+ xtn v20.8b, v20.8h
3261
+ xtn v22.8b, v22.8h
3262
+ umov w14, v0.h[0]
3263
+ xtn2 v16.16b, v17.8h
3264
+ umov w13, v24.h[0]
3265
+ xtn2 v18.16b, v19.8h
3266
+ clz w14, w14
3267
+ xtn2 v20.16b, v21.8h
3268
+ lsl w13, w13, w14
3269
+ cmeq v17.8h, v7.8h, #0
3270
+ sub w12, w14, #32
3271
+ xtn2 v22.16b, v17.8h
3272
+ lsr w13, w13, w14
3273
+ and v16.16b, v16.16b, v23.16b
3274
+ neg w12, w12
3275
+ and v18.16b, v18.16b, v23.16b
3276
+ add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
3277
+ and v20.16b, v20.16b, v23.16b
3278
+ add x15, sp, #0x90 /* x15 = t2 */
3279
+ and v22.16b, v22.16b, v23.16b
3280
+ ldr w10, [x4, x12, lsl #2]
3281
+ addp v16.16b, v16.16b, v18.16b
3282
+ ldrb w11, [x3, x12]
3283
+ addp v20.16b, v20.16b, v22.16b
3284
+ checkbuf47
3285
+ addp v16.16b, v16.16b, v20.16b
3286
+ put_bits x10, x11
3287
+ addp v16.16b, v16.16b, v18.16b
3288
+ checkbuf47
3289
+ umov x9, v16.D[0]
3290
+ put_bits x13, x12
3291
+ cnt v17.8b, v16.8b
3292
+ mvn x9, x9
3293
+ addv B18, v17.8b
3294
+ add x4, x5, #0x400 /* x4 = actbl->ehufsi */
3295
+ umov w12, v18.b[0]
3296
+ lsr x9, x9, #0x1 /* clear AC coeff */
3297
+ ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */
3298
+ rbit x9, x9 /* x9 = index0 */
3299
+ ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
3300
+ cmp w12, #(64-8)
3301
+ add x11, sp, #16
3302
+ b.lt 4f
3303
+ cbz x9, 6f
3304
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3305
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3306
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3307
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
3308
+ 1:
3309
+ clz x2, x9
3310
+ add x15, x15, x2, lsl #1
3311
+ lsl x9, x9, x2
3312
+ ldrh w20, [x15, #-126]
3313
+ 2:
3314
+ cmp x2, #0x10
3315
+ b.lt 3f
3316
+ sub x2, x2, #0x10
3317
+ checkbuf47
3318
+ put_bits x13, x14
3319
+ b 2b
3320
+ 3:
3321
+ clz w20, w20
3322
+ ldrh w3, [x15, #2]!
3323
+ sub w11, w20, #32
3324
+ lsl w3, w3, w20
3325
+ neg w11, w11
3326
+ lsr w3, w3, w20
3327
+ add x2, x11, x2, lsl #4
3328
+ lsl x9, x9, #0x1
3329
+ ldr w12, [x5, x2, lsl #2]
3330
+ ldrb w10, [x4, x2]
3331
+ checkbuf31
3332
+ put_bits x12, x10
3333
+ put_bits x3, x11
3334
+ cbnz x9, 1b
3335
+ b 6f
3336
+ 4:
3337
+ movi v21.8h, #0x0010
3338
+ clz v0.8h, v0.8h
3339
+ clz v1.8h, v1.8h
3340
+ clz v2.8h, v2.8h
3341
+ clz v3.8h, v3.8h
3342
+ clz v4.8h, v4.8h
3343
+ clz v5.8h, v5.8h
3344
+ clz v6.8h, v6.8h
3345
+ clz v7.8h, v7.8h
3346
+ ushl v24.8h, v24.8h, v0.8h
3347
+ ushl v25.8h, v25.8h, v1.8h
3348
+ ushl v26.8h, v26.8h, v2.8h
3349
+ ushl v27.8h, v27.8h, v3.8h
3350
+ ushl v28.8h, v28.8h, v4.8h
3351
+ ushl v29.8h, v29.8h, v5.8h
3352
+ ushl v30.8h, v30.8h, v6.8h
3353
+ ushl v31.8h, v31.8h, v7.8h
3354
+ neg v0.8h, v0.8h
3355
+ neg v1.8h, v1.8h
3356
+ neg v2.8h, v2.8h
3357
+ neg v3.8h, v3.8h
3358
+ neg v4.8h, v4.8h
3359
+ neg v5.8h, v5.8h
3360
+ neg v6.8h, v6.8h
3361
+ neg v7.8h, v7.8h
3362
+ ushl v24.8h, v24.8h, v0.8h
3363
+ ushl v25.8h, v25.8h, v1.8h
3364
+ ushl v26.8h, v26.8h, v2.8h
3365
+ ushl v27.8h, v27.8h, v3.8h
3366
+ ushl v28.8h, v28.8h, v4.8h
3367
+ ushl v29.8h, v29.8h, v5.8h
3368
+ ushl v30.8h, v30.8h, v6.8h
3369
+ ushl v31.8h, v31.8h, v7.8h
3370
+ add v0.8h, v21.8h, v0.8h
3371
+ add v1.8h, v21.8h, v1.8h
3372
+ add v2.8h, v21.8h, v2.8h
3373
+ add v3.8h, v21.8h, v3.8h
3374
+ add v4.8h, v21.8h, v4.8h
3375
+ add v5.8h, v21.8h, v5.8h
3376
+ add v6.8h, v21.8h, v6.8h
3377
+ add v7.8h, v21.8h, v7.8h
3378
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3379
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3380
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3381
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
3382
+ 1:
3383
+ clz x2, x9
3384
+ add x15, x15, x2, lsl #1
3385
+ lsl x9, x9, x2
3386
+ ldrh w11, [x15, #-126]
3387
+ 2:
3388
+ cmp x2, #0x10
3389
+ b.lt 3f
3390
+ sub x2, x2, #0x10
3391
+ checkbuf47
3392
+ put_bits x13, x14
3393
+ b 2b
3394
+ 3:
3395
+ ldrh w3, [x15, #2]!
3396
+ add x2, x11, x2, lsl #4
3397
+ lsl x9, x9, #0x1
3398
+ ldr w12, [x5, x2, lsl #2]
3399
+ ldrb w10, [x4, x2]
3400
+ checkbuf31
3401
+ put_bits x12, x10
3402
+ put_bits x3, x11
3403
+ cbnz x9, 1b
3404
+ 6:
3405
+ add x13, sp, #0x10e
3406
+ cmp x15, x13
3407
+ b.hs 1f
3408
+ ldr w12, [x5]
3409
+ ldrb w14, [x4]
3410
+ checkbuf47
3411
+ put_bits x12, x14
3412
+ 1:
3413
+ str PUT_BUFFER, [x0, #0x10]
3414
+ str PUT_BITSw, [x0, #0x18]
3415
+ ldp x19, x20, [sp], 16
3416
+ add x0, BUFFER, #0x1
3417
+ add sp, sp, 256
3418
+ br x30
3419
+
3420
+ .endm
3421
+
3422
+ generate_jsimd_huff_encode_one_block 1
3423
+ generate_jsimd_huff_encode_one_block 0
3424
+
3425
+ .unreq BUFFER
3426
+ .unreq PUT_BUFFER
3427
+ .unreq PUT_BITS
3428
+ .unreq PUT_BITSw
3429
+
3430
+ .purgem emit_byte
3431
+ .purgem put_bits
3432
+ .purgem checkbuf31
3433
+ .purgem checkbuf47