laag-libjpeg-turbo 1.5.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (444) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +57 -0
  3. data/LICENSE.txt +139 -0
  4. data/README.org +34 -0
  5. data/ext/laag/libjpeg-turbo/extconf.rb +16 -0
  6. data/laag-libjpeg-turbo.gemspec +20 -0
  7. data/lib/laag/libjpeg-turbo.rb +29 -0
  8. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/.gitignore +14 -0
  9. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/.travis.yml +131 -0
  10. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/BUILDING.md +964 -0
  11. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/CMakeLists.txt +962 -0
  12. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/ChangeLog.md +1151 -0
  13. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/LICENSE.md +139 -0
  14. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/Makefile.am +794 -0
  15. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/README.ijg +279 -0
  16. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/README.md +341 -0
  17. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/acinclude.m4 +287 -0
  18. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/appveyor.yml +57 -0
  19. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/bmp.c +341 -0
  20. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/bmp.h +42 -0
  21. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/cderror.h +136 -0
  22. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/cdjpeg.c +144 -0
  23. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/cdjpeg.h +153 -0
  24. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/change.log +315 -0
  25. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/ci/keys.enc +0 -0
  26. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/cjpeg.1 +351 -0
  27. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/cjpeg.c +644 -0
  28. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/cmakescripts/cmake_uninstall.cmake.in +24 -0
  29. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/cmakescripts/testclean.cmake +39 -0
  30. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/coderules.txt +78 -0
  31. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/configure.ac +616 -0
  32. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/djpeg.1 +293 -0
  33. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/djpeg.c +782 -0
  34. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/annotated.html +104 -0
  35. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/bc_s.png +0 -0
  36. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/bdwn.png +0 -0
  37. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/classes.html +106 -0
  38. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/closed.png +0 -0
  39. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/doxygen-extra.css +3 -0
  40. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/doxygen.css +1184 -0
  41. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/doxygen.png +0 -0
  42. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/dynsections.js +97 -0
  43. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2blank.png +0 -0
  44. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2cl.png +0 -0
  45. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2doc.png +0 -0
  46. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2folderclosed.png +0 -0
  47. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2folderopen.png +0 -0
  48. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2lastnode.png +0 -0
  49. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2link.png +0 -0
  50. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2mlastnode.png +0 -0
  51. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2mnode.png +0 -0
  52. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2mo.png +0 -0
  53. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2node.png +0 -0
  54. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2ns.png +0 -0
  55. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2plastnode.png +0 -0
  56. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2pnode.png +0 -0
  57. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2splitbar.png +0 -0
  58. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/ftv2vertline.png +0 -0
  59. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/functions.html +134 -0
  60. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/functions_vars.html +134 -0
  61. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/group___turbo_j_p_e_g.html +2446 -0
  62. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/index.html +90 -0
  63. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/jquery.js +8 -0
  64. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/modules.html +95 -0
  65. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/nav_f.png +0 -0
  66. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/nav_g.png +0 -0
  67. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/nav_h.png +0 -0
  68. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/open.png +0 -0
  69. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_63.html +26 -0
  70. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_63.js +4 -0
  71. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_64.html +26 -0
  72. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_64.js +5 -0
  73. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_68.html +26 -0
  74. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_68.js +4 -0
  75. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_6e.html +26 -0
  76. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_6e.js +4 -0
  77. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_6f.html +26 -0
  78. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_6f.js +5 -0
  79. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_72.html +26 -0
  80. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_72.js +4 -0
  81. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_74.html +26 -0
  82. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_74.js +89 -0
  83. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_77.html +26 -0
  84. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_77.js +4 -0
  85. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_78.html +26 -0
  86. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_78.js +4 -0
  87. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_79.html +26 -0
  88. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/all_79.js +4 -0
  89. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/classes_74.html +26 -0
  90. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/classes_74.js +6 -0
  91. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/close.png +0 -0
  92. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/enums_74.html +26 -0
  93. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/enums_74.js +7 -0
  94. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/enumvalues_74.html +26 -0
  95. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/enumvalues_74.js +34 -0
  96. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/functions_74.html +26 -0
  97. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/functions_74.js +28 -0
  98. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/groups_74.html +26 -0
  99. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/groups_74.js +4 -0
  100. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/mag_sel.png +0 -0
  101. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/nomatches.html +12 -0
  102. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/search.css +271 -0
  103. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/search.js +809 -0
  104. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/search_l.png +0 -0
  105. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/search_m.png +0 -0
  106. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/search_r.png +0 -0
  107. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/typedefs_74.html +26 -0
  108. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/typedefs_74.js +5 -0
  109. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_63.html +26 -0
  110. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_63.js +4 -0
  111. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_64.html +26 -0
  112. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_64.js +5 -0
  113. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_68.html +26 -0
  114. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_68.js +4 -0
  115. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_6e.html +26 -0
  116. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_6e.js +4 -0
  117. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_6f.html +26 -0
  118. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_6f.js +5 -0
  119. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_72.html +26 -0
  120. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_72.js +4 -0
  121. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_74.html +26 -0
  122. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_74.js +9 -0
  123. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_77.html +26 -0
  124. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_77.js +4 -0
  125. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_78.html +26 -0
  126. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_78.js +4 -0
  127. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_79.html +26 -0
  128. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/search/variables_79.js +4 -0
  129. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/structtjregion.html +186 -0
  130. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/structtjscalingfactor.html +148 -0
  131. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/structtjtransform.html +212 -0
  132. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/sync_off.png +0 -0
  133. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/sync_on.png +0 -0
  134. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/tab_a.png +0 -0
  135. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/tab_b.png +0 -0
  136. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/tab_h.png +0 -0
  137. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/tab_s.png +0 -0
  138. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doc/html/tabs.css +60 -0
  139. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doxygen-extra.css +3 -0
  140. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/doxygen.config +16 -0
  141. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/example.c +433 -0
  142. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jaricom.c +156 -0
  143. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/CMakeLists.txt +57 -0
  144. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/MANIFEST.MF +2 -0
  145. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/Makefile.am +75 -0
  146. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/README +52 -0
  147. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/TJBench.java +926 -0
  148. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/TJExample.java +362 -0
  149. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/TJUnitTest.java +959 -0
  150. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/allclasses-frame.html +24 -0
  151. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/allclasses-noframe.html +24 -0
  152. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/constant-values.html +479 -0
  153. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/deprecated-list.html +248 -0
  154. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/help-doc.html +206 -0
  155. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/index-all.html +980 -0
  156. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/index.html +70 -0
  157. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/org/libjpegturbo/turbojpeg/TJ.html +1254 -0
  158. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html +922 -0
  159. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html +237 -0
  160. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html +1235 -0
  161. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/org/libjpegturbo/turbojpeg/TJException.html +287 -0
  162. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html +333 -0
  163. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html +706 -0
  164. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html +417 -0
  165. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html +761 -0
  166. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/org/libjpegturbo/turbojpeg/package-frame.html +31 -0
  167. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/org/libjpegturbo/turbojpeg/package-summary.html +198 -0
  168. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/org/libjpegturbo/turbojpeg/package-tree.html +156 -0
  169. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/overview-tree.html +160 -0
  170. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/package-list +1 -0
  171. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/resources/background.gif +0 -0
  172. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/resources/tab.gif +0 -0
  173. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/resources/titlebar.gif +0 -0
  174. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/resources/titlebar_end.gif +0 -0
  175. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/serialized-form.html +159 -0
  176. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/doc/stylesheet.css +474 -0
  177. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org/libjpegturbo/turbojpeg/TJ.java +513 -0
  178. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org/libjpegturbo/turbojpeg/TJCompressor.java +658 -0
  179. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java +76 -0
  180. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org/libjpegturbo/turbojpeg/TJDecompressor.java +909 -0
  181. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org/libjpegturbo/turbojpeg/TJException.java +53 -0
  182. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org/libjpegturbo/turbojpeg/TJLoader.java.in +35 -0
  183. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org/libjpegturbo/turbojpeg/TJLoader.java.tmpl +59 -0
  184. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org/libjpegturbo/turbojpeg/TJScalingFactor.java +104 -0
  185. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org/libjpegturbo/turbojpeg/TJTransform.java +208 -0
  186. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org/libjpegturbo/turbojpeg/TJTransformer.java +163 -0
  187. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org/libjpegturbo/turbojpeg/YUVImage.java +443 -0
  188. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org_libjpegturbo_turbojpeg_TJ.h +129 -0
  189. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org_libjpegturbo_turbojpeg_TJCompressor.h +101 -0
  190. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org_libjpegturbo_turbojpeg_TJDecompressor.h +101 -0
  191. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/java/org_libjpegturbo_turbojpeg_TJTransformer.h +29 -0
  192. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcapimin.c +295 -0
  193. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcapistd.c +162 -0
  194. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcarith.c +928 -0
  195. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jccoefct.c +449 -0
  196. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jccolext.c +148 -0
  197. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jccolor.c +719 -0
  198. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcdctmgr.c +721 -0
  199. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jchuff.c +1091 -0
  200. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jchuff.h +43 -0
  201. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcinit.c +77 -0
  202. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcmainct.c +162 -0
  203. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcmarker.c +665 -0
  204. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcmaster.c +639 -0
  205. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcomapi.c +109 -0
  206. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jconfig.h.in +73 -0
  207. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jconfig.txt +143 -0
  208. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jconfigint.h.in +17 -0
  209. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcparam.c +542 -0
  210. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcphuff.c +834 -0
  211. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcprepct.c +357 -0
  212. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcsample.c +539 -0
  213. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jcstest.c +126 -0
  214. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jctrans.c +402 -0
  215. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdapimin.c +407 -0
  216. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdapistd.c +637 -0
  217. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdarith.c +769 -0
  218. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdatadst-tj.c +202 -0
  219. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdatadst.c +293 -0
  220. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdatasrc-tj.c +191 -0
  221. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdatasrc.c +295 -0
  222. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdcoefct.c +693 -0
  223. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdcoefct.h +82 -0
  224. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdcol565.c +384 -0
  225. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdcolext.c +143 -0
  226. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdcolor.c +897 -0
  227. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdct.h +208 -0
  228. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jddctmgr.c +352 -0
  229. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdhuff.c +822 -0
  230. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdhuff.h +234 -0
  231. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdinput.c +405 -0
  232. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdmainct.c +456 -0
  233. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdmainct.h +71 -0
  234. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdmarker.c +1377 -0
  235. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdmaster.c +736 -0
  236. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdmaster.h +28 -0
  237. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdmerge.c +627 -0
  238. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdmrg565.c +356 -0
  239. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdmrgext.c +186 -0
  240. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdphuff.c +674 -0
  241. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdpostct.c +290 -0
  242. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdsample.c +517 -0
  243. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdsample.h +50 -0
  244. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jdtrans.c +155 -0
  245. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jerror.c +251 -0
  246. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jerror.h +317 -0
  247. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jfdctflt.c +169 -0
  248. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jfdctfst.c +227 -0
  249. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jfdctint.c +286 -0
  250. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jidctflt.c +240 -0
  251. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jidctfst.c +371 -0
  252. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jidctint.c +2627 -0
  253. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jidctred.c +403 -0
  254. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jinclude.h +84 -0
  255. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jmemmgr.c +1183 -0
  256. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jmemnobs.c +115 -0
  257. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jmemsys.h +178 -0
  258. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jmorecfg.h +421 -0
  259. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jpeg_nbits_table.h +4098 -0
  260. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jpegcomp.h +31 -0
  261. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jpegint.h +368 -0
  262. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jpeglib.h +1122 -0
  263. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jpegtran.1 +290 -0
  264. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jpegtran.c +551 -0
  265. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jquant1.c +857 -0
  266. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jquant2.c +1282 -0
  267. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jsimd.h +93 -0
  268. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jsimd_none.c +404 -0
  269. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jsimddct.h +74 -0
  270. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jstdhuff.c +135 -0
  271. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jutils.c +133 -0
  272. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/jversion.h +49 -0
  273. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/libjpeg.map.in +11 -0
  274. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/libjpeg.txt +3104 -0
  275. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/md5/CMakeLists.txt +1 -0
  276. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/md5/Makefile.am +4 -0
  277. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/md5/md5.c +340 -0
  278. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/md5/md5.h +49 -0
  279. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/md5/md5cmp.c +60 -0
  280. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/md5/md5hl.c +114 -0
  281. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/rdbmp.c +483 -0
  282. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/rdcolmap.c +254 -0
  283. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/rdgif.c +39 -0
  284. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/rdjpgcom.1 +63 -0
  285. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/rdjpgcom.c +510 -0
  286. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/rdppm.c +471 -0
  287. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/rdrle.c +389 -0
  288. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/rdswitch.c +424 -0
  289. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/rdtarga.c +503 -0
  290. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/Distribution.xml +24 -0
  291. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/License.rtf +20 -0
  292. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/ReadMe.txt +5 -0
  293. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/Welcome.rtf +17 -0
  294. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/deb-control.tmpl +31 -0
  295. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/libjpeg-turbo.nsi.in +162 -0
  296. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/libjpeg-turbo.spec.in +164 -0
  297. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/libjpeg.pc.in +10 -0
  298. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/libturbojpeg.pc.in +10 -0
  299. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/makecygwinpkg.in +42 -0
  300. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/makedpkg.in +82 -0
  301. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/makemacpkg.in +470 -0
  302. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/release/uninstall.in +112 -0
  303. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/sharedlib/CMakeLists.txt +73 -0
  304. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/CMakeLists.txt +81 -0
  305. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/Makefile.am +102 -0
  306. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jccolext-altivec.c +267 -0
  307. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jccolext-mmx.asm +476 -0
  308. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jccolext-sse2-64.asm +486 -0
  309. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jccolext-sse2.asm +503 -0
  310. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jccolor-altivec.c +104 -0
  311. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jccolor-mmx.asm +122 -0
  312. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jccolor-sse2-64.asm +121 -0
  313. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jccolor-sse2.asm +121 -0
  314. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcgray-altivec.c +99 -0
  315. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcgray-mmx.asm +115 -0
  316. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcgray-sse2-64.asm +114 -0
  317. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcgray-sse2.asm +114 -0
  318. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcgryext-altivec.c +227 -0
  319. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcgryext-mmx.asm +356 -0
  320. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcgryext-sse2-64.asm +365 -0
  321. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcgryext-sse2.asm +384 -0
  322. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jchuff-sse2-64.asm +360 -0
  323. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jchuff-sse2.asm +426 -0
  324. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcolsamp.inc +104 -0
  325. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcsample-altivec.c +158 -0
  326. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcsample-mmx.asm +323 -0
  327. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcsample-sse2-64.asm +329 -0
  328. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcsample-sse2.asm +350 -0
  329. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jcsample.h +28 -0
  330. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdcolext-altivec.c +274 -0
  331. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdcolext-mmx.asm +404 -0
  332. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdcolext-sse2-64.asm +440 -0
  333. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdcolext-sse2.asm +459 -0
  334. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdcolor-altivec.c +96 -0
  335. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdcolor-mmx.asm +119 -0
  336. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdcolor-sse2-64.asm +119 -0
  337. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdcolor-sse2.asm +119 -0
  338. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdct.inc +27 -0
  339. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdmerge-altivec.c +108 -0
  340. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdmerge-mmx.asm +125 -0
  341. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdmerge-sse2-64.asm +125 -0
  342. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdmerge-sse2.asm +125 -0
  343. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdmrgext-altivec.c +323 -0
  344. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdmrgext-mmx.asm +463 -0
  345. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdmrgext-sse2-64.asm +537 -0
  346. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdmrgext-sse2.asm +518 -0
  347. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdsample-altivec.c +392 -0
  348. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdsample-mmx.asm +736 -0
  349. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdsample-sse2-64.asm +670 -0
  350. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jdsample-sse2.asm +728 -0
  351. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jfdctflt-3dn.asm +319 -0
  352. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jfdctflt-sse-64.asm +357 -0
  353. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jfdctflt-sse.asm +369 -0
  354. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jfdctfst-altivec.c +156 -0
  355. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jfdctfst-mmx.asm +396 -0
  356. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jfdctfst-sse2-64.asm +391 -0
  357. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jfdctfst-sse2.asm +403 -0
  358. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jfdctint-altivec.c +262 -0
  359. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jfdctint-mmx.asm +621 -0
  360. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jfdctint-sse2-64.asm +621 -0
  361. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jfdctint-sse2.asm +633 -0
  362. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctflt-3dn.asm +451 -0
  363. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctflt-sse.asm +571 -0
  364. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctflt-sse2-64.asm +482 -0
  365. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctflt-sse2.asm +497 -0
  366. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctfst-altivec.c +257 -0
  367. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctfst-mmx.asm +499 -0
  368. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctfst-sse2-64.asm +491 -0
  369. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctfst-sse2.asm +501 -0
  370. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctint-altivec.c +359 -0
  371. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctint-mmx.asm +851 -0
  372. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctint-sse2-64.asm +847 -0
  373. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctint-sse2.asm +858 -0
  374. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctred-mmx.asm +705 -0
  375. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctred-sse2-64.asm +575 -0
  376. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jidctred-sse2.asm +593 -0
  377. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jpeg_nbits_table.inc +4097 -0
  378. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jquant-3dn.asm +232 -0
  379. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jquant-mmx.asm +273 -0
  380. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jquant-sse.asm +210 -0
  381. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jquantf-sse2-64.asm +157 -0
  382. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jquantf-sse2.asm +170 -0
  383. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jquanti-altivec.c +252 -0
  384. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jquanti-sse2-64.asm +186 -0
  385. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jquanti-sse2.asm +199 -0
  386. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimd.h +871 -0
  387. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimd_altivec.h +99 -0
  388. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimd_arm.c +728 -0
  389. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimd_arm64.c +803 -0
  390. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimd_arm64_neon.S +3425 -0
  391. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimd_arm_neon.S +2878 -0
  392. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimd_i386.c +1091 -0
  393. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimd_mips.c +1140 -0
  394. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimd_mips_dspr2.S +4486 -0
  395. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimd_mips_dspr2_asm.h +283 -0
  396. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimd_powerpc.c +852 -0
  397. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimd_x86_64.c +887 -0
  398. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimdcfg.inc.h +130 -0
  399. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimdcpu.asm +104 -0
  400. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/jsimdext.inc +375 -0
  401. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/simd/nasm_lt.sh +60 -0
  402. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/structure.txt +904 -0
  403. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/testimages/nightshot_iso_100.bmp +0 -0
  404. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/testimages/nightshot_iso_100.txt +25 -0
  405. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/testimages/testimgari.jpg +0 -0
  406. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/testimages/testimgint.jpg +0 -0
  407. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/testimages/testorig.jpg +0 -0
  408. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/testimages/testorig.ppm +4 -0
  409. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/testimages/testorig12.jpg +0 -0
  410. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/testimages/vgl_5674_0098.bmp +0 -0
  411. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/testimages/vgl_6434_0018a.bmp +0 -0
  412. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/testimages/vgl_6548_0026a.bmp +0 -0
  413. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/tjbench.c +1010 -0
  414. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/tjbenchtest.in +252 -0
  415. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/tjbenchtest.java.in +207 -0
  416. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/tjexampletest.in +150 -0
  417. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/tjunittest.c +734 -0
  418. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/tjutil.c +66 -0
  419. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/tjutil.h +47 -0
  420. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/transupp.c +1626 -0
  421. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/transupp.h +207 -0
  422. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/turbojpeg-jni.c +1166 -0
  423. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/turbojpeg-mapfile +56 -0
  424. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/turbojpeg-mapfile.jni +92 -0
  425. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/turbojpeg.c +2175 -0
  426. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/turbojpeg.h +1545 -0
  427. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/usage.txt +635 -0
  428. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/win/jconfig.h.in +51 -0
  429. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/win/jconfigint.h.in +13 -0
  430. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/win/jpeg62-memsrcdst.def +106 -0
  431. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/win/jpeg62.def +104 -0
  432. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/win/jpeg7-memsrcdst.def +108 -0
  433. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/win/jpeg7.def +106 -0
  434. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/win/jpeg8.def +109 -0
  435. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/win/jsimdcfg.inc +94 -0
  436. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/wizard.txt +211 -0
  437. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/wrbmp.c +494 -0
  438. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/wrgif.c +413 -0
  439. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/wrjpgcom.1 +103 -0
  440. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/wrjpgcom.c +592 -0
  441. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/wrppm.c +280 -0
  442. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/wrrle.c +308 -0
  443. data/vendor/github.com/libjpeg-turbo/libjpeg-turbo/wrtarga.c +261 -0
  444. metadata +509 -0
@@ -0,0 +1,3425 @@
1
+ /*
2
+ * ARMv8 NEON optimizations for libjpeg-turbo
3
+ *
4
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
5
+ * All Rights Reserved.
6
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7
+ * Copyright (C) 2013-2014, Linaro Limited. All Rights Reserved.
8
+ * Author: Ragesh Radhakrishnan <ragesh.r@linaro.org>
9
+ * Copyright (C) 2014-2016, D. R. Commander. All Rights Reserved.
10
+ * Copyright (C) 2015-2016, Matthieu Darbois. All Rights Reserved.
11
+ * Copyright (C) 2016, Siarhei Siamashka. All Rights Reserved.
12
+ *
13
+ * This software is provided 'as-is', without any express or implied
14
+ * warranty. In no event will the authors be held liable for any damages
15
+ * arising from the use of this software.
16
+ *
17
+ * Permission is granted to anyone to use this software for any purpose,
18
+ * including commercial applications, and to alter it and redistribute it
19
+ * freely, subject to the following restrictions:
20
+ *
21
+ * 1. The origin of this software must not be misrepresented; you must not
22
+ * claim that you wrote the original software. If you use this software
23
+ * in a product, an acknowledgment in the product documentation would be
24
+ * appreciated but is not required.
25
+ * 2. Altered source versions must be plainly marked as such, and must not be
26
+ * misrepresented as being the original software.
27
+ * 3. This notice may not be removed or altered from any source distribution.
28
+ */
29
+
30
+ #if defined(__linux__) && defined(__ELF__)
31
+ .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
32
+ #endif
33
+
34
+ .text
35
+
36
+
37
+ #define RESPECT_STRICT_ALIGNMENT 1
38
+
39
+
40
+ /*****************************************************************************/
41
+
42
+ /* Supplementary macro for setting function attributes */
43
+ .macro asm_function fname
44
+ #ifdef __APPLE__
45
+ .globl _\fname
46
+ _\fname:
47
+ #else
48
+ .global \fname
49
+ #ifdef __ELF__
50
+ .hidden \fname
51
+ .type \fname, %function
52
+ #endif
53
+ \fname:
54
+ #endif
55
+ .endm
56
+
57
+ /* Transpose elements of single 128 bit registers */
58
+ .macro transpose_single x0, x1, xi, xilen, literal
59
+ ins \xi\xilen[0], \x0\xilen[0]
60
+ ins \x1\xilen[0], \x0\xilen[1]
61
+ trn1 \x0\literal, \x0\literal, \x1\literal
62
+ trn2 \x1\literal, \xi\literal, \x1\literal
63
+ .endm
64
+
65
+ /* Transpose elements of 2 differnet registers */
66
+ .macro transpose x0, x1, xi, xilen, literal
67
+ mov \xi\xilen, \x0\xilen
68
+ trn1 \x0\literal, \x0\literal, \x1\literal
69
+ trn2 \x1\literal, \xi\literal, \x1\literal
70
+ .endm
71
+
72
+ /* Transpose a block of 4x4 coefficients in four 64-bit registers */
73
+ .macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
74
+ mov \xi\xilen, \x0\xilen
75
+ trn1 \x0\x0len, \x0\x0len, \x2\x2len
76
+ trn2 \x2\x2len, \xi\x0len, \x2\x2len
77
+ mov \xi\xilen, \x1\xilen
78
+ trn1 \x1\x1len, \x1\x1len, \x3\x3len
79
+ trn2 \x3\x3len, \xi\x1len, \x3\x3len
80
+ .endm
81
+
82
+ .macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
83
+ mov \xi\xilen, \x0\xilen
84
+ trn1 \x0\x0len, \x0\x0len, \x1\x1len
85
+ trn2 \x1\x2len, \xi\x0len, \x1\x2len
86
+ mov \xi\xilen, \x2\xilen
87
+ trn1 \x2\x2len, \x2\x2len, \x3\x3len
88
+ trn2 \x3\x2len, \xi\x1len, \x3\x3len
89
+ .endm
90
+
91
+ .macro transpose_4x4 x0, x1, x2, x3, x5
92
+ transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
93
+ transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
94
+ .endm
95
+
96
+ .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
97
+ trn1 \t0\().8h, \l0\().8h, \l1\().8h
98
+ trn1 \t1\().8h, \l2\().8h, \l3\().8h
99
+ trn1 \t2\().8h, \l4\().8h, \l5\().8h
100
+ trn1 \t3\().8h, \l6\().8h, \l7\().8h
101
+ trn2 \l1\().8h, \l0\().8h, \l1\().8h
102
+ trn2 \l3\().8h, \l2\().8h, \l3\().8h
103
+ trn2 \l5\().8h, \l4\().8h, \l5\().8h
104
+ trn2 \l7\().8h, \l6\().8h, \l7\().8h
105
+
106
+ trn1 \l4\().4s, \t2\().4s, \t3\().4s
107
+ trn2 \t3\().4s, \t2\().4s, \t3\().4s
108
+ trn1 \t2\().4s, \t0\().4s, \t1\().4s
109
+ trn2 \l2\().4s, \t0\().4s, \t1\().4s
110
+ trn1 \t0\().4s, \l1\().4s, \l3\().4s
111
+ trn2 \l3\().4s, \l1\().4s, \l3\().4s
112
+ trn2 \t1\().4s, \l5\().4s, \l7\().4s
113
+ trn1 \l5\().4s, \l5\().4s, \l7\().4s
114
+
115
+ trn2 \l6\().2d, \l2\().2d, \t3\().2d
116
+ trn1 \l0\().2d, \t2\().2d, \l4\().2d
117
+ trn1 \l1\().2d, \t0\().2d, \l5\().2d
118
+ trn2 \l7\().2d, \l3\().2d, \t1\().2d
119
+ trn1 \l2\().2d, \l2\().2d, \t3\().2d
120
+ trn2 \l4\().2d, \t2\().2d, \l4\().2d
121
+ trn1 \l3\().2d, \l3\().2d, \t1\().2d
122
+ trn2 \l5\().2d, \t0\().2d, \l5\().2d
123
+ .endm
124
+
125
+
126
+ #define CENTERJSAMPLE 128
127
+
128
+ /*****************************************************************************/
129
+
130
+ /*
131
+ * Perform dequantization and inverse DCT on one block of coefficients.
132
+ *
133
+ * GLOBAL(void)
134
+ * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
135
+ * JSAMPARRAY output_buf, JDIMENSION output_col)
136
+ */
137
+
138
+ #define CONST_BITS 13
139
+ #define PASS1_BITS 2
140
+
141
+ #define F_0_298 2446 /* FIX(0.298631336) */
142
+ #define F_0_390 3196 /* FIX(0.390180644) */
143
+ #define F_0_541 4433 /* FIX(0.541196100) */
144
+ #define F_0_765 6270 /* FIX(0.765366865) */
145
+ #define F_0_899 7373 /* FIX(0.899976223) */
146
+ #define F_1_175 9633 /* FIX(1.175875602) */
147
+ #define F_1_501 12299 /* FIX(1.501321110) */
148
+ #define F_1_847 15137 /* FIX(1.847759065) */
149
+ #define F_1_961 16069 /* FIX(1.961570560) */
150
+ #define F_2_053 16819 /* FIX(2.053119869) */
151
+ #define F_2_562 20995 /* FIX(2.562915447) */
152
+ #define F_3_072 25172 /* FIX(3.072711026) */
153
+
154
+ .balign 16
155
+ Ljsimd_idct_islow_neon_consts:
156
+ .short F_0_298
157
+ .short -F_0_390
158
+ .short F_0_541
159
+ .short F_0_765
160
+ .short - F_0_899
161
+ .short F_1_175
162
+ .short F_1_501
163
+ .short - F_1_847
164
+ .short - F_1_961
165
+ .short F_2_053
166
+ .short - F_2_562
167
+ .short F_3_072
168
+ .short 0 /* padding */
169
+ .short 0
170
+ .short 0
171
+ .short 0
172
+
173
+ #undef F_0_298
174
+ #undef F_0_390
175
+ #undef F_0_541
176
+ #undef F_0_765
177
+ #undef F_0_899
178
+ #undef F_1_175
179
+ #undef F_1_501
180
+ #undef F_1_847
181
+ #undef F_1_961
182
+ #undef F_2_053
183
+ #undef F_2_562
184
+ #undef F_3_072
185
+
186
+ #define XFIX_P_0_298 v0.h[0]
187
+ #define XFIX_N_0_390 v0.h[1]
188
+ #define XFIX_P_0_541 v0.h[2]
189
+ #define XFIX_P_0_765 v0.h[3]
190
+ #define XFIX_N_0_899 v0.h[4]
191
+ #define XFIX_P_1_175 v0.h[5]
192
+ #define XFIX_P_1_501 v0.h[6]
193
+ #define XFIX_N_1_847 v0.h[7]
194
+ #define XFIX_N_1_961 v1.h[0]
195
+ #define XFIX_P_2_053 v1.h[1]
196
+ #define XFIX_N_2_562 v1.h[2]
197
+ #define XFIX_P_3_072 v1.h[3]
198
+
199
+ asm_function jsimd_idct_islow_neon
200
+ DCT_TABLE .req x0
201
+ COEF_BLOCK .req x1
202
+ OUTPUT_BUF .req x2
203
+ OUTPUT_COL .req x3
204
+ TMP1 .req x0
205
+ TMP2 .req x1
206
+ TMP3 .req x9
207
+ TMP4 .req x10
208
+ TMP5 .req x11
209
+ TMP6 .req x12
210
+ TMP7 .req x13
211
+ TMP8 .req x14
212
+
213
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
214
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
215
+ instruction ensures that those bits are set to zero. */
216
+ uxtw x3, w3
217
+
218
+ sub sp, sp, #64
219
+ adr x15, Ljsimd_idct_islow_neon_consts
220
+ mov x10, sp
221
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
222
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
223
+ ld1 {v0.8h, v1.8h}, [x15]
224
+ ld1 {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
225
+ ld1 {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
226
+ ld1 {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
227
+ ld1 {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
228
+
229
+ cmeq v16.8h, v3.8h, #0
230
+ cmeq v26.8h, v4.8h, #0
231
+ cmeq v27.8h, v5.8h, #0
232
+ cmeq v28.8h, v6.8h, #0
233
+ cmeq v29.8h, v7.8h, #0
234
+ cmeq v30.8h, v8.8h, #0
235
+ cmeq v31.8h, v9.8h, #0
236
+
237
+ and v10.16b, v16.16b, v26.16b
238
+ and v11.16b, v27.16b, v28.16b
239
+ and v12.16b, v29.16b, v30.16b
240
+ and v13.16b, v31.16b, v10.16b
241
+ and v14.16b, v11.16b, v12.16b
242
+ mul v2.8h, v2.8h, v18.8h
243
+ and v15.16b, v13.16b, v14.16b
244
+ shl v10.8h, v2.8h, #(PASS1_BITS)
245
+ sqxtn v16.8b, v15.8h
246
+ mov TMP1, v16.d[0]
247
+ mvn TMP2, TMP1
248
+
249
+ cbnz TMP2, 2f
250
+ /* case all AC coeffs are zeros */
251
+ dup v2.2d, v10.d[0]
252
+ dup v6.2d, v10.d[1]
253
+ mov v3.16b, v2.16b
254
+ mov v7.16b, v6.16b
255
+ mov v4.16b, v2.16b
256
+ mov v8.16b, v6.16b
257
+ mov v5.16b, v2.16b
258
+ mov v9.16b, v6.16b
259
+ 1:
260
+ /* for this transpose, we should organise data like this:
261
+ * 00, 01, 02, 03, 40, 41, 42, 43
262
+ * 10, 11, 12, 13, 50, 51, 52, 53
263
+ * 20, 21, 22, 23, 60, 61, 62, 63
264
+ * 30, 31, 32, 33, 70, 71, 72, 73
265
+ * 04, 05, 06, 07, 44, 45, 46, 47
266
+ * 14, 15, 16, 17, 54, 55, 56, 57
267
+ * 24, 25, 26, 27, 64, 65, 66, 67
268
+ * 34, 35, 36, 37, 74, 75, 76, 77
269
+ */
270
+ trn1 v28.8h, v2.8h, v3.8h
271
+ trn1 v29.8h, v4.8h, v5.8h
272
+ trn1 v30.8h, v6.8h, v7.8h
273
+ trn1 v31.8h, v8.8h, v9.8h
274
+ trn2 v16.8h, v2.8h, v3.8h
275
+ trn2 v17.8h, v4.8h, v5.8h
276
+ trn2 v18.8h, v6.8h, v7.8h
277
+ trn2 v19.8h, v8.8h, v9.8h
278
+ trn1 v2.4s, v28.4s, v29.4s
279
+ trn1 v6.4s, v30.4s, v31.4s
280
+ trn1 v3.4s, v16.4s, v17.4s
281
+ trn1 v7.4s, v18.4s, v19.4s
282
+ trn2 v4.4s, v28.4s, v29.4s
283
+ trn2 v8.4s, v30.4s, v31.4s
284
+ trn2 v5.4s, v16.4s, v17.4s
285
+ trn2 v9.4s, v18.4s, v19.4s
286
+ /* Even part: reverse the even part of the forward DCT. */
287
+ add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
288
+ add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
289
+ smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
290
+ sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
291
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
292
+ sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
293
+ mov v21.16b, v19.16b /* tmp3 = z1 */
294
+ mov v20.16b, v18.16b /* tmp3 = z1 */
295
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
296
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
297
+ sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
298
+ smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
299
+ smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
300
+ sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
301
+ sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
302
+ add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
303
+ sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
304
+ add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
305
+ sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
306
+ add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
307
+ sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
308
+ add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
309
+ sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
310
+
311
+ /* Odd part per figure 8; the matrix is unitary and hence its
312
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
313
+ */
314
+
315
+ add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
316
+ add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
317
+ add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
318
+ add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
319
+ add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
320
+
321
+ smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
322
+ smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
323
+ smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
324
+ smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
325
+ smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
326
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
327
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
328
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
329
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
330
+
331
+ smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
332
+ smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
333
+ smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
334
+ smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
335
+ smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
336
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
337
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
338
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
339
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
340
+
341
+ add v23.4s, v23.4s, v27.4s /* z3 += z5 */
342
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
343
+ add v25.4s, v25.4s, v27.4s /* z4 += z5 */
344
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
345
+
346
+ add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
347
+ add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
348
+ add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
349
+ add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
350
+ add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
351
+ add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
352
+ add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
353
+ add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
354
+
355
+ add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
356
+ add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
357
+ add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
358
+ add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
359
+ add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
360
+ add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
361
+ add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
362
+ add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
363
+
364
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
365
+
366
+ add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
367
+ add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
368
+ sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
369
+ sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
370
+ add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
371
+ add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
372
+ sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
373
+ sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
374
+ add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
375
+ add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
376
+ sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
377
+ sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
378
+ add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
379
+ add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
380
+ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
381
+ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
382
+
383
+ shrn v2.4h, v18.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
384
+ shrn v9.4h, v20.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
385
+ shrn v3.4h, v22.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
386
+ shrn v8.4h, v24.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
387
+ shrn v4.4h, v26.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
388
+ shrn v7.4h, v28.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
389
+ shrn v5.4h, v14.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
390
+ shrn v6.4h, v16.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
391
+ shrn2 v2.8h, v19.4s, #16 /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
392
+ shrn2 v9.8h, v21.4s, #16 /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
393
+ shrn2 v3.8h, v23.4s, #16 /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
394
+ shrn2 v8.8h, v25.4s, #16 /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
395
+ shrn2 v4.8h, v27.4s, #16 /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
396
+ shrn2 v7.8h, v29.4s, #16 /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
397
+ shrn2 v5.8h, v15.4s, #16 /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
398
+ shrn2 v6.8h, v17.4s, #16 /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
399
+ movi v0.16b, #(CENTERJSAMPLE)
400
+ /* Prepare pointers (dual-issue with NEON instructions) */
401
+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
402
+ sqrshrn v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
403
+ ldp TMP3, TMP4, [OUTPUT_BUF], 16
404
+ sqrshrn v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
405
+ add TMP1, TMP1, OUTPUT_COL
406
+ sqrshrn v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
407
+ add TMP2, TMP2, OUTPUT_COL
408
+ sqrshrn v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
409
+ add TMP3, TMP3, OUTPUT_COL
410
+ sqrshrn2 v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
411
+ add TMP4, TMP4, OUTPUT_COL
412
+ sqrshrn2 v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
413
+ ldp TMP5, TMP6, [OUTPUT_BUF], 16
414
+ sqrshrn2 v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
415
+ ldp TMP7, TMP8, [OUTPUT_BUF], 16
416
+ sqrshrn2 v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
417
+ add TMP5, TMP5, OUTPUT_COL
418
+ add v16.16b, v28.16b, v0.16b
419
+ add TMP6, TMP6, OUTPUT_COL
420
+ add v18.16b, v29.16b, v0.16b
421
+ add TMP7, TMP7, OUTPUT_COL
422
+ add v20.16b, v30.16b, v0.16b
423
+ add TMP8, TMP8, OUTPUT_COL
424
+ add v22.16b, v31.16b, v0.16b
425
+
426
+ /* Transpose the final 8-bit samples */
427
+ trn1 v28.16b, v16.16b, v18.16b
428
+ trn1 v30.16b, v20.16b, v22.16b
429
+ trn2 v29.16b, v16.16b, v18.16b
430
+ trn2 v31.16b, v20.16b, v22.16b
431
+
432
+ trn1 v16.8h, v28.8h, v30.8h
433
+ trn2 v18.8h, v28.8h, v30.8h
434
+ trn1 v20.8h, v29.8h, v31.8h
435
+ trn2 v22.8h, v29.8h, v31.8h
436
+
437
+ uzp1 v28.4s, v16.4s, v18.4s
438
+ uzp2 v30.4s, v16.4s, v18.4s
439
+ uzp1 v29.4s, v20.4s, v22.4s
440
+ uzp2 v31.4s, v20.4s, v22.4s
441
+
442
+ /* Store results to the output buffer */
443
+ st1 {v28.d}[0], [TMP1]
444
+ st1 {v29.d}[0], [TMP2]
445
+ st1 {v28.d}[1], [TMP3]
446
+ st1 {v29.d}[1], [TMP4]
447
+ st1 {v30.d}[0], [TMP5]
448
+ st1 {v31.d}[0], [TMP6]
449
+ st1 {v30.d}[1], [TMP7]
450
+ st1 {v31.d}[1], [TMP8]
451
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
452
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
453
+ blr x30
454
+
455
+ .balign 16
456
+ 2:
457
+ mul v3.8h, v3.8h, v19.8h
458
+ mul v4.8h, v4.8h, v20.8h
459
+ mul v5.8h, v5.8h, v21.8h
460
+ add TMP4, xzr, TMP2, LSL #32
461
+ mul v6.8h, v6.8h, v22.8h
462
+ mul v7.8h, v7.8h, v23.8h
463
+ adds TMP3, xzr, TMP2, LSR #32
464
+ mul v8.8h, v8.8h, v24.8h
465
+ mul v9.8h, v9.8h, v25.8h
466
+ b.ne 3f
467
+ /* Right AC coef is zero */
468
+ dup v15.2d, v10.d[1]
469
+ /* Even part: reverse the even part of the forward DCT. */
470
+ add v18.4h, v4.4h, v8.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
471
+ add v22.4h, v2.4h, v6.4h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
472
+ sub v26.4h, v2.4h, v6.4h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
473
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
474
+ sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
475
+ mov v20.16b, v18.16b /* tmp3 = z1 */
476
+ sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
477
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
478
+ smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
479
+ add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
480
+ sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
481
+ add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
482
+ sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
483
+
484
+ /* Odd part per figure 8; the matrix is unitary and hence its
485
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
486
+ */
487
+
488
+ add v22.4h, v9.4h, v5.4h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
489
+ add v24.4h, v7.4h, v3.4h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
490
+ add v18.4h, v9.4h, v3.4h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
491
+ add v20.4h, v7.4h, v5.4h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
492
+ add v26.4h, v22.4h, v24.4h /* z5 = z3 + z4 */
493
+
494
+ smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
495
+ smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
496
+ smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
497
+ smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
498
+ smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
499
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
500
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
501
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
502
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
503
+
504
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
505
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
506
+
507
+ add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
508
+ add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
509
+ add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
510
+ add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
511
+
512
+ add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
513
+ add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
514
+ add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
515
+ add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
516
+
517
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
518
+
519
+ add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
520
+ sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
521
+ add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
522
+ sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
523
+ add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
524
+ sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
525
+ add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
526
+ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
527
+
528
+ rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
529
+ rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
530
+ rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
531
+ rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
532
+ rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
533
+ rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
534
+ rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
535
+ rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
536
+ mov v6.16b, v15.16b
537
+ mov v7.16b, v15.16b
538
+ mov v8.16b, v15.16b
539
+ mov v9.16b, v15.16b
540
+ b 1b
541
+
542
+ .balign 16
543
+ 3:
544
+ cbnz TMP4, 4f
545
+ /* Left AC coef is zero */
546
+ dup v14.2d, v10.d[0]
547
+ /* Even part: reverse the even part of the forward DCT. */
548
+ add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
549
+ add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
550
+ smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
551
+ sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
552
+ sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
553
+ mov v21.16b, v19.16b /* tmp3 = z1 */
554
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
555
+ sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
556
+ smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
557
+ add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
558
+ sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
559
+ add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
560
+ sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
561
+
562
+ /* Odd part per figure 8; the matrix is unitary and hence its
563
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
564
+ */
565
+
566
+ add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
567
+ add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
568
+ add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
569
+ add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
570
+ add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
571
+
572
+ smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
573
+ smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
574
+ smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
575
+ smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
576
+ smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
577
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
578
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
579
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
580
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
581
+
582
+ add v23.4s, v23.4s, v27.4s /* z3 += z5 */
583
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
584
+ add v25.4s, v25.4s, v27.4s /* z4 += z5 */
585
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
586
+
587
+ add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
588
+ add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
589
+ add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
590
+ add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
591
+
592
+ add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
593
+ add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
594
+ add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
595
+ add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
596
+
597
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
598
+
599
+ add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
600
+ sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
601
+ add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
602
+ sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
603
+ add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
604
+ sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
605
+ add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
606
+ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
607
+
608
+ mov v2.16b, v14.16b
609
+ mov v3.16b, v14.16b
610
+ mov v4.16b, v14.16b
611
+ mov v5.16b, v14.16b
612
+ rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
613
+ rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
614
+ rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
615
+ rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
616
+ rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
617
+ rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
618
+ rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
619
+ rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
620
+ b 1b
621
+
622
+ .balign 16
623
+ 4:
624
+ /* "No" AC coef is zero */
625
+ /* Even part: reverse the even part of the forward DCT. */
626
+ add v18.8h, v4.8h, v8.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
627
+ add v22.8h, v2.8h, v6.8h /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
628
+ smull2 v19.4s, v18.8h, XFIX_P_0_541 /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
629
+ sub v26.8h, v2.8h, v6.8h /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
630
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
631
+ sshll2 v23.4s, v22.8h, #(CONST_BITS) /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
632
+ mov v21.16b, v19.16b /* tmp3 = z1 */
633
+ mov v20.16b, v18.16b /* tmp3 = z1 */
634
+ smlal2 v19.4s, v8.8h, XFIX_N_1_847 /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
635
+ smlal v18.4s, v8.4h, XFIX_N_1_847 /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
636
+ sshll2 v27.4s, v26.8h, #(CONST_BITS) /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
637
+ smlal2 v21.4s, v4.8h, XFIX_P_0_765 /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
638
+ smlal v20.4s, v4.4h, XFIX_P_0_765 /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
639
+ sshll v22.4s, v22.4h, #(CONST_BITS) /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
640
+ sshll v26.4s, v26.4h, #(CONST_BITS) /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
641
+ add v2.4s, v22.4s, v20.4s /* tmp10l tmp10 = tmp0 + tmp3; */
642
+ sub v6.4s, v22.4s, v20.4s /* tmp13l tmp13 = tmp0 - tmp3; */
643
+ add v8.4s, v26.4s, v18.4s /* tmp11l tmp11 = tmp1 + tmp2; */
644
+ sub v4.4s, v26.4s, v18.4s /* tmp12l tmp12 = tmp1 - tmp2; */
645
+ add v28.4s, v23.4s, v21.4s /* tmp10h tmp10 = tmp0 + tmp3; */
646
+ sub v31.4s, v23.4s, v21.4s /* tmp13h tmp13 = tmp0 - tmp3; */
647
+ add v29.4s, v27.4s, v19.4s /* tmp11h tmp11 = tmp1 + tmp2; */
648
+ sub v30.4s, v27.4s, v19.4s /* tmp12h tmp12 = tmp1 - tmp2; */
649
+
650
+ /* Odd part per figure 8; the matrix is unitary and hence its
651
+ * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
652
+ */
653
+
654
+ add v22.8h, v9.8h, v5.8h /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
655
+ add v24.8h, v7.8h, v3.8h /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
656
+ add v18.8h, v9.8h, v3.8h /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
657
+ add v20.8h, v7.8h, v5.8h /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
658
+ add v26.8h, v22.8h, v24.8h /* z5 = z3 + z4 */
659
+
660
+ smull2 v11.4s, v9.8h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
661
+ smull2 v13.4s, v7.8h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
662
+ smull2 v15.4s, v5.8h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
663
+ smull2 v17.4s, v3.8h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
664
+ smull2 v27.4s, v26.8h, XFIX_P_1_175 /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
665
+ smull2 v23.4s, v22.8h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
666
+ smull2 v25.4s, v24.8h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
667
+ smull2 v19.4s, v18.8h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
668
+ smull2 v21.4s, v20.8h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
669
+
670
+ smull v10.4s, v9.4h, XFIX_P_0_298 /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
671
+ smull v12.4s, v7.4h, XFIX_P_2_053 /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
672
+ smull v14.4s, v5.4h, XFIX_P_3_072 /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
673
+ smull v16.4s, v3.4h, XFIX_P_1_501 /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
674
+ smull v26.4s, v26.4h, XFIX_P_1_175 /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
675
+ smull v22.4s, v22.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
676
+ smull v24.4s, v24.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
677
+ smull v18.4s, v18.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
678
+ smull v20.4s, v20.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
679
+
680
+ add v23.4s, v23.4s, v27.4s /* z3 += z5 */
681
+ add v22.4s, v22.4s, v26.4s /* z3 += z5 */
682
+ add v25.4s, v25.4s, v27.4s /* z4 += z5 */
683
+ add v24.4s, v24.4s, v26.4s /* z4 += z5 */
684
+
685
+ add v11.4s, v11.4s, v19.4s /* tmp0 += z1 */
686
+ add v10.4s, v10.4s, v18.4s /* tmp0 += z1 */
687
+ add v13.4s, v13.4s, v21.4s /* tmp1 += z2 */
688
+ add v12.4s, v12.4s, v20.4s /* tmp1 += z2 */
689
+ add v15.4s, v15.4s, v21.4s /* tmp2 += z2 */
690
+ add v14.4s, v14.4s, v20.4s /* tmp2 += z2 */
691
+ add v17.4s, v17.4s, v19.4s /* tmp3 += z1 */
692
+ add v16.4s, v16.4s, v18.4s /* tmp3 += z1 */
693
+
694
+ add v11.4s, v11.4s, v23.4s /* tmp0 += z3 */
695
+ add v10.4s, v10.4s, v22.4s /* tmp0 += z3 */
696
+ add v13.4s, v13.4s, v25.4s /* tmp1 += z4 */
697
+ add v12.4s, v12.4s, v24.4s /* tmp1 += z4 */
698
+ add v17.4s, v17.4s, v25.4s /* tmp3 += z4 */
699
+ add v16.4s, v16.4s, v24.4s /* tmp3 += z4 */
700
+ add v15.4s, v15.4s, v23.4s /* tmp2 += z3 */
701
+ add v14.4s, v14.4s, v22.4s /* tmp2 += z3 */
702
+
703
+ /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
704
+
705
+ add v18.4s, v2.4s, v16.4s /* tmp10 + tmp3 */
706
+ add v19.4s, v28.4s, v17.4s /* tmp10 + tmp3 */
707
+ sub v20.4s, v2.4s, v16.4s /* tmp10 - tmp3 */
708
+ sub v21.4s, v28.4s, v17.4s /* tmp10 - tmp3 */
709
+ add v22.4s, v8.4s, v14.4s /* tmp11 + tmp2 */
710
+ add v23.4s, v29.4s, v15.4s /* tmp11 + tmp2 */
711
+ sub v24.4s, v8.4s, v14.4s /* tmp11 - tmp2 */
712
+ sub v25.4s, v29.4s, v15.4s /* tmp11 - tmp2 */
713
+ add v26.4s, v4.4s, v12.4s /* tmp12 + tmp1 */
714
+ add v27.4s, v30.4s, v13.4s /* tmp12 + tmp1 */
715
+ sub v28.4s, v4.4s, v12.4s /* tmp12 - tmp1 */
716
+ sub v29.4s, v30.4s, v13.4s /* tmp12 - tmp1 */
717
+ add v14.4s, v6.4s, v10.4s /* tmp13 + tmp0 */
718
+ add v15.4s, v31.4s, v11.4s /* tmp13 + tmp0 */
719
+ sub v16.4s, v6.4s, v10.4s /* tmp13 - tmp0 */
720
+ sub v17.4s, v31.4s, v11.4s /* tmp13 - tmp0 */
721
+
722
+ rshrn v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
723
+ rshrn v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
724
+ rshrn v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
725
+ rshrn v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
726
+ rshrn v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
727
+ rshrn v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
728
+ rshrn v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
729
+ rshrn v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
730
+ rshrn2 v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
731
+ rshrn2 v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
732
+ rshrn2 v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
733
+ rshrn2 v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
734
+ rshrn2 v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
735
+ rshrn2 v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
736
+ rshrn2 v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
737
+ rshrn2 v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS) /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
738
+ b 1b
739
+
740
+ .unreq DCT_TABLE
741
+ .unreq COEF_BLOCK
742
+ .unreq OUTPUT_BUF
743
+ .unreq OUTPUT_COL
744
+ .unreq TMP1
745
+ .unreq TMP2
746
+ .unreq TMP3
747
+ .unreq TMP4
748
+ .unreq TMP5
749
+ .unreq TMP6
750
+ .unreq TMP7
751
+ .unreq TMP8
752
+
753
+ #undef CENTERJSAMPLE
754
+ #undef CONST_BITS
755
+ #undef PASS1_BITS
756
+ #undef XFIX_P_0_298
757
+ #undef XFIX_N_0_390
758
+ #undef XFIX_P_0_541
759
+ #undef XFIX_P_0_765
760
+ #undef XFIX_N_0_899
761
+ #undef XFIX_P_1_175
762
+ #undef XFIX_P_1_501
763
+ #undef XFIX_N_1_847
764
+ #undef XFIX_N_1_961
765
+ #undef XFIX_P_2_053
766
+ #undef XFIX_N_2_562
767
+ #undef XFIX_P_3_072
768
+
769
+
770
+ /*****************************************************************************/
771
+
772
+ /*
773
+ * jsimd_idct_ifast_neon
774
+ *
775
+ * This function contains a fast, not so accurate integer implementation of
776
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
777
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
778
+ * function from jidctfst.c
779
+ *
780
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
781
+ * But in ARM NEON case some extra additions are required because VQDMULH
782
+ * instruction can't handle the constants larger than 1. So the expressions
783
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
784
+ * which introduces an extra addition. Overall, there are 6 extra additions
785
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
786
+ */
787
+
788
+ #define XFIX_1_082392200 v0.h[0]
789
+ #define XFIX_1_414213562 v0.h[1]
790
+ #define XFIX_1_847759065 v0.h[2]
791
+ #define XFIX_2_613125930 v0.h[3]
792
+
793
+ .balign 16
794
+ Ljsimd_idct_ifast_neon_consts:
795
+ .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
796
+ .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
797
+ .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
798
+ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
799
+
800
+ asm_function jsimd_idct_ifast_neon
801
+
802
+ DCT_TABLE .req x0
803
+ COEF_BLOCK .req x1
804
+ OUTPUT_BUF .req x2
805
+ OUTPUT_COL .req x3
806
+ TMP1 .req x0
807
+ TMP2 .req x1
808
+ TMP3 .req x9
809
+ TMP4 .req x10
810
+ TMP5 .req x11
811
+ TMP6 .req x12
812
+ TMP7 .req x13
813
+ TMP8 .req x14
814
+
815
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
816
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
817
+ instruction ensures that those bits are set to zero. */
818
+ uxtw x3, w3
819
+
820
+ /* Load and dequantize coefficients into NEON registers
821
+ * with the following allocation:
822
+ * 0 1 2 3 | 4 5 6 7
823
+ * ---------+--------
824
+ * 0 | d16 | d17 ( v16.8h )
825
+ * 1 | d18 | d19 ( v17.8h )
826
+ * 2 | d20 | d21 ( v18.8h )
827
+ * 3 | d22 | d23 ( v19.8h )
828
+ * 4 | d24 | d25 ( v20.8h )
829
+ * 5 | d26 | d27 ( v21.8h )
830
+ * 6 | d28 | d29 ( v22.8h )
831
+ * 7 | d30 | d31 ( v23.8h )
832
+ */
833
+ /* Save NEON registers used in fast IDCT */
834
+ adr TMP5, Ljsimd_idct_ifast_neon_consts
835
+ ld1 {v16.8h, v17.8h}, [COEF_BLOCK], 32
836
+ ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
837
+ ld1 {v18.8h, v19.8h}, [COEF_BLOCK], 32
838
+ mul v16.8h, v16.8h, v0.8h
839
+ ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
840
+ mul v17.8h, v17.8h, v1.8h
841
+ ld1 {v20.8h, v21.8h}, [COEF_BLOCK], 32
842
+ mul v18.8h, v18.8h, v2.8h
843
+ ld1 {v0.8h, v1.8h}, [DCT_TABLE], 32
844
+ mul v19.8h, v19.8h, v3.8h
845
+ ld1 {v22.8h, v23.8h}, [COEF_BLOCK], 32
846
+ mul v20.8h, v20.8h, v0.8h
847
+ ld1 {v2.8h, v3.8h}, [DCT_TABLE], 32
848
+ mul v22.8h, v22.8h, v2.8h
849
+ mul v21.8h, v21.8h, v1.8h
850
+ ld1 {v0.4h}, [TMP5] /* load constants */
851
+ mul v23.8h, v23.8h, v3.8h
852
+
853
+ /* 1-D IDCT, pass 1 */
854
+ sub v2.8h, v18.8h, v22.8h
855
+ add v22.8h, v18.8h, v22.8h
856
+ sub v1.8h, v19.8h, v21.8h
857
+ add v21.8h, v19.8h, v21.8h
858
+ sub v5.8h, v17.8h, v23.8h
859
+ add v23.8h, v17.8h, v23.8h
860
+ sqdmulh v4.8h, v2.8h, XFIX_1_414213562
861
+ sqdmulh v6.8h, v1.8h, XFIX_2_613125930
862
+ add v3.8h, v1.8h, v1.8h
863
+ sub v1.8h, v5.8h, v1.8h
864
+ add v18.8h, v2.8h, v4.8h
865
+ sqdmulh v4.8h, v1.8h, XFIX_1_847759065
866
+ sub v2.8h, v23.8h, v21.8h
867
+ add v3.8h, v3.8h, v6.8h
868
+ sqdmulh v6.8h, v2.8h, XFIX_1_414213562
869
+ add v1.8h, v1.8h, v4.8h
870
+ sqdmulh v4.8h, v5.8h, XFIX_1_082392200
871
+ sub v18.8h, v18.8h, v22.8h
872
+ add v2.8h, v2.8h, v6.8h
873
+ sub v6.8h, v16.8h, v20.8h
874
+ add v20.8h, v16.8h, v20.8h
875
+ add v17.8h, v5.8h, v4.8h
876
+ add v5.8h, v6.8h, v18.8h
877
+ sub v18.8h, v6.8h, v18.8h
878
+ add v6.8h, v23.8h, v21.8h
879
+ add v16.8h, v20.8h, v22.8h
880
+ sub v3.8h, v6.8h, v3.8h
881
+ sub v20.8h, v20.8h, v22.8h
882
+ sub v3.8h, v3.8h, v1.8h
883
+ sub v1.8h, v17.8h, v1.8h
884
+ add v2.8h, v3.8h, v2.8h
885
+ sub v23.8h, v16.8h, v6.8h
886
+ add v1.8h, v1.8h, v2.8h
887
+ add v16.8h, v16.8h, v6.8h
888
+ add v22.8h, v5.8h, v3.8h
889
+ sub v17.8h, v5.8h, v3.8h
890
+ sub v21.8h, v18.8h, v2.8h
891
+ add v18.8h, v18.8h, v2.8h
892
+ sub v19.8h, v20.8h, v1.8h
893
+ add v20.8h, v20.8h, v1.8h
894
+ transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
895
+ /* 1-D IDCT, pass 2 */
896
+ sub v2.8h, v18.8h, v22.8h
897
+ add v22.8h, v18.8h, v22.8h
898
+ sub v1.8h, v19.8h, v21.8h
899
+ add v21.8h, v19.8h, v21.8h
900
+ sub v5.8h, v17.8h, v23.8h
901
+ add v23.8h, v17.8h, v23.8h
902
+ sqdmulh v4.8h, v2.8h, XFIX_1_414213562
903
+ sqdmulh v6.8h, v1.8h, XFIX_2_613125930
904
+ add v3.8h, v1.8h, v1.8h
905
+ sub v1.8h, v5.8h, v1.8h
906
+ add v18.8h, v2.8h, v4.8h
907
+ sqdmulh v4.8h, v1.8h, XFIX_1_847759065
908
+ sub v2.8h, v23.8h, v21.8h
909
+ add v3.8h, v3.8h, v6.8h
910
+ sqdmulh v6.8h, v2.8h, XFIX_1_414213562
911
+ add v1.8h, v1.8h, v4.8h
912
+ sqdmulh v4.8h, v5.8h, XFIX_1_082392200
913
+ sub v18.8h, v18.8h, v22.8h
914
+ add v2.8h, v2.8h, v6.8h
915
+ sub v6.8h, v16.8h, v20.8h
916
+ add v20.8h, v16.8h, v20.8h
917
+ add v17.8h, v5.8h, v4.8h
918
+ add v5.8h, v6.8h, v18.8h
919
+ sub v18.8h, v6.8h, v18.8h
920
+ add v6.8h, v23.8h, v21.8h
921
+ add v16.8h, v20.8h, v22.8h
922
+ sub v3.8h, v6.8h, v3.8h
923
+ sub v20.8h, v20.8h, v22.8h
924
+ sub v3.8h, v3.8h, v1.8h
925
+ sub v1.8h, v17.8h, v1.8h
926
+ add v2.8h, v3.8h, v2.8h
927
+ sub v23.8h, v16.8h, v6.8h
928
+ add v1.8h, v1.8h, v2.8h
929
+ add v16.8h, v16.8h, v6.8h
930
+ add v22.8h, v5.8h, v3.8h
931
+ sub v17.8h, v5.8h, v3.8h
932
+ sub v21.8h, v18.8h, v2.8h
933
+ add v18.8h, v18.8h, v2.8h
934
+ sub v19.8h, v20.8h, v1.8h
935
+ add v20.8h, v20.8h, v1.8h
936
+ /* Descale to 8-bit and range limit */
937
+ movi v0.16b, #0x80
938
+ /* Prepare pointers (dual-issue with NEON instructions) */
939
+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
940
+ sqshrn v28.8b, v16.8h, #5
941
+ ldp TMP3, TMP4, [OUTPUT_BUF], 16
942
+ sqshrn v29.8b, v17.8h, #5
943
+ add TMP1, TMP1, OUTPUT_COL
944
+ sqshrn v30.8b, v18.8h, #5
945
+ add TMP2, TMP2, OUTPUT_COL
946
+ sqshrn v31.8b, v19.8h, #5
947
+ add TMP3, TMP3, OUTPUT_COL
948
+ sqshrn2 v28.16b, v20.8h, #5
949
+ add TMP4, TMP4, OUTPUT_COL
950
+ sqshrn2 v29.16b, v21.8h, #5
951
+ ldp TMP5, TMP6, [OUTPUT_BUF], 16
952
+ sqshrn2 v30.16b, v22.8h, #5
953
+ ldp TMP7, TMP8, [OUTPUT_BUF], 16
954
+ sqshrn2 v31.16b, v23.8h, #5
955
+ add TMP5, TMP5, OUTPUT_COL
956
+ add v16.16b, v28.16b, v0.16b
957
+ add TMP6, TMP6, OUTPUT_COL
958
+ add v18.16b, v29.16b, v0.16b
959
+ add TMP7, TMP7, OUTPUT_COL
960
+ add v20.16b, v30.16b, v0.16b
961
+ add TMP8, TMP8, OUTPUT_COL
962
+ add v22.16b, v31.16b, v0.16b
963
+
964
+ /* Transpose the final 8-bit samples */
965
+ trn1 v28.16b, v16.16b, v18.16b
966
+ trn1 v30.16b, v20.16b, v22.16b
967
+ trn2 v29.16b, v16.16b, v18.16b
968
+ trn2 v31.16b, v20.16b, v22.16b
969
+
970
+ trn1 v16.8h, v28.8h, v30.8h
971
+ trn2 v18.8h, v28.8h, v30.8h
972
+ trn1 v20.8h, v29.8h, v31.8h
973
+ trn2 v22.8h, v29.8h, v31.8h
974
+
975
+ uzp1 v28.4s, v16.4s, v18.4s
976
+ uzp2 v30.4s, v16.4s, v18.4s
977
+ uzp1 v29.4s, v20.4s, v22.4s
978
+ uzp2 v31.4s, v20.4s, v22.4s
979
+
980
+ /* Store results to the output buffer */
981
+ st1 {v28.d}[0], [TMP1]
982
+ st1 {v29.d}[0], [TMP2]
983
+ st1 {v28.d}[1], [TMP3]
984
+ st1 {v29.d}[1], [TMP4]
985
+ st1 {v30.d}[0], [TMP5]
986
+ st1 {v31.d}[0], [TMP6]
987
+ st1 {v30.d}[1], [TMP7]
988
+ st1 {v31.d}[1], [TMP8]
989
+ blr x30
990
+
991
+ .unreq DCT_TABLE
992
+ .unreq COEF_BLOCK
993
+ .unreq OUTPUT_BUF
994
+ .unreq OUTPUT_COL
995
+ .unreq TMP1
996
+ .unreq TMP2
997
+ .unreq TMP3
998
+ .unreq TMP4
999
+ .unreq TMP5
1000
+ .unreq TMP6
1001
+ .unreq TMP7
1002
+ .unreq TMP8
1003
+
1004
+
1005
+ /*****************************************************************************/
1006
+
1007
+ /*
1008
+ * jsimd_idct_4x4_neon
1009
+ *
1010
+ * This function contains inverse-DCT code for getting reduced-size
1011
+ * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
1012
+ * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
1013
+ * function from jpeg-6b (jidctred.c).
1014
+ *
1015
+ * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
1016
+ * requires much less arithmetic operations and hence should be faster.
1017
+ * The primary purpose of this particular NEON optimized function is
1018
+ * bit exact compatibility with jpeg-6b.
1019
+ *
1020
+ * TODO: a bit better instructions scheduling can be achieved by expanding
1021
+ * idct_helper/transpose_4x4 macros and reordering instructions,
1022
+ * but readability will suffer somewhat.
1023
+ */
1024
+
1025
+ #define CONST_BITS 13
1026
+
1027
+ #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
1028
+ #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
1029
+ #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
1030
+ #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
1031
+ #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
1032
+ #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
1033
+ #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
1034
+ #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
1035
+ #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
1036
+ #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
1037
+ #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
1038
+ #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
1039
+ #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
1040
+ #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
1041
+
1042
+ .balign 16
1043
+ Ljsimd_idct_4x4_neon_consts:
1044
+ .short FIX_1_847759065 /* v0.h[0] */
1045
+ .short -FIX_0_765366865 /* v0.h[1] */
1046
+ .short -FIX_0_211164243 /* v0.h[2] */
1047
+ .short FIX_1_451774981 /* v0.h[3] */
1048
+ .short -FIX_2_172734803 /* d1[0] */
1049
+ .short FIX_1_061594337 /* d1[1] */
1050
+ .short -FIX_0_509795579 /* d1[2] */
1051
+ .short -FIX_0_601344887 /* d1[3] */
1052
+ .short FIX_0_899976223 /* v2.h[0] */
1053
+ .short FIX_2_562915447 /* v2.h[1] */
1054
+ .short 1 << (CONST_BITS+1) /* v2.h[2] */
1055
+ .short 0 /* v2.h[3] */
1056
+
1057
+ .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
1058
+ smull v28.4s, \x4, v2.h[2]
1059
+ smlal v28.4s, \x8, v0.h[0]
1060
+ smlal v28.4s, \x14, v0.h[1]
1061
+
1062
+ smull v26.4s, \x16, v1.h[2]
1063
+ smlal v26.4s, \x12, v1.h[3]
1064
+ smlal v26.4s, \x10, v2.h[0]
1065
+ smlal v26.4s, \x6, v2.h[1]
1066
+
1067
+ smull v30.4s, \x4, v2.h[2]
1068
+ smlsl v30.4s, \x8, v0.h[0]
1069
+ smlsl v30.4s, \x14, v0.h[1]
1070
+
1071
+ smull v24.4s, \x16, v0.h[2]
1072
+ smlal v24.4s, \x12, v0.h[3]
1073
+ smlal v24.4s, \x10, v1.h[0]
1074
+ smlal v24.4s, \x6, v1.h[1]
1075
+
1076
+ add v20.4s, v28.4s, v26.4s
1077
+ sub v28.4s, v28.4s, v26.4s
1078
+
1079
+ .if \shift > 16
1080
+ srshr v20.4s, v20.4s, #\shift
1081
+ srshr v28.4s, v28.4s, #\shift
1082
+ xtn \y26, v20.4s
1083
+ xtn \y29, v28.4s
1084
+ .else
1085
+ rshrn \y26, v20.4s, #\shift
1086
+ rshrn \y29, v28.4s, #\shift
1087
+ .endif
1088
+
1089
+ add v20.4s, v30.4s, v24.4s
1090
+ sub v30.4s, v30.4s, v24.4s
1091
+
1092
+ .if \shift > 16
1093
+ srshr v20.4s, v20.4s, #\shift
1094
+ srshr v30.4s, v30.4s, #\shift
1095
+ xtn \y27, v20.4s
1096
+ xtn \y28, v30.4s
1097
+ .else
1098
+ rshrn \y27, v20.4s, #\shift
1099
+ rshrn \y28, v30.4s, #\shift
1100
+ .endif
1101
+ .endm
1102
+
1103
+ asm_function jsimd_idct_4x4_neon
1104
+
1105
+ DCT_TABLE .req x0
1106
+ COEF_BLOCK .req x1
1107
+ OUTPUT_BUF .req x2
1108
+ OUTPUT_COL .req x3
1109
+ TMP1 .req x0
1110
+ TMP2 .req x1
1111
+ TMP3 .req x2
1112
+ TMP4 .req x15
1113
+
1114
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1115
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
1116
+ instruction ensures that those bits are set to zero. */
1117
+ uxtw x3, w3
1118
+
1119
+ /* Save all used NEON registers */
1120
+ sub sp, sp, 64
1121
+ mov x9, sp
1122
+ /* Load constants (v3.4h is just used for padding) */
1123
+ adr TMP4, Ljsimd_idct_4x4_neon_consts
1124
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1125
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1126
+ ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
1127
+
1128
+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
1129
+ * 0 1 2 3 | 4 5 6 7
1130
+ * ---------+--------
1131
+ * 0 | v4.4h | v5.4h
1132
+ * 1 | v6.4h | v7.4h
1133
+ * 2 | v8.4h | v9.4h
1134
+ * 3 | v10.4h | v11.4h
1135
+ * 4 | - | -
1136
+ * 5 | v12.4h | v13.4h
1137
+ * 6 | v14.4h | v15.4h
1138
+ * 7 | v16.4h | v17.4h
1139
+ */
1140
+ ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1141
+ ld1 {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
1142
+ add COEF_BLOCK, COEF_BLOCK, #16
1143
+ ld1 {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
1144
+ ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1145
+ /* dequantize */
1146
+ ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1147
+ mul v4.4h, v4.4h, v18.4h
1148
+ mul v5.4h, v5.4h, v19.4h
1149
+ ins v4.d[1], v5.d[0] /* 128 bit q4 */
1150
+ ld1 {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
1151
+ mul v6.4h, v6.4h, v20.4h
1152
+ mul v7.4h, v7.4h, v21.4h
1153
+ ins v6.d[1], v7.d[0] /* 128 bit q6 */
1154
+ mul v8.4h, v8.4h, v22.4h
1155
+ mul v9.4h, v9.4h, v23.4h
1156
+ ins v8.d[1], v9.d[0] /* 128 bit q8 */
1157
+ add DCT_TABLE, DCT_TABLE, #16
1158
+ ld1 {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
1159
+ mul v10.4h, v10.4h, v24.4h
1160
+ mul v11.4h, v11.4h, v25.4h
1161
+ ins v10.d[1], v11.d[0] /* 128 bit q10 */
1162
+ mul v12.4h, v12.4h, v26.4h
1163
+ mul v13.4h, v13.4h, v27.4h
1164
+ ins v12.d[1], v13.d[0] /* 128 bit q12 */
1165
+ ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1166
+ mul v14.4h, v14.4h, v28.4h
1167
+ mul v15.4h, v15.4h, v29.4h
1168
+ ins v14.d[1], v15.d[0] /* 128 bit q14 */
1169
+ mul v16.4h, v16.4h, v30.4h
1170
+ mul v17.4h, v17.4h, v31.4h
1171
+ ins v16.d[1], v17.d[0] /* 128 bit q16 */
1172
+
1173
+ /* Pass 1 */
1174
+ idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
1175
+ v4.4h, v6.4h, v8.4h, v10.4h
1176
+ transpose_4x4 v4, v6, v8, v10, v3
1177
+ ins v10.d[1], v11.d[0]
1178
+ idct_helper v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
1179
+ v5.4h, v7.4h, v9.4h, v11.4h
1180
+ transpose_4x4 v5, v7, v9, v11, v3
1181
+ ins v10.d[1], v11.d[0]
1182
+
1183
+ /* Pass 2 */
1184
+ idct_helper v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
1185
+ v26.4h, v27.4h, v28.4h, v29.4h
1186
+ transpose_4x4 v26, v27, v28, v29, v3
1187
+
1188
+ /* Range limit */
1189
+ movi v30.8h, #0x80
1190
+ ins v26.d[1], v27.d[0]
1191
+ ins v28.d[1], v29.d[0]
1192
+ add v26.8h, v26.8h, v30.8h
1193
+ add v28.8h, v28.8h, v30.8h
1194
+ sqxtun v26.8b, v26.8h
1195
+ sqxtun v27.8b, v28.8h
1196
+
1197
+ /* Store results to the output buffer */
1198
+ ldp TMP1, TMP2, [OUTPUT_BUF], 16
1199
+ ldp TMP3, TMP4, [OUTPUT_BUF]
1200
+ add TMP1, TMP1, OUTPUT_COL
1201
+ add TMP2, TMP2, OUTPUT_COL
1202
+ add TMP3, TMP3, OUTPUT_COL
1203
+ add TMP4, TMP4, OUTPUT_COL
1204
+
1205
+ #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1206
+ /* We can use much less instructions on little endian systems if the
1207
+ * OS kernel is not configured to trap unaligned memory accesses
1208
+ */
1209
+ st1 {v26.s}[0], [TMP1], 4
1210
+ st1 {v27.s}[0], [TMP3], 4
1211
+ st1 {v26.s}[1], [TMP2], 4
1212
+ st1 {v27.s}[1], [TMP4], 4
1213
+ #else
1214
+ st1 {v26.b}[0], [TMP1], 1
1215
+ st1 {v27.b}[0], [TMP3], 1
1216
+ st1 {v26.b}[1], [TMP1], 1
1217
+ st1 {v27.b}[1], [TMP3], 1
1218
+ st1 {v26.b}[2], [TMP1], 1
1219
+ st1 {v27.b}[2], [TMP3], 1
1220
+ st1 {v26.b}[3], [TMP1], 1
1221
+ st1 {v27.b}[3], [TMP3], 1
1222
+
1223
+ st1 {v26.b}[4], [TMP2], 1
1224
+ st1 {v27.b}[4], [TMP4], 1
1225
+ st1 {v26.b}[5], [TMP2], 1
1226
+ st1 {v27.b}[5], [TMP4], 1
1227
+ st1 {v26.b}[6], [TMP2], 1
1228
+ st1 {v27.b}[6], [TMP4], 1
1229
+ st1 {v26.b}[7], [TMP2], 1
1230
+ st1 {v27.b}[7], [TMP4], 1
1231
+ #endif
1232
+
1233
+ /* vpop {v8.4h - v15.4h} ;not available */
1234
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1235
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1236
+ blr x30
1237
+
1238
+ .unreq DCT_TABLE
1239
+ .unreq COEF_BLOCK
1240
+ .unreq OUTPUT_BUF
1241
+ .unreq OUTPUT_COL
1242
+ .unreq TMP1
1243
+ .unreq TMP2
1244
+ .unreq TMP3
1245
+ .unreq TMP4
1246
+
1247
+ .purgem idct_helper
1248
+
1249
+
1250
+ /*****************************************************************************/
1251
+
1252
+ /*
1253
+ * jsimd_idct_2x2_neon
1254
+ *
1255
+ * This function contains inverse-DCT code for getting reduced-size
1256
+ * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1257
+ * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1258
+ * function from jpeg-6b (jidctred.c).
1259
+ *
1260
+ * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1261
+ * requires much less arithmetic operations and hence should be faster.
1262
+ * The primary purpose of this particular NEON optimized function is
1263
+ * bit exact compatibility with jpeg-6b.
1264
+ */
1265
+
1266
+ .balign 8
1267
+ Ljsimd_idct_2x2_neon_consts:
1268
+ .short -FIX_0_720959822 /* v14[0] */
1269
+ .short FIX_0_850430095 /* v14[1] */
1270
+ .short -FIX_1_272758580 /* v14[2] */
1271
+ .short FIX_3_624509785 /* v14[3] */
1272
+
1273
+ .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1274
+ sshll v15.4s, \x4, #15
1275
+ smull v26.4s, \x6, v14.h[3]
1276
+ smlal v26.4s, \x10, v14.h[2]
1277
+ smlal v26.4s, \x12, v14.h[1]
1278
+ smlal v26.4s, \x16, v14.h[0]
1279
+
1280
+ add v20.4s, v15.4s, v26.4s
1281
+ sub v15.4s, v15.4s, v26.4s
1282
+
1283
+ .if \shift > 16
1284
+ srshr v20.4s, v20.4s, #\shift
1285
+ srshr v15.4s, v15.4s, #\shift
1286
+ xtn \y26, v20.4s
1287
+ xtn \y27, v15.4s
1288
+ .else
1289
+ rshrn \y26, v20.4s, #\shift
1290
+ rshrn \y27, v15.4s, #\shift
1291
+ .endif
1292
+ .endm
1293
+
1294
+ asm_function jsimd_idct_2x2_neon
1295
+
1296
+ DCT_TABLE .req x0
1297
+ COEF_BLOCK .req x1
1298
+ OUTPUT_BUF .req x2
1299
+ OUTPUT_COL .req x3
1300
+ TMP1 .req x0
1301
+ TMP2 .req x15
1302
+
1303
+ /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
1304
+ guarantee that the upper (unused) 32 bits of x3 are valid. This
1305
+ instruction ensures that those bits are set to zero. */
1306
+ uxtw x3, w3
1307
+
1308
+ /* vpush {v8.4h - v15.4h} ; not available */
1309
+ sub sp, sp, 64
1310
+ mov x9, sp
1311
+
1312
+ /* Load constants */
1313
+ adr TMP2, Ljsimd_idct_2x2_neon_consts
1314
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1315
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1316
+ ld1 {v14.4h}, [TMP2]
1317
+
1318
+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
1319
+ * 0 1 2 3 | 4 5 6 7
1320
+ * ---------+--------
1321
+ * 0 | v4.4h | v5.4h
1322
+ * 1 | v6.4h | v7.4h
1323
+ * 2 | - | -
1324
+ * 3 | v10.4h | v11.4h
1325
+ * 4 | - | -
1326
+ * 5 | v12.4h | v13.4h
1327
+ * 6 | - | -
1328
+ * 7 | v16.4h | v17.4h
1329
+ */
1330
+ ld1 {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
1331
+ add COEF_BLOCK, COEF_BLOCK, #16
1332
+ ld1 {v10.4h, v11.4h}, [COEF_BLOCK], 16
1333
+ add COEF_BLOCK, COEF_BLOCK, #16
1334
+ ld1 {v12.4h, v13.4h}, [COEF_BLOCK], 16
1335
+ add COEF_BLOCK, COEF_BLOCK, #16
1336
+ ld1 {v16.4h, v17.4h}, [COEF_BLOCK], 16
1337
+ /* Dequantize */
1338
+ ld1 {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
1339
+ mul v4.4h, v4.4h, v18.4h
1340
+ mul v5.4h, v5.4h, v19.4h
1341
+ ins v4.d[1], v5.d[0]
1342
+ mul v6.4h, v6.4h, v20.4h
1343
+ mul v7.4h, v7.4h, v21.4h
1344
+ ins v6.d[1], v7.d[0]
1345
+ add DCT_TABLE, DCT_TABLE, #16
1346
+ ld1 {v24.4h, v25.4h}, [DCT_TABLE], 16
1347
+ mul v10.4h, v10.4h, v24.4h
1348
+ mul v11.4h, v11.4h, v25.4h
1349
+ ins v10.d[1], v11.d[0]
1350
+ add DCT_TABLE, DCT_TABLE, #16
1351
+ ld1 {v26.4h, v27.4h}, [DCT_TABLE], 16
1352
+ mul v12.4h, v12.4h, v26.4h
1353
+ mul v13.4h, v13.4h, v27.4h
1354
+ ins v12.d[1], v13.d[0]
1355
+ add DCT_TABLE, DCT_TABLE, #16
1356
+ ld1 {v30.4h, v31.4h}, [DCT_TABLE], 16
1357
+ mul v16.4h, v16.4h, v30.4h
1358
+ mul v17.4h, v17.4h, v31.4h
1359
+ ins v16.d[1], v17.d[0]
1360
+
1361
+ /* Pass 1 */
1362
+ #if 0
1363
+ idct_helper v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
1364
+ transpose_4x4 v4.4h, v6.4h, v8.4h, v10.4h
1365
+ idct_helper v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
1366
+ transpose_4x4 v5.4h, v7.4h, v9.4h, v11.4h
1367
+ #else
1368
+ smull v26.4s, v6.4h, v14.h[3]
1369
+ smlal v26.4s, v10.4h, v14.h[2]
1370
+ smlal v26.4s, v12.4h, v14.h[1]
1371
+ smlal v26.4s, v16.4h, v14.h[0]
1372
+ smull v24.4s, v7.4h, v14.h[3]
1373
+ smlal v24.4s, v11.4h, v14.h[2]
1374
+ smlal v24.4s, v13.4h, v14.h[1]
1375
+ smlal v24.4s, v17.4h, v14.h[0]
1376
+ sshll v15.4s, v4.4h, #15
1377
+ sshll v30.4s, v5.4h, #15
1378
+ add v20.4s, v15.4s, v26.4s
1379
+ sub v15.4s, v15.4s, v26.4s
1380
+ rshrn v4.4h, v20.4s, #13
1381
+ rshrn v6.4h, v15.4s, #13
1382
+ add v20.4s, v30.4s, v24.4s
1383
+ sub v15.4s, v30.4s, v24.4s
1384
+ rshrn v5.4h, v20.4s, #13
1385
+ rshrn v7.4h, v15.4s, #13
1386
+ ins v4.d[1], v5.d[0]
1387
+ ins v6.d[1], v7.d[0]
1388
+ transpose v4, v6, v3, .16b, .8h
1389
+ transpose v6, v10, v3, .16b, .4s
1390
+ ins v11.d[0], v10.d[1]
1391
+ ins v7.d[0], v6.d[1]
1392
+ #endif
1393
+
1394
+ /* Pass 2 */
1395
+ idct_helper v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
1396
+
1397
+ /* Range limit */
1398
+ movi v30.8h, #0x80
1399
+ ins v26.d[1], v27.d[0]
1400
+ add v26.8h, v26.8h, v30.8h
1401
+ sqxtun v30.8b, v26.8h
1402
+ ins v26.d[0], v30.d[0]
1403
+ sqxtun v27.8b, v26.8h
1404
+
1405
+ /* Store results to the output buffer */
1406
+ ldp TMP1, TMP2, [OUTPUT_BUF]
1407
+ add TMP1, TMP1, OUTPUT_COL
1408
+ add TMP2, TMP2, OUTPUT_COL
1409
+
1410
+ st1 {v26.b}[0], [TMP1], 1
1411
+ st1 {v27.b}[4], [TMP1], 1
1412
+ st1 {v26.b}[1], [TMP2], 1
1413
+ st1 {v27.b}[5], [TMP2], 1
1414
+
1415
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1416
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1417
+ blr x30
1418
+
1419
+ .unreq DCT_TABLE
1420
+ .unreq COEF_BLOCK
1421
+ .unreq OUTPUT_BUF
1422
+ .unreq OUTPUT_COL
1423
+ .unreq TMP1
1424
+ .unreq TMP2
1425
+
1426
+ .purgem idct_helper
1427
+
1428
+
1429
+ /*****************************************************************************/
1430
+
1431
+ /*
1432
+ * jsimd_ycc_extrgb_convert_neon
1433
+ * jsimd_ycc_extbgr_convert_neon
1434
+ * jsimd_ycc_extrgbx_convert_neon
1435
+ * jsimd_ycc_extbgrx_convert_neon
1436
+ * jsimd_ycc_extxbgr_convert_neon
1437
+ * jsimd_ycc_extxrgb_convert_neon
1438
+ *
1439
+ * Colorspace conversion YCbCr -> RGB
1440
+ */
1441
+
1442
+ .macro do_load size
1443
+ .if \size == 8
1444
+ ld1 {v4.8b}, [U], 8
1445
+ ld1 {v5.8b}, [V], 8
1446
+ ld1 {v0.8b}, [Y], 8
1447
+ prfm pldl1keep, [U, #64]
1448
+ prfm pldl1keep, [V, #64]
1449
+ prfm pldl1keep, [Y, #64]
1450
+ .elseif \size == 4
1451
+ ld1 {v4.b}[0], [U], 1
1452
+ ld1 {v4.b}[1], [U], 1
1453
+ ld1 {v4.b}[2], [U], 1
1454
+ ld1 {v4.b}[3], [U], 1
1455
+ ld1 {v5.b}[0], [V], 1
1456
+ ld1 {v5.b}[1], [V], 1
1457
+ ld1 {v5.b}[2], [V], 1
1458
+ ld1 {v5.b}[3], [V], 1
1459
+ ld1 {v0.b}[0], [Y], 1
1460
+ ld1 {v0.b}[1], [Y], 1
1461
+ ld1 {v0.b}[2], [Y], 1
1462
+ ld1 {v0.b}[3], [Y], 1
1463
+ .elseif \size == 2
1464
+ ld1 {v4.b}[4], [U], 1
1465
+ ld1 {v4.b}[5], [U], 1
1466
+ ld1 {v5.b}[4], [V], 1
1467
+ ld1 {v5.b}[5], [V], 1
1468
+ ld1 {v0.b}[4], [Y], 1
1469
+ ld1 {v0.b}[5], [Y], 1
1470
+ .elseif \size == 1
1471
+ ld1 {v4.b}[6], [U], 1
1472
+ ld1 {v5.b}[6], [V], 1
1473
+ ld1 {v0.b}[6], [Y], 1
1474
+ .else
1475
+ .error unsupported macroblock size
1476
+ .endif
1477
+ .endm
1478
+
1479
+ .macro do_store bpp, size, fast_st3
1480
+ .if \bpp == 24
1481
+ .if \size == 8
1482
+ .if \fast_st3 == 1
1483
+ st3 {v10.8b, v11.8b, v12.8b}, [RGB], 24
1484
+ .else
1485
+ st1 {v10.b}[0], [RGB], #1
1486
+ st1 {v11.b}[0], [RGB], #1
1487
+ st1 {v12.b}[0], [RGB], #1
1488
+
1489
+ st1 {v10.b}[1], [RGB], #1
1490
+ st1 {v11.b}[1], [RGB], #1
1491
+ st1 {v12.b}[1], [RGB], #1
1492
+
1493
+ st1 {v10.b}[2], [RGB], #1
1494
+ st1 {v11.b}[2], [RGB], #1
1495
+ st1 {v12.b}[2], [RGB], #1
1496
+
1497
+ st1 {v10.b}[3], [RGB], #1
1498
+ st1 {v11.b}[3], [RGB], #1
1499
+ st1 {v12.b}[3], [RGB], #1
1500
+
1501
+ st1 {v10.b}[4], [RGB], #1
1502
+ st1 {v11.b}[4], [RGB], #1
1503
+ st1 {v12.b}[4], [RGB], #1
1504
+
1505
+ st1 {v10.b}[5], [RGB], #1
1506
+ st1 {v11.b}[5], [RGB], #1
1507
+ st1 {v12.b}[5], [RGB], #1
1508
+
1509
+ st1 {v10.b}[6], [RGB], #1
1510
+ st1 {v11.b}[6], [RGB], #1
1511
+ st1 {v12.b}[6], [RGB], #1
1512
+
1513
+ st1 {v10.b}[7], [RGB], #1
1514
+ st1 {v11.b}[7], [RGB], #1
1515
+ st1 {v12.b}[7], [RGB], #1
1516
+ .endif
1517
+ .elseif \size == 4
1518
+ st3 {v10.b, v11.b, v12.b}[0], [RGB], 3
1519
+ st3 {v10.b, v11.b, v12.b}[1], [RGB], 3
1520
+ st3 {v10.b, v11.b, v12.b}[2], [RGB], 3
1521
+ st3 {v10.b, v11.b, v12.b}[3], [RGB], 3
1522
+ .elseif \size == 2
1523
+ st3 {v10.b, v11.b, v12.b}[4], [RGB], 3
1524
+ st3 {v10.b, v11.b, v12.b}[5], [RGB], 3
1525
+ .elseif \size == 1
1526
+ st3 {v10.b, v11.b, v12.b}[6], [RGB], 3
1527
+ .else
1528
+ .error unsupported macroblock size
1529
+ .endif
1530
+ .elseif \bpp == 32
1531
+ .if \size == 8
1532
+ st4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
1533
+ .elseif \size == 4
1534
+ st4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
1535
+ st4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
1536
+ st4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
1537
+ st4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
1538
+ .elseif \size == 2
1539
+ st4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
1540
+ st4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
1541
+ .elseif \size == 1
1542
+ st4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
1543
+ .else
1544
+ .error unsupported macroblock size
1545
+ .endif
1546
+ .elseif \bpp==16
1547
+ .if \size == 8
1548
+ st1 {v25.8h}, [RGB], 16
1549
+ .elseif \size == 4
1550
+ st1 {v25.4h}, [RGB], 8
1551
+ .elseif \size == 2
1552
+ st1 {v25.h}[4], [RGB], 2
1553
+ st1 {v25.h}[5], [RGB], 2
1554
+ .elseif \size == 1
1555
+ st1 {v25.h}[6], [RGB], 2
1556
+ .else
1557
+ .error unsupported macroblock size
1558
+ .endif
1559
+ .else
1560
+ .error unsupported bpp
1561
+ .endif
1562
+ .endm
1563
+
1564
+ .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
1565
+ g_offs, gsize, b_offs, bsize, \
1566
+ defsize, fast_st3
1567
+
1568
+ /*
1569
+ * 2-stage pipelined YCbCr->RGB conversion
1570
+ */
1571
+
1572
+ .macro do_yuv_to_rgb_stage1
1573
+ uaddw v6.8h, v2.8h, v4.8b /* q3 = u - 128 */
1574
+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1575
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1576
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1577
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1578
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1579
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1580
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1581
+ smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
1582
+ smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
1583
+ .endm
1584
+
1585
+ .macro do_yuv_to_rgb_stage2
1586
+ rshrn v20.4h, v20.4s, #15
1587
+ rshrn2 v20.8h, v22.4s, #15
1588
+ rshrn v24.4h, v24.4s, #14
1589
+ rshrn2 v24.8h, v26.4s, #14
1590
+ rshrn v28.4h, v28.4s, #14
1591
+ rshrn2 v28.8h, v30.4s, #14
1592
+ uaddw v20.8h, v20.8h, v0.8b
1593
+ uaddw v24.8h, v24.8h, v0.8b
1594
+ uaddw v28.8h, v28.8h, v0.8b
1595
+ .if \bpp != 16
1596
+ sqxtun v1\g_offs\defsize, v20.8h
1597
+ sqxtun v1\r_offs\defsize, v24.8h
1598
+ sqxtun v1\b_offs\defsize, v28.8h
1599
+ .else
1600
+ sqshlu v21.8h, v20.8h, #8
1601
+ sqshlu v25.8h, v24.8h, #8
1602
+ sqshlu v29.8h, v28.8h, #8
1603
+ sri v25.8h, v21.8h, #5
1604
+ sri v25.8h, v29.8h, #11
1605
+ .endif
1606
+ .endm
1607
+
1608
+ .macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
1609
+ rshrn v20.4h, v20.4s, #15
1610
+ rshrn v24.4h, v24.4s, #14
1611
+ rshrn v28.4h, v28.4s, #14
1612
+ ld1 {v4.8b}, [U], 8
1613
+ rshrn2 v20.8h, v22.4s, #15
1614
+ rshrn2 v24.8h, v26.4s, #14
1615
+ rshrn2 v28.8h, v30.4s, #14
1616
+ ld1 {v5.8b}, [V], 8
1617
+ uaddw v20.8h, v20.8h, v0.8b
1618
+ uaddw v24.8h, v24.8h, v0.8b
1619
+ uaddw v28.8h, v28.8h, v0.8b
1620
+ .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
1621
+ sqxtun v1\g_offs\defsize, v20.8h
1622
+ ld1 {v0.8b}, [Y], 8
1623
+ sqxtun v1\r_offs\defsize, v24.8h
1624
+ prfm pldl1keep, [U, #64]
1625
+ prfm pldl1keep, [V, #64]
1626
+ prfm pldl1keep, [Y, #64]
1627
+ sqxtun v1\b_offs\defsize, v28.8h
1628
+ uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1629
+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1630
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1631
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1632
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1633
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1634
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1635
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1636
+ .else /**************************** rgb565 ********************************/
1637
+ sqshlu v21.8h, v20.8h, #8
1638
+ sqshlu v25.8h, v24.8h, #8
1639
+ sqshlu v29.8h, v28.8h, #8
1640
+ uaddw v6.8h, v2.8h, v4.8b /* v6.16b = u - 128 */
1641
+ uaddw v8.8h, v2.8h, v5.8b /* q2 = v - 128 */
1642
+ ld1 {v0.8b}, [Y], 8
1643
+ smull v20.4s, v6.4h, v1.h[1] /* multiply by -11277 */
1644
+ smlal v20.4s, v8.4h, v1.h[2] /* multiply by -23401 */
1645
+ smull2 v22.4s, v6.8h, v1.h[1] /* multiply by -11277 */
1646
+ smlal2 v22.4s, v8.8h, v1.h[2] /* multiply by -23401 */
1647
+ sri v25.8h, v21.8h, #5
1648
+ smull v24.4s, v8.4h, v1.h[0] /* multiply by 22971 */
1649
+ smull2 v26.4s, v8.8h, v1.h[0] /* multiply by 22971 */
1650
+ prfm pldl1keep, [U, #64]
1651
+ prfm pldl1keep, [V, #64]
1652
+ prfm pldl1keep, [Y, #64]
1653
+ sri v25.8h, v29.8h, #11
1654
+ .endif
1655
+ do_store \bpp, 8, \fast_st3
1656
+ smull v28.4s, v6.4h, v1.h[3] /* multiply by 29033 */
1657
+ smull2 v30.4s, v6.8h, v1.h[3] /* multiply by 29033 */
1658
+ .endm
1659
+
1660
+ .macro do_yuv_to_rgb
1661
+ do_yuv_to_rgb_stage1
1662
+ do_yuv_to_rgb_stage2
1663
+ .endm
1664
+
1665
+ /* Apple gas crashes on adrl, work around that by using adr.
1666
+ * But this requires a copy of these constants for each function.
1667
+ */
1668
+
1669
+ .balign 16
1670
+ .if \fast_st3 == 1
1671
+ Ljsimd_ycc_\colorid\()_neon_consts:
1672
+ .else
1673
+ Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
1674
+ .endif
1675
+ .short 0, 0, 0, 0
1676
+ .short 22971, -11277, -23401, 29033
1677
+ .short -128, -128, -128, -128
1678
+ .short -128, -128, -128, -128
1679
+
1680
+ .if \fast_st3 == 1
1681
+ asm_function jsimd_ycc_\colorid\()_convert_neon
1682
+ .else
1683
+ asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
1684
+ .endif
1685
+ OUTPUT_WIDTH .req w0
1686
+ INPUT_BUF .req x1
1687
+ INPUT_ROW .req w2
1688
+ OUTPUT_BUF .req x3
1689
+ NUM_ROWS .req w4
1690
+
1691
+ INPUT_BUF0 .req x5
1692
+ INPUT_BUF1 .req x6
1693
+ INPUT_BUF2 .req x1
1694
+
1695
+ RGB .req x7
1696
+ Y .req x9
1697
+ U .req x10
1698
+ V .req x11
1699
+ N .req w15
1700
+
1701
+ sub sp, sp, 64
1702
+ mov x9, sp
1703
+
1704
+ /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
1705
+ .if \fast_st3 == 1
1706
+ adr x15, Ljsimd_ycc_\colorid\()_neon_consts
1707
+ .else
1708
+ adr x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
1709
+ .endif
1710
+
1711
+ /* Save NEON registers */
1712
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
1713
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
1714
+ ld1 {v0.4h, v1.4h}, [x15], 16
1715
+ ld1 {v2.8h}, [x15]
1716
+
1717
+ ldr INPUT_BUF0, [INPUT_BUF]
1718
+ ldr INPUT_BUF1, [INPUT_BUF, #8]
1719
+ ldr INPUT_BUF2, [INPUT_BUF, #16]
1720
+ .unreq INPUT_BUF
1721
+
1722
+ /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
1723
+ movi v10.16b, #255
1724
+ movi v13.16b, #255
1725
+
1726
+ /* Outer loop over scanlines */
1727
+ cmp NUM_ROWS, #1
1728
+ b.lt 9f
1729
+ 0:
1730
+ ldr Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
1731
+ ldr U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
1732
+ mov N, OUTPUT_WIDTH
1733
+ ldr V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
1734
+ add INPUT_ROW, INPUT_ROW, #1
1735
+ ldr RGB, [OUTPUT_BUF], #8
1736
+
1737
+ /* Inner loop over pixels */
1738
+ subs N, N, #8
1739
+ b.lt 3f
1740
+ do_load 8
1741
+ do_yuv_to_rgb_stage1
1742
+ subs N, N, #8
1743
+ b.lt 2f
1744
+ 1:
1745
+ do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
1746
+ subs N, N, #8
1747
+ b.ge 1b
1748
+ 2:
1749
+ do_yuv_to_rgb_stage2
1750
+ do_store \bpp, 8, \fast_st3
1751
+ tst N, #7
1752
+ b.eq 8f
1753
+ 3:
1754
+ tst N, #4
1755
+ b.eq 3f
1756
+ do_load 4
1757
+ 3:
1758
+ tst N, #2
1759
+ b.eq 4f
1760
+ do_load 2
1761
+ 4:
1762
+ tst N, #1
1763
+ b.eq 5f
1764
+ do_load 1
1765
+ 5:
1766
+ do_yuv_to_rgb
1767
+ tst N, #4
1768
+ b.eq 6f
1769
+ do_store \bpp, 4, \fast_st3
1770
+ 6:
1771
+ tst N, #2
1772
+ b.eq 7f
1773
+ do_store \bpp, 2, \fast_st3
1774
+ 7:
1775
+ tst N, #1
1776
+ b.eq 8f
1777
+ do_store \bpp, 1, \fast_st3
1778
+ 8:
1779
+ subs NUM_ROWS, NUM_ROWS, #1
1780
+ b.gt 0b
1781
+ 9:
1782
+ /* Restore all registers and return */
1783
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
1784
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
1785
+ br x30
1786
+ .unreq OUTPUT_WIDTH
1787
+ .unreq INPUT_ROW
1788
+ .unreq OUTPUT_BUF
1789
+ .unreq NUM_ROWS
1790
+ .unreq INPUT_BUF0
1791
+ .unreq INPUT_BUF1
1792
+ .unreq INPUT_BUF2
1793
+ .unreq RGB
1794
+ .unreq Y
1795
+ .unreq U
1796
+ .unreq V
1797
+ .unreq N
1798
+
1799
+ .purgem do_yuv_to_rgb
1800
+ .purgem do_yuv_to_rgb_stage1
1801
+ .purgem do_yuv_to_rgb_stage2
1802
+ .purgem do_yuv_to_rgb_stage2_store_load_stage1
1803
+
1804
+ .endm
1805
+
1806
+ /*--------------------------------- id ----- bpp R rsize G gsize B bsize defsize fast_st3*/
1807
+ generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 1
1808
+ generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 1
1809
+ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h, 1, .4h, 2, .4h, .8b, 1
1810
+ generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h, 1, .4h, 0, .4h, .8b, 1
1811
+ generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h, 2, .4h, 1, .4h, .8b, 1
1812
+ generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h, 2, .4h, 3, .4h, .8b, 1
1813
+ generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, .4h, 0, .4h, 0, .4h, .8b, 1
1814
+
1815
+ generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, .4h, 1, .4h, 2, .4h, .8b, 0
1816
+ generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, .4h, 1, .4h, 0, .4h, .8b, 0
1817
+
1818
+ .purgem do_load
1819
+ .purgem do_store
1820
+
1821
+
1822
+ /*****************************************************************************/
1823
+
1824
+ /*
1825
+ * jsimd_extrgb_ycc_convert_neon
1826
+ * jsimd_extbgr_ycc_convert_neon
1827
+ * jsimd_extrgbx_ycc_convert_neon
1828
+ * jsimd_extbgrx_ycc_convert_neon
1829
+ * jsimd_extxbgr_ycc_convert_neon
1830
+ * jsimd_extxrgb_ycc_convert_neon
1831
+ *
1832
+ * Colorspace conversion RGB -> YCbCr
1833
+ */
1834
+
1835
+ .macro do_store size
1836
+ .if \size == 8
1837
+ st1 {v20.8b}, [Y], #8
1838
+ st1 {v21.8b}, [U], #8
1839
+ st1 {v22.8b}, [V], #8
1840
+ .elseif \size == 4
1841
+ st1 {v20.b}[0], [Y], #1
1842
+ st1 {v20.b}[1], [Y], #1
1843
+ st1 {v20.b}[2], [Y], #1
1844
+ st1 {v20.b}[3], [Y], #1
1845
+ st1 {v21.b}[0], [U], #1
1846
+ st1 {v21.b}[1], [U], #1
1847
+ st1 {v21.b}[2], [U], #1
1848
+ st1 {v21.b}[3], [U], #1
1849
+ st1 {v22.b}[0], [V], #1
1850
+ st1 {v22.b}[1], [V], #1
1851
+ st1 {v22.b}[2], [V], #1
1852
+ st1 {v22.b}[3], [V], #1
1853
+ .elseif \size == 2
1854
+ st1 {v20.b}[4], [Y], #1
1855
+ st1 {v20.b}[5], [Y], #1
1856
+ st1 {v21.b}[4], [U], #1
1857
+ st1 {v21.b}[5], [U], #1
1858
+ st1 {v22.b}[4], [V], #1
1859
+ st1 {v22.b}[5], [V], #1
1860
+ .elseif \size == 1
1861
+ st1 {v20.b}[6], [Y], #1
1862
+ st1 {v21.b}[6], [U], #1
1863
+ st1 {v22.b}[6], [V], #1
1864
+ .else
1865
+ .error unsupported macroblock size
1866
+ .endif
1867
+ .endm
1868
+
1869
+ .macro do_load bpp, size, fast_ld3
1870
+ .if \bpp == 24
1871
+ .if \size == 8
1872
+ .if \fast_ld3 == 1
1873
+ ld3 {v10.8b, v11.8b, v12.8b}, [RGB], #24
1874
+ .else
1875
+ ld1 {v10.b}[0], [RGB], #1
1876
+ ld1 {v11.b}[0], [RGB], #1
1877
+ ld1 {v12.b}[0], [RGB], #1
1878
+
1879
+ ld1 {v10.b}[1], [RGB], #1
1880
+ ld1 {v11.b}[1], [RGB], #1
1881
+ ld1 {v12.b}[1], [RGB], #1
1882
+
1883
+ ld1 {v10.b}[2], [RGB], #1
1884
+ ld1 {v11.b}[2], [RGB], #1
1885
+ ld1 {v12.b}[2], [RGB], #1
1886
+
1887
+ ld1 {v10.b}[3], [RGB], #1
1888
+ ld1 {v11.b}[3], [RGB], #1
1889
+ ld1 {v12.b}[3], [RGB], #1
1890
+
1891
+ ld1 {v10.b}[4], [RGB], #1
1892
+ ld1 {v11.b}[4], [RGB], #1
1893
+ ld1 {v12.b}[4], [RGB], #1
1894
+
1895
+ ld1 {v10.b}[5], [RGB], #1
1896
+ ld1 {v11.b}[5], [RGB], #1
1897
+ ld1 {v12.b}[5], [RGB], #1
1898
+
1899
+ ld1 {v10.b}[6], [RGB], #1
1900
+ ld1 {v11.b}[6], [RGB], #1
1901
+ ld1 {v12.b}[6], [RGB], #1
1902
+
1903
+ ld1 {v10.b}[7], [RGB], #1
1904
+ ld1 {v11.b}[7], [RGB], #1
1905
+ ld1 {v12.b}[7], [RGB], #1
1906
+ .endif
1907
+ prfm pldl1keep, [RGB, #128]
1908
+ .elseif \size == 4
1909
+ ld3 {v10.b, v11.b, v12.b}[0], [RGB], #3
1910
+ ld3 {v10.b, v11.b, v12.b}[1], [RGB], #3
1911
+ ld3 {v10.b, v11.b, v12.b}[2], [RGB], #3
1912
+ ld3 {v10.b, v11.b, v12.b}[3], [RGB], #3
1913
+ .elseif \size == 2
1914
+ ld3 {v10.b, v11.b, v12.b}[4], [RGB], #3
1915
+ ld3 {v10.b, v11.b, v12.b}[5], [RGB], #3
1916
+ .elseif \size == 1
1917
+ ld3 {v10.b, v11.b, v12.b}[6], [RGB], #3
1918
+ .else
1919
+ .error unsupported macroblock size
1920
+ .endif
1921
+ .elseif \bpp == 32
1922
+ .if \size == 8
1923
+ ld4 {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
1924
+ prfm pldl1keep, [RGB, #128]
1925
+ .elseif \size == 4
1926
+ ld4 {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
1927
+ ld4 {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
1928
+ ld4 {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
1929
+ ld4 {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
1930
+ .elseif \size == 2
1931
+ ld4 {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
1932
+ ld4 {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
1933
+ .elseif \size == 1
1934
+ ld4 {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
1935
+ .else
1936
+ .error unsupported macroblock size
1937
+ .endif
1938
+ .else
1939
+ .error unsupported bpp
1940
+ .endif
1941
+ .endm
1942
+
1943
+ .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
1944
+ b_offs, fast_ld3
1945
+
1946
+ /*
1947
+ * 2-stage pipelined RGB->YCbCr conversion
1948
+ */
1949
+
1950
+ .macro do_rgb_to_yuv_stage1
1951
+ ushll v4.8h, v1\r_offs\().8b, #0 /* r = v4 */
1952
+ ushll v6.8h, v1\g_offs\().8b, #0 /* g = v6 */
1953
+ ushll v8.8h, v1\b_offs\().8b, #0 /* b = v8 */
1954
+ rev64 v18.4s, v1.4s
1955
+ rev64 v26.4s, v1.4s
1956
+ rev64 v28.4s, v1.4s
1957
+ rev64 v30.4s, v1.4s
1958
+ umull v14.4s, v4.4h, v0.h[0]
1959
+ umull2 v16.4s, v4.8h, v0.h[0]
1960
+ umlsl v18.4s, v4.4h, v0.h[3]
1961
+ umlsl2 v26.4s, v4.8h, v0.h[3]
1962
+ umlal v28.4s, v4.4h, v0.h[5]
1963
+ umlal2 v30.4s, v4.8h, v0.h[5]
1964
+ umlal v14.4s, v6.4h, v0.h[1]
1965
+ umlal2 v16.4s, v6.8h, v0.h[1]
1966
+ umlsl v18.4s, v6.4h, v0.h[4]
1967
+ umlsl2 v26.4s, v6.8h, v0.h[4]
1968
+ umlsl v28.4s, v6.4h, v0.h[6]
1969
+ umlsl2 v30.4s, v6.8h, v0.h[6]
1970
+ umlal v14.4s, v8.4h, v0.h[2]
1971
+ umlal2 v16.4s, v8.8h, v0.h[2]
1972
+ umlal v18.4s, v8.4h, v0.h[5]
1973
+ umlal2 v26.4s, v8.8h, v0.h[5]
1974
+ umlsl v28.4s, v8.4h, v0.h[7]
1975
+ umlsl2 v30.4s, v8.8h, v0.h[7]
1976
+ .endm
1977
+
1978
+ .macro do_rgb_to_yuv_stage2
1979
+ rshrn v20.4h, v14.4s, #16
1980
+ shrn v22.4h, v18.4s, #16
1981
+ shrn v24.4h, v28.4s, #16
1982
+ rshrn2 v20.8h, v16.4s, #16
1983
+ shrn2 v22.8h, v26.4s, #16
1984
+ shrn2 v24.8h, v30.4s, #16
1985
+ xtn v20.8b, v20.8h /* v20 = y */
1986
+ xtn v21.8b, v22.8h /* v21 = u */
1987
+ xtn v22.8b, v24.8h /* v22 = v */
1988
+ .endm
1989
+
1990
+ .macro do_rgb_to_yuv
1991
+ do_rgb_to_yuv_stage1
1992
+ do_rgb_to_yuv_stage2
1993
+ .endm
1994
+
1995
+ /* TODO: expand macros and interleave instructions if some in-order
1996
+ * ARM64 processor actually can dual-issue LOAD/STORE with ALU */
1997
+ .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
1998
+ do_rgb_to_yuv_stage2
1999
+ do_load \bpp, 8, \fast_ld3
2000
+ st1 {v20.8b}, [Y], #8
2001
+ st1 {v21.8b}, [U], #8
2002
+ st1 {v22.8b}, [V], #8
2003
+ do_rgb_to_yuv_stage1
2004
+ .endm
2005
+
2006
+ .balign 16
2007
+ .if \fast_ld3 == 1
2008
+ Ljsimd_\colorid\()_ycc_neon_consts:
2009
+ .else
2010
+ Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
2011
+ .endif
2012
+ .short 19595, 38470, 7471, 11059
2013
+ .short 21709, 32768, 27439, 5329
2014
+ .short 32767, 128, 32767, 128
2015
+ .short 32767, 128, 32767, 128
2016
+
2017
+ .if \fast_ld3 == 1
2018
+ asm_function jsimd_\colorid\()_ycc_convert_neon
2019
+ .else
2020
+ asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
2021
+ .endif
2022
+ OUTPUT_WIDTH .req w0
2023
+ INPUT_BUF .req x1
2024
+ OUTPUT_BUF .req x2
2025
+ OUTPUT_ROW .req w3
2026
+ NUM_ROWS .req w4
2027
+
2028
+ OUTPUT_BUF0 .req x5
2029
+ OUTPUT_BUF1 .req x6
2030
+ OUTPUT_BUF2 .req x2 /* OUTPUT_BUF */
2031
+
2032
+ RGB .req x7
2033
+ Y .req x9
2034
+ U .req x10
2035
+ V .req x11
2036
+ N .req w12
2037
+
2038
+ /* Load constants to d0, d1, d2, d3 */
2039
+ .if \fast_ld3 == 1
2040
+ adr x13, Ljsimd_\colorid\()_ycc_neon_consts
2041
+ .else
2042
+ adr x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
2043
+ .endif
2044
+ ld1 {v0.8h, v1.8h}, [x13]
2045
+
2046
+ ldr OUTPUT_BUF0, [OUTPUT_BUF]
2047
+ ldr OUTPUT_BUF1, [OUTPUT_BUF, #8]
2048
+ ldr OUTPUT_BUF2, [OUTPUT_BUF, #16]
2049
+ .unreq OUTPUT_BUF
2050
+
2051
+ /* Save NEON registers */
2052
+ sub sp, sp, #64
2053
+ mov x9, sp
2054
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
2055
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
2056
+
2057
+ /* Outer loop over scanlines */
2058
+ cmp NUM_ROWS, #1
2059
+ b.lt 9f
2060
+ 0:
2061
+ ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
2062
+ ldr U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
2063
+ mov N, OUTPUT_WIDTH
2064
+ ldr V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
2065
+ add OUTPUT_ROW, OUTPUT_ROW, #1
2066
+ ldr RGB, [INPUT_BUF], #8
2067
+
2068
+ /* Inner loop over pixels */
2069
+ subs N, N, #8
2070
+ b.lt 3f
2071
+ do_load \bpp, 8, \fast_ld3
2072
+ do_rgb_to_yuv_stage1
2073
+ subs N, N, #8
2074
+ b.lt 2f
2075
+ 1:
2076
+ do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
2077
+ subs N, N, #8
2078
+ b.ge 1b
2079
+ 2:
2080
+ do_rgb_to_yuv_stage2
2081
+ do_store 8
2082
+ tst N, #7
2083
+ b.eq 8f
2084
+ 3:
2085
+ tbz N, #2, 3f
2086
+ do_load \bpp, 4, \fast_ld3
2087
+ 3:
2088
+ tbz N, #1, 4f
2089
+ do_load \bpp, 2, \fast_ld3
2090
+ 4:
2091
+ tbz N, #0, 5f
2092
+ do_load \bpp, 1, \fast_ld3
2093
+ 5:
2094
+ do_rgb_to_yuv
2095
+ tbz N, #2, 6f
2096
+ do_store 4
2097
+ 6:
2098
+ tbz N, #1, 7f
2099
+ do_store 2
2100
+ 7:
2101
+ tbz N, #0, 8f
2102
+ do_store 1
2103
+ 8:
2104
+ subs NUM_ROWS, NUM_ROWS, #1
2105
+ b.gt 0b
2106
+ 9:
2107
+ /* Restore all registers and return */
2108
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2109
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2110
+ br x30
2111
+
2112
+ .unreq OUTPUT_WIDTH
2113
+ .unreq OUTPUT_ROW
2114
+ .unreq INPUT_BUF
2115
+ .unreq NUM_ROWS
2116
+ .unreq OUTPUT_BUF0
2117
+ .unreq OUTPUT_BUF1
2118
+ .unreq OUTPUT_BUF2
2119
+ .unreq RGB
2120
+ .unreq Y
2121
+ .unreq U
2122
+ .unreq V
2123
+ .unreq N
2124
+
2125
+ .purgem do_rgb_to_yuv
2126
+ .purgem do_rgb_to_yuv_stage1
2127
+ .purgem do_rgb_to_yuv_stage2
2128
+ .purgem do_rgb_to_yuv_stage2_store_load_stage1
2129
+
2130
+ .endm
2131
+
2132
+ /*--------------------------------- id ----- bpp R G B Fast LD3 */
2133
+ generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 1
2134
+ generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 1
2135
+ generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
2136
+ generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
2137
+ generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
2138
+ generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
2139
+
2140
+ generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2, 0
2141
+ generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0, 0
2142
+
2143
+ .purgem do_load
2144
+ .purgem do_store
2145
+
2146
+
2147
+ /*****************************************************************************/
2148
+
2149
+ /*
2150
+ * Load data into workspace, applying unsigned->signed conversion
2151
+ *
2152
+ * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
2153
+ * rid of VST1.16 instructions
2154
+ */
2155
+
2156
+ asm_function jsimd_convsamp_neon
2157
+ SAMPLE_DATA .req x0
2158
+ START_COL .req x1
2159
+ WORKSPACE .req x2
2160
+ TMP1 .req x9
2161
+ TMP2 .req x10
2162
+ TMP3 .req x11
2163
+ TMP4 .req x12
2164
+ TMP5 .req x13
2165
+ TMP6 .req x14
2166
+ TMP7 .req x15
2167
+ TMP8 .req x4
2168
+ TMPDUP .req w3
2169
+
2170
+ /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
2171
+ guarantee that the upper (unused) 32 bits of x1 are valid. This
2172
+ instruction ensures that those bits are set to zero. */
2173
+ uxtw x1, w1
2174
+
2175
+ mov TMPDUP, #128
2176
+ ldp TMP1, TMP2, [SAMPLE_DATA], 16
2177
+ ldp TMP3, TMP4, [SAMPLE_DATA], 16
2178
+ dup v0.8b, TMPDUP
2179
+ add TMP1, TMP1, START_COL
2180
+ add TMP2, TMP2, START_COL
2181
+ ldp TMP5, TMP6, [SAMPLE_DATA], 16
2182
+ add TMP3, TMP3, START_COL
2183
+ add TMP4, TMP4, START_COL
2184
+ ldp TMP7, TMP8, [SAMPLE_DATA], 16
2185
+ add TMP5, TMP5, START_COL
2186
+ add TMP6, TMP6, START_COL
2187
+ ld1 {v16.8b}, [TMP1]
2188
+ add TMP7, TMP7, START_COL
2189
+ add TMP8, TMP8, START_COL
2190
+ ld1 {v17.8b}, [TMP2]
2191
+ usubl v16.8h, v16.8b, v0.8b
2192
+ ld1 {v18.8b}, [TMP3]
2193
+ usubl v17.8h, v17.8b, v0.8b
2194
+ ld1 {v19.8b}, [TMP4]
2195
+ usubl v18.8h, v18.8b, v0.8b
2196
+ ld1 {v20.8b}, [TMP5]
2197
+ usubl v19.8h, v19.8b, v0.8b
2198
+ ld1 {v21.8b}, [TMP6]
2199
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
2200
+ usubl v20.8h, v20.8b, v0.8b
2201
+ ld1 {v22.8b}, [TMP7]
2202
+ usubl v21.8h, v21.8b, v0.8b
2203
+ ld1 {v23.8b}, [TMP8]
2204
+ usubl v22.8h, v22.8b, v0.8b
2205
+ usubl v23.8h, v23.8b, v0.8b
2206
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
2207
+
2208
+ br x30
2209
+
2210
+ .unreq SAMPLE_DATA
2211
+ .unreq START_COL
2212
+ .unreq WORKSPACE
2213
+ .unreq TMP1
2214
+ .unreq TMP2
2215
+ .unreq TMP3
2216
+ .unreq TMP4
2217
+ .unreq TMP5
2218
+ .unreq TMP6
2219
+ .unreq TMP7
2220
+ .unreq TMP8
2221
+ .unreq TMPDUP
2222
+
2223
+ /*****************************************************************************/
2224
+
2225
+ /*
2226
+ * jsimd_fdct_islow_neon
2227
+ *
2228
+ * This file contains a slow-but-accurate integer implementation of the
2229
+ * forward DCT (Discrete Cosine Transform). The following code is based
2230
+ * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
2231
+ * more details.
2232
+ *
2233
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
2234
+ * rid of a bunch of VLD1.16 instructions
2235
+ */
2236
+
2237
+ #define CONST_BITS 13
2238
+ #define PASS1_BITS 2
2239
+
2240
+ #define DESCALE_P1 (CONST_BITS-PASS1_BITS)
2241
+ #define DESCALE_P2 (CONST_BITS+PASS1_BITS)
2242
+
2243
+ #define F_0_298 2446 /* FIX(0.298631336) */
2244
+ #define F_0_390 3196 /* FIX(0.390180644) */
2245
+ #define F_0_541 4433 /* FIX(0.541196100) */
2246
+ #define F_0_765 6270 /* FIX(0.765366865) */
2247
+ #define F_0_899 7373 /* FIX(0.899976223) */
2248
+ #define F_1_175 9633 /* FIX(1.175875602) */
2249
+ #define F_1_501 12299 /* FIX(1.501321110) */
2250
+ #define F_1_847 15137 /* FIX(1.847759065) */
2251
+ #define F_1_961 16069 /* FIX(1.961570560) */
2252
+ #define F_2_053 16819 /* FIX(2.053119869) */
2253
+ #define F_2_562 20995 /* FIX(2.562915447) */
2254
+ #define F_3_072 25172 /* FIX(3.072711026) */
2255
+
2256
+ .balign 16
2257
+ Ljsimd_fdct_islow_neon_consts:
2258
+ .short F_0_298
2259
+ .short -F_0_390
2260
+ .short F_0_541
2261
+ .short F_0_765
2262
+ .short - F_0_899
2263
+ .short F_1_175
2264
+ .short F_1_501
2265
+ .short - F_1_847
2266
+ .short - F_1_961
2267
+ .short F_2_053
2268
+ .short - F_2_562
2269
+ .short F_3_072
2270
+ .short 0 /* padding */
2271
+ .short 0
2272
+ .short 0
2273
+ .short 0
2274
+
2275
+ #undef F_0_298
2276
+ #undef F_0_390
2277
+ #undef F_0_541
2278
+ #undef F_0_765
2279
+ #undef F_0_899
2280
+ #undef F_1_175
2281
+ #undef F_1_501
2282
+ #undef F_1_847
2283
+ #undef F_1_961
2284
+ #undef F_2_053
2285
+ #undef F_2_562
2286
+ #undef F_3_072
2287
+ #define XFIX_P_0_298 v0.h[0]
2288
+ #define XFIX_N_0_390 v0.h[1]
2289
+ #define XFIX_P_0_541 v0.h[2]
2290
+ #define XFIX_P_0_765 v0.h[3]
2291
+ #define XFIX_N_0_899 v0.h[4]
2292
+ #define XFIX_P_1_175 v0.h[5]
2293
+ #define XFIX_P_1_501 v0.h[6]
2294
+ #define XFIX_N_1_847 v0.h[7]
2295
+ #define XFIX_N_1_961 v1.h[0]
2296
+ #define XFIX_P_2_053 v1.h[1]
2297
+ #define XFIX_N_2_562 v1.h[2]
2298
+ #define XFIX_P_3_072 v1.h[3]
2299
+
2300
+ asm_function jsimd_fdct_islow_neon
2301
+
2302
+ DATA .req x0
2303
+ TMP .req x9
2304
+
2305
+ /* Load constants */
2306
+ adr TMP, Ljsimd_fdct_islow_neon_consts
2307
+ ld1 {v0.8h, v1.8h}, [TMP]
2308
+
2309
+ /* Save NEON registers */
2310
+ sub sp, sp, #64
2311
+ mov x10, sp
2312
+ st1 {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
2313
+ st1 {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
2314
+
2315
+ /* Load all DATA into NEON registers with the following allocation:
2316
+ * 0 1 2 3 | 4 5 6 7
2317
+ * ---------+--------
2318
+ * 0 | d16 | d17 | v16.8h
2319
+ * 1 | d18 | d19 | v17.8h
2320
+ * 2 | d20 | d21 | v18.8h
2321
+ * 3 | d22 | d23 | v19.8h
2322
+ * 4 | d24 | d25 | v20.8h
2323
+ * 5 | d26 | d27 | v21.8h
2324
+ * 6 | d28 | d29 | v22.8h
2325
+ * 7 | d30 | d31 | v23.8h
2326
+ */
2327
+
2328
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2329
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2330
+ sub DATA, DATA, #64
2331
+
2332
+ /* Transpose */
2333
+ transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2334
+ /* 1-D FDCT */
2335
+ add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
2336
+ sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
2337
+ add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
2338
+ sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
2339
+ add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
2340
+ sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
2341
+ add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
2342
+ sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
2343
+
2344
+ /* even part */
2345
+
2346
+ add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
2347
+ sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
2348
+ add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
2349
+ sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
2350
+
2351
+ add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
2352
+ sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
2353
+
2354
+ add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
2355
+
2356
+ shl v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
2357
+ shl v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
2358
+
2359
+ smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2360
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2361
+ mov v22.16b, v18.16b
2362
+ mov v25.16b, v24.16b
2363
+
2364
+ smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2365
+ smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2366
+ smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2367
+ smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2368
+
2369
+ rshrn v18.4h, v18.4s, #DESCALE_P1
2370
+ rshrn v22.4h, v22.4s, #DESCALE_P1
2371
+ rshrn2 v18.8h, v24.4s, #DESCALE_P1 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2372
+ rshrn2 v22.8h, v25.4s, #DESCALE_P1 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2373
+
2374
+ /* Odd part */
2375
+
2376
+ add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
2377
+ add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
2378
+ add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
2379
+ add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
2380
+ smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
2381
+ smull2 v5.4s, v10.8h, XFIX_P_1_175
2382
+ smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2383
+ smlal2 v5.4s, v11.8h, XFIX_P_1_175
2384
+
2385
+ smull2 v24.4s, v28.8h, XFIX_P_0_298
2386
+ smull2 v25.4s, v29.8h, XFIX_P_2_053
2387
+ smull2 v26.4s, v30.8h, XFIX_P_3_072
2388
+ smull2 v27.4s, v31.8h, XFIX_P_1_501
2389
+ smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2390
+ smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2391
+ smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2392
+ smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2393
+
2394
+ smull2 v12.4s, v8.8h, XFIX_N_0_899
2395
+ smull2 v13.4s, v9.8h, XFIX_N_2_562
2396
+ smull2 v14.4s, v10.8h, XFIX_N_1_961
2397
+ smull2 v15.4s, v11.8h, XFIX_N_0_390
2398
+ smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
2399
+ smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
2400
+ smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
2401
+ smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
2402
+
2403
+ add v10.4s, v10.4s, v4.4s /* z3 += z5 */
2404
+ add v14.4s, v14.4s, v5.4s
2405
+ add v11.4s, v11.4s, v4.4s /* z4 += z5 */
2406
+ add v15.4s, v15.4s, v5.4s
2407
+
2408
+ add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
2409
+ add v24.4s, v24.4s, v12.4s
2410
+ add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
2411
+ add v25.4s, v25.4s, v13.4s
2412
+ add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
2413
+ add v26.4s, v26.4s, v14.4s
2414
+ add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
2415
+ add v27.4s, v27.4s, v15.4s
2416
+
2417
+ add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
2418
+ add v24.4s, v24.4s, v14.4s
2419
+ add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
2420
+ add v25.4s, v25.4s, v15.4s
2421
+ add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
2422
+ add v26.4s, v26.4s, v13.4s
2423
+ add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
2424
+ add v27.4s, v27.4s, v12.4s
2425
+
2426
+ rshrn v23.4h, v28.4s, #DESCALE_P1
2427
+ rshrn v21.4h, v29.4s, #DESCALE_P1
2428
+ rshrn v19.4h, v30.4s, #DESCALE_P1
2429
+ rshrn v17.4h, v31.4s, #DESCALE_P1
2430
+ rshrn2 v23.8h, v24.4s, #DESCALE_P1 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2431
+ rshrn2 v21.8h, v25.4s, #DESCALE_P1 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2432
+ rshrn2 v19.8h, v26.4s, #DESCALE_P1 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2433
+ rshrn2 v17.8h, v27.4s, #DESCALE_P1 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2434
+
2435
+ /* Transpose */
2436
+ transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
2437
+
2438
+ /* 1-D FDCT */
2439
+ add v24.8h, v16.8h, v23.8h /* tmp0 = dataptr[0] + dataptr[7]; */
2440
+ sub v31.8h, v16.8h, v23.8h /* tmp7 = dataptr[0] - dataptr[7]; */
2441
+ add v25.8h, v17.8h, v22.8h /* tmp1 = dataptr[1] + dataptr[6]; */
2442
+ sub v30.8h, v17.8h, v22.8h /* tmp6 = dataptr[1] - dataptr[6]; */
2443
+ add v26.8h, v18.8h, v21.8h /* tmp2 = dataptr[2] + dataptr[5]; */
2444
+ sub v29.8h, v18.8h, v21.8h /* tmp5 = dataptr[2] - dataptr[5]; */
2445
+ add v27.8h, v19.8h, v20.8h /* tmp3 = dataptr[3] + dataptr[4]; */
2446
+ sub v28.8h, v19.8h, v20.8h /* tmp4 = dataptr[3] - dataptr[4]; */
2447
+
2448
+ /* even part */
2449
+ add v8.8h, v24.8h, v27.8h /* tmp10 = tmp0 + tmp3; */
2450
+ sub v9.8h, v24.8h, v27.8h /* tmp13 = tmp0 - tmp3; */
2451
+ add v10.8h, v25.8h, v26.8h /* tmp11 = tmp1 + tmp2; */
2452
+ sub v11.8h, v25.8h, v26.8h /* tmp12 = tmp1 - tmp2; */
2453
+
2454
+ add v16.8h, v8.8h, v10.8h /* tmp10 + tmp11 */
2455
+ sub v20.8h, v8.8h, v10.8h /* tmp10 - tmp11 */
2456
+
2457
+ add v18.8h, v11.8h, v9.8h /* tmp12 + tmp13 */
2458
+
2459
+ srshr v16.8h, v16.8h, #PASS1_BITS /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
2460
+ srshr v20.8h, v20.8h, #PASS1_BITS /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
2461
+
2462
+ smull2 v24.4s, v18.8h, XFIX_P_0_541 /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2463
+ smull v18.4s, v18.4h, XFIX_P_0_541 /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
2464
+ mov v22.16b, v18.16b
2465
+ mov v25.16b, v24.16b
2466
+
2467
+ smlal v18.4s, v9.4h, XFIX_P_0_765 /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2468
+ smlal2 v24.4s, v9.8h, XFIX_P_0_765 /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
2469
+ smlal v22.4s, v11.4h, XFIX_N_1_847 /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2470
+ smlal2 v25.4s, v11.8h, XFIX_N_1_847 /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
2471
+
2472
+ rshrn v18.4h, v18.4s, #DESCALE_P2
2473
+ rshrn v22.4h, v22.4s, #DESCALE_P2
2474
+ rshrn2 v18.8h, v24.4s, #DESCALE_P2 /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
2475
+ rshrn2 v22.8h, v25.4s, #DESCALE_P2 /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
2476
+
2477
+ /* Odd part */
2478
+ add v8.8h, v28.8h, v31.8h /* z1 = tmp4 + tmp7; */
2479
+ add v9.8h, v29.8h, v30.8h /* z2 = tmp5 + tmp6; */
2480
+ add v10.8h, v28.8h, v30.8h /* z3 = tmp4 + tmp6; */
2481
+ add v11.8h, v29.8h, v31.8h /* z4 = tmp5 + tmp7; */
2482
+
2483
+ smull v4.4s, v10.4h, XFIX_P_1_175 /* z5 lo = z3 lo * XFIX_P_1_175 */
2484
+ smull2 v5.4s, v10.8h, XFIX_P_1_175
2485
+ smlal v4.4s, v11.4h, XFIX_P_1_175 /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
2486
+ smlal2 v5.4s, v11.8h, XFIX_P_1_175
2487
+
2488
+ smull2 v24.4s, v28.8h, XFIX_P_0_298
2489
+ smull2 v25.4s, v29.8h, XFIX_P_2_053
2490
+ smull2 v26.4s, v30.8h, XFIX_P_3_072
2491
+ smull2 v27.4s, v31.8h, XFIX_P_1_501
2492
+ smull v28.4s, v28.4h, XFIX_P_0_298 /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
2493
+ smull v29.4s, v29.4h, XFIX_P_2_053 /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
2494
+ smull v30.4s, v30.4h, XFIX_P_3_072 /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
2495
+ smull v31.4s, v31.4h, XFIX_P_1_501 /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
2496
+
2497
+ smull2 v12.4s, v8.8h, XFIX_N_0_899
2498
+ smull2 v13.4s, v9.8h, XFIX_N_2_562
2499
+ smull2 v14.4s, v10.8h, XFIX_N_1_961
2500
+ smull2 v15.4s, v11.8h, XFIX_N_0_390
2501
+ smull v8.4s, v8.4h, XFIX_N_0_899 /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
2502
+ smull v9.4s, v9.4h, XFIX_N_2_562 /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
2503
+ smull v10.4s, v10.4h, XFIX_N_1_961 /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
2504
+ smull v11.4s, v11.4h, XFIX_N_0_390 /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
2505
+
2506
+ add v10.4s, v10.4s, v4.4s
2507
+ add v14.4s, v14.4s, v5.4s
2508
+ add v11.4s, v11.4s, v4.4s
2509
+ add v15.4s, v15.4s, v5.4s
2510
+
2511
+ add v28.4s, v28.4s, v8.4s /* tmp4 += z1 */
2512
+ add v24.4s, v24.4s, v12.4s
2513
+ add v29.4s, v29.4s, v9.4s /* tmp5 += z2 */
2514
+ add v25.4s, v25.4s, v13.4s
2515
+ add v30.4s, v30.4s, v10.4s /* tmp6 += z3 */
2516
+ add v26.4s, v26.4s, v14.4s
2517
+ add v31.4s, v31.4s, v11.4s /* tmp7 += z4 */
2518
+ add v27.4s, v27.4s, v15.4s
2519
+
2520
+ add v28.4s, v28.4s, v10.4s /* tmp4 += z3 */
2521
+ add v24.4s, v24.4s, v14.4s
2522
+ add v29.4s, v29.4s, v11.4s /* tmp5 += z4 */
2523
+ add v25.4s, v25.4s, v15.4s
2524
+ add v30.4s, v30.4s, v9.4s /* tmp6 += z2 */
2525
+ add v26.4s, v26.4s, v13.4s
2526
+ add v31.4s, v31.4s, v8.4s /* tmp7 += z1 */
2527
+ add v27.4s, v27.4s, v12.4s
2528
+
2529
+ rshrn v23.4h, v28.4s, #DESCALE_P2
2530
+ rshrn v21.4h, v29.4s, #DESCALE_P2
2531
+ rshrn v19.4h, v30.4s, #DESCALE_P2
2532
+ rshrn v17.4h, v31.4s, #DESCALE_P2
2533
+ rshrn2 v23.8h, v24.4s, #DESCALE_P2 /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
2534
+ rshrn2 v21.8h, v25.4s, #DESCALE_P2 /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
2535
+ rshrn2 v19.8h, v26.4s, #DESCALE_P2 /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
2536
+ rshrn2 v17.8h, v27.4s, #DESCALE_P2 /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
2537
+
2538
+ /* store results */
2539
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2540
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2541
+
2542
+ /* Restore NEON registers */
2543
+ ld1 {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
2544
+ ld1 {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
2545
+
2546
+ br x30
2547
+
2548
+ .unreq DATA
2549
+ .unreq TMP
2550
+
2551
+ #undef XFIX_P_0_298
2552
+ #undef XFIX_N_0_390
2553
+ #undef XFIX_P_0_541
2554
+ #undef XFIX_P_0_765
2555
+ #undef XFIX_N_0_899
2556
+ #undef XFIX_P_1_175
2557
+ #undef XFIX_P_1_501
2558
+ #undef XFIX_N_1_847
2559
+ #undef XFIX_N_1_961
2560
+ #undef XFIX_P_2_053
2561
+ #undef XFIX_N_2_562
2562
+ #undef XFIX_P_3_072
2563
+
2564
+
2565
+ /*****************************************************************************/
2566
+
2567
+ /*
2568
+ * jsimd_fdct_ifast_neon
2569
+ *
2570
+ * This function contains a fast, not so accurate integer implementation of
2571
+ * the forward DCT (Discrete Cosine Transform). It uses the same calculations
2572
+ * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
2573
+ * function from jfdctfst.c
2574
+ *
2575
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
2576
+ * rid of a bunch of VLD1.16 instructions
2577
+ */
2578
+
2579
+ #undef XFIX_0_541196100
2580
+ #define XFIX_0_382683433 v0.h[0]
2581
+ #define XFIX_0_541196100 v0.h[1]
2582
+ #define XFIX_0_707106781 v0.h[2]
2583
+ #define XFIX_1_306562965 v0.h[3]
2584
+
2585
+ .balign 16
2586
+ Ljsimd_fdct_ifast_neon_consts:
2587
+ .short (98 * 128) /* XFIX_0_382683433 */
2588
+ .short (139 * 128) /* XFIX_0_541196100 */
2589
+ .short (181 * 128) /* XFIX_0_707106781 */
2590
+ .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
2591
+
2592
+ asm_function jsimd_fdct_ifast_neon
2593
+
2594
+ DATA .req x0
2595
+ TMP .req x9
2596
+
2597
+ /* Load constants */
2598
+ adr TMP, Ljsimd_fdct_ifast_neon_consts
2599
+ ld1 {v0.4h}, [TMP]
2600
+
2601
+ /* Load all DATA into NEON registers with the following allocation:
2602
+ * 0 1 2 3 | 4 5 6 7
2603
+ * ---------+--------
2604
+ * 0 | d16 | d17 | v0.8h
2605
+ * 1 | d18 | d19 | q9
2606
+ * 2 | d20 | d21 | q10
2607
+ * 3 | d22 | d23 | q11
2608
+ * 4 | d24 | d25 | q12
2609
+ * 5 | d26 | d27 | q13
2610
+ * 6 | d28 | d29 | q14
2611
+ * 7 | d30 | d31 | q15
2612
+ */
2613
+
2614
+ ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2615
+ ld1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2616
+ mov TMP, #2
2617
+ sub DATA, DATA, #64
2618
+ 1:
2619
+ /* Transpose */
2620
+ transpose_8x8 v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
2621
+ subs TMP, TMP, #1
2622
+ /* 1-D FDCT */
2623
+ add v4.8h, v19.8h, v20.8h
2624
+ sub v20.8h, v19.8h, v20.8h
2625
+ sub v28.8h, v18.8h, v21.8h
2626
+ add v18.8h, v18.8h, v21.8h
2627
+ sub v29.8h, v17.8h, v22.8h
2628
+ add v17.8h, v17.8h, v22.8h
2629
+ sub v21.8h, v16.8h, v23.8h
2630
+ add v16.8h, v16.8h, v23.8h
2631
+ sub v6.8h, v17.8h, v18.8h
2632
+ sub v7.8h, v16.8h, v4.8h
2633
+ add v5.8h, v17.8h, v18.8h
2634
+ add v6.8h, v6.8h, v7.8h
2635
+ add v4.8h, v16.8h, v4.8h
2636
+ sqdmulh v6.8h, v6.8h, XFIX_0_707106781
2637
+ add v19.8h, v20.8h, v28.8h
2638
+ add v16.8h, v4.8h, v5.8h
2639
+ sub v20.8h, v4.8h, v5.8h
2640
+ add v5.8h, v28.8h, v29.8h
2641
+ add v29.8h, v29.8h, v21.8h
2642
+ sqdmulh v5.8h, v5.8h, XFIX_0_707106781
2643
+ sub v28.8h, v19.8h, v29.8h
2644
+ add v18.8h, v7.8h, v6.8h
2645
+ sqdmulh v28.8h, v28.8h, XFIX_0_382683433
2646
+ sub v22.8h, v7.8h, v6.8h
2647
+ sqdmulh v19.8h, v19.8h, XFIX_0_541196100
2648
+ sqdmulh v7.8h, v29.8h, XFIX_1_306562965
2649
+ add v6.8h, v21.8h, v5.8h
2650
+ sub v5.8h, v21.8h, v5.8h
2651
+ add v29.8h, v29.8h, v28.8h
2652
+ add v19.8h, v19.8h, v28.8h
2653
+ add v29.8h, v29.8h, v7.8h
2654
+ add v21.8h, v5.8h, v19.8h
2655
+ sub v19.8h, v5.8h, v19.8h
2656
+ add v17.8h, v6.8h, v29.8h
2657
+ sub v23.8h, v6.8h, v29.8h
2658
+
2659
+ b.ne 1b
2660
+
2661
+ /* store results */
2662
+ st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
2663
+ st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
2664
+
2665
+ br x30
2666
+
2667
+ .unreq DATA
2668
+ .unreq TMP
2669
+ #undef XFIX_0_382683433
2670
+ #undef XFIX_0_541196100
2671
+ #undef XFIX_0_707106781
2672
+ #undef XFIX_1_306562965
2673
+
2674
+
2675
+ /*****************************************************************************/
2676
+
2677
+ /*
2678
+ * GLOBAL(void)
2679
+ * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
2680
+ * DCTELEM *workspace);
2681
+ *
2682
+ */
2683
+ asm_function jsimd_quantize_neon
2684
+
2685
+ COEF_BLOCK .req x0
2686
+ DIVISORS .req x1
2687
+ WORKSPACE .req x2
2688
+
2689
+ RECIPROCAL .req DIVISORS
2690
+ CORRECTION .req x9
2691
+ SHIFT .req x10
2692
+ LOOP_COUNT .req x11
2693
+
2694
+ mov LOOP_COUNT, #2
2695
+ add CORRECTION, DIVISORS, #(64 * 2)
2696
+ add SHIFT, DIVISORS, #(64 * 6)
2697
+ 1:
2698
+ subs LOOP_COUNT, LOOP_COUNT, #1
2699
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
2700
+ ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
2701
+ abs v20.8h, v0.8h
2702
+ abs v21.8h, v1.8h
2703
+ abs v22.8h, v2.8h
2704
+ abs v23.8h, v3.8h
2705
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
2706
+ add v20.8h, v20.8h, v4.8h /* add correction */
2707
+ add v21.8h, v21.8h, v5.8h
2708
+ add v22.8h, v22.8h, v6.8h
2709
+ add v23.8h, v23.8h, v7.8h
2710
+ umull v4.4s, v20.4h, v28.4h /* multiply by reciprocal */
2711
+ umull2 v16.4s, v20.8h, v28.8h
2712
+ umull v5.4s, v21.4h, v29.4h
2713
+ umull2 v17.4s, v21.8h, v29.8h
2714
+ umull v6.4s, v22.4h, v30.4h /* multiply by reciprocal */
2715
+ umull2 v18.4s, v22.8h, v30.8h
2716
+ umull v7.4s, v23.4h, v31.4h
2717
+ umull2 v19.4s, v23.8h, v31.8h
2718
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
2719
+ shrn v4.4h, v4.4s, #16
2720
+ shrn v5.4h, v5.4s, #16
2721
+ shrn v6.4h, v6.4s, #16
2722
+ shrn v7.4h, v7.4s, #16
2723
+ shrn2 v4.8h, v16.4s, #16
2724
+ shrn2 v5.8h, v17.4s, #16
2725
+ shrn2 v6.8h, v18.4s, #16
2726
+ shrn2 v7.8h, v19.4s, #16
2727
+ neg v24.8h, v24.8h
2728
+ neg v25.8h, v25.8h
2729
+ neg v26.8h, v26.8h
2730
+ neg v27.8h, v27.8h
2731
+ sshr v0.8h, v0.8h, #15 /* extract sign */
2732
+ sshr v1.8h, v1.8h, #15
2733
+ sshr v2.8h, v2.8h, #15
2734
+ sshr v3.8h, v3.8h, #15
2735
+ ushl v4.8h, v4.8h, v24.8h /* shift */
2736
+ ushl v5.8h, v5.8h, v25.8h
2737
+ ushl v6.8h, v6.8h, v26.8h
2738
+ ushl v7.8h, v7.8h, v27.8h
2739
+
2740
+ eor v4.16b, v4.16b, v0.16b /* restore sign */
2741
+ eor v5.16b, v5.16b, v1.16b
2742
+ eor v6.16b, v6.16b, v2.16b
2743
+ eor v7.16b, v7.16b, v3.16b
2744
+ sub v4.8h, v4.8h, v0.8h
2745
+ sub v5.8h, v5.8h, v1.8h
2746
+ sub v6.8h, v6.8h, v2.8h
2747
+ sub v7.8h, v7.8h, v3.8h
2748
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
2749
+
2750
+ b.ne 1b
2751
+
2752
+ br x30 /* return */
2753
+
2754
+ .unreq COEF_BLOCK
2755
+ .unreq DIVISORS
2756
+ .unreq WORKSPACE
2757
+ .unreq RECIPROCAL
2758
+ .unreq CORRECTION
2759
+ .unreq SHIFT
2760
+ .unreq LOOP_COUNT
2761
+
2762
+
2763
+ /*****************************************************************************/
2764
+
2765
+ /*
2766
+ * Downsample pixel values of a single component.
2767
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
2768
+ * without smoothing.
2769
+ *
2770
+ * GLOBAL(void)
2771
+ * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
2772
+ * JDIMENSION v_samp_factor,
2773
+ * JDIMENSION width_blocks, JSAMPARRAY input_data,
2774
+ * JSAMPARRAY output_data);
2775
+ */
2776
+
2777
+ .balign 16
2778
+ Ljsimd_h2_downsample_neon_consts:
2779
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2780
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F /* diff 0 */
2781
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2782
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E /* diff 1 */
2783
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2784
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D /* diff 2 */
2785
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2786
+ 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C /* diff 3 */
2787
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2788
+ 0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B /* diff 4 */
2789
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2790
+ 0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A /* diff 5 */
2791
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2792
+ 0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09 /* diff 6 */
2793
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2794
+ 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08 /* diff 7 */
2795
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
2796
+ 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07 /* diff 8 */
2797
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
2798
+ 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06 /* diff 9 */
2799
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
2800
+ 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05 /* diff 10 */
2801
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
2802
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04 /* diff 11 */
2803
+ .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
2804
+ 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 /* diff 12 */
2805
+ .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
2806
+ 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 /* diff 13 */
2807
+ .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
2808
+ 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 /* diff 14 */
2809
+ .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
2810
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* diff 15 */
2811
+
2812
+ asm_function jsimd_h2v1_downsample_neon
2813
+ IMAGE_WIDTH .req x0
2814
+ MAX_V_SAMP .req x1
2815
+ V_SAMP .req x2
2816
+ BLOCK_WIDTH .req x3
2817
+ INPUT_DATA .req x4
2818
+ OUTPUT_DATA .req x5
2819
+ OUTPTR .req x9
2820
+ INPTR .req x10
2821
+ TMP1 .req x11
2822
+ TMP2 .req x12
2823
+ TMP3 .req x13
2824
+ TMPDUP .req w15
2825
+
2826
+ mov TMPDUP, #0x10000
2827
+ lsl TMP2, BLOCK_WIDTH, #4
2828
+ sub TMP2, TMP2, IMAGE_WIDTH
2829
+ adr TMP3, Ljsimd_h2_downsample_neon_consts
2830
+ add TMP3, TMP3, TMP2, lsl #4
2831
+ dup v16.4s, TMPDUP
2832
+ ld1 {v18.16b}, [TMP3]
2833
+
2834
+ 1: /* row loop */
2835
+ ldr INPTR, [INPUT_DATA], #8
2836
+ ldr OUTPTR, [OUTPUT_DATA], #8
2837
+ subs TMP1, BLOCK_WIDTH, #1
2838
+ b.eq 3f
2839
+ 2: /* columns */
2840
+ ld1 {v0.16b}, [INPTR], #16
2841
+ mov v4.16b, v16.16b
2842
+ subs TMP1, TMP1, #1
2843
+ uadalp v4.8h, v0.16b
2844
+ shrn v6.8b, v4.8h, #1
2845
+ st1 {v6.8b}, [OUTPTR], #8
2846
+ b.ne 2b
2847
+ 3: /* last columns */
2848
+ ld1 {v0.16b}, [INPTR]
2849
+ mov v4.16b, v16.16b
2850
+ subs V_SAMP, V_SAMP, #1
2851
+ /* expand right */
2852
+ tbl v2.16b, {v0.16b}, v18.16b
2853
+ uadalp v4.8h, v2.16b
2854
+ shrn v6.8b, v4.8h, #1
2855
+ st1 {v6.8b}, [OUTPTR], #8
2856
+ b.ne 1b
2857
+
2858
+ br x30
2859
+
2860
+ .unreq IMAGE_WIDTH
2861
+ .unreq MAX_V_SAMP
2862
+ .unreq V_SAMP
2863
+ .unreq BLOCK_WIDTH
2864
+ .unreq INPUT_DATA
2865
+ .unreq OUTPUT_DATA
2866
+ .unreq OUTPTR
2867
+ .unreq INPTR
2868
+ .unreq TMP1
2869
+ .unreq TMP2
2870
+ .unreq TMP3
2871
+ .unreq TMPDUP
2872
+
2873
+
2874
+ /*****************************************************************************/
2875
+
2876
+ /*
2877
+ * Downsample pixel values of a single component.
2878
+ * This version handles the common case of 2:1 horizontal and 2:1 vertical,
2879
+ * without smoothing.
2880
+ *
2881
+ * GLOBAL(void)
2882
+ * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
2883
+ * JDIMENSION v_samp_factor, JDIMENSION width_blocks,
2884
+ * JSAMPARRAY input_data, JSAMPARRAY output_data);
2885
+ */
2886
+
2887
+ .balign 16
2888
+ asm_function jsimd_h2v2_downsample_neon
2889
+ IMAGE_WIDTH .req x0
2890
+ MAX_V_SAMP .req x1
2891
+ V_SAMP .req x2
2892
+ BLOCK_WIDTH .req x3
2893
+ INPUT_DATA .req x4
2894
+ OUTPUT_DATA .req x5
2895
+ OUTPTR .req x9
2896
+ INPTR0 .req x10
2897
+ INPTR1 .req x14
2898
+ TMP1 .req x11
2899
+ TMP2 .req x12
2900
+ TMP3 .req x13
2901
+ TMPDUP .req w15
2902
+
2903
+ mov TMPDUP, #1
2904
+ lsl TMP2, BLOCK_WIDTH, #4
2905
+ lsl TMPDUP, TMPDUP, #17
2906
+ sub TMP2, TMP2, IMAGE_WIDTH
2907
+ adr TMP3, Ljsimd_h2_downsample_neon_consts
2908
+ orr TMPDUP, TMPDUP, #1
2909
+ add TMP3, TMP3, TMP2, lsl #4
2910
+ dup v16.4s, TMPDUP
2911
+ ld1 {v18.16b}, [TMP3]
2912
+
2913
+ 1: /* row loop */
2914
+ ldr INPTR0, [INPUT_DATA], #8
2915
+ ldr OUTPTR, [OUTPUT_DATA], #8
2916
+ ldr INPTR1, [INPUT_DATA], #8
2917
+ subs TMP1, BLOCK_WIDTH, #1
2918
+ b.eq 3f
2919
+ 2: /* columns */
2920
+ ld1 {v0.16b}, [INPTR0], #16
2921
+ ld1 {v1.16b}, [INPTR1], #16
2922
+ mov v4.16b, v16.16b
2923
+ subs TMP1, TMP1, #1
2924
+ uadalp v4.8h, v0.16b
2925
+ uadalp v4.8h, v1.16b
2926
+ shrn v6.8b, v4.8h, #2
2927
+ st1 {v6.8b}, [OUTPTR], #8
2928
+ b.ne 2b
2929
+ 3: /* last columns */
2930
+ ld1 {v0.16b}, [INPTR0], #16
2931
+ ld1 {v1.16b}, [INPTR1], #16
2932
+ mov v4.16b, v16.16b
2933
+ subs V_SAMP, V_SAMP, #1
2934
+ /* expand right */
2935
+ tbl v2.16b, {v0.16b}, v18.16b
2936
+ tbl v3.16b, {v1.16b}, v18.16b
2937
+ uadalp v4.8h, v2.16b
2938
+ uadalp v4.8h, v3.16b
2939
+ shrn v6.8b, v4.8h, #2
2940
+ st1 {v6.8b}, [OUTPTR], #8
2941
+ b.ne 1b
2942
+
2943
+ br x30
2944
+
2945
+ .unreq IMAGE_WIDTH
2946
+ .unreq MAX_V_SAMP
2947
+ .unreq V_SAMP
2948
+ .unreq BLOCK_WIDTH
2949
+ .unreq INPUT_DATA
2950
+ .unreq OUTPUT_DATA
2951
+ .unreq OUTPTR
2952
+ .unreq INPTR0
2953
+ .unreq INPTR1
2954
+ .unreq TMP1
2955
+ .unreq TMP2
2956
+ .unreq TMP3
2957
+ .unreq TMPDUP
2958
+
2959
+
2960
+ /*****************************************************************************/
2961
+
2962
+ /*
2963
+ * GLOBAL(JOCTET*)
2964
+ * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
2965
+ * JCOEFPTR block, int last_dc_val,
2966
+ * c_derived_tbl *dctbl, c_derived_tbl *actbl)
2967
+ *
2968
+ */
2969
+
2970
+ BUFFER .req x1
2971
+ PUT_BUFFER .req x6
2972
+ PUT_BITS .req x7
2973
+ PUT_BITSw .req w7
2974
+
2975
+ .macro emit_byte
2976
+ sub PUT_BITS, PUT_BITS, #0x8
2977
+ lsr x19, PUT_BUFFER, PUT_BITS
2978
+ uxtb w19, w19
2979
+ strb w19, [BUFFER, #1]!
2980
+ cmp w19, #0xff
2981
+ b.ne 14f
2982
+ strb wzr, [BUFFER, #1]!
2983
+ 14:
2984
+ .endm
2985
+ .macro put_bits CODE, SIZE
2986
+ lsl PUT_BUFFER, PUT_BUFFER, \SIZE
2987
+ add PUT_BITS, PUT_BITS, \SIZE
2988
+ orr PUT_BUFFER, PUT_BUFFER, \CODE
2989
+ .endm
2990
+ .macro checkbuf31
2991
+ cmp PUT_BITS, #0x20
2992
+ b.lt 31f
2993
+ emit_byte
2994
+ emit_byte
2995
+ emit_byte
2996
+ emit_byte
2997
+ 31:
2998
+ .endm
2999
+ .macro checkbuf47
3000
+ cmp PUT_BITS, #0x30
3001
+ b.lt 47f
3002
+ emit_byte
3003
+ emit_byte
3004
+ emit_byte
3005
+ emit_byte
3006
+ emit_byte
3007
+ emit_byte
3008
+ 47:
3009
+ .endm
3010
+
3011
+ .macro generate_jsimd_huff_encode_one_block fast_tbl
3012
+
3013
+ .balign 16
3014
+ .if \fast_tbl == 1
3015
+ Ljsimd_huff_encode_one_block_neon_consts:
3016
+ .else
3017
+ Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
3018
+ .endif
3019
+ .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
3020
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
3021
+ .if \fast_tbl == 1
3022
+ .byte 0, 1, 2, 3, 16, 17, 32, 33, \
3023
+ 18, 19, 4, 5, 6, 7, 20, 21 /* L0 => L3 : 4 lines OK */
3024
+ .byte 34, 35, 48, 49, 255, 255, 50, 51, \
3025
+ 36, 37, 22, 23, 8, 9, 10, 11 /* L0 => L3 : 4 lines OK */
3026
+ .byte 8, 9, 22, 23, 36, 37, 50, 51, \
3027
+ 255, 255, 255, 255, 255, 255, 52, 53 /* L1 => L4 : 4 lines OK */
3028
+ .byte 54, 55, 40, 41, 26, 27, 12, 13, \
3029
+ 14, 15, 28, 29, 42, 43, 56, 57 /* L0 => L3 : 4 lines OK */
3030
+ .byte 6, 7, 20, 21, 34, 35, 48, 49, \
3031
+ 50, 51, 36, 37, 22, 23, 8, 9 /* L4 => L7 : 4 lines OK */
3032
+ .byte 42, 43, 28, 29, 14, 15, 30, 31, \
3033
+ 44, 45, 58, 59, 255, 255, 255, 255 /* L1 => L4 : 4 lines OK */
3034
+ .byte 255, 255, 255, 255, 56, 57, 42, 43, \
3035
+ 28, 29, 14, 15, 30, 31, 44, 45 /* L3 => L6 : 4 lines OK */
3036
+ .byte 26, 27, 40, 41, 42, 43, 28, 29, \
3037
+ 14, 15, 30, 31, 44, 45, 46, 47 /* L5 => L7 : 3 lines OK */
3038
+ .byte 255, 255, 255, 255, 0, 1, 255, 255, \
3039
+ 255, 255, 255, 255, 255, 255, 255, 255 /* L4 : 1 lines OK */
3040
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, \
3041
+ 0, 1, 16, 17, 2, 3, 255, 255 /* L5 => L6 : 2 lines OK */
3042
+ .byte 255, 255, 255, 255, 255, 255, 255, 255, \
3043
+ 255, 255, 255, 255, 8, 9, 22, 23 /* L5 => L6 : 2 lines OK */
3044
+ .byte 4, 5, 6, 7, 255, 255, 255, 255, \
3045
+ 255, 255, 255, 255, 255, 255, 255, 255 /* L7 : 1 line OK */
3046
+ .endif
3047
+
3048
+ .if \fast_tbl == 1
3049
+ asm_function jsimd_huff_encode_one_block_neon
3050
+ .else
3051
+ asm_function jsimd_huff_encode_one_block_neon_slowtbl
3052
+ .endif
3053
+ sub sp, sp, 272
3054
+ sub BUFFER, BUFFER, #0x1 /* BUFFER=buffer-- */
3055
+ /* Save ARM registers */
3056
+ stp x19, x20, [sp]
3057
+ .if \fast_tbl == 1
3058
+ adr x15, Ljsimd_huff_encode_one_block_neon_consts
3059
+ .else
3060
+ adr x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
3061
+ .endif
3062
+ ldr PUT_BUFFER, [x0, #0x10]
3063
+ ldr PUT_BITSw, [x0, #0x18]
3064
+ ldrsh w12, [x2] /* load DC coeff in w12 */
3065
+ /* prepare data */
3066
+ .if \fast_tbl == 1
3067
+ ld1 {v23.16b}, [x15], #16
3068
+ ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
3069
+ ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
3070
+ ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
3071
+ ld1 {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
3072
+ ld1 {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
3073
+ sub w12, w12, w3 /* last_dc_val, not used afterwards */
3074
+ /* ZigZag 8x8 */
3075
+ tbl v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
3076
+ tbl v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
3077
+ tbl v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
3078
+ tbl v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
3079
+ tbl v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
3080
+ tbl v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
3081
+ tbl v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
3082
+ tbl v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
3083
+ ins v0.h[0], w12
3084
+ tbx v1.16b, {v28.16b}, v16.16b
3085
+ tbx v2.16b, {v29.16b, v30.16b}, v17.16b
3086
+ tbx v5.16b, {v29.16b, v30.16b}, v18.16b
3087
+ tbx v6.16b, {v31.16b}, v19.16b
3088
+ .else
3089
+ add x13, x2, #0x22
3090
+ sub w12, w12, w3 /* last_dc_val, not used afterwards */
3091
+ ld1 {v23.16b}, [x15]
3092
+ add x14, x2, #0x18
3093
+ add x3, x2, #0x36
3094
+ ins v0.h[0], w12
3095
+ add x9, x2, #0x2
3096
+ ld1 {v1.h}[0], [x13]
3097
+ add x15, x2, #0x30
3098
+ ld1 {v2.h}[0], [x14]
3099
+ add x19, x2, #0x26
3100
+ ld1 {v3.h}[0], [x3]
3101
+ add x20, x2, #0x28
3102
+ ld1 {v0.h}[1], [x9]
3103
+ add x12, x2, #0x10
3104
+ ld1 {v1.h}[1], [x15]
3105
+ add x13, x2, #0x40
3106
+ ld1 {v2.h}[1], [x19]
3107
+ add x14, x2, #0x34
3108
+ ld1 {v3.h}[1], [x20]
3109
+ add x3, x2, #0x1a
3110
+ ld1 {v0.h}[2], [x12]
3111
+ add x9, x2, #0x20
3112
+ ld1 {v1.h}[2], [x13]
3113
+ add x15, x2, #0x32
3114
+ ld1 {v2.h}[2], [x14]
3115
+ add x19, x2, #0x42
3116
+ ld1 {v3.h}[2], [x3]
3117
+ add x20, x2, #0xc
3118
+ ld1 {v0.h}[3], [x9]
3119
+ add x12, x2, #0x12
3120
+ ld1 {v1.h}[3], [x15]
3121
+ add x13, x2, #0x24
3122
+ ld1 {v2.h}[3], [x19]
3123
+ add x14, x2, #0x50
3124
+ ld1 {v3.h}[3], [x20]
3125
+ add x3, x2, #0xe
3126
+ ld1 {v0.h}[4], [x12]
3127
+ add x9, x2, #0x4
3128
+ ld1 {v1.h}[4], [x13]
3129
+ add x15, x2, #0x16
3130
+ ld1 {v2.h}[4], [x14]
3131
+ add x19, x2, #0x60
3132
+ ld1 {v3.h}[4], [x3]
3133
+ add x20, x2, #0x1c
3134
+ ld1 {v0.h}[5], [x9]
3135
+ add x12, x2, #0x6
3136
+ ld1 {v1.h}[5], [x15]
3137
+ add x13, x2, #0x8
3138
+ ld1 {v2.h}[5], [x19]
3139
+ add x14, x2, #0x52
3140
+ ld1 {v3.h}[5], [x20]
3141
+ add x3, x2, #0x2a
3142
+ ld1 {v0.h}[6], [x12]
3143
+ add x9, x2, #0x14
3144
+ ld1 {v1.h}[6], [x13]
3145
+ add x15, x2, #0xa
3146
+ ld1 {v2.h}[6], [x14]
3147
+ add x19, x2, #0x44
3148
+ ld1 {v3.h}[6], [x3]
3149
+ add x20, x2, #0x38
3150
+ ld1 {v0.h}[7], [x9]
3151
+ add x12, x2, #0x46
3152
+ ld1 {v1.h}[7], [x15]
3153
+ add x13, x2, #0x3a
3154
+ ld1 {v2.h}[7], [x19]
3155
+ add x14, x2, #0x74
3156
+ ld1 {v3.h}[7], [x20]
3157
+ add x3, x2, #0x6a
3158
+ ld1 {v4.h}[0], [x12]
3159
+ add x9, x2, #0x54
3160
+ ld1 {v5.h}[0], [x13]
3161
+ add x15, x2, #0x2c
3162
+ ld1 {v6.h}[0], [x14]
3163
+ add x19, x2, #0x76
3164
+ ld1 {v7.h}[0], [x3]
3165
+ add x20, x2, #0x78
3166
+ ld1 {v4.h}[1], [x9]
3167
+ add x12, x2, #0x62
3168
+ ld1 {v5.h}[1], [x15]
3169
+ add x13, x2, #0x1e
3170
+ ld1 {v6.h}[1], [x19]
3171
+ add x14, x2, #0x68
3172
+ ld1 {v7.h}[1], [x20]
3173
+ add x3, x2, #0x7a
3174
+ ld1 {v4.h}[2], [x12]
3175
+ add x9, x2, #0x70
3176
+ ld1 {v5.h}[2], [x13]
3177
+ add x15, x2, #0x2e
3178
+ ld1 {v6.h}[2], [x14]
3179
+ add x19, x2, #0x5a
3180
+ ld1 {v7.h}[2], [x3]
3181
+ add x20, x2, #0x6c
3182
+ ld1 {v4.h}[3], [x9]
3183
+ add x12, x2, #0x72
3184
+ ld1 {v5.h}[3], [x15]
3185
+ add x13, x2, #0x3c
3186
+ ld1 {v6.h}[3], [x19]
3187
+ add x14, x2, #0x4c
3188
+ ld1 {v7.h}[3], [x20]
3189
+ add x3, x2, #0x5e
3190
+ ld1 {v4.h}[4], [x12]
3191
+ add x9, x2, #0x64
3192
+ ld1 {v5.h}[4], [x13]
3193
+ add x15, x2, #0x4a
3194
+ ld1 {v6.h}[4], [x14]
3195
+ add x19, x2, #0x3e
3196
+ ld1 {v7.h}[4], [x3]
3197
+ add x20, x2, #0x6e
3198
+ ld1 {v4.h}[5], [x9]
3199
+ add x12, x2, #0x56
3200
+ ld1 {v5.h}[5], [x15]
3201
+ add x13, x2, #0x58
3202
+ ld1 {v6.h}[5], [x19]
3203
+ add x14, x2, #0x4e
3204
+ ld1 {v7.h}[5], [x20]
3205
+ add x3, x2, #0x7c
3206
+ ld1 {v4.h}[6], [x12]
3207
+ add x9, x2, #0x48
3208
+ ld1 {v5.h}[6], [x13]
3209
+ add x15, x2, #0x66
3210
+ ld1 {v6.h}[6], [x14]
3211
+ add x19, x2, #0x5c
3212
+ ld1 {v7.h}[6], [x3]
3213
+ add x20, x2, #0x7e
3214
+ ld1 {v4.h}[7], [x9]
3215
+ ld1 {v5.h}[7], [x15]
3216
+ ld1 {v6.h}[7], [x19]
3217
+ ld1 {v7.h}[7], [x20]
3218
+ .endif
3219
+ cmlt v24.8h, v0.8h, #0
3220
+ cmlt v25.8h, v1.8h, #0
3221
+ cmlt v26.8h, v2.8h, #0
3222
+ cmlt v27.8h, v3.8h, #0
3223
+ cmlt v28.8h, v4.8h, #0
3224
+ cmlt v29.8h, v5.8h, #0
3225
+ cmlt v30.8h, v6.8h, #0
3226
+ cmlt v31.8h, v7.8h, #0
3227
+ abs v0.8h, v0.8h
3228
+ abs v1.8h, v1.8h
3229
+ abs v2.8h, v2.8h
3230
+ abs v3.8h, v3.8h
3231
+ abs v4.8h, v4.8h
3232
+ abs v5.8h, v5.8h
3233
+ abs v6.8h, v6.8h
3234
+ abs v7.8h, v7.8h
3235
+ eor v24.16b, v24.16b, v0.16b
3236
+ eor v25.16b, v25.16b, v1.16b
3237
+ eor v26.16b, v26.16b, v2.16b
3238
+ eor v27.16b, v27.16b, v3.16b
3239
+ eor v28.16b, v28.16b, v4.16b
3240
+ eor v29.16b, v29.16b, v5.16b
3241
+ eor v30.16b, v30.16b, v6.16b
3242
+ eor v31.16b, v31.16b, v7.16b
3243
+ cmeq v16.8h, v0.8h, #0
3244
+ cmeq v17.8h, v1.8h, #0
3245
+ cmeq v18.8h, v2.8h, #0
3246
+ cmeq v19.8h, v3.8h, #0
3247
+ cmeq v20.8h, v4.8h, #0
3248
+ cmeq v21.8h, v5.8h, #0
3249
+ cmeq v22.8h, v6.8h, #0
3250
+ xtn v16.8b, v16.8h
3251
+ xtn v18.8b, v18.8h
3252
+ xtn v20.8b, v20.8h
3253
+ xtn v22.8b, v22.8h
3254
+ umov w14, v0.h[0]
3255
+ xtn2 v16.16b, v17.8h
3256
+ umov w13, v24.h[0]
3257
+ xtn2 v18.16b, v19.8h
3258
+ clz w14, w14
3259
+ xtn2 v20.16b, v21.8h
3260
+ lsl w13, w13, w14
3261
+ cmeq v17.8h, v7.8h, #0
3262
+ sub w12, w14, #32
3263
+ xtn2 v22.16b, v17.8h
3264
+ lsr w13, w13, w14
3265
+ and v16.16b, v16.16b, v23.16b
3266
+ neg w12, w12
3267
+ and v18.16b, v18.16b, v23.16b
3268
+ add x3, x4, #0x400 /* r1 = dctbl->ehufsi */
3269
+ and v20.16b, v20.16b, v23.16b
3270
+ add x15, sp, #0x90 /* x15 = t2 */
3271
+ and v22.16b, v22.16b, v23.16b
3272
+ ldr w10, [x4, x12, lsl #2]
3273
+ addp v16.16b, v16.16b, v18.16b
3274
+ ldrb w11, [x3, x12]
3275
+ addp v20.16b, v20.16b, v22.16b
3276
+ checkbuf47
3277
+ addp v16.16b, v16.16b, v20.16b
3278
+ put_bits x10, x11
3279
+ addp v16.16b, v16.16b, v18.16b
3280
+ checkbuf47
3281
+ umov x9,v16.D[0]
3282
+ put_bits x13, x12
3283
+ cnt v17.8b, v16.8b
3284
+ mvn x9, x9
3285
+ addv B18, v17.8b
3286
+ add x4, x5, #0x400 /* x4 = actbl->ehufsi */
3287
+ umov w12, v18.b[0]
3288
+ lsr x9, x9, #0x1 /* clear AC coeff */
3289
+ ldr w13, [x5, #0x3c0] /* x13 = actbl->ehufco[0xf0] */
3290
+ rbit x9, x9 /* x9 = index0 */
3291
+ ldrb w14, [x4, #0xf0] /* x14 = actbl->ehufsi[0xf0] */
3292
+ cmp w12, #(64-8)
3293
+ add x11, sp, #16
3294
+ b.lt 4f
3295
+ cbz x9, 6f
3296
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3297
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3298
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3299
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
3300
+ 1:
3301
+ clz x2, x9
3302
+ add x15, x15, x2, lsl #1
3303
+ lsl x9, x9, x2
3304
+ ldrh w20, [x15, #-126]
3305
+ 2:
3306
+ cmp x2, #0x10
3307
+ b.lt 3f
3308
+ sub x2, x2, #0x10
3309
+ checkbuf47
3310
+ put_bits x13, x14
3311
+ b 2b
3312
+ 3:
3313
+ clz w20, w20
3314
+ ldrh w3, [x15, #2]!
3315
+ sub w11, w20, #32
3316
+ lsl w3, w3, w20
3317
+ neg w11, w11
3318
+ lsr w3, w3, w20
3319
+ add x2, x11, x2, lsl #4
3320
+ lsl x9, x9, #0x1
3321
+ ldr w12, [x5, x2, lsl #2]
3322
+ ldrb w10, [x4, x2]
3323
+ checkbuf31
3324
+ put_bits x12, x10
3325
+ put_bits x3, x11
3326
+ cbnz x9, 1b
3327
+ b 6f
3328
+ 4:
3329
+ movi v21.8h, #0x0010
3330
+ clz v0.8h, v0.8h
3331
+ clz v1.8h, v1.8h
3332
+ clz v2.8h, v2.8h
3333
+ clz v3.8h, v3.8h
3334
+ clz v4.8h, v4.8h
3335
+ clz v5.8h, v5.8h
3336
+ clz v6.8h, v6.8h
3337
+ clz v7.8h, v7.8h
3338
+ ushl v24.8h, v24.8h, v0.8h
3339
+ ushl v25.8h, v25.8h, v1.8h
3340
+ ushl v26.8h, v26.8h, v2.8h
3341
+ ushl v27.8h, v27.8h, v3.8h
3342
+ ushl v28.8h, v28.8h, v4.8h
3343
+ ushl v29.8h, v29.8h, v5.8h
3344
+ ushl v30.8h, v30.8h, v6.8h
3345
+ ushl v31.8h, v31.8h, v7.8h
3346
+ neg v0.8h, v0.8h
3347
+ neg v1.8h, v1.8h
3348
+ neg v2.8h, v2.8h
3349
+ neg v3.8h, v3.8h
3350
+ neg v4.8h, v4.8h
3351
+ neg v5.8h, v5.8h
3352
+ neg v6.8h, v6.8h
3353
+ neg v7.8h, v7.8h
3354
+ ushl v24.8h, v24.8h, v0.8h
3355
+ ushl v25.8h, v25.8h, v1.8h
3356
+ ushl v26.8h, v26.8h, v2.8h
3357
+ ushl v27.8h, v27.8h, v3.8h
3358
+ ushl v28.8h, v28.8h, v4.8h
3359
+ ushl v29.8h, v29.8h, v5.8h
3360
+ ushl v30.8h, v30.8h, v6.8h
3361
+ ushl v31.8h, v31.8h, v7.8h
3362
+ add v0.8h, v21.8h, v0.8h
3363
+ add v1.8h, v21.8h, v1.8h
3364
+ add v2.8h, v21.8h, v2.8h
3365
+ add v3.8h, v21.8h, v3.8h
3366
+ add v4.8h, v21.8h, v4.8h
3367
+ add v5.8h, v21.8h, v5.8h
3368
+ add v6.8h, v21.8h, v6.8h
3369
+ add v7.8h, v21.8h, v7.8h
3370
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
3371
+ st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
3372
+ st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
3373
+ st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
3374
+ 1:
3375
+ clz x2, x9
3376
+ add x15, x15, x2, lsl #1
3377
+ lsl x9, x9, x2
3378
+ ldrh w11, [x15, #-126]
3379
+ 2:
3380
+ cmp x2, #0x10
3381
+ b.lt 3f
3382
+ sub x2, x2, #0x10
3383
+ checkbuf47
3384
+ put_bits x13, x14
3385
+ b 2b
3386
+ 3:
3387
+ ldrh w3, [x15, #2]!
3388
+ add x2, x11, x2, lsl #4
3389
+ lsl x9, x9, #0x1
3390
+ ldr w12, [x5, x2, lsl #2]
3391
+ ldrb w10, [x4, x2]
3392
+ checkbuf31
3393
+ put_bits x12, x10
3394
+ put_bits x3, x11
3395
+ cbnz x9, 1b
3396
+ 6:
3397
+ add x13, sp, #0x10e
3398
+ cmp x15, x13
3399
+ b.hs 1f
3400
+ ldr w12, [x5]
3401
+ ldrb w14, [x4]
3402
+ checkbuf47
3403
+ put_bits x12, x14
3404
+ 1:
3405
+ str PUT_BUFFER, [x0, #0x10]
3406
+ str PUT_BITSw, [x0, #0x18]
3407
+ ldp x19, x20, [sp], 16
3408
+ add x0, BUFFER, #0x1
3409
+ add sp, sp, 256
3410
+ br x30
3411
+
3412
+ .endm
3413
+
3414
+ generate_jsimd_huff_encode_one_block 1
3415
+ generate_jsimd_huff_encode_one_block 0
3416
+
3417
+ .unreq BUFFER
3418
+ .unreq PUT_BUFFER
3419
+ .unreq PUT_BITS
3420
+ .unreq PUT_BITSw
3421
+
3422
+ .purgem emit_byte
3423
+ .purgem put_bits
3424
+ .purgem checkbuf31
3425
+ .purgem checkbuf47