epeg 1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (504) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/MANIFEST +5 -0
  4. data/TODO +1 -0
  5. data/epeg/.dockerignore +4 -0
  6. data/epeg/.gitignore +5 -0
  7. data/epeg/CMakeLists.txt +30 -0
  8. data/epeg/Dockerfile +23 -0
  9. data/epeg/Epeg.h +90 -0
  10. data/epeg/README.md +42 -0
  11. data/epeg/epeg_main.c +1642 -0
  12. data/epeg/epeg_private.h +85 -0
  13. data/epeg/example/.gitignore +1 -0
  14. data/epeg/example/CMakeLists.txt +20 -0
  15. data/epeg/example/example.jpg +0 -0
  16. data/epeg/example/rotatetest.c +29 -0
  17. data/epeg/example/scaletest.c +48 -0
  18. data/epeg/vendor/libjpeg-turbo-2.0.4/BUILDING.md +828 -0
  19. data/epeg/vendor/libjpeg-turbo-2.0.4/CMakeLists.txt +1420 -0
  20. data/epeg/vendor/libjpeg-turbo-2.0.4/ChangeLog.md +1494 -0
  21. data/epeg/vendor/libjpeg-turbo-2.0.4/LICENSE.md +132 -0
  22. data/epeg/vendor/libjpeg-turbo-2.0.4/README.ijg +277 -0
  23. data/epeg/vendor/libjpeg-turbo-2.0.4/README.md +356 -0
  24. data/epeg/vendor/libjpeg-turbo-2.0.4/cderror.h +137 -0
  25. data/epeg/vendor/libjpeg-turbo-2.0.4/cdjpeg.c +145 -0
  26. data/epeg/vendor/libjpeg-turbo-2.0.4/cdjpeg.h +157 -0
  27. data/epeg/vendor/libjpeg-turbo-2.0.4/change.log +315 -0
  28. data/epeg/vendor/libjpeg-turbo-2.0.4/cjpeg.1 +354 -0
  29. data/epeg/vendor/libjpeg-turbo-2.0.4/cjpeg.c +695 -0
  30. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/BuildPackages.cmake +182 -0
  31. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/GNUInstallDirs.cmake +416 -0
  32. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/cmake_uninstall.cmake.in +24 -0
  33. data/epeg/vendor/libjpeg-turbo-2.0.4/cmakescripts/testclean.cmake +41 -0
  34. data/epeg/vendor/libjpeg-turbo-2.0.4/cmyk.h +61 -0
  35. data/epeg/vendor/libjpeg-turbo-2.0.4/coderules.txt +78 -0
  36. data/epeg/vendor/libjpeg-turbo-2.0.4/djpeg.1 +296 -0
  37. data/epeg/vendor/libjpeg-turbo-2.0.4/djpeg.c +822 -0
  38. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/annotated.html +104 -0
  39. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/bc_s.png +0 -0
  40. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/bdwn.png +0 -0
  41. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/classes.html +106 -0
  42. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/closed.png +0 -0
  43. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen-extra.css +3 -0
  44. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen.css +1184 -0
  45. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/doxygen.png +0 -0
  46. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/dynsections.js +97 -0
  47. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2blank.png +0 -0
  48. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2cl.png +0 -0
  49. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2doc.png +0 -0
  50. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2folderclosed.png +0 -0
  51. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2folderopen.png +0 -0
  52. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2lastnode.png +0 -0
  53. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2link.png +0 -0
  54. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mlastnode.png +0 -0
  55. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mnode.png +0 -0
  56. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2mo.png +0 -0
  57. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2node.png +0 -0
  58. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2ns.png +0 -0
  59. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2plastnode.png +0 -0
  60. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2pnode.png +0 -0
  61. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2splitbar.png +0 -0
  62. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/ftv2vertline.png +0 -0
  63. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/functions.html +134 -0
  64. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/functions_vars.html +134 -0
  65. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/group___turbo_j_p_e_g.html +2775 -0
  66. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/index.html +90 -0
  67. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/jquery.js +8 -0
  68. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/modules.html +95 -0
  69. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_f.png +0 -0
  70. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_g.png +0 -0
  71. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/nav_h.png +0 -0
  72. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/open.png +0 -0
  73. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_63.html +26 -0
  74. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_63.js +4 -0
  75. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_64.html +26 -0
  76. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_64.js +5 -0
  77. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_68.html +26 -0
  78. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_68.js +4 -0
  79. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6e.html +26 -0
  80. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6e.js +4 -0
  81. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6f.html +26 -0
  82. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_6f.js +5 -0
  83. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_72.html +26 -0
  84. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_72.js +4 -0
  85. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_74.html +26 -0
  86. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_74.js +102 -0
  87. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_77.html +26 -0
  88. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_77.js +4 -0
  89. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_78.html +26 -0
  90. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_78.js +4 -0
  91. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_79.html +26 -0
  92. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/all_79.js +4 -0
  93. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/classes_74.html +26 -0
  94. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/classes_74.js +6 -0
  95. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/close.png +0 -0
  96. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enums_74.html +26 -0
  97. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enums_74.js +8 -0
  98. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enumvalues_74.html +26 -0
  99. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/enumvalues_74.js +37 -0
  100. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/functions_74.html +26 -0
  101. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/functions_74.js +31 -0
  102. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/groups_74.html +26 -0
  103. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/groups_74.js +4 -0
  104. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/mag_sel.png +0 -0
  105. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/nomatches.html +12 -0
  106. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search.css +271 -0
  107. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search.js +809 -0
  108. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_l.png +0 -0
  109. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_m.png +0 -0
  110. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/search_r.png +0 -0
  111. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/typedefs_74.html +26 -0
  112. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/typedefs_74.js +5 -0
  113. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_63.html +26 -0
  114. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_63.js +4 -0
  115. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_64.html +26 -0
  116. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_64.js +5 -0
  117. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_68.html +26 -0
  118. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_68.js +4 -0
  119. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6e.html +26 -0
  120. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6e.js +4 -0
  121. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6f.html +26 -0
  122. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_6f.js +5 -0
  123. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_72.html +26 -0
  124. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_72.js +4 -0
  125. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_74.html +26 -0
  126. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_74.js +10 -0
  127. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_77.html +26 -0
  128. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_77.js +4 -0
  129. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_78.html +26 -0
  130. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_78.js +4 -0
  131. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_79.html +26 -0
  132. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/search/variables_79.js +4 -0
  133. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjregion.html +186 -0
  134. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjscalingfactor.html +148 -0
  135. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/structtjtransform.html +212 -0
  136. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/sync_off.png +0 -0
  137. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/sync_on.png +0 -0
  138. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_a.png +0 -0
  139. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_b.png +0 -0
  140. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_h.png +0 -0
  141. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tab_s.png +0 -0
  142. data/epeg/vendor/libjpeg-turbo-2.0.4/doc/html/tabs.css +60 -0
  143. data/epeg/vendor/libjpeg-turbo-2.0.4/doxygen-extra.css +3 -0
  144. data/epeg/vendor/libjpeg-turbo-2.0.4/doxygen.config +16 -0
  145. data/epeg/vendor/libjpeg-turbo-2.0.4/example.txt +464 -0
  146. data/epeg/vendor/libjpeg-turbo-2.0.4/jaricom.c +157 -0
  147. data/epeg/vendor/libjpeg-turbo-2.0.4/java/CMakeLists.txt +88 -0
  148. data/epeg/vendor/libjpeg-turbo-2.0.4/java/MANIFEST.MF +2 -0
  149. data/epeg/vendor/libjpeg-turbo-2.0.4/java/README +52 -0
  150. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJBench.java +1021 -0
  151. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJExample.java +405 -0
  152. data/epeg/vendor/libjpeg-turbo-2.0.4/java/TJUnitTest.java +960 -0
  153. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/allclasses-frame.html +24 -0
  154. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/allclasses-noframe.html +24 -0
  155. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/constant-values.html +532 -0
  156. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/deprecated-list.html +252 -0
  157. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/help-doc.html +210 -0
  158. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/index-all.html +1029 -0
  159. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/index.html +71 -0
  160. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJ.html +1356 -0
  161. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJCompressor.html +926 -0
  162. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJCustomFilter.html +241 -0
  163. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJDecompressor.html +1255 -0
  164. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJException.html +340 -0
  165. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJScalingFactor.html +343 -0
  166. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJTransform.html +751 -0
  167. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/TJTransformer.html +421 -0
  168. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/YUVImage.html +765 -0
  169. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-frame.html +31 -0
  170. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-summary.html +202 -0
  171. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/org/libjpegturbo/turbojpeg/package-tree.html +160 -0
  172. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/overview-tree.html +164 -0
  173. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/package-list +1 -0
  174. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/background.gif +0 -0
  175. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/tab.gif +0 -0
  176. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/titlebar.gif +0 -0
  177. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/resources/titlebar_end.gif +0 -0
  178. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/script.js +30 -0
  179. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/serialized-form.html +176 -0
  180. data/epeg/vendor/libjpeg-turbo-2.0.4/java/doc/stylesheet.css +474 -0
  181. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJ.java +584 -0
  182. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJCompressor.java +677 -0
  183. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJCustomFilter.java +76 -0
  184. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJDecompressor.java +931 -0
  185. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJException.java +78 -0
  186. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJLoader-unix.java.in +59 -0
  187. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJLoader-win.java.in +35 -0
  188. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJScalingFactor.java +115 -0
  189. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJTransform.java +227 -0
  190. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/TJTransformer.java +163 -0
  191. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org/libjpegturbo/turbojpeg/YUVImage.java +445 -0
  192. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJ.h +129 -0
  193. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJCompressor.h +101 -0
  194. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJDecompressor.h +101 -0
  195. data/epeg/vendor/libjpeg-turbo-2.0.4/java/org_libjpegturbo_turbojpeg_TJTransformer.h +29 -0
  196. data/epeg/vendor/libjpeg-turbo-2.0.4/jcapimin.c +295 -0
  197. data/epeg/vendor/libjpeg-turbo-2.0.4/jcapistd.c +162 -0
  198. data/epeg/vendor/libjpeg-turbo-2.0.4/jcarith.c +932 -0
  199. data/epeg/vendor/libjpeg-turbo-2.0.4/jccoefct.c +449 -0
  200. data/epeg/vendor/libjpeg-turbo-2.0.4/jccolext.c +144 -0
  201. data/epeg/vendor/libjpeg-turbo-2.0.4/jccolor.c +710 -0
  202. data/epeg/vendor/libjpeg-turbo-2.0.4/jcdctmgr.c +721 -0
  203. data/epeg/vendor/libjpeg-turbo-2.0.4/jchuff.c +1096 -0
  204. data/epeg/vendor/libjpeg-turbo-2.0.4/jchuff.h +42 -0
  205. data/epeg/vendor/libjpeg-turbo-2.0.4/jcicc.c +105 -0
  206. data/epeg/vendor/libjpeg-turbo-2.0.4/jcinit.c +77 -0
  207. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmainct.c +162 -0
  208. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmarker.c +664 -0
  209. data/epeg/vendor/libjpeg-turbo-2.0.4/jcmaster.c +640 -0
  210. data/epeg/vendor/libjpeg-turbo-2.0.4/jcomapi.c +109 -0
  211. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfig.h.in +73 -0
  212. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfig.txt +143 -0
  213. data/epeg/vendor/libjpeg-turbo-2.0.4/jconfigint.h.in +31 -0
  214. data/epeg/vendor/libjpeg-turbo-2.0.4/jcparam.c +541 -0
  215. data/epeg/vendor/libjpeg-turbo-2.0.4/jcphuff.c +1105 -0
  216. data/epeg/vendor/libjpeg-turbo-2.0.4/jcprepct.c +351 -0
  217. data/epeg/vendor/libjpeg-turbo-2.0.4/jcsample.c +539 -0
  218. data/epeg/vendor/libjpeg-turbo-2.0.4/jcstest.c +126 -0
  219. data/epeg/vendor/libjpeg-turbo-2.0.4/jctrans.c +400 -0
  220. data/epeg/vendor/libjpeg-turbo-2.0.4/jdapimin.c +407 -0
  221. data/epeg/vendor/libjpeg-turbo-2.0.4/jdapistd.c +639 -0
  222. data/epeg/vendor/libjpeg-turbo-2.0.4/jdarith.c +773 -0
  223. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatadst-tj.c +203 -0
  224. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatadst.c +293 -0
  225. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatasrc-tj.c +194 -0
  226. data/epeg/vendor/libjpeg-turbo-2.0.4/jdatasrc.c +295 -0
  227. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcoefct.c +692 -0
  228. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcoefct.h +82 -0
  229. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcol565.c +384 -0
  230. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcolext.c +143 -0
  231. data/epeg/vendor/libjpeg-turbo-2.0.4/jdcolor.c +883 -0
  232. data/epeg/vendor/libjpeg-turbo-2.0.4/jdct.h +208 -0
  233. data/epeg/vendor/libjpeg-turbo-2.0.4/jddctmgr.c +352 -0
  234. data/epeg/vendor/libjpeg-turbo-2.0.4/jdhuff.c +831 -0
  235. data/epeg/vendor/libjpeg-turbo-2.0.4/jdhuff.h +238 -0
  236. data/epeg/vendor/libjpeg-turbo-2.0.4/jdicc.c +171 -0
  237. data/epeg/vendor/libjpeg-turbo-2.0.4/jdinput.c +408 -0
  238. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmainct.c +460 -0
  239. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmainct.h +71 -0
  240. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmarker.c +1377 -0
  241. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmaster.c +737 -0
  242. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmaster.h +28 -0
  243. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmerge.c +617 -0
  244. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmrg565.c +354 -0
  245. data/epeg/vendor/libjpeg-turbo-2.0.4/jdmrgext.c +184 -0
  246. data/epeg/vendor/libjpeg-turbo-2.0.4/jdphuff.c +687 -0
  247. data/epeg/vendor/libjpeg-turbo-2.0.4/jdpostct.c +294 -0
  248. data/epeg/vendor/libjpeg-turbo-2.0.4/jdsample.c +518 -0
  249. data/epeg/vendor/libjpeg-turbo-2.0.4/jdsample.h +50 -0
  250. data/epeg/vendor/libjpeg-turbo-2.0.4/jdtrans.c +155 -0
  251. data/epeg/vendor/libjpeg-turbo-2.0.4/jerror.c +251 -0
  252. data/epeg/vendor/libjpeg-turbo-2.0.4/jerror.h +316 -0
  253. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctflt.c +169 -0
  254. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctfst.c +227 -0
  255. data/epeg/vendor/libjpeg-turbo-2.0.4/jfdctint.c +288 -0
  256. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctflt.c +240 -0
  257. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctfst.c +371 -0
  258. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctint.c +2627 -0
  259. data/epeg/vendor/libjpeg-turbo-2.0.4/jidctred.c +409 -0
  260. data/epeg/vendor/libjpeg-turbo-2.0.4/jinclude.h +88 -0
  261. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemmgr.c +1179 -0
  262. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemnobs.c +115 -0
  263. data/epeg/vendor/libjpeg-turbo-2.0.4/jmemsys.h +178 -0
  264. data/epeg/vendor/libjpeg-turbo-2.0.4/jmorecfg.h +421 -0
  265. data/epeg/vendor/libjpeg-turbo-2.0.4/jpeg_nbits_table.h +4098 -0
  266. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegcomp.h +31 -0
  267. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegint.h +368 -0
  268. data/epeg/vendor/libjpeg-turbo-2.0.4/jpeglib.h +1132 -0
  269. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegtran.1 +295 -0
  270. data/epeg/vendor/libjpeg-turbo-2.0.4/jpegtran.c +601 -0
  271. data/epeg/vendor/libjpeg-turbo-2.0.4/jquant1.c +859 -0
  272. data/epeg/vendor/libjpeg-turbo-2.0.4/jquant2.c +1285 -0
  273. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimd.h +117 -0
  274. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimd_none.c +418 -0
  275. data/epeg/vendor/libjpeg-turbo-2.0.4/jsimddct.h +70 -0
  276. data/epeg/vendor/libjpeg-turbo-2.0.4/jstdhuff.c +143 -0
  277. data/epeg/vendor/libjpeg-turbo-2.0.4/jutils.c +133 -0
  278. data/epeg/vendor/libjpeg-turbo-2.0.4/jversion.h +52 -0
  279. data/epeg/vendor/libjpeg-turbo-2.0.4/libjpeg.map.in +11 -0
  280. data/epeg/vendor/libjpeg-turbo-2.0.4/libjpeg.txt +3144 -0
  281. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/CMakeLists.txt +1 -0
  282. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5.c +275 -0
  283. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5.h +57 -0
  284. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5cmp.c +59 -0
  285. data/epeg/vendor/libjpeg-turbo-2.0.4/md5/md5hl.c +125 -0
  286. data/epeg/vendor/libjpeg-turbo-2.0.4/rdbmp.c +689 -0
  287. data/epeg/vendor/libjpeg-turbo-2.0.4/rdcolmap.c +254 -0
  288. data/epeg/vendor/libjpeg-turbo-2.0.4/rdgif.c +39 -0
  289. data/epeg/vendor/libjpeg-turbo-2.0.4/rdjpgcom.1 +63 -0
  290. data/epeg/vendor/libjpeg-turbo-2.0.4/rdjpgcom.c +510 -0
  291. data/epeg/vendor/libjpeg-turbo-2.0.4/rdppm.c +766 -0
  292. data/epeg/vendor/libjpeg-turbo-2.0.4/rdrle.c +389 -0
  293. data/epeg/vendor/libjpeg-turbo-2.0.4/rdswitch.c +424 -0
  294. data/epeg/vendor/libjpeg-turbo-2.0.4/rdtarga.c +509 -0
  295. data/epeg/vendor/libjpeg-turbo-2.0.4/release/Distribution.xml.in +24 -0
  296. data/epeg/vendor/libjpeg-turbo-2.0.4/release/License.rtf +20 -0
  297. data/epeg/vendor/libjpeg-turbo-2.0.4/release/ReadMe.txt +5 -0
  298. data/epeg/vendor/libjpeg-turbo-2.0.4/release/Welcome.rtf +17 -0
  299. data/epeg/vendor/libjpeg-turbo-2.0.4/release/deb-control.in +31 -0
  300. data/epeg/vendor/libjpeg-turbo-2.0.4/release/installer.nsi.in +191 -0
  301. data/epeg/vendor/libjpeg-turbo-2.0.4/release/libjpeg.pc.in +10 -0
  302. data/epeg/vendor/libjpeg-turbo-2.0.4/release/libturbojpeg.pc.in +10 -0
  303. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makecygwinpkg.in +66 -0
  304. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makedpkg.in +115 -0
  305. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makemacpkg.in +284 -0
  306. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makerpm.in +30 -0
  307. data/epeg/vendor/libjpeg-turbo-2.0.4/release/makesrpm.in +48 -0
  308. data/epeg/vendor/libjpeg-turbo-2.0.4/release/maketarball.in +51 -0
  309. data/epeg/vendor/libjpeg-turbo-2.0.4/release/rpm.spec.in +221 -0
  310. data/epeg/vendor/libjpeg-turbo-2.0.4/release/uninstall.in +113 -0
  311. data/epeg/vendor/libjpeg-turbo-2.0.4/sharedlib/CMakeLists.txt +99 -0
  312. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/CMakeLists.txt +385 -0
  313. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm/jsimd.c +721 -0
  314. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm/jsimd_neon.S +2878 -0
  315. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm64/jsimd.c +798 -0
  316. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/arm64/jsimd_neon.S +3433 -0
  317. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/gas-preprocessor.in +1 -0
  318. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-avx2.asm +578 -0
  319. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-mmx.asm +476 -0
  320. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolext-sse2.asm +503 -0
  321. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-avx2.asm +121 -0
  322. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-mmx.asm +121 -0
  323. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jccolor-sse2.asm +120 -0
  324. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-avx2.asm +113 -0
  325. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-mmx.asm +113 -0
  326. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgray-sse2.asm +112 -0
  327. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-avx2.asm +457 -0
  328. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-mmx.asm +355 -0
  329. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcgryext-sse2.asm +382 -0
  330. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jchuff-sse2.asm +424 -0
  331. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcphuff-sse2.asm +660 -0
  332. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-avx2.asm +388 -0
  333. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-mmx.asm +324 -0
  334. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jcsample-sse2.asm +351 -0
  335. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-avx2.asm +515 -0
  336. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-mmx.asm +404 -0
  337. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolext-sse2.asm +458 -0
  338. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-avx2.asm +118 -0
  339. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-mmx.asm +117 -0
  340. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdcolor-sse2.asm +117 -0
  341. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-avx2.asm +136 -0
  342. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-mmx.asm +123 -0
  343. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmerge-sse2.asm +135 -0
  344. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-avx2.asm +575 -0
  345. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-mmx.asm +460 -0
  346. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdmrgext-sse2.asm +517 -0
  347. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-avx2.asm +760 -0
  348. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-mmx.asm +731 -0
  349. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jdsample-sse2.asm +724 -0
  350. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctflt-3dn.asm +318 -0
  351. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctflt-sse.asm +369 -0
  352. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctfst-mmx.asm +395 -0
  353. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctfst-sse2.asm +403 -0
  354. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-avx2.asm +331 -0
  355. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-mmx.asm +620 -0
  356. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jfdctint-sse2.asm +633 -0
  357. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-3dn.asm +451 -0
  358. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-sse.asm +571 -0
  359. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctflt-sse2.asm +497 -0
  360. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctfst-mmx.asm +499 -0
  361. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctfst-sse2.asm +501 -0
  362. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-avx2.asm +453 -0
  363. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-mmx.asm +851 -0
  364. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctint-sse2.asm +858 -0
  365. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctred-mmx.asm +704 -0
  366. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jidctred-sse2.asm +592 -0
  367. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-3dn.asm +230 -0
  368. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-mmx.asm +276 -0
  369. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquant-sse.asm +208 -0
  370. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquantf-sse2.asm +168 -0
  371. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquanti-avx2.asm +188 -0
  372. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jquanti-sse2.asm +201 -0
  373. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jsimd.c +1253 -0
  374. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/i386/jsimdcpu.asm +135 -0
  375. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/jsimd.h +1083 -0
  376. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jccolext-mmi.c +483 -0
  377. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jccolor-mmi.c +148 -0
  378. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jcsample-mmi.c +100 -0
  379. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jcsample.h +28 -0
  380. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdcolext-mmi.c +424 -0
  381. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdcolor-mmi.c +139 -0
  382. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jdsample-mmi.c +245 -0
  383. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jfdctint-mmi.c +398 -0
  384. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jidctint-mmi.c +571 -0
  385. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jquanti-mmi.c +130 -0
  386. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jsimd.c +610 -0
  387. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/jsimd_mmi.h +57 -0
  388. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/loongson/loongson-mmintrin.h +1324 -0
  389. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd.c +1123 -0
  390. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd_dspr2.S +4479 -0
  391. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/mips/jsimd_dspr2_asm.h +292 -0
  392. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jcolsamp.inc +135 -0
  393. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jdct.inc +31 -0
  394. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jpeg_nbits_table.inc +4097 -0
  395. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdcfg.inc +93 -0
  396. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdcfg.inc.h +131 -0
  397. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/nasm/jsimdext.inc +479 -0
  398. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jccolext-altivec.c +269 -0
  399. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jccolor-altivec.c +116 -0
  400. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcgray-altivec.c +111 -0
  401. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcgryext-altivec.c +228 -0
  402. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcsample-altivec.c +159 -0
  403. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jcsample.h +28 -0
  404. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdcolext-altivec.c +276 -0
  405. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdcolor-altivec.c +106 -0
  406. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdmerge-altivec.c +130 -0
  407. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdmrgext-altivec.c +329 -0
  408. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jdsample-altivec.c +400 -0
  409. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jfdctfst-altivec.c +154 -0
  410. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jfdctint-altivec.c +258 -0
  411. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jidctfst-altivec.c +255 -0
  412. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jidctint-altivec.c +357 -0
  413. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jquanti-altivec.c +250 -0
  414. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jsimd.c +872 -0
  415. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/powerpc/jsimd_altivec.h +98 -0
  416. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolext-avx2.asm +558 -0
  417. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolext-sse2.asm +483 -0
  418. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolor-avx2.asm +121 -0
  419. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jccolor-sse2.asm +120 -0
  420. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgray-avx2.asm +113 -0
  421. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgray-sse2.asm +112 -0
  422. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgryext-avx2.asm +437 -0
  423. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcgryext-sse2.asm +362 -0
  424. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jchuff-sse2.asm +346 -0
  425. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcphuff-sse2.asm +637 -0
  426. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcsample-avx2.asm +366 -0
  427. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jcsample-sse2.asm +329 -0
  428. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolext-avx2.asm +495 -0
  429. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolext-sse2.asm +438 -0
  430. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolor-avx2.asm +118 -0
  431. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdcolor-sse2.asm +117 -0
  432. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmerge-avx2.asm +136 -0
  433. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmerge-sse2.asm +135 -0
  434. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmrgext-avx2.asm +593 -0
  435. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdmrgext-sse2.asm +535 -0
  436. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdsample-avx2.asm +695 -0
  437. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jdsample-sse2.asm +664 -0
  438. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctflt-sse.asm +355 -0
  439. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctfst-sse2.asm +389 -0
  440. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctint-avx2.asm +320 -0
  441. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jfdctint-sse2.asm +619 -0
  442. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctflt-sse2.asm +481 -0
  443. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctfst-sse2.asm +490 -0
  444. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctint-avx2.asm +417 -0
  445. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctint-sse2.asm +846 -0
  446. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jidctred-sse2.asm +573 -0
  447. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquantf-sse2.asm +154 -0
  448. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquanti-avx2.asm +162 -0
  449. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jquanti-sse2.asm +187 -0
  450. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jsimd.c +1076 -0
  451. data/epeg/vendor/libjpeg-turbo-2.0.4/simd/x86_64/jsimdcpu.asm +86 -0
  452. data/epeg/vendor/libjpeg-turbo-2.0.4/structure.txt +904 -0
  453. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/nightshot_iso_100.bmp +0 -0
  454. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/nightshot_iso_100.txt +25 -0
  455. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test.scan +5 -0
  456. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test1.icc +0 -0
  457. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test1.icc.txt +20 -0
  458. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test2.icc +0 -0
  459. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/test2.icc.txt +20 -0
  460. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testimgari.jpg +0 -0
  461. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testimgint.jpg +0 -0
  462. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig.jpg +0 -0
  463. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig.ppm +4 -0
  464. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/testorig12.jpg +0 -0
  465. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_5674_0098.bmp +0 -0
  466. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_6434_0018a.bmp +0 -0
  467. data/epeg/vendor/libjpeg-turbo-2.0.4/testimages/vgl_6548_0026a.bmp +0 -0
  468. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbench.c +1031 -0
  469. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbenchtest.in +256 -0
  470. data/epeg/vendor/libjpeg-turbo-2.0.4/tjbenchtest.java.in +215 -0
  471. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexample.c +396 -0
  472. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexampletest.in +149 -0
  473. data/epeg/vendor/libjpeg-turbo-2.0.4/tjexampletest.java.in +151 -0
  474. data/epeg/vendor/libjpeg-turbo-2.0.4/tjunittest.c +931 -0
  475. data/epeg/vendor/libjpeg-turbo-2.0.4/tjutil.c +70 -0
  476. data/epeg/vendor/libjpeg-turbo-2.0.4/tjutil.h +47 -0
  477. data/epeg/vendor/libjpeg-turbo-2.0.4/transupp.c +1628 -0
  478. data/epeg/vendor/libjpeg-turbo-2.0.4/transupp.h +210 -0
  479. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-jni.c +1246 -0
  480. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-mapfile +65 -0
  481. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg-mapfile.jni +101 -0
  482. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg.c +2152 -0
  483. data/epeg/vendor/libjpeg-turbo-2.0.4/turbojpeg.h +1744 -0
  484. data/epeg/vendor/libjpeg-turbo-2.0.4/usage.txt +635 -0
  485. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jconfig.h.in +34 -0
  486. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg62-memsrcdst.def +108 -0
  487. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg62.def +106 -0
  488. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg7-memsrcdst.def +110 -0
  489. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg7.def +108 -0
  490. data/epeg/vendor/libjpeg-turbo-2.0.4/win/jpeg8.def +111 -0
  491. data/epeg/vendor/libjpeg-turbo-2.0.4/wizard.txt +212 -0
  492. data/epeg/vendor/libjpeg-turbo-2.0.4/wrbmp.c +558 -0
  493. data/epeg/vendor/libjpeg-turbo-2.0.4/wrgif.c +413 -0
  494. data/epeg/vendor/libjpeg-turbo-2.0.4/wrjpgcom.1 +103 -0
  495. data/epeg/vendor/libjpeg-turbo-2.0.4/wrjpgcom.c +591 -0
  496. data/epeg/vendor/libjpeg-turbo-2.0.4/wrppm.c +365 -0
  497. data/epeg/vendor/libjpeg-turbo-2.0.4/wrrle.c +309 -0
  498. data/epeg/vendor/libjpeg-turbo-2.0.4/wrtarga.c +261 -0
  499. data/epeg.c +131 -0
  500. data/epeg.gemspec +18 -0
  501. data/extconf.rb +80 -0
  502. data/test.jpg +0 -0
  503. data/test.rb +42 -0
  504. metadata +546 -0
@@ -0,0 +1,2878 @@
1
+ /*
2
+ * ARMv7 NEON optimizations for libjpeg-turbo
3
+ *
4
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
5
+ * All Rights Reserved.
6
+ * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
7
+ * Copyright (C) 2014, Siarhei Siamashka. All Rights Reserved.
8
+ * Copyright (C) 2014, Linaro Limited. All Rights Reserved.
9
+ * Copyright (C) 2015, D. R. Commander. All Rights Reserved.
10
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois. All Rights Reserved.
11
+ *
12
+ * This software is provided 'as-is', without any express or implied
13
+ * warranty. In no event will the authors be held liable for any damages
14
+ * arising from the use of this software.
15
+ *
16
+ * Permission is granted to anyone to use this software for any purpose,
17
+ * including commercial applications, and to alter it and redistribute it
18
+ * freely, subject to the following restrictions:
19
+ *
20
+ * 1. The origin of this software must not be misrepresented; you must not
21
+ * claim that you wrote the original software. If you use this software
22
+ * in a product, an acknowledgment in the product documentation would be
23
+ * appreciated but is not required.
24
+ * 2. Altered source versions must be plainly marked as such, and must not be
25
+ * misrepresented as being the original software.
26
+ * 3. This notice may not be removed or altered from any source distribution.
27
+ */
28
+
29
+ #if defined(__linux__) && defined(__ELF__)
30
+ .section .note.GNU-stack, "", %progbits /* mark stack as non-executable */
31
+ #endif
32
+
33
+ .text
34
+ .fpu neon
35
+ .arch armv7a
36
+ .object_arch armv4
37
+ .arm
38
+ .syntax unified
39
+
40
+
41
+ #define RESPECT_STRICT_ALIGNMENT 1
42
+
43
+
44
+ /*****************************************************************************/
45
+
46
+ /* Supplementary macro for setting function attributes */
47
+ .macro asm_function fname
48
+ #ifdef __APPLE__
49
+ .private_extern _\fname
50
+ .globl _\fname
51
+ _\fname:
52
+ #else
53
+ .global \fname
54
+ #ifdef __ELF__
55
+ .hidden \fname
56
+ .type \fname, %function
57
+ #endif
58
+ \fname:
59
+ #endif
60
+ .endm
61
+
62
+ /* Transpose a block of 4x4 coefficients in four 64-bit registers */
63
+ .macro transpose_4x4 x0, x1, x2, x3
64
+ vtrn.16 \x0, \x1
65
+ vtrn.16 \x2, \x3
66
+ vtrn.32 \x0, \x2
67
+ vtrn.32 \x1, \x3
68
+ .endm
69
+
70
+
71
+ #define CENTERJSAMPLE 128
72
+
73
+ /*****************************************************************************/
74
+
75
+ /*
76
+ * Perform dequantization and inverse DCT on one block of coefficients.
77
+ *
78
+ * GLOBAL(void)
79
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
80
+ * JSAMPARRAY output_buf, JDIMENSION output_col)
81
+ */
82
+
83
+ #define FIX_0_298631336 (2446)
84
+ #define FIX_0_390180644 (3196)
85
+ #define FIX_0_541196100 (4433)
86
+ #define FIX_0_765366865 (6270)
87
+ #define FIX_0_899976223 (7373)
88
+ #define FIX_1_175875602 (9633)
89
+ #define FIX_1_501321110 (12299)
90
+ #define FIX_1_847759065 (15137)
91
+ #define FIX_1_961570560 (16069)
92
+ #define FIX_2_053119869 (16819)
93
+ #define FIX_2_562915447 (20995)
94
+ #define FIX_3_072711026 (25172)
95
+
96
+ #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
97
+ #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
98
+ #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
99
+ #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
100
+ #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
101
+ #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
102
+ #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
103
+ #define FIX_0_541196100_PLUS_0_765366865 (FIX_0_541196100 + FIX_0_765366865)
104
+
105
+ /*
106
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
107
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
108
+ */
109
+ #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
110
+ DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
111
+ JLONG q1, q2, q3, q4, q5, q6, q7; \
112
+ JLONG tmp11_plus_tmp2, tmp11_minus_tmp2; \
113
+ \
114
+ /* 1-D iDCT input data */ \
115
+ row0 = xrow0; \
116
+ row1 = xrow1; \
117
+ row2 = xrow2; \
118
+ row3 = xrow3; \
119
+ row4 = xrow4; \
120
+ row5 = xrow5; \
121
+ row6 = xrow6; \
122
+ row7 = xrow7; \
123
+ \
124
+ q5 = row7 + row3; \
125
+ q4 = row5 + row1; \
126
+ q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
127
+ MULTIPLY(q4, FIX_1_175875602); \
128
+ q7 = MULTIPLY(q5, FIX_1_175875602) + \
129
+ MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
130
+ q2 = MULTIPLY(row2, FIX_0_541196100) + \
131
+ MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
132
+ q4 = q6; \
133
+ q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
134
+ q6 += MULTIPLY(row5, -FIX_2_562915447) + \
135
+ MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
136
+ /* now we can use q1 (reloadable constants have been used up) */ \
137
+ q1 = q3 + q2; \
138
+ q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
139
+ MULTIPLY(row1, -FIX_0_899976223); \
140
+ q5 = q7; \
141
+ q1 = q1 + q6; \
142
+ q7 += MULTIPLY(row7, -FIX_0_899976223) + \
143
+ MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
144
+ \
145
+ /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
146
+ tmp11_plus_tmp2 = q1; \
147
+ row1 = 0; \
148
+ \
149
+ q1 = q1 - q6; \
150
+ q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
151
+ MULTIPLY(row3, -FIX_2_562915447); \
152
+ q1 = q1 - q6; \
153
+ q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
154
+ MULTIPLY(row6, FIX_0_541196100); \
155
+ q3 = q3 - q2; \
156
+ \
157
+ /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
158
+ tmp11_minus_tmp2 = q1; \
159
+ \
160
+ q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
161
+ q2 = q1 + q6; \
162
+ q1 = q1 - q6; \
163
+ \
164
+ /* pick up the results */ \
165
+ tmp0 = q4; \
166
+ tmp1 = q5; \
167
+ tmp2 = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
168
+ tmp3 = q7; \
169
+ tmp10 = q2; \
170
+ tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
171
+ tmp12 = q3; \
172
+ tmp13 = q1; \
173
+ }
174
+
175
+ #define XFIX_0_899976223 d0[0]
176
+ #define XFIX_0_541196100 d0[1]
177
+ #define XFIX_2_562915447 d0[2]
178
+ #define XFIX_0_298631336_MINUS_0_899976223 d0[3]
179
+ #define XFIX_1_501321110_MINUS_0_899976223 d1[0]
180
+ #define XFIX_2_053119869_MINUS_2_562915447 d1[1]
181
+ #define XFIX_0_541196100_PLUS_0_765366865 d1[2]
182
+ #define XFIX_1_175875602 d1[3]
183
+ #define XFIX_1_175875602_MINUS_0_390180644 d2[0]
184
+ #define XFIX_0_541196100_MINUS_1_847759065 d2[1]
185
+ #define XFIX_3_072711026_MINUS_2_562915447 d2[2]
186
+ #define XFIX_1_175875602_MINUS_1_961570560 d2[3]
187
+
188
+ .balign 16
189
+ jsimd_idct_islow_neon_consts:
190
+ .short FIX_0_899976223 /* d0[0] */
191
+ .short FIX_0_541196100 /* d0[1] */
192
+ .short FIX_2_562915447 /* d0[2] */
193
+ .short FIX_0_298631336_MINUS_0_899976223 /* d0[3] */
194
+ .short FIX_1_501321110_MINUS_0_899976223 /* d1[0] */
195
+ .short FIX_2_053119869_MINUS_2_562915447 /* d1[1] */
196
+ .short FIX_0_541196100_PLUS_0_765366865 /* d1[2] */
197
+ .short FIX_1_175875602 /* d1[3] */
198
+ /* reloadable constants */
199
+ .short FIX_1_175875602_MINUS_0_390180644 /* d2[0] */
200
+ .short FIX_0_541196100_MINUS_1_847759065 /* d2[1] */
201
+ .short FIX_3_072711026_MINUS_2_562915447 /* d2[2] */
202
+ .short FIX_1_175875602_MINUS_1_961570560 /* d2[3] */
203
+
204
+ asm_function jsimd_idct_islow_neon
205
+
206
+ DCT_TABLE .req r0
207
+ COEF_BLOCK .req r1
208
+ OUTPUT_BUF .req r2
209
+ OUTPUT_COL .req r3
210
+ TMP1 .req r0
211
+ TMP2 .req r1
212
+ TMP3 .req r2
213
+ TMP4 .req ip
214
+
215
+ ROW0L .req d16
216
+ ROW0R .req d17
217
+ ROW1L .req d18
218
+ ROW1R .req d19
219
+ ROW2L .req d20
220
+ ROW2R .req d21
221
+ ROW3L .req d22
222
+ ROW3R .req d23
223
+ ROW4L .req d24
224
+ ROW4R .req d25
225
+ ROW5L .req d26
226
+ ROW5R .req d27
227
+ ROW6L .req d28
228
+ ROW6R .req d29
229
+ ROW7L .req d30
230
+ ROW7R .req d31
231
+
232
+ /* Load and dequantize coefficients into NEON registers
233
+ * with the following allocation:
234
+ * 0 1 2 3 | 4 5 6 7
235
+ * ---------+--------
236
+ * 0 | d16 | d17 ( q8 )
237
+ * 1 | d18 | d19 ( q9 )
238
+ * 2 | d20 | d21 ( q10 )
239
+ * 3 | d22 | d23 ( q11 )
240
+ * 4 | d24 | d25 ( q12 )
241
+ * 5 | d26 | d27 ( q13 )
242
+ * 6 | d28 | d29 ( q14 )
243
+ * 7 | d30 | d31 ( q15 )
244
+ */
245
+ adr ip, jsimd_idct_islow_neon_consts
246
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
247
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
248
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
249
+ vmul.s16 q8, q8, q0
250
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
251
+ vmul.s16 q9, q9, q1
252
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
253
+ vmul.s16 q10, q10, q2
254
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
255
+ vmul.s16 q11, q11, q3
256
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
257
+ vmul.s16 q12, q12, q0
258
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
259
+ vmul.s16 q14, q14, q2
260
+ vmul.s16 q13, q13, q1
261
+ vld1.16 {d0, d1, d2, d3}, [ip, :128] /* load constants */
262
+ add ip, ip, #16
263
+ vmul.s16 q15, q15, q3
264
+ vpush {d8-d15} /* save NEON registers */
265
+ /* 1-D IDCT, pass 1, left 4x8 half */
266
+ vadd.s16 d4, ROW7L, ROW3L
267
+ vadd.s16 d5, ROW5L, ROW1L
268
+ vmull.s16 q6, d4, XFIX_1_175875602_MINUS_1_961570560
269
+ vmlal.s16 q6, d5, XFIX_1_175875602
270
+ vmull.s16 q7, d4, XFIX_1_175875602
271
+ /* Check for the zero coefficients in the right 4x8 half */
272
+ push {r4, r5}
273
+ vmlal.s16 q7, d5, XFIX_1_175875602_MINUS_0_390180644
274
+ vsubl.s16 q3, ROW0L, ROW4L
275
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
276
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
277
+ vmlal.s16 q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
278
+ orr r0, r4, r5
279
+ vmov q4, q6
280
+ vmlsl.s16 q6, ROW5L, XFIX_2_562915447
281
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
282
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
283
+ vshl.s32 q3, q3, #13
284
+ orr r0, r0, r4
285
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
286
+ orr r0, r0, r5
287
+ vadd.s32 q1, q3, q2
288
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
289
+ vmov q5, q7
290
+ vadd.s32 q1, q1, q6
291
+ orr r0, r0, r4
292
+ vmlsl.s16 q7, ROW7L, XFIX_0_899976223
293
+ orr r0, r0, r5
294
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
295
+ vrshrn.s32 ROW1L, q1, #11
296
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
297
+ vsub.s32 q1, q1, q6
298
+ vmlal.s16 q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
299
+ orr r0, r0, r4
300
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
301
+ orr r0, r0, r5
302
+ vsub.s32 q1, q1, q6
303
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
304
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
305
+ vmlal.s16 q6, ROW6L, XFIX_0_541196100
306
+ vsub.s32 q3, q3, q2
307
+ orr r0, r0, r4
308
+ vrshrn.s32 ROW6L, q1, #11
309
+ orr r0, r0, r5
310
+ vadd.s32 q1, q3, q5
311
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
312
+ vsub.s32 q3, q3, q5
313
+ vaddl.s16 q5, ROW0L, ROW4L
314
+ orr r0, r0, r4
315
+ vrshrn.s32 ROW2L, q1, #11
316
+ orr r0, r0, r5
317
+ vrshrn.s32 ROW5L, q3, #11
318
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
319
+ vshl.s32 q5, q5, #13
320
+ vmlal.s16 q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
321
+ orr r0, r0, r4
322
+ vadd.s32 q2, q5, q6
323
+ orrs r0, r0, r5
324
+ vsub.s32 q1, q5, q6
325
+ vadd.s32 q6, q2, q7
326
+ ldrd r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
327
+ vsub.s32 q2, q2, q7
328
+ vadd.s32 q5, q1, q4
329
+ orr r0, r4, r5
330
+ vsub.s32 q3, q1, q4
331
+ pop {r4, r5}
332
+ vrshrn.s32 ROW7L, q2, #11
333
+ vrshrn.s32 ROW3L, q5, #11
334
+ vrshrn.s32 ROW0L, q6, #11
335
+ vrshrn.s32 ROW4L, q3, #11
336
+
337
+ beq 3f /* Go to do some special handling for the sparse
338
+ right 4x8 half */
339
+
340
+ /* 1-D IDCT, pass 1, right 4x8 half */
341
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
342
+ vadd.s16 d10, ROW7R, ROW3R
343
+ vadd.s16 d8, ROW5R, ROW1R
344
+ /* Transpose left 4x8 half */
345
+ vtrn.16 ROW6L, ROW7L
346
+ vmull.s16 q6, d10, XFIX_1_175875602_MINUS_1_961570560
347
+ vmlal.s16 q6, d8, XFIX_1_175875602
348
+ vtrn.16 ROW2L, ROW3L
349
+ vmull.s16 q7, d10, XFIX_1_175875602
350
+ vmlal.s16 q7, d8, XFIX_1_175875602_MINUS_0_390180644
351
+ vtrn.16 ROW0L, ROW1L
352
+ vsubl.s16 q3, ROW0R, ROW4R
353
+ vmull.s16 q2, ROW2R, XFIX_0_541196100
354
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
355
+ vtrn.16 ROW4L, ROW5L
356
+ vmov q4, q6
357
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447
358
+ vmlal.s16 q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
359
+ vtrn.32 ROW1L, ROW3L
360
+ vshl.s32 q3, q3, #13
361
+ vmlsl.s16 q4, ROW1R, XFIX_0_899976223
362
+ vtrn.32 ROW4L, ROW6L
363
+ vadd.s32 q1, q3, q2
364
+ vmov q5, q7
365
+ vadd.s32 q1, q1, q6
366
+ vtrn.32 ROW0L, ROW2L
367
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223
368
+ vmlal.s16 q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
369
+ vrshrn.s32 ROW1R, q1, #11
370
+ vtrn.32 ROW5L, ROW7L
371
+ vsub.s32 q1, q1, q6
372
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
373
+ vmlsl.s16 q5, ROW3R, XFIX_2_562915447
374
+ vsub.s32 q1, q1, q6
375
+ vmull.s16 q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
376
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100
377
+ vsub.s32 q3, q3, q2
378
+ vrshrn.s32 ROW6R, q1, #11
379
+ vadd.s32 q1, q3, q5
380
+ vsub.s32 q3, q3, q5
381
+ vaddl.s16 q5, ROW0R, ROW4R
382
+ vrshrn.s32 ROW2R, q1, #11
383
+ vrshrn.s32 ROW5R, q3, #11
384
+ vshl.s32 q5, q5, #13
385
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
386
+ vadd.s32 q2, q5, q6
387
+ vsub.s32 q1, q5, q6
388
+ vadd.s32 q6, q2, q7
389
+ vsub.s32 q2, q2, q7
390
+ vadd.s32 q5, q1, q4
391
+ vsub.s32 q3, q1, q4
392
+ vrshrn.s32 ROW7R, q2, #11
393
+ vrshrn.s32 ROW3R, q5, #11
394
+ vrshrn.s32 ROW0R, q6, #11
395
+ vrshrn.s32 ROW4R, q3, #11
396
+ /* Transpose right 4x8 half */
397
+ vtrn.16 ROW6R, ROW7R
398
+ vtrn.16 ROW2R, ROW3R
399
+ vtrn.16 ROW0R, ROW1R
400
+ vtrn.16 ROW4R, ROW5R
401
+ vtrn.32 ROW1R, ROW3R
402
+ vtrn.32 ROW4R, ROW6R
403
+ vtrn.32 ROW0R, ROW2R
404
+ vtrn.32 ROW5R, ROW7R
405
+
406
+ 1: /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
407
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
408
+ vmull.s16 q6, ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
409
+ vmlal.s16 q6, ROW1L, XFIX_1_175875602
410
+ vmlal.s16 q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
411
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
412
+ vmull.s16 q7, ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
413
+ vmlal.s16 q7, ROW3L, XFIX_1_175875602
414
+ vmlal.s16 q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
415
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
416
+ vsubl.s16 q3, ROW0L, ROW0R /* ROW4L <-> ROW0R */
417
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
418
+ vmlal.s16 q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
419
+ vmov q4, q6
420
+ vmlsl.s16 q6, ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
421
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
422
+ vshl.s32 q3, q3, #13
423
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
424
+ vadd.s32 q1, q3, q2
425
+ vmov q5, q7
426
+ vadd.s32 q1, q1, q6
427
+ vmlsl.s16 q7, ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
428
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
429
+ vshrn.s32 ROW1L, q1, #16
430
+ vsub.s32 q1, q1, q6
431
+ vmlal.s16 q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
432
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
433
+ vsub.s32 q1, q1, q6
434
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
435
+ vmlal.s16 q6, ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
436
+ vsub.s32 q3, q3, q2
437
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
438
+ vadd.s32 q1, q3, q5
439
+ vsub.s32 q3, q3, q5
440
+ vaddl.s16 q5, ROW0L, ROW0R /* ROW4L <-> ROW0R */
441
+ vshrn.s32 ROW2L, q1, #16
442
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
443
+ vshl.s32 q5, q5, #13
444
+ vmlal.s16 q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
445
+ vadd.s32 q2, q5, q6
446
+ vsub.s32 q1, q5, q6
447
+ vadd.s32 q6, q2, q7
448
+ vsub.s32 q2, q2, q7
449
+ vadd.s32 q5, q1, q4
450
+ vsub.s32 q3, q1, q4
451
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
452
+ vshrn.s32 ROW3L, q5, #16
453
+ vshrn.s32 ROW0L, q6, #16
454
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
455
+ /* 1-D IDCT, pass 2, right 4x8 half */
456
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
457
+ vmull.s16 q6, ROW5R, XFIX_1_175875602
458
+ vmlal.s16 q6, ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
459
+ vmlal.s16 q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
460
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
461
+ vmull.s16 q7, ROW7R, XFIX_1_175875602
462
+ vmlal.s16 q7, ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
463
+ vmlal.s16 q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
464
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
465
+ vsubl.s16 q3, ROW4L, ROW4R /* ROW4L <-> ROW0R */
466
+ vmull.s16 q2, ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
467
+ vmlal.s16 q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
468
+ vmov q4, q6
469
+ vmlsl.s16 q6, ROW5R, XFIX_2_562915447
470
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
471
+ vshl.s32 q3, q3, #13
472
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
473
+ vadd.s32 q1, q3, q2
474
+ vmov q5, q7
475
+ vadd.s32 q1, q1, q6
476
+ vmlsl.s16 q7, ROW7R, XFIX_0_899976223
477
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
478
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
479
+ vsub.s32 q1, q1, q6
480
+ vmlal.s16 q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
481
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
482
+ vsub.s32 q1, q1, q6
483
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
484
+ vmlal.s16 q6, ROW6R, XFIX_0_541196100
485
+ vsub.s32 q3, q3, q2
486
+ vshrn.s32 ROW6R, q1, #16
487
+ vadd.s32 q1, q3, q5
488
+ vsub.s32 q3, q3, q5
489
+ vaddl.s16 q5, ROW4L, ROW4R /* ROW4L <-> ROW0R */
490
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
491
+ vshrn.s32 ROW5R, q3, #16
492
+ vshl.s32 q5, q5, #13
493
+ vmlal.s16 q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
494
+ vadd.s32 q2, q5, q6
495
+ vsub.s32 q1, q5, q6
496
+ vadd.s32 q6, q2, q7
497
+ vsub.s32 q2, q2, q7
498
+ vadd.s32 q5, q1, q4
499
+ vsub.s32 q3, q1, q4
500
+ vshrn.s32 ROW7R, q2, #16
501
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
502
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
503
+ vshrn.s32 ROW4R, q3, #16
504
+
505
+ 2: /* Descale to 8-bit and range limit */
506
+ vqrshrn.s16 d16, q8, #2
507
+ vqrshrn.s16 d17, q9, #2
508
+ vqrshrn.s16 d18, q10, #2
509
+ vqrshrn.s16 d19, q11, #2
510
+ vpop {d8-d15} /* restore NEON registers */
511
+ vqrshrn.s16 d20, q12, #2
512
+ /* Transpose the final 8-bit samples and do signed->unsigned conversion */
513
+ vtrn.16 q8, q9
514
+ vqrshrn.s16 d21, q13, #2
515
+ vqrshrn.s16 d22, q14, #2
516
+ vmov.u8 q0, #(CENTERJSAMPLE)
517
+ vqrshrn.s16 d23, q15, #2
518
+ vtrn.8 d16, d17
519
+ vtrn.8 d18, d19
520
+ vadd.u8 q8, q8, q0
521
+ vadd.u8 q9, q9, q0
522
+ vtrn.16 q10, q11
523
+ /* Store results to the output buffer */
524
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
525
+ add TMP1, TMP1, OUTPUT_COL
526
+ add TMP2, TMP2, OUTPUT_COL
527
+ vst1.8 {d16}, [TMP1]
528
+ vtrn.8 d20, d21
529
+ vst1.8 {d17}, [TMP2]
530
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
531
+ add TMP1, TMP1, OUTPUT_COL
532
+ add TMP2, TMP2, OUTPUT_COL
533
+ vst1.8 {d18}, [TMP1]
534
+ vadd.u8 q10, q10, q0
535
+ vst1.8 {d19}, [TMP2]
536
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
537
+ add TMP1, TMP1, OUTPUT_COL
538
+ add TMP2, TMP2, OUTPUT_COL
539
+ add TMP3, TMP3, OUTPUT_COL
540
+ add TMP4, TMP4, OUTPUT_COL
541
+ vtrn.8 d22, d23
542
+ vst1.8 {d20}, [TMP1]
543
+ vadd.u8 q11, q11, q0
544
+ vst1.8 {d21}, [TMP2]
545
+ vst1.8 {d22}, [TMP3]
546
+ vst1.8 {d23}, [TMP4]
547
+ bx lr
548
+
549
+ 3: /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
550
+
551
+ /* Transpose left 4x8 half */
552
+ vtrn.16 ROW6L, ROW7L
553
+ vtrn.16 ROW2L, ROW3L
554
+ vtrn.16 ROW0L, ROW1L
555
+ vtrn.16 ROW4L, ROW5L
556
+ vshl.s16 ROW0R, ROW0R, #2 /* PASS1_BITS */
557
+ vtrn.32 ROW1L, ROW3L
558
+ vtrn.32 ROW4L, ROW6L
559
+ vtrn.32 ROW0L, ROW2L
560
+ vtrn.32 ROW5L, ROW7L
561
+
562
+ cmp r0, #0
563
+ beq 4f /* Right 4x8 half has all zeros, go to 'sparse' second
564
+ pass */
565
+
566
+ /* Only row 0 is non-zero for the right 4x8 half */
567
+ vdup.s16 ROW1R, ROW0R[1]
568
+ vdup.s16 ROW2R, ROW0R[2]
569
+ vdup.s16 ROW3R, ROW0R[3]
570
+ vdup.s16 ROW4R, ROW0R[0]
571
+ vdup.s16 ROW5R, ROW0R[1]
572
+ vdup.s16 ROW6R, ROW0R[2]
573
+ vdup.s16 ROW7R, ROW0R[3]
574
+ vdup.s16 ROW0R, ROW0R[0]
575
+ b 1b /* Go to 'normal' second pass */
576
+
577
+ 4: /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
578
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
579
+ vmull.s16 q6, ROW1L, XFIX_1_175875602
580
+ vmlal.s16 q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
581
+ vmull.s16 q7, ROW3L, XFIX_1_175875602
582
+ vmlal.s16 q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
583
+ vmull.s16 q2, ROW2L, XFIX_0_541196100
584
+ vshll.s16 q3, ROW0L, #13
585
+ vmov q4, q6
586
+ vmlal.s16 q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
587
+ vmlsl.s16 q4, ROW1L, XFIX_0_899976223
588
+ vadd.s32 q1, q3, q2
589
+ vmov q5, q7
590
+ vmlal.s16 q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
591
+ vadd.s32 q1, q1, q6
592
+ vadd.s32 q6, q6, q6
593
+ vmlsl.s16 q5, ROW3L, XFIX_2_562915447
594
+ vshrn.s32 ROW1L, q1, #16
595
+ vsub.s32 q1, q1, q6
596
+ vmull.s16 q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
597
+ vsub.s32 q3, q3, q2
598
+ vshrn.s32 ROW2R, q1, #16 /* ROW6L <-> ROW2R */
599
+ vadd.s32 q1, q3, q5
600
+ vsub.s32 q3, q3, q5
601
+ vshll.s16 q5, ROW0L, #13
602
+ vshrn.s32 ROW2L, q1, #16
603
+ vshrn.s32 ROW1R, q3, #16 /* ROW5L <-> ROW1R */
604
+ vadd.s32 q2, q5, q6
605
+ vsub.s32 q1, q5, q6
606
+ vadd.s32 q6, q2, q7
607
+ vsub.s32 q2, q2, q7
608
+ vadd.s32 q5, q1, q4
609
+ vsub.s32 q3, q1, q4
610
+ vshrn.s32 ROW3R, q2, #16 /* ROW7L <-> ROW3R */
611
+ vshrn.s32 ROW3L, q5, #16
612
+ vshrn.s32 ROW0L, q6, #16
613
+ vshrn.s32 ROW0R, q3, #16 /* ROW4L <-> ROW0R */
614
+ /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
615
+ vld1.s16 {d2}, [ip, :64] /* reload constants */
616
+ vmull.s16 q6, ROW5L, XFIX_1_175875602
617
+ vmlal.s16 q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
618
+ vmull.s16 q7, ROW7L, XFIX_1_175875602
619
+ vmlal.s16 q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
620
+ vmull.s16 q2, ROW6L, XFIX_0_541196100
621
+ vshll.s16 q3, ROW4L, #13
622
+ vmov q4, q6
623
+ vmlal.s16 q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
624
+ vmlsl.s16 q4, ROW5L, XFIX_0_899976223
625
+ vadd.s32 q1, q3, q2
626
+ vmov q5, q7
627
+ vmlal.s16 q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
628
+ vadd.s32 q1, q1, q6
629
+ vadd.s32 q6, q6, q6
630
+ vmlsl.s16 q5, ROW7L, XFIX_2_562915447
631
+ vshrn.s32 ROW5L, q1, #16 /* ROW5L <-> ROW1R */
632
+ vsub.s32 q1, q1, q6
633
+ vmull.s16 q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
634
+ vsub.s32 q3, q3, q2
635
+ vshrn.s32 ROW6R, q1, #16
636
+ vadd.s32 q1, q3, q5
637
+ vsub.s32 q3, q3, q5
638
+ vshll.s16 q5, ROW4L, #13
639
+ vshrn.s32 ROW6L, q1, #16 /* ROW6L <-> ROW2R */
640
+ vshrn.s32 ROW5R, q3, #16
641
+ vadd.s32 q2, q5, q6
642
+ vsub.s32 q1, q5, q6
643
+ vadd.s32 q6, q2, q7
644
+ vsub.s32 q2, q2, q7
645
+ vadd.s32 q5, q1, q4
646
+ vsub.s32 q3, q1, q4
647
+ vshrn.s32 ROW7R, q2, #16
648
+ vshrn.s32 ROW7L, q5, #16 /* ROW7L <-> ROW3R */
649
+ vshrn.s32 ROW4L, q6, #16 /* ROW4L <-> ROW0R */
650
+ vshrn.s32 ROW4R, q3, #16
651
+ b 2b /* Go to epilogue */
652
+
653
+ .unreq DCT_TABLE
654
+ .unreq COEF_BLOCK
655
+ .unreq OUTPUT_BUF
656
+ .unreq OUTPUT_COL
657
+ .unreq TMP1
658
+ .unreq TMP2
659
+ .unreq TMP3
660
+ .unreq TMP4
661
+
662
+ .unreq ROW0L
663
+ .unreq ROW0R
664
+ .unreq ROW1L
665
+ .unreq ROW1R
666
+ .unreq ROW2L
667
+ .unreq ROW2R
668
+ .unreq ROW3L
669
+ .unreq ROW3R
670
+ .unreq ROW4L
671
+ .unreq ROW4R
672
+ .unreq ROW5L
673
+ .unreq ROW5R
674
+ .unreq ROW6L
675
+ .unreq ROW6R
676
+ .unreq ROW7L
677
+ .unreq ROW7R
678
+
679
+
680
+ /*****************************************************************************/
681
+
682
+ /*
683
+ * jsimd_idct_ifast_neon
684
+ *
685
+ * This function contains a fast, not so accurate integer implementation of
686
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
687
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
688
+ * function from jidctfst.c
689
+ *
690
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
691
+ * But in ARM NEON case some extra additions are required because VQDMULH
692
+ * instruction can't handle the constants larger than 1. So the expressions
693
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
694
+ * which introduces an extra addition. Overall, there are 6 extra additions
695
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
696
+ */
697
+
698
+ #define XFIX_1_082392200 d0[0]
699
+ #define XFIX_1_414213562 d0[1]
700
+ #define XFIX_1_847759065 d0[2]
701
+ #define XFIX_2_613125930 d0[3]
702
+
703
+ .balign 16
704
+ jsimd_idct_ifast_neon_consts:
705
+ .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
706
+ .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
707
+ .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
708
+ .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
709
+
710
+ asm_function jsimd_idct_ifast_neon
711
+
712
+ DCT_TABLE .req r0
713
+ COEF_BLOCK .req r1
714
+ OUTPUT_BUF .req r2
715
+ OUTPUT_COL .req r3
716
+ TMP1 .req r0
717
+ TMP2 .req r1
718
+ TMP3 .req r2
719
+ TMP4 .req ip
720
+
721
+ /* Load and dequantize coefficients into NEON registers
722
+ * with the following allocation:
723
+ * 0 1 2 3 | 4 5 6 7
724
+ * ---------+--------
725
+ * 0 | d16 | d17 ( q8 )
726
+ * 1 | d18 | d19 ( q9 )
727
+ * 2 | d20 | d21 ( q10 )
728
+ * 3 | d22 | d23 ( q11 )
729
+ * 4 | d24 | d25 ( q12 )
730
+ * 5 | d26 | d27 ( q13 )
731
+ * 6 | d28 | d29 ( q14 )
732
+ * 7 | d30 | d31 ( q15 )
733
+ */
734
+ adr ip, jsimd_idct_ifast_neon_consts
735
+ vld1.16 {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
736
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
737
+ vld1.16 {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
738
+ vmul.s16 q8, q8, q0
739
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
740
+ vmul.s16 q9, q9, q1
741
+ vld1.16 {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
742
+ vmul.s16 q10, q10, q2
743
+ vld1.16 {d0, d1, d2, d3}, [DCT_TABLE, :128]!
744
+ vmul.s16 q11, q11, q3
745
+ vld1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]
746
+ vmul.s16 q12, q12, q0
747
+ vld1.16 {d4, d5, d6, d7}, [DCT_TABLE, :128]!
748
+ vmul.s16 q14, q14, q2
749
+ vmul.s16 q13, q13, q1
750
+ vld1.16 {d0}, [ip, :64] /* load constants */
751
+ vmul.s16 q15, q15, q3
752
+ vpush {d8-d13} /* save NEON registers */
753
+ /* 1-D IDCT, pass 1 */
754
+ vsub.s16 q2, q10, q14
755
+ vadd.s16 q14, q10, q14
756
+ vsub.s16 q1, q11, q13
757
+ vadd.s16 q13, q11, q13
758
+ vsub.s16 q5, q9, q15
759
+ vadd.s16 q15, q9, q15
760
+ vqdmulh.s16 q4, q2, XFIX_1_414213562
761
+ vqdmulh.s16 q6, q1, XFIX_2_613125930
762
+ vadd.s16 q3, q1, q1
763
+ vsub.s16 q1, q5, q1
764
+ vadd.s16 q10, q2, q4
765
+ vqdmulh.s16 q4, q1, XFIX_1_847759065
766
+ vsub.s16 q2, q15, q13
767
+ vadd.s16 q3, q3, q6
768
+ vqdmulh.s16 q6, q2, XFIX_1_414213562
769
+ vadd.s16 q1, q1, q4
770
+ vqdmulh.s16 q4, q5, XFIX_1_082392200
771
+ vsub.s16 q10, q10, q14
772
+ vadd.s16 q2, q2, q6
773
+ vsub.s16 q6, q8, q12
774
+ vadd.s16 q12, q8, q12
775
+ vadd.s16 q9, q5, q4
776
+ vadd.s16 q5, q6, q10
777
+ vsub.s16 q10, q6, q10
778
+ vadd.s16 q6, q15, q13
779
+ vadd.s16 q8, q12, q14
780
+ vsub.s16 q3, q6, q3
781
+ vsub.s16 q12, q12, q14
782
+ vsub.s16 q3, q3, q1
783
+ vsub.s16 q1, q9, q1
784
+ vadd.s16 q2, q3, q2
785
+ vsub.s16 q15, q8, q6
786
+ vadd.s16 q1, q1, q2
787
+ vadd.s16 q8, q8, q6
788
+ vadd.s16 q14, q5, q3
789
+ vsub.s16 q9, q5, q3
790
+ vsub.s16 q13, q10, q2
791
+ vadd.s16 q10, q10, q2
792
+ /* Transpose */
793
+ vtrn.16 q8, q9
794
+ vsub.s16 q11, q12, q1
795
+ vtrn.16 q14, q15
796
+ vadd.s16 q12, q12, q1
797
+ vtrn.16 q10, q11
798
+ vtrn.16 q12, q13
799
+ vtrn.32 q9, q11
800
+ vtrn.32 q12, q14
801
+ vtrn.32 q8, q10
802
+ vtrn.32 q13, q15
803
+ vswp d28, d21
804
+ vswp d26, d19
805
+ /* 1-D IDCT, pass 2 */
806
+ vsub.s16 q2, q10, q14
807
+ vswp d30, d23
808
+ vadd.s16 q14, q10, q14
809
+ vswp d24, d17
810
+ vsub.s16 q1, q11, q13
811
+ vadd.s16 q13, q11, q13
812
+ vsub.s16 q5, q9, q15
813
+ vadd.s16 q15, q9, q15
814
+ vqdmulh.s16 q4, q2, XFIX_1_414213562
815
+ vqdmulh.s16 q6, q1, XFIX_2_613125930
816
+ vadd.s16 q3, q1, q1
817
+ vsub.s16 q1, q5, q1
818
+ vadd.s16 q10, q2, q4
819
+ vqdmulh.s16 q4, q1, XFIX_1_847759065
820
+ vsub.s16 q2, q15, q13
821
+ vadd.s16 q3, q3, q6
822
+ vqdmulh.s16 q6, q2, XFIX_1_414213562
823
+ vadd.s16 q1, q1, q4
824
+ vqdmulh.s16 q4, q5, XFIX_1_082392200
825
+ vsub.s16 q10, q10, q14
826
+ vadd.s16 q2, q2, q6
827
+ vsub.s16 q6, q8, q12
828
+ vadd.s16 q12, q8, q12
829
+ vadd.s16 q9, q5, q4
830
+ vadd.s16 q5, q6, q10
831
+ vsub.s16 q10, q6, q10
832
+ vadd.s16 q6, q15, q13
833
+ vadd.s16 q8, q12, q14
834
+ vsub.s16 q3, q6, q3
835
+ vsub.s16 q12, q12, q14
836
+ vsub.s16 q3, q3, q1
837
+ vsub.s16 q1, q9, q1
838
+ vadd.s16 q2, q3, q2
839
+ vsub.s16 q15, q8, q6
840
+ vadd.s16 q1, q1, q2
841
+ vadd.s16 q8, q8, q6
842
+ vadd.s16 q14, q5, q3
843
+ vsub.s16 q9, q5, q3
844
+ vsub.s16 q13, q10, q2
845
+ vpop {d8-d13} /* restore NEON registers */
846
+ vadd.s16 q10, q10, q2
847
+ vsub.s16 q11, q12, q1
848
+ vadd.s16 q12, q12, q1
849
+ /* Descale to 8-bit and range limit */
850
+ vmov.u8 q0, #0x80
851
+ vqshrn.s16 d16, q8, #5
852
+ vqshrn.s16 d17, q9, #5
853
+ vqshrn.s16 d18, q10, #5
854
+ vqshrn.s16 d19, q11, #5
855
+ vqshrn.s16 d20, q12, #5
856
+ vqshrn.s16 d21, q13, #5
857
+ vqshrn.s16 d22, q14, #5
858
+ vqshrn.s16 d23, q15, #5
859
+ vadd.u8 q8, q8, q0
860
+ vadd.u8 q9, q9, q0
861
+ vadd.u8 q10, q10, q0
862
+ vadd.u8 q11, q11, q0
863
+ /* Transpose the final 8-bit samples */
864
+ vtrn.16 q8, q9
865
+ vtrn.16 q10, q11
866
+ vtrn.32 q8, q10
867
+ vtrn.32 q9, q11
868
+ vtrn.8 d16, d17
869
+ vtrn.8 d18, d19
870
+ /* Store results to the output buffer */
871
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
872
+ add TMP1, TMP1, OUTPUT_COL
873
+ add TMP2, TMP2, OUTPUT_COL
874
+ vst1.8 {d16}, [TMP1]
875
+ vst1.8 {d17}, [TMP2]
876
+ ldmia OUTPUT_BUF!, {TMP1, TMP2}
877
+ add TMP1, TMP1, OUTPUT_COL
878
+ add TMP2, TMP2, OUTPUT_COL
879
+ vst1.8 {d18}, [TMP1]
880
+ vtrn.8 d20, d21
881
+ vst1.8 {d19}, [TMP2]
882
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
883
+ add TMP1, TMP1, OUTPUT_COL
884
+ add TMP2, TMP2, OUTPUT_COL
885
+ add TMP3, TMP3, OUTPUT_COL
886
+ add TMP4, TMP4, OUTPUT_COL
887
+ vst1.8 {d20}, [TMP1]
888
+ vtrn.8 d22, d23
889
+ vst1.8 {d21}, [TMP2]
890
+ vst1.8 {d22}, [TMP3]
891
+ vst1.8 {d23}, [TMP4]
892
+ bx lr
893
+
894
+ .unreq DCT_TABLE
895
+ .unreq COEF_BLOCK
896
+ .unreq OUTPUT_BUF
897
+ .unreq OUTPUT_COL
898
+ .unreq TMP1
899
+ .unreq TMP2
900
+ .unreq TMP3
901
+ .unreq TMP4
902
+
903
+
904
+ /*****************************************************************************/
905
+
906
+ /*
907
+ * jsimd_idct_4x4_neon
908
+ *
909
+ * This function contains inverse-DCT code for getting reduced-size
910
+ * 4x4 pixels output from an 8x8 DCT block. It uses the same calculations
911
+ * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
912
+ * function from jpeg-6b (jidctred.c).
913
+ *
914
+ * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
915
+ * requires much less arithmetic operations and hence should be faster.
916
+ * The primary purpose of this particular NEON optimized function is
917
+ * bit exact compatibility with jpeg-6b.
918
+ *
919
+ * TODO: a bit better instructions scheduling can be achieved by expanding
920
+ * idct_helper/transpose_4x4 macros and reordering instructions,
921
+ * but readability will suffer somewhat.
922
+ */
923
+
924
+ #define CONST_BITS 13
925
+
926
+ #define FIX_0_211164243 (1730) /* FIX(0.211164243) */
927
+ #define FIX_0_509795579 (4176) /* FIX(0.509795579) */
928
+ #define FIX_0_601344887 (4926) /* FIX(0.601344887) */
929
+ #define FIX_0_720959822 (5906) /* FIX(0.720959822) */
930
+ #define FIX_0_765366865 (6270) /* FIX(0.765366865) */
931
+ #define FIX_0_850430095 (6967) /* FIX(0.850430095) */
932
+ #define FIX_0_899976223 (7373) /* FIX(0.899976223) */
933
+ #define FIX_1_061594337 (8697) /* FIX(1.061594337) */
934
+ #define FIX_1_272758580 (10426) /* FIX(1.272758580) */
935
+ #define FIX_1_451774981 (11893) /* FIX(1.451774981) */
936
+ #define FIX_1_847759065 (15137) /* FIX(1.847759065) */
937
+ #define FIX_2_172734803 (17799) /* FIX(2.172734803) */
938
+ #define FIX_2_562915447 (20995) /* FIX(2.562915447) */
939
+ #define FIX_3_624509785 (29692) /* FIX(3.624509785) */
940
+
941
+ .balign 16
942
+ jsimd_idct_4x4_neon_consts:
943
+ .short FIX_1_847759065 /* d0[0] */
944
+ .short -FIX_0_765366865 /* d0[1] */
945
+ .short -FIX_0_211164243 /* d0[2] */
946
+ .short FIX_1_451774981 /* d0[3] */
947
+ .short -FIX_2_172734803 /* d1[0] */
948
+ .short FIX_1_061594337 /* d1[1] */
949
+ .short -FIX_0_509795579 /* d1[2] */
950
+ .short -FIX_0_601344887 /* d1[3] */
951
+ .short FIX_0_899976223 /* d2[0] */
952
+ .short FIX_2_562915447 /* d2[1] */
953
+ .short 1 << (CONST_BITS + 1) /* d2[2] */
954
+ .short 0 /* d2[3] */
955
+
956
+ .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
957
+ vmull.s16 q14, \x4, d2[2]
958
+ vmlal.s16 q14, \x8, d0[0]
959
+ vmlal.s16 q14, \x14, d0[1]
960
+
961
+ vmull.s16 q13, \x16, d1[2]
962
+ vmlal.s16 q13, \x12, d1[3]
963
+ vmlal.s16 q13, \x10, d2[0]
964
+ vmlal.s16 q13, \x6, d2[1]
965
+
966
+ vmull.s16 q15, \x4, d2[2]
967
+ vmlsl.s16 q15, \x8, d0[0]
968
+ vmlsl.s16 q15, \x14, d0[1]
969
+
970
+ vmull.s16 q12, \x16, d0[2]
971
+ vmlal.s16 q12, \x12, d0[3]
972
+ vmlal.s16 q12, \x10, d1[0]
973
+ vmlal.s16 q12, \x6, d1[1]
974
+
975
+ vadd.s32 q10, q14, q13
976
+ vsub.s32 q14, q14, q13
977
+
978
+ .if \shift > 16
979
+ vrshr.s32 q10, q10, #\shift
980
+ vrshr.s32 q14, q14, #\shift
981
+ vmovn.s32 \y26, q10
982
+ vmovn.s32 \y29, q14
983
+ .else
984
+ vrshrn.s32 \y26, q10, #\shift
985
+ vrshrn.s32 \y29, q14, #\shift
986
+ .endif
987
+
988
+ vadd.s32 q10, q15, q12
989
+ vsub.s32 q15, q15, q12
990
+
991
+ .if \shift > 16
992
+ vrshr.s32 q10, q10, #\shift
993
+ vrshr.s32 q15, q15, #\shift
994
+ vmovn.s32 \y27, q10
995
+ vmovn.s32 \y28, q15
996
+ .else
997
+ vrshrn.s32 \y27, q10, #\shift
998
+ vrshrn.s32 \y28, q15, #\shift
999
+ .endif
1000
+ .endm
1001
+
1002
+ asm_function jsimd_idct_4x4_neon
1003
+
1004
+ DCT_TABLE .req r0
1005
+ COEF_BLOCK .req r1
1006
+ OUTPUT_BUF .req r2
1007
+ OUTPUT_COL .req r3
1008
+ TMP1 .req r0
1009
+ TMP2 .req r1
1010
+ TMP3 .req r2
1011
+ TMP4 .req ip
1012
+
1013
+ vpush {d8-d15}
1014
+
1015
+ /* Load constants (d3 is just used for padding) */
1016
+ adr TMP4, jsimd_idct_4x4_neon_consts
1017
+ vld1.16 {d0, d1, d2, d3}, [TMP4, :128]
1018
+
1019
+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
1020
+ * 0 1 2 3 | 4 5 6 7
1021
+ * ---------+--------
1022
+ * 0 | d4 | d5
1023
+ * 1 | d6 | d7
1024
+ * 2 | d8 | d9
1025
+ * 3 | d10 | d11
1026
+ * 4 | - | -
1027
+ * 5 | d12 | d13
1028
+ * 6 | d14 | d15
1029
+ * 7 | d16 | d17
1030
+ */
1031
+ vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1032
+ vld1.16 {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
1033
+ add COEF_BLOCK, COEF_BLOCK, #16
1034
+ vld1.16 {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
1035
+ vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1036
+ /* dequantize */
1037
+ vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1038
+ vmul.s16 q2, q2, q9
1039
+ vld1.16 {d22, d23, d24, d25}, [DCT_TABLE, :128]!
1040
+ vmul.s16 q3, q3, q10
1041
+ vmul.s16 q4, q4, q11
1042
+ add DCT_TABLE, DCT_TABLE, #16
1043
+ vld1.16 {d26, d27, d28, d29}, [DCT_TABLE, :128]!
1044
+ vmul.s16 q5, q5, q12
1045
+ vmul.s16 q6, q6, q13
1046
+ vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1047
+ vmul.s16 q7, q7, q14
1048
+ vmul.s16 q8, q8, q15
1049
+
1050
+ /* Pass 1 */
1051
+ idct_helper d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
1052
+ transpose_4x4 d4, d6, d8, d10
1053
+ idct_helper d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
1054
+ transpose_4x4 d5, d7, d9, d11
1055
+
1056
+ /* Pass 2 */
1057
+ idct_helper d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
1058
+ transpose_4x4 d26, d27, d28, d29
1059
+
1060
+ /* Range limit */
1061
+ vmov.u16 q15, #0x80
1062
+ vadd.s16 q13, q13, q15
1063
+ vadd.s16 q14, q14, q15
1064
+ vqmovun.s16 d26, q13
1065
+ vqmovun.s16 d27, q14
1066
+
1067
+ /* Store results to the output buffer */
1068
+ ldmia OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
1069
+ add TMP1, TMP1, OUTPUT_COL
1070
+ add TMP2, TMP2, OUTPUT_COL
1071
+ add TMP3, TMP3, OUTPUT_COL
1072
+ add TMP4, TMP4, OUTPUT_COL
1073
+
1074
+ #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
1075
+ /* We can use much less instructions on little endian systems if the
1076
+ * OS kernel is not configured to trap unaligned memory accesses
1077
+ */
1078
+ vst1.32 {d26[0]}, [TMP1]!
1079
+ vst1.32 {d27[0]}, [TMP3]!
1080
+ vst1.32 {d26[1]}, [TMP2]!
1081
+ vst1.32 {d27[1]}, [TMP4]!
1082
+ #else
1083
+ vst1.8 {d26[0]}, [TMP1]!
1084
+ vst1.8 {d27[0]}, [TMP3]!
1085
+ vst1.8 {d26[1]}, [TMP1]!
1086
+ vst1.8 {d27[1]}, [TMP3]!
1087
+ vst1.8 {d26[2]}, [TMP1]!
1088
+ vst1.8 {d27[2]}, [TMP3]!
1089
+ vst1.8 {d26[3]}, [TMP1]!
1090
+ vst1.8 {d27[3]}, [TMP3]!
1091
+
1092
+ vst1.8 {d26[4]}, [TMP2]!
1093
+ vst1.8 {d27[4]}, [TMP4]!
1094
+ vst1.8 {d26[5]}, [TMP2]!
1095
+ vst1.8 {d27[5]}, [TMP4]!
1096
+ vst1.8 {d26[6]}, [TMP2]!
1097
+ vst1.8 {d27[6]}, [TMP4]!
1098
+ vst1.8 {d26[7]}, [TMP2]!
1099
+ vst1.8 {d27[7]}, [TMP4]!
1100
+ #endif
1101
+
1102
+ vpop {d8-d15}
1103
+ bx lr
1104
+
1105
+ .unreq DCT_TABLE
1106
+ .unreq COEF_BLOCK
1107
+ .unreq OUTPUT_BUF
1108
+ .unreq OUTPUT_COL
1109
+ .unreq TMP1
1110
+ .unreq TMP2
1111
+ .unreq TMP3
1112
+ .unreq TMP4
1113
+
1114
+ .purgem idct_helper
1115
+
1116
+
1117
+ /*****************************************************************************/
1118
+
1119
+ /*
1120
+ * jsimd_idct_2x2_neon
1121
+ *
1122
+ * This function contains inverse-DCT code for getting reduced-size
1123
+ * 2x2 pixels output from an 8x8 DCT block. It uses the same calculations
1124
+ * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
1125
+ * function from jpeg-6b (jidctred.c).
1126
+ *
1127
+ * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
1128
+ * requires much less arithmetic operations and hence should be faster.
1129
+ * The primary purpose of this particular NEON optimized function is
1130
+ * bit exact compatibility with jpeg-6b.
1131
+ */
1132
+
1133
+ .balign 8
1134
+ jsimd_idct_2x2_neon_consts:
1135
+ .short -FIX_0_720959822 /* d0[0] */
1136
+ .short FIX_0_850430095 /* d0[1] */
1137
+ .short -FIX_1_272758580 /* d0[2] */
1138
+ .short FIX_3_624509785 /* d0[3] */
1139
+
1140
+ .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
1141
+ vshll.s16 q14, \x4, #15
1142
+ vmull.s16 q13, \x6, d0[3]
1143
+ vmlal.s16 q13, \x10, d0[2]
1144
+ vmlal.s16 q13, \x12, d0[1]
1145
+ vmlal.s16 q13, \x16, d0[0]
1146
+
1147
+ vadd.s32 q10, q14, q13
1148
+ vsub.s32 q14, q14, q13
1149
+
1150
+ .if \shift > 16
1151
+ vrshr.s32 q10, q10, #\shift
1152
+ vrshr.s32 q14, q14, #\shift
1153
+ vmovn.s32 \y26, q10
1154
+ vmovn.s32 \y27, q14
1155
+ .else
1156
+ vrshrn.s32 \y26, q10, #\shift
1157
+ vrshrn.s32 \y27, q14, #\shift
1158
+ .endif
1159
+ .endm
1160
+
1161
+ asm_function jsimd_idct_2x2_neon
1162
+
1163
+ DCT_TABLE .req r0
1164
+ COEF_BLOCK .req r1
1165
+ OUTPUT_BUF .req r2
1166
+ OUTPUT_COL .req r3
1167
+ TMP1 .req r0
1168
+ TMP2 .req ip
1169
+
1170
+ vpush {d8-d15}
1171
+
1172
+ /* Load constants */
1173
+ adr TMP2, jsimd_idct_2x2_neon_consts
1174
+ vld1.16 {d0}, [TMP2, :64]
1175
+
1176
+ /* Load all COEF_BLOCK into NEON registers with the following allocation:
1177
+ * 0 1 2 3 | 4 5 6 7
1178
+ * ---------+--------
1179
+ * 0 | d4 | d5
1180
+ * 1 | d6 | d7
1181
+ * 2 | - | -
1182
+ * 3 | d10 | d11
1183
+ * 4 | - | -
1184
+ * 5 | d12 | d13
1185
+ * 6 | - | -
1186
+ * 7 | d16 | d17
1187
+ */
1188
+ vld1.16 {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
1189
+ add COEF_BLOCK, COEF_BLOCK, #16
1190
+ vld1.16 {d10, d11}, [COEF_BLOCK, :128]!
1191
+ add COEF_BLOCK, COEF_BLOCK, #16
1192
+ vld1.16 {d12, d13}, [COEF_BLOCK, :128]!
1193
+ add COEF_BLOCK, COEF_BLOCK, #16
1194
+ vld1.16 {d16, d17}, [COEF_BLOCK, :128]!
1195
+ /* Dequantize */
1196
+ vld1.16 {d18, d19, d20, d21}, [DCT_TABLE, :128]!
1197
+ vmul.s16 q2, q2, q9
1198
+ vmul.s16 q3, q3, q10
1199
+ add DCT_TABLE, DCT_TABLE, #16
1200
+ vld1.16 {d24, d25}, [DCT_TABLE, :128]!
1201
+ vmul.s16 q5, q5, q12
1202
+ add DCT_TABLE, DCT_TABLE, #16
1203
+ vld1.16 {d26, d27}, [DCT_TABLE, :128]!
1204
+ vmul.s16 q6, q6, q13
1205
+ add DCT_TABLE, DCT_TABLE, #16
1206
+ vld1.16 {d30, d31}, [DCT_TABLE, :128]!
1207
+ vmul.s16 q8, q8, q15
1208
+
1209
+ /* Pass 1 */
1210
+ #if 0
1211
+ idct_helper d4, d6, d10, d12, d16, 13, d4, d6
1212
+ transpose_4x4 d4, d6, d8, d10
1213
+ idct_helper d5, d7, d11, d13, d17, 13, d5, d7
1214
+ transpose_4x4 d5, d7, d9, d11
1215
+ #else
1216
+ vmull.s16 q13, d6, d0[3]
1217
+ vmlal.s16 q13, d10, d0[2]
1218
+ vmlal.s16 q13, d12, d0[1]
1219
+ vmlal.s16 q13, d16, d0[0]
1220
+ vmull.s16 q12, d7, d0[3]
1221
+ vmlal.s16 q12, d11, d0[2]
1222
+ vmlal.s16 q12, d13, d0[1]
1223
+ vmlal.s16 q12, d17, d0[0]
1224
+ vshll.s16 q14, d4, #15
1225
+ vshll.s16 q15, d5, #15
1226
+ vadd.s32 q10, q14, q13
1227
+ vsub.s32 q14, q14, q13
1228
+ vrshrn.s32 d4, q10, #13
1229
+ vrshrn.s32 d6, q14, #13
1230
+ vadd.s32 q10, q15, q12
1231
+ vsub.s32 q14, q15, q12
1232
+ vrshrn.s32 d5, q10, #13
1233
+ vrshrn.s32 d7, q14, #13
1234
+ vtrn.16 q2, q3
1235
+ vtrn.32 q3, q5
1236
+ #endif
1237
+
1238
+ /* Pass 2 */
1239
+ idct_helper d4, d6, d10, d7, d11, 20, d26, d27
1240
+
1241
+ /* Range limit */
1242
+ vmov.u16 q15, #0x80
1243
+ vadd.s16 q13, q13, q15
1244
+ vqmovun.s16 d26, q13
1245
+ vqmovun.s16 d27, q13
1246
+
1247
+ /* Store results to the output buffer */
1248
+ ldmia OUTPUT_BUF, {TMP1, TMP2}
1249
+ add TMP1, TMP1, OUTPUT_COL
1250
+ add TMP2, TMP2, OUTPUT_COL
1251
+
1252
+ vst1.8 {d26[0]}, [TMP1]!
1253
+ vst1.8 {d27[4]}, [TMP1]!
1254
+ vst1.8 {d26[1]}, [TMP2]!
1255
+ vst1.8 {d27[5]}, [TMP2]!
1256
+
1257
+ vpop {d8-d15}
1258
+ bx lr
1259
+
1260
+ .unreq DCT_TABLE
1261
+ .unreq COEF_BLOCK
1262
+ .unreq OUTPUT_BUF
1263
+ .unreq OUTPUT_COL
1264
+ .unreq TMP1
1265
+ .unreq TMP2
1266
+
1267
+ .purgem idct_helper
1268
+
1269
+
1270
+ /*****************************************************************************/
1271
+
1272
+ /*
1273
+ * jsimd_ycc_extrgb_convert_neon
1274
+ * jsimd_ycc_extbgr_convert_neon
1275
+ * jsimd_ycc_extrgbx_convert_neon
1276
+ * jsimd_ycc_extbgrx_convert_neon
1277
+ * jsimd_ycc_extxbgr_convert_neon
1278
+ * jsimd_ycc_extxrgb_convert_neon
1279
+ *
1280
+ * Colorspace conversion YCbCr -> RGB
1281
+ */
1282
+
1283
+
1284
+ .macro do_load size
1285
+ .if \size == 8
1286
+ vld1.8 {d4}, [U, :64]!
1287
+ vld1.8 {d5}, [V, :64]!
1288
+ vld1.8 {d0}, [Y, :64]!
1289
+ pld [U, #64]
1290
+ pld [V, #64]
1291
+ pld [Y, #64]
1292
+ .elseif \size == 4
1293
+ vld1.8 {d4[0]}, [U]!
1294
+ vld1.8 {d4[1]}, [U]!
1295
+ vld1.8 {d4[2]}, [U]!
1296
+ vld1.8 {d4[3]}, [U]!
1297
+ vld1.8 {d5[0]}, [V]!
1298
+ vld1.8 {d5[1]}, [V]!
1299
+ vld1.8 {d5[2]}, [V]!
1300
+ vld1.8 {d5[3]}, [V]!
1301
+ vld1.8 {d0[0]}, [Y]!
1302
+ vld1.8 {d0[1]}, [Y]!
1303
+ vld1.8 {d0[2]}, [Y]!
1304
+ vld1.8 {d0[3]}, [Y]!
1305
+ .elseif \size == 2
1306
+ vld1.8 {d4[4]}, [U]!
1307
+ vld1.8 {d4[5]}, [U]!
1308
+ vld1.8 {d5[4]}, [V]!
1309
+ vld1.8 {d5[5]}, [V]!
1310
+ vld1.8 {d0[4]}, [Y]!
1311
+ vld1.8 {d0[5]}, [Y]!
1312
+ .elseif \size == 1
1313
+ vld1.8 {d4[6]}, [U]!
1314
+ vld1.8 {d5[6]}, [V]!
1315
+ vld1.8 {d0[6]}, [Y]!
1316
+ .else
1317
+ .error unsupported macroblock size
1318
+ .endif
1319
+ .endm
1320
+
1321
+ .macro do_store bpp, size
1322
+ .if \bpp == 24
1323
+ .if \size == 8
1324
+ vst3.8 {d10, d11, d12}, [RGB]!
1325
+ .elseif \size == 4
1326
+ vst3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1327
+ vst3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1328
+ vst3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1329
+ vst3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1330
+ .elseif \size == 2
1331
+ vst3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1332
+ vst3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1333
+ .elseif \size == 1
1334
+ vst3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1335
+ .else
1336
+ .error unsupported macroblock size
1337
+ .endif
1338
+ .elseif \bpp == 32
1339
+ .if \size == 8
1340
+ vst4.8 {d10, d11, d12, d13}, [RGB]!
1341
+ .elseif \size == 4
1342
+ vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1343
+ vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1344
+ vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1345
+ vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1346
+ .elseif \size == 2
1347
+ vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1348
+ vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1349
+ .elseif \size == 1
1350
+ vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1351
+ .else
1352
+ .error unsupported macroblock size
1353
+ .endif
1354
+ .elseif \bpp == 16
1355
+ .if \size == 8
1356
+ vst1.16 {q15}, [RGB]!
1357
+ .elseif \size == 4
1358
+ vst1.16 {d30}, [RGB]!
1359
+ .elseif \size == 2
1360
+ vst1.16 {d31[0]}, [RGB]!
1361
+ vst1.16 {d31[1]}, [RGB]!
1362
+ .elseif \size == 1
1363
+ vst1.16 {d31[2]}, [RGB]!
1364
+ .else
1365
+ .error unsupported macroblock size
1366
+ .endif
1367
+ .else
1368
+ .error unsupported bpp
1369
+ .endif
1370
+ .endm
1371
+
1372
+ .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1373
+
1374
+ /*
1375
+ * 2-stage pipelined YCbCr->RGB conversion
1376
+ */
1377
+
1378
+ .macro do_yuv_to_rgb_stage1
1379
+ vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1380
+ vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1381
+ vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1382
+ vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1383
+ vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1384
+ vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1385
+ vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1386
+ vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1387
+ vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1388
+ vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1389
+ .endm
1390
+
1391
+ .macro do_yuv_to_rgb_stage2
1392
+ vrshrn.s32 d20, q10, #15
1393
+ vrshrn.s32 d21, q11, #15
1394
+ vrshrn.s32 d24, q12, #14
1395
+ vrshrn.s32 d25, q13, #14
1396
+ vrshrn.s32 d28, q14, #14
1397
+ vrshrn.s32 d29, q15, #14
1398
+ vaddw.u8 q11, q10, d0
1399
+ vaddw.u8 q12, q12, d0
1400
+ vaddw.u8 q14, q14, d0
1401
+ .if \bpp != 16
1402
+ vqmovun.s16 d1\g_offs, q11
1403
+ vqmovun.s16 d1\r_offs, q12
1404
+ vqmovun.s16 d1\b_offs, q14
1405
+ .else /* rgb565 */
1406
+ vqshlu.s16 q13, q11, #8
1407
+ vqshlu.s16 q15, q12, #8
1408
+ vqshlu.s16 q14, q14, #8
1409
+ vsri.u16 q15, q13, #5
1410
+ vsri.u16 q15, q14, #11
1411
+ .endif
1412
+ .endm
1413
+
1414
+ .macro do_yuv_to_rgb_stage2_store_load_stage1
1415
+ /* "do_yuv_to_rgb_stage2" and "store" */
1416
+ vrshrn.s32 d20, q10, #15
1417
+ /* "load" and "do_yuv_to_rgb_stage1" */
1418
+ pld [U, #64]
1419
+ vrshrn.s32 d21, q11, #15
1420
+ pld [V, #64]
1421
+ vrshrn.s32 d24, q12, #14
1422
+ vrshrn.s32 d25, q13, #14
1423
+ vld1.8 {d4}, [U, :64]!
1424
+ vrshrn.s32 d28, q14, #14
1425
+ vld1.8 {d5}, [V, :64]!
1426
+ vrshrn.s32 d29, q15, #14
1427
+ vaddw.u8 q3, q1, d4 /* q3 = u - 128 */
1428
+ vaddw.u8 q4, q1, d5 /* q2 = v - 128 */
1429
+ vaddw.u8 q11, q10, d0
1430
+ vmull.s16 q10, d6, d1[1] /* multiply by -11277 */
1431
+ vmlal.s16 q10, d8, d1[2] /* multiply by -23401 */
1432
+ vaddw.u8 q12, q12, d0
1433
+ vaddw.u8 q14, q14, d0
1434
+ .if \bpp != 16 /**************** rgb24/rgb32 ******************************/
1435
+ vqmovun.s16 d1\g_offs, q11
1436
+ pld [Y, #64]
1437
+ vqmovun.s16 d1\r_offs, q12
1438
+ vld1.8 {d0}, [Y, :64]!
1439
+ vqmovun.s16 d1\b_offs, q14
1440
+ vmull.s16 q11, d7, d1[1] /* multiply by -11277 */
1441
+ vmlal.s16 q11, d9, d1[2] /* multiply by -23401 */
1442
+ do_store \bpp, 8
1443
+ vmull.s16 q12, d8, d1[0] /* multiply by 22971 */
1444
+ vmull.s16 q13, d9, d1[0] /* multiply by 22971 */
1445
+ vmull.s16 q14, d6, d1[3] /* multiply by 29033 */
1446
+ vmull.s16 q15, d7, d1[3] /* multiply by 29033 */
1447
+ .else /**************************** rgb565 ********************************/
1448
+ vqshlu.s16 q13, q11, #8
1449
+ pld [Y, #64]
1450
+ vqshlu.s16 q15, q12, #8
1451
+ vqshlu.s16 q14, q14, #8
1452
+ vld1.8 {d0}, [Y, :64]!
1453
+ vmull.s16 q11, d7, d1[1]
1454
+ vmlal.s16 q11, d9, d1[2]
1455
+ vsri.u16 q15, q13, #5
1456
+ vmull.s16 q12, d8, d1[0]
1457
+ vsri.u16 q15, q14, #11
1458
+ vmull.s16 q13, d9, d1[0]
1459
+ vmull.s16 q14, d6, d1[3]
1460
+ do_store \bpp, 8
1461
+ vmull.s16 q15, d7, d1[3]
1462
+ .endif
1463
+ .endm
1464
+
1465
+ .macro do_yuv_to_rgb
1466
+ do_yuv_to_rgb_stage1
1467
+ do_yuv_to_rgb_stage2
1468
+ .endm
1469
+
1470
+ /* Apple gas crashes on adrl, work around that by using adr.
1471
+ * But this requires a copy of these constants for each function.
1472
+ */
1473
+
1474
+ .balign 16
1475
+ jsimd_ycc_\colorid\()_neon_consts:
1476
+ .short 0, 0, 0, 0
1477
+ .short 22971, -11277, -23401, 29033
1478
+ .short -128, -128, -128, -128
1479
+ .short -128, -128, -128, -128
1480
+
1481
+ asm_function jsimd_ycc_\colorid\()_convert_neon
1482
+ OUTPUT_WIDTH .req r0
1483
+ INPUT_BUF .req r1
1484
+ INPUT_ROW .req r2
1485
+ OUTPUT_BUF .req r3
1486
+ NUM_ROWS .req r4
1487
+
1488
+ INPUT_BUF0 .req r5
1489
+ INPUT_BUF1 .req r6
1490
+ INPUT_BUF2 .req INPUT_BUF
1491
+
1492
+ RGB .req r7
1493
+ Y .req r8
1494
+ U .req r9
1495
+ V .req r10
1496
+ N .req ip
1497
+
1498
+ /* Load constants to d1, d2, d3 (d0 is just used for padding) */
1499
+ adr ip, jsimd_ycc_\colorid\()_neon_consts
1500
+ vld1.16 {d0, d1, d2, d3}, [ip, :128]
1501
+
1502
+ /* Save ARM registers and handle input arguments */
1503
+ push {r4, r5, r6, r7, r8, r9, r10, lr}
1504
+ ldr NUM_ROWS, [sp, #(4 * 8)]
1505
+ ldr INPUT_BUF0, [INPUT_BUF]
1506
+ ldr INPUT_BUF1, [INPUT_BUF, #4]
1507
+ ldr INPUT_BUF2, [INPUT_BUF, #8]
1508
+ .unreq INPUT_BUF
1509
+
1510
+ /* Save NEON registers */
1511
+ vpush {d8-d15}
1512
+
1513
+ /* Initially set d10, d11, d12, d13 to 0xFF */
1514
+ vmov.u8 q5, #255
1515
+ vmov.u8 q6, #255
1516
+
1517
+ /* Outer loop over scanlines */
1518
+ cmp NUM_ROWS, #1
1519
+ blt 9f
1520
+ 0:
1521
+ ldr Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
1522
+ ldr U, [INPUT_BUF1, INPUT_ROW, lsl #2]
1523
+ mov N, OUTPUT_WIDTH
1524
+ ldr V, [INPUT_BUF2, INPUT_ROW, lsl #2]
1525
+ add INPUT_ROW, INPUT_ROW, #1
1526
+ ldr RGB, [OUTPUT_BUF], #4
1527
+
1528
+ /* Inner loop over pixels */
1529
+ subs N, N, #8
1530
+ blt 3f
1531
+ do_load 8
1532
+ do_yuv_to_rgb_stage1
1533
+ subs N, N, #8
1534
+ blt 2f
1535
+ 1:
1536
+ do_yuv_to_rgb_stage2_store_load_stage1
1537
+ subs N, N, #8
1538
+ bge 1b
1539
+ 2:
1540
+ do_yuv_to_rgb_stage2
1541
+ do_store \bpp, 8
1542
+ tst N, #7
1543
+ beq 8f
1544
+ 3:
1545
+ tst N, #4
1546
+ beq 3f
1547
+ do_load 4
1548
+ 3:
1549
+ tst N, #2
1550
+ beq 4f
1551
+ do_load 2
1552
+ 4:
1553
+ tst N, #1
1554
+ beq 5f
1555
+ do_load 1
1556
+ 5:
1557
+ do_yuv_to_rgb
1558
+ tst N, #4
1559
+ beq 6f
1560
+ do_store \bpp, 4
1561
+ 6:
1562
+ tst N, #2
1563
+ beq 7f
1564
+ do_store \bpp, 2
1565
+ 7:
1566
+ tst N, #1
1567
+ beq 8f
1568
+ do_store \bpp, 1
1569
+ 8:
1570
+ subs NUM_ROWS, NUM_ROWS, #1
1571
+ bgt 0b
1572
+ 9:
1573
+ /* Restore all registers and return */
1574
+ vpop {d8-d15}
1575
+ pop {r4, r5, r6, r7, r8, r9, r10, pc}
1576
+
1577
+ .unreq OUTPUT_WIDTH
1578
+ .unreq INPUT_ROW
1579
+ .unreq OUTPUT_BUF
1580
+ .unreq NUM_ROWS
1581
+ .unreq INPUT_BUF0
1582
+ .unreq INPUT_BUF1
1583
+ .unreq INPUT_BUF2
1584
+ .unreq RGB
1585
+ .unreq Y
1586
+ .unreq U
1587
+ .unreq V
1588
+ .unreq N
1589
+
1590
+ .purgem do_yuv_to_rgb
1591
+ .purgem do_yuv_to_rgb_stage1
1592
+ .purgem do_yuv_to_rgb_stage2
1593
+ .purgem do_yuv_to_rgb_stage2_store_load_stage1
1594
+
1595
+ .endm
1596
+
1597
+ /*--------------------------------- id ----- bpp R G B */
1598
+ generate_jsimd_ycc_rgb_convert_neon extrgb, 24, 0, 1, 2
1599
+ generate_jsimd_ycc_rgb_convert_neon extbgr, 24, 2, 1, 0
1600
+ generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
1601
+ generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
1602
+ generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
1603
+ generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
1604
+ generate_jsimd_ycc_rgb_convert_neon rgb565, 16, 0, 0, 0
1605
+
1606
+ .purgem do_load
1607
+ .purgem do_store
1608
+
1609
+
1610
+ /*****************************************************************************/
1611
+
1612
+ /*
1613
+ * jsimd_extrgb_ycc_convert_neon
1614
+ * jsimd_extbgr_ycc_convert_neon
1615
+ * jsimd_extrgbx_ycc_convert_neon
1616
+ * jsimd_extbgrx_ycc_convert_neon
1617
+ * jsimd_extxbgr_ycc_convert_neon
1618
+ * jsimd_extxrgb_ycc_convert_neon
1619
+ *
1620
+ * Colorspace conversion RGB -> YCbCr
1621
+ */
1622
+
1623
+ .macro do_store size
1624
+ .if \size == 8
1625
+ vst1.8 {d20}, [Y]!
1626
+ vst1.8 {d21}, [U]!
1627
+ vst1.8 {d22}, [V]!
1628
+ .elseif \size == 4
1629
+ vst1.8 {d20[0]}, [Y]!
1630
+ vst1.8 {d20[1]}, [Y]!
1631
+ vst1.8 {d20[2]}, [Y]!
1632
+ vst1.8 {d20[3]}, [Y]!
1633
+ vst1.8 {d21[0]}, [U]!
1634
+ vst1.8 {d21[1]}, [U]!
1635
+ vst1.8 {d21[2]}, [U]!
1636
+ vst1.8 {d21[3]}, [U]!
1637
+ vst1.8 {d22[0]}, [V]!
1638
+ vst1.8 {d22[1]}, [V]!
1639
+ vst1.8 {d22[2]}, [V]!
1640
+ vst1.8 {d22[3]}, [V]!
1641
+ .elseif \size == 2
1642
+ vst1.8 {d20[4]}, [Y]!
1643
+ vst1.8 {d20[5]}, [Y]!
1644
+ vst1.8 {d21[4]}, [U]!
1645
+ vst1.8 {d21[5]}, [U]!
1646
+ vst1.8 {d22[4]}, [V]!
1647
+ vst1.8 {d22[5]}, [V]!
1648
+ .elseif \size == 1
1649
+ vst1.8 {d20[6]}, [Y]!
1650
+ vst1.8 {d21[6]}, [U]!
1651
+ vst1.8 {d22[6]}, [V]!
1652
+ .else
1653
+ .error unsupported macroblock size
1654
+ .endif
1655
+ .endm
1656
+
1657
+ .macro do_load bpp, size
1658
+ .if \bpp == 24
1659
+ .if \size == 8
1660
+ vld3.8 {d10, d11, d12}, [RGB]!
1661
+ pld [RGB, #128]
1662
+ .elseif \size == 4
1663
+ vld3.8 {d10[0], d11[0], d12[0]}, [RGB]!
1664
+ vld3.8 {d10[1], d11[1], d12[1]}, [RGB]!
1665
+ vld3.8 {d10[2], d11[2], d12[2]}, [RGB]!
1666
+ vld3.8 {d10[3], d11[3], d12[3]}, [RGB]!
1667
+ .elseif \size == 2
1668
+ vld3.8 {d10[4], d11[4], d12[4]}, [RGB]!
1669
+ vld3.8 {d10[5], d11[5], d12[5]}, [RGB]!
1670
+ .elseif \size == 1
1671
+ vld3.8 {d10[6], d11[6], d12[6]}, [RGB]!
1672
+ .else
1673
+ .error unsupported macroblock size
1674
+ .endif
1675
+ .elseif \bpp == 32
1676
+ .if \size == 8
1677
+ vld4.8 {d10, d11, d12, d13}, [RGB]!
1678
+ pld [RGB, #128]
1679
+ .elseif \size == 4
1680
+ vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
1681
+ vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
1682
+ vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
1683
+ vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
1684
+ .elseif \size == 2
1685
+ vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
1686
+ vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
1687
+ .elseif \size == 1
1688
+ vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
1689
+ .else
1690
+ .error unsupported macroblock size
1691
+ .endif
1692
+ .else
1693
+ .error unsupported bpp
1694
+ .endif
1695
+ .endm
1696
+
1697
+ .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
1698
+
1699
+ /*
1700
+ * 2-stage pipelined RGB->YCbCr conversion
1701
+ */
1702
+
1703
+ .macro do_rgb_to_yuv_stage1
1704
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1705
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1706
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1707
+ vmull.u16 q7, d4, d0[0]
1708
+ vmlal.u16 q7, d6, d0[1]
1709
+ vmlal.u16 q7, d8, d0[2]
1710
+ vmull.u16 q8, d5, d0[0]
1711
+ vmlal.u16 q8, d7, d0[1]
1712
+ vmlal.u16 q8, d9, d0[2]
1713
+ vrev64.32 q9, q1
1714
+ vrev64.32 q13, q1
1715
+ vmlsl.u16 q9, d4, d0[3]
1716
+ vmlsl.u16 q9, d6, d1[0]
1717
+ vmlal.u16 q9, d8, d1[1]
1718
+ vmlsl.u16 q13, d5, d0[3]
1719
+ vmlsl.u16 q13, d7, d1[0]
1720
+ vmlal.u16 q13, d9, d1[1]
1721
+ vrev64.32 q14, q1
1722
+ vrev64.32 q15, q1
1723
+ vmlal.u16 q14, d4, d1[1]
1724
+ vmlsl.u16 q14, d6, d1[2]
1725
+ vmlsl.u16 q14, d8, d1[3]
1726
+ vmlal.u16 q15, d5, d1[1]
1727
+ vmlsl.u16 q15, d7, d1[2]
1728
+ vmlsl.u16 q15, d9, d1[3]
1729
+ .endm
1730
+
1731
+ .macro do_rgb_to_yuv_stage2
1732
+ vrshrn.u32 d20, q7, #16
1733
+ vrshrn.u32 d21, q8, #16
1734
+ vshrn.u32 d22, q9, #16
1735
+ vshrn.u32 d23, q13, #16
1736
+ vshrn.u32 d24, q14, #16
1737
+ vshrn.u32 d25, q15, #16
1738
+ vmovn.u16 d20, q10 /* d20 = y */
1739
+ vmovn.u16 d21, q11 /* d21 = u */
1740
+ vmovn.u16 d22, q12 /* d22 = v */
1741
+ .endm
1742
+
1743
+ .macro do_rgb_to_yuv
1744
+ do_rgb_to_yuv_stage1
1745
+ do_rgb_to_yuv_stage2
1746
+ .endm
1747
+
1748
+ .macro do_rgb_to_yuv_stage2_store_load_stage1
1749
+ vrshrn.u32 d20, q7, #16
1750
+ vrshrn.u32 d21, q8, #16
1751
+ vshrn.u32 d22, q9, #16
1752
+ vrev64.32 q9, q1
1753
+ vshrn.u32 d23, q13, #16
1754
+ vrev64.32 q13, q1
1755
+ vshrn.u32 d24, q14, #16
1756
+ vshrn.u32 d25, q15, #16
1757
+ do_load \bpp, 8
1758
+ vmovn.u16 d20, q10 /* d20 = y */
1759
+ vmovl.u8 q2, d1\r_offs /* r = { d4, d5 } */
1760
+ vmovn.u16 d21, q11 /* d21 = u */
1761
+ vmovl.u8 q3, d1\g_offs /* g = { d6, d7 } */
1762
+ vmovn.u16 d22, q12 /* d22 = v */
1763
+ vmovl.u8 q4, d1\b_offs /* b = { d8, d9 } */
1764
+ vmull.u16 q7, d4, d0[0]
1765
+ vmlal.u16 q7, d6, d0[1]
1766
+ vmlal.u16 q7, d8, d0[2]
1767
+ vst1.8 {d20}, [Y]!
1768
+ vmull.u16 q8, d5, d0[0]
1769
+ vmlal.u16 q8, d7, d0[1]
1770
+ vmlal.u16 q8, d9, d0[2]
1771
+ vmlsl.u16 q9, d4, d0[3]
1772
+ vmlsl.u16 q9, d6, d1[0]
1773
+ vmlal.u16 q9, d8, d1[1]
1774
+ vst1.8 {d21}, [U]!
1775
+ vmlsl.u16 q13, d5, d0[3]
1776
+ vmlsl.u16 q13, d7, d1[0]
1777
+ vmlal.u16 q13, d9, d1[1]
1778
+ vrev64.32 q14, q1
1779
+ vrev64.32 q15, q1
1780
+ vmlal.u16 q14, d4, d1[1]
1781
+ vmlsl.u16 q14, d6, d1[2]
1782
+ vmlsl.u16 q14, d8, d1[3]
1783
+ vst1.8 {d22}, [V]!
1784
+ vmlal.u16 q15, d5, d1[1]
1785
+ vmlsl.u16 q15, d7, d1[2]
1786
+ vmlsl.u16 q15, d9, d1[3]
1787
+ .endm
1788
+
1789
+ .balign 16
1790
+ jsimd_\colorid\()_ycc_neon_consts:
1791
+ .short 19595, 38470, 7471, 11059
1792
+ .short 21709, 32768, 27439, 5329
1793
+ .short 32767, 128, 32767, 128
1794
+ .short 32767, 128, 32767, 128
1795
+
1796
+ asm_function jsimd_\colorid\()_ycc_convert_neon
1797
+ OUTPUT_WIDTH .req r0
1798
+ INPUT_BUF .req r1
1799
+ OUTPUT_BUF .req r2
1800
+ OUTPUT_ROW .req r3
1801
+ NUM_ROWS .req r4
1802
+
1803
+ OUTPUT_BUF0 .req r5
1804
+ OUTPUT_BUF1 .req r6
1805
+ OUTPUT_BUF2 .req OUTPUT_BUF
1806
+
1807
+ RGB .req r7
1808
+ Y .req r8
1809
+ U .req r9
1810
+ V .req r10
1811
+ N .req ip
1812
+
1813
+ /* Load constants to d0, d1, d2, d3 */
1814
+ adr ip, jsimd_\colorid\()_ycc_neon_consts
1815
+ vld1.16 {d0, d1, d2, d3}, [ip, :128]
1816
+
1817
+ /* Save ARM registers and handle input arguments */
1818
+ push {r4, r5, r6, r7, r8, r9, r10, lr}
1819
+ ldr NUM_ROWS, [sp, #(4 * 8)]
1820
+ ldr OUTPUT_BUF0, [OUTPUT_BUF]
1821
+ ldr OUTPUT_BUF1, [OUTPUT_BUF, #4]
1822
+ ldr OUTPUT_BUF2, [OUTPUT_BUF, #8]
1823
+ .unreq OUTPUT_BUF
1824
+
1825
+ /* Save NEON registers */
1826
+ vpush {d8-d15}
1827
+
1828
+ /* Outer loop over scanlines */
1829
+ cmp NUM_ROWS, #1
1830
+ blt 9f
1831
+ 0:
1832
+ ldr Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
1833
+ ldr U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
1834
+ mov N, OUTPUT_WIDTH
1835
+ ldr V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
1836
+ add OUTPUT_ROW, OUTPUT_ROW, #1
1837
+ ldr RGB, [INPUT_BUF], #4
1838
+
1839
+ /* Inner loop over pixels */
1840
+ subs N, N, #8
1841
+ blt 3f
1842
+ do_load \bpp, 8
1843
+ do_rgb_to_yuv_stage1
1844
+ subs N, N, #8
1845
+ blt 2f
1846
+ 1:
1847
+ do_rgb_to_yuv_stage2_store_load_stage1
1848
+ subs N, N, #8
1849
+ bge 1b
1850
+ 2:
1851
+ do_rgb_to_yuv_stage2
1852
+ do_store 8
1853
+ tst N, #7
1854
+ beq 8f
1855
+ 3:
1856
+ tst N, #4
1857
+ beq 3f
1858
+ do_load \bpp, 4
1859
+ 3:
1860
+ tst N, #2
1861
+ beq 4f
1862
+ do_load \bpp, 2
1863
+ 4:
1864
+ tst N, #1
1865
+ beq 5f
1866
+ do_load \bpp, 1
1867
+ 5:
1868
+ do_rgb_to_yuv
1869
+ tst N, #4
1870
+ beq 6f
1871
+ do_store 4
1872
+ 6:
1873
+ tst N, #2
1874
+ beq 7f
1875
+ do_store 2
1876
+ 7:
1877
+ tst N, #1
1878
+ beq 8f
1879
+ do_store 1
1880
+ 8:
1881
+ subs NUM_ROWS, NUM_ROWS, #1
1882
+ bgt 0b
1883
+ 9:
1884
+ /* Restore all registers and return */
1885
+ vpop {d8-d15}
1886
+ pop {r4, r5, r6, r7, r8, r9, r10, pc}
1887
+
1888
+ .unreq OUTPUT_WIDTH
1889
+ .unreq OUTPUT_ROW
1890
+ .unreq INPUT_BUF
1891
+ .unreq NUM_ROWS
1892
+ .unreq OUTPUT_BUF0
1893
+ .unreq OUTPUT_BUF1
1894
+ .unreq OUTPUT_BUF2
1895
+ .unreq RGB
1896
+ .unreq Y
1897
+ .unreq U
1898
+ .unreq V
1899
+ .unreq N
1900
+
1901
+ .purgem do_rgb_to_yuv
1902
+ .purgem do_rgb_to_yuv_stage1
1903
+ .purgem do_rgb_to_yuv_stage2
1904
+ .purgem do_rgb_to_yuv_stage2_store_load_stage1
1905
+
1906
+ .endm
1907
+
1908
+ /*--------------------------------- id ----- bpp R G B */
1909
+ generate_jsimd_rgb_ycc_convert_neon extrgb, 24, 0, 1, 2
1910
+ generate_jsimd_rgb_ycc_convert_neon extbgr, 24, 2, 1, 0
1911
+ generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
1912
+ generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
1913
+ generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
1914
+ generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
1915
+
1916
+ .purgem do_load
1917
+ .purgem do_store
1918
+
1919
+
1920
+ /*****************************************************************************/
1921
+
1922
+ /*
1923
+ * Load data into workspace, applying unsigned->signed conversion
1924
+ *
1925
+ * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
1926
+ * rid of VST1.16 instructions
1927
+ */
1928
+
1929
+ asm_function jsimd_convsamp_neon
1930
+ SAMPLE_DATA .req r0
1931
+ START_COL .req r1
1932
+ WORKSPACE .req r2
1933
+ TMP1 .req r3
1934
+ TMP2 .req r4
1935
+ TMP3 .req r5
1936
+ TMP4 .req ip
1937
+
1938
+ push {r4, r5}
1939
+ vmov.u8 d0, #128
1940
+
1941
+ ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1942
+ add TMP1, TMP1, START_COL
1943
+ add TMP2, TMP2, START_COL
1944
+ add TMP3, TMP3, START_COL
1945
+ add TMP4, TMP4, START_COL
1946
+ vld1.8 {d16}, [TMP1]
1947
+ vsubl.u8 q8, d16, d0
1948
+ vld1.8 {d18}, [TMP2]
1949
+ vsubl.u8 q9, d18, d0
1950
+ vld1.8 {d20}, [TMP3]
1951
+ vsubl.u8 q10, d20, d0
1952
+ vld1.8 {d22}, [TMP4]
1953
+ ldmia SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
1954
+ vsubl.u8 q11, d22, d0
1955
+ vst1.16 {d16, d17, d18, d19}, [WORKSPACE, :128]!
1956
+ add TMP1, TMP1, START_COL
1957
+ add TMP2, TMP2, START_COL
1958
+ vst1.16 {d20, d21, d22, d23}, [WORKSPACE, :128]!
1959
+ add TMP3, TMP3, START_COL
1960
+ add TMP4, TMP4, START_COL
1961
+ vld1.8 {d24}, [TMP1]
1962
+ vsubl.u8 q12, d24, d0
1963
+ vld1.8 {d26}, [TMP2]
1964
+ vsubl.u8 q13, d26, d0
1965
+ vld1.8 {d28}, [TMP3]
1966
+ vsubl.u8 q14, d28, d0
1967
+ vld1.8 {d30}, [TMP4]
1968
+ vsubl.u8 q15, d30, d0
1969
+ vst1.16 {d24, d25, d26, d27}, [WORKSPACE, :128]!
1970
+ vst1.16 {d28, d29, d30, d31}, [WORKSPACE, :128]!
1971
+ pop {r4, r5}
1972
+ bx lr
1973
+
1974
+ .unreq SAMPLE_DATA
1975
+ .unreq START_COL
1976
+ .unreq WORKSPACE
1977
+ .unreq TMP1
1978
+ .unreq TMP2
1979
+ .unreq TMP3
1980
+ .unreq TMP4
1981
+
1982
+
1983
+ /*****************************************************************************/
1984
+
1985
+ /*
1986
+ * jsimd_fdct_ifast_neon
1987
+ *
1988
+ * This function contains a fast, not so accurate integer implementation of
1989
+ * the forward DCT (Discrete Cosine Transform). It uses the same calculations
1990
+ * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
1991
+ * function from jfdctfst.c
1992
+ *
1993
+ * TODO: can be combined with 'jsimd_convsamp_neon' to get
1994
+ * rid of a bunch of VLD1.16 instructions
1995
+ */
1996
+
1997
+ #define XFIX_0_382683433 d0[0]
1998
+ #define XFIX_0_541196100 d0[1]
1999
+ #define XFIX_0_707106781 d0[2]
2000
+ #define XFIX_1_306562965 d0[3]
2001
+
2002
+ .balign 16
2003
+ jsimd_fdct_ifast_neon_consts:
2004
+ .short (98 * 128) /* XFIX_0_382683433 */
2005
+ .short (139 * 128) /* XFIX_0_541196100 */
2006
+ .short (181 * 128) /* XFIX_0_707106781 */
2007
+ .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
2008
+
2009
+ asm_function jsimd_fdct_ifast_neon
2010
+
2011
+ DATA .req r0
2012
+ TMP .req ip
2013
+
2014
+ vpush {d8-d15}
2015
+
2016
+ /* Load constants */
2017
+ adr TMP, jsimd_fdct_ifast_neon_consts
2018
+ vld1.16 {d0}, [TMP, :64]
2019
+
2020
+ /* Load all DATA into NEON registers with the following allocation:
2021
+ * 0 1 2 3 | 4 5 6 7
2022
+ * ---------+--------
2023
+ * 0 | d16 | d17 | q8
2024
+ * 1 | d18 | d19 | q9
2025
+ * 2 | d20 | d21 | q10
2026
+ * 3 | d22 | d23 | q11
2027
+ * 4 | d24 | d25 | q12
2028
+ * 5 | d26 | d27 | q13
2029
+ * 6 | d28 | d29 | q14
2030
+ * 7 | d30 | d31 | q15
2031
+ */
2032
+
2033
+ vld1.16 {d16, d17, d18, d19}, [DATA, :128]!
2034
+ vld1.16 {d20, d21, d22, d23}, [DATA, :128]!
2035
+ vld1.16 {d24, d25, d26, d27}, [DATA, :128]!
2036
+ vld1.16 {d28, d29, d30, d31}, [DATA, :128]
2037
+ sub DATA, DATA, #(128 - 32)
2038
+
2039
+ mov TMP, #2
2040
+ 1:
2041
+ /* Transpose */
2042
+ vtrn.16 q12, q13
2043
+ vtrn.16 q10, q11
2044
+ vtrn.16 q8, q9
2045
+ vtrn.16 q14, q15
2046
+ vtrn.32 q9, q11
2047
+ vtrn.32 q13, q15
2048
+ vtrn.32 q8, q10
2049
+ vtrn.32 q12, q14
2050
+ vswp d30, d23
2051
+ vswp d24, d17
2052
+ vswp d26, d19
2053
+ /* 1-D FDCT */
2054
+ vadd.s16 q2, q11, q12
2055
+ vswp d28, d21
2056
+ vsub.s16 q12, q11, q12
2057
+ vsub.s16 q6, q10, q13
2058
+ vadd.s16 q10, q10, q13
2059
+ vsub.s16 q7, q9, q14
2060
+ vadd.s16 q9, q9, q14
2061
+ vsub.s16 q1, q8, q15
2062
+ vadd.s16 q8, q8, q15
2063
+ vsub.s16 q4, q9, q10
2064
+ vsub.s16 q5, q8, q2
2065
+ vadd.s16 q3, q9, q10
2066
+ vadd.s16 q4, q4, q5
2067
+ vadd.s16 q2, q8, q2
2068
+ vqdmulh.s16 q4, q4, XFIX_0_707106781
2069
+ vadd.s16 q11, q12, q6
2070
+ vadd.s16 q8, q2, q3
2071
+ vsub.s16 q12, q2, q3
2072
+ vadd.s16 q3, q6, q7
2073
+ vadd.s16 q7, q7, q1
2074
+ vqdmulh.s16 q3, q3, XFIX_0_707106781
2075
+ vsub.s16 q6, q11, q7
2076
+ vadd.s16 q10, q5, q4
2077
+ vqdmulh.s16 q6, q6, XFIX_0_382683433
2078
+ vsub.s16 q14, q5, q4
2079
+ vqdmulh.s16 q11, q11, XFIX_0_541196100
2080
+ vqdmulh.s16 q5, q7, XFIX_1_306562965
2081
+ vadd.s16 q4, q1, q3
2082
+ vsub.s16 q3, q1, q3
2083
+ vadd.s16 q7, q7, q6
2084
+ vadd.s16 q11, q11, q6
2085
+ vadd.s16 q7, q7, q5
2086
+ vadd.s16 q13, q3, q11
2087
+ vsub.s16 q11, q3, q11
2088
+ vadd.s16 q9, q4, q7
2089
+ vsub.s16 q15, q4, q7
2090
+ subs TMP, TMP, #1
2091
+ bne 1b
2092
+
2093
+ /* store results */
2094
+ vst1.16 {d16, d17, d18, d19}, [DATA, :128]!
2095
+ vst1.16 {d20, d21, d22, d23}, [DATA, :128]!
2096
+ vst1.16 {d24, d25, d26, d27}, [DATA, :128]!
2097
+ vst1.16 {d28, d29, d30, d31}, [DATA, :128]
2098
+
2099
+ vpop {d8-d15}
2100
+ bx lr
2101
+
2102
+ .unreq DATA
2103
+ .unreq TMP
2104
+
2105
+
2106
+ /*****************************************************************************/
2107
+
2108
+ /*
2109
+ * GLOBAL(void)
2110
+ * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
2111
+ * DCTELEM *workspace);
2112
+ *
2113
+ * Note: the code uses 2 stage pipelining in order to improve instructions
2114
+ * scheduling and eliminate stalls (this provides ~15% better
2115
+ * performance for this function on both ARM Cortex-A8 and
2116
+ * ARM Cortex-A9 when compared to the non-pipelined variant).
2117
+ * The instructions which belong to the second stage use different
2118
+ * indentation for better readiability.
2119
+ */
2120
+ asm_function jsimd_quantize_neon
2121
+
2122
+ COEF_BLOCK .req r0
2123
+ DIVISORS .req r1
2124
+ WORKSPACE .req r2
2125
+
2126
+ RECIPROCAL .req DIVISORS
2127
+ CORRECTION .req r3
2128
+ SHIFT .req ip
2129
+ LOOP_COUNT .req r4
2130
+
2131
+ vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2132
+ vabs.s16 q12, q0
2133
+ add CORRECTION, DIVISORS, #(64 * 2)
2134
+ add SHIFT, DIVISORS, #(64 * 6)
2135
+ vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2136
+ vabs.s16 q13, q1
2137
+ vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2138
+ vadd.u16 q12, q12, q10 /* add correction */
2139
+ vadd.u16 q13, q13, q11
2140
+ vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2141
+ vmull.u16 q11, d25, d17
2142
+ vmull.u16 q8, d26, d18
2143
+ vmull.u16 q9, d27, d19
2144
+ vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2145
+ vshrn.u32 d20, q10, #16
2146
+ vshrn.u32 d21, q11, #16
2147
+ vshrn.u32 d22, q8, #16
2148
+ vshrn.u32 d23, q9, #16
2149
+ vneg.s16 q12, q12
2150
+ vneg.s16 q13, q13
2151
+ vshr.s16 q2, q0, #15 /* extract sign */
2152
+ vshr.s16 q3, q1, #15
2153
+ vshl.u16 q14, q10, q12 /* shift */
2154
+ vshl.u16 q15, q11, q13
2155
+
2156
+ push {r4, r5}
2157
+ mov LOOP_COUNT, #3
2158
+ 1:
2159
+ vld1.16 {d0, d1, d2, d3}, [WORKSPACE, :128]!
2160
+ veor.u16 q14, q14, q2 /* restore sign */
2161
+ vabs.s16 q12, q0
2162
+ vld1.16 {d20, d21, d22, d23}, [CORRECTION, :128]!
2163
+ vabs.s16 q13, q1
2164
+ veor.u16 q15, q15, q3
2165
+ vld1.16 {d16, d17, d18, d19}, [RECIPROCAL, :128]!
2166
+ vadd.u16 q12, q12, q10 /* add correction */
2167
+ vadd.u16 q13, q13, q11
2168
+ vmull.u16 q10, d24, d16 /* multiply by reciprocal */
2169
+ vmull.u16 q11, d25, d17
2170
+ vmull.u16 q8, d26, d18
2171
+ vmull.u16 q9, d27, d19
2172
+ vsub.u16 q14, q14, q2
2173
+ vld1.16 {d24, d25, d26, d27}, [SHIFT, :128]!
2174
+ vsub.u16 q15, q15, q3
2175
+ vshrn.u32 d20, q10, #16
2176
+ vshrn.u32 d21, q11, #16
2177
+ vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2178
+ vshrn.u32 d22, q8, #16
2179
+ vshrn.u32 d23, q9, #16
2180
+ vneg.s16 q12, q12
2181
+ vneg.s16 q13, q13
2182
+ vshr.s16 q2, q0, #15 /* extract sign */
2183
+ vshr.s16 q3, q1, #15
2184
+ vshl.u16 q14, q10, q12 /* shift */
2185
+ vshl.u16 q15, q11, q13
2186
+ subs LOOP_COUNT, LOOP_COUNT, #1
2187
+ bne 1b
2188
+ pop {r4, r5}
2189
+
2190
+ veor.u16 q14, q14, q2 /* restore sign */
2191
+ veor.u16 q15, q15, q3
2192
+ vsub.u16 q14, q14, q2
2193
+ vsub.u16 q15, q15, q3
2194
+ vst1.16 {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
2195
+
2196
+ bx lr /* return */
2197
+
2198
+ .unreq COEF_BLOCK
2199
+ .unreq DIVISORS
2200
+ .unreq WORKSPACE
2201
+ .unreq RECIPROCAL
2202
+ .unreq CORRECTION
2203
+ .unreq SHIFT
2204
+ .unreq LOOP_COUNT
2205
+
2206
+
2207
+ /*****************************************************************************/
2208
+
2209
+ /*
2210
+ * GLOBAL(void)
2211
+ * jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
2212
+ * JDIMENSION downsampled_width,
2213
+ * JSAMPARRAY input_data,
2214
+ * JSAMPARRAY *output_data_ptr);
2215
+ *
2216
+ * Note: the use of unaligned writes is the main remaining bottleneck in
2217
+ * this code, which can be potentially solved to get up to tens
2218
+ * of percents performance improvement on Cortex-A8/Cortex-A9.
2219
+ */
2220
+
2221
+ /*
2222
+ * Upsample 16 source pixels to 32 destination pixels. The new 16 source
2223
+ * pixels are loaded to q0. The previous 16 source pixels are in q1. The
2224
+ * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
2225
+ * Register d28 is used for multiplication by 3. Register q15 is used
2226
+ * for adding +1 bias.
2227
+ */
2228
+ .macro upsample16 OUTPTR, INPTR
2229
+ vld1.8 {q0}, [\INPTR]!
2230
+ vmovl.u8 q8, d0
2231
+ vext.8 q2, q1, q0, #15
2232
+ vmovl.u8 q9, d1
2233
+ vaddw.u8 q10, q15, d4
2234
+ vaddw.u8 q11, q15, d5
2235
+ vmlal.u8 q8, d4, d28
2236
+ vmlal.u8 q9, d5, d28
2237
+ vmlal.u8 q10, d0, d28
2238
+ vmlal.u8 q11, d1, d28
2239
+ vmov q1, q0 /* backup source pixels to q1 */
2240
+ vrshrn.u16 d6, q8, #2
2241
+ vrshrn.u16 d7, q9, #2
2242
+ vshrn.u16 d8, q10, #2
2243
+ vshrn.u16 d9, q11, #2
2244
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2245
+ .endm
2246
+
2247
+ /*
2248
+ * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
2249
+ * macro, the roles of q0 and q1 registers are reversed for even and odd
2250
+ * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
2251
+ * Also this unrolling allows to reorder loads and stores to compensate
2252
+ * multiplication latency and reduce stalls.
2253
+ */
2254
+ .macro upsample32 OUTPTR, INPTR
2255
+ /* even 16 pixels group */
2256
+ vld1.8 {q0}, [\INPTR]!
2257
+ vmovl.u8 q8, d0
2258
+ vext.8 q2, q1, q0, #15
2259
+ vmovl.u8 q9, d1
2260
+ vaddw.u8 q10, q15, d4
2261
+ vaddw.u8 q11, q15, d5
2262
+ vmlal.u8 q8, d4, d28
2263
+ vmlal.u8 q9, d5, d28
2264
+ vmlal.u8 q10, d0, d28
2265
+ vmlal.u8 q11, d1, d28
2266
+ /* odd 16 pixels group */
2267
+ vld1.8 {q1}, [\INPTR]!
2268
+ vrshrn.u16 d6, q8, #2
2269
+ vrshrn.u16 d7, q9, #2
2270
+ vshrn.u16 d8, q10, #2
2271
+ vshrn.u16 d9, q11, #2
2272
+ vmovl.u8 q8, d2
2273
+ vext.8 q2, q0, q1, #15
2274
+ vmovl.u8 q9, d3
2275
+ vaddw.u8 q10, q15, d4
2276
+ vaddw.u8 q11, q15, d5
2277
+ vmlal.u8 q8, d4, d28
2278
+ vmlal.u8 q9, d5, d28
2279
+ vmlal.u8 q10, d2, d28
2280
+ vmlal.u8 q11, d3, d28
2281
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2282
+ vrshrn.u16 d6, q8, #2
2283
+ vrshrn.u16 d7, q9, #2
2284
+ vshrn.u16 d8, q10, #2
2285
+ vshrn.u16 d9, q11, #2
2286
+ vst2.8 {d6, d7, d8, d9}, [\OUTPTR]!
2287
+ .endm
2288
+
2289
+ /*
2290
+ * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
2291
+ */
2292
+ .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
2293
+ /* special case for the first and last pixels */
2294
+ sub \WIDTH, \WIDTH, #1
2295
+ add \OUTPTR, \OUTPTR, #1
2296
+ ldrb \TMP1, [\INPTR, \WIDTH]
2297
+ strb \TMP1, [\OUTPTR, \WIDTH, asl #1]
2298
+ ldrb \TMP1, [\INPTR], #1
2299
+ strb \TMP1, [\OUTPTR, #-1]
2300
+ vmov.8 d3[7], \TMP1
2301
+
2302
+ subs \WIDTH, \WIDTH, #32
2303
+ blt 5f
2304
+ 0: /* process 32 pixels per iteration */
2305
+ upsample32 \OUTPTR, \INPTR
2306
+ subs \WIDTH, \WIDTH, #32
2307
+ bge 0b
2308
+ 5:
2309
+ adds \WIDTH, \WIDTH, #16
2310
+ blt 1f
2311
+ 0: /* process 16 pixels if needed */
2312
+ upsample16 \OUTPTR, \INPTR
2313
+ subs \WIDTH, \WIDTH, #16
2314
+ 1:
2315
+ adds \WIDTH, \WIDTH, #16
2316
+ beq 9f
2317
+
2318
+ /* load the remaining 1-15 pixels */
2319
+ add \INPTR, \INPTR, \WIDTH
2320
+ tst \WIDTH, #1
2321
+ beq 2f
2322
+ sub \INPTR, \INPTR, #1
2323
+ vld1.8 {d0[0]}, [\INPTR]
2324
+ 2:
2325
+ tst \WIDTH, #2
2326
+ beq 2f
2327
+ vext.8 d0, d0, d0, #6
2328
+ sub \INPTR, \INPTR, #1
2329
+ vld1.8 {d0[1]}, [\INPTR]
2330
+ sub \INPTR, \INPTR, #1
2331
+ vld1.8 {d0[0]}, [\INPTR]
2332
+ 2:
2333
+ tst \WIDTH, #4
2334
+ beq 2f
2335
+ vrev64.32 d0, d0
2336
+ sub \INPTR, \INPTR, #1
2337
+ vld1.8 {d0[3]}, [\INPTR]
2338
+ sub \INPTR, \INPTR, #1
2339
+ vld1.8 {d0[2]}, [\INPTR]
2340
+ sub \INPTR, \INPTR, #1
2341
+ vld1.8 {d0[1]}, [\INPTR]
2342
+ sub \INPTR, \INPTR, #1
2343
+ vld1.8 {d0[0]}, [\INPTR]
2344
+ 2:
2345
+ tst \WIDTH, #8
2346
+ beq 2f
2347
+ vmov d1, d0
2348
+ sub \INPTR, \INPTR, #8
2349
+ vld1.8 {d0}, [\INPTR]
2350
+ 2: /* upsample the remaining pixels */
2351
+ vmovl.u8 q8, d0
2352
+ vext.8 q2, q1, q0, #15
2353
+ vmovl.u8 q9, d1
2354
+ vaddw.u8 q10, q15, d4
2355
+ vaddw.u8 q11, q15, d5
2356
+ vmlal.u8 q8, d4, d28
2357
+ vmlal.u8 q9, d5, d28
2358
+ vmlal.u8 q10, d0, d28
2359
+ vmlal.u8 q11, d1, d28
2360
+ vrshrn.u16 d10, q8, #2
2361
+ vrshrn.u16 d12, q9, #2
2362
+ vshrn.u16 d11, q10, #2
2363
+ vshrn.u16 d13, q11, #2
2364
+ vzip.8 d10, d11
2365
+ vzip.8 d12, d13
2366
+ /* store the remaining pixels */
2367
+ tst \WIDTH, #8
2368
+ beq 2f
2369
+ vst1.8 {d10, d11}, [\OUTPTR]!
2370
+ vmov q5, q6
2371
+ 2:
2372
+ tst \WIDTH, #4
2373
+ beq 2f
2374
+ vst1.8 {d10}, [\OUTPTR]!
2375
+ vmov d10, d11
2376
+ 2:
2377
+ tst \WIDTH, #2
2378
+ beq 2f
2379
+ vst1.8 {d10[0]}, [\OUTPTR]!
2380
+ vst1.8 {d10[1]}, [\OUTPTR]!
2381
+ vst1.8 {d10[2]}, [\OUTPTR]!
2382
+ vst1.8 {d10[3]}, [\OUTPTR]!
2383
+ vext.8 d10, d10, d10, #4
2384
+ 2:
2385
+ tst \WIDTH, #1
2386
+ beq 2f
2387
+ vst1.8 {d10[0]}, [\OUTPTR]!
2388
+ vst1.8 {d10[1]}, [\OUTPTR]!
2389
+ 2:
2390
+ 9:
2391
+ .endm
2392
+
2393
+ asm_function jsimd_h2v1_fancy_upsample_neon
2394
+
2395
+ MAX_V_SAMP_FACTOR .req r0
2396
+ DOWNSAMPLED_WIDTH .req r1
2397
+ INPUT_DATA .req r2
2398
+ OUTPUT_DATA_PTR .req r3
2399
+ OUTPUT_DATA .req OUTPUT_DATA_PTR
2400
+
2401
+ OUTPTR .req r4
2402
+ INPTR .req r5
2403
+ WIDTH .req ip
2404
+ TMP .req lr
2405
+
2406
+ push {r4, r5, r6, lr}
2407
+ vpush {d8-d15}
2408
+
2409
+ ldr OUTPUT_DATA, [OUTPUT_DATA_PTR]
2410
+ cmp MAX_V_SAMP_FACTOR, #0
2411
+ ble 99f
2412
+
2413
+ /* initialize constants */
2414
+ vmov.u8 d28, #3
2415
+ vmov.u16 q15, #1
2416
+ 11:
2417
+ ldr INPTR, [INPUT_DATA], #4
2418
+ ldr OUTPTR, [OUTPUT_DATA], #4
2419
+ mov WIDTH, DOWNSAMPLED_WIDTH
2420
+ upsample_row OUTPTR, INPTR, WIDTH, TMP
2421
+ subs MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
2422
+ bgt 11b
2423
+
2424
+ 99:
2425
+ vpop {d8-d15}
2426
+ pop {r4, r5, r6, pc}
2427
+
2428
+ .unreq MAX_V_SAMP_FACTOR
2429
+ .unreq DOWNSAMPLED_WIDTH
2430
+ .unreq INPUT_DATA
2431
+ .unreq OUTPUT_DATA_PTR
2432
+ .unreq OUTPUT_DATA
2433
+
2434
+ .unreq OUTPTR
2435
+ .unreq INPTR
2436
+ .unreq WIDTH
2437
+ .unreq TMP
2438
+
2439
+ .purgem upsample16
2440
+ .purgem upsample32
2441
+ .purgem upsample_row
2442
+
2443
+
2444
+ /*****************************************************************************/
2445
+
2446
+ /*
2447
+ * GLOBAL(JOCTET *)
2448
+ * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
2449
+ * JCOEFPTR block, int last_dc_val,
2450
+ * c_derived_tbl *dctbl, c_derived_tbl *actbl)
2451
+ *
2452
+ */
2453
+
2454
+ .macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
2455
+ sub \PUT_BITS, \PUT_BITS, #0x8
2456
+ lsr \TMP, \PUT_BUFFER, \PUT_BITS
2457
+ uxtb \TMP, \TMP
2458
+ strb \TMP, [\BUFFER, #1]!
2459
+ cmp \TMP, #0xff
2460
+ /*it eq*/
2461
+ strbeq \ZERO, [\BUFFER, #1]!
2462
+ .endm
2463
+
2464
+ .macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
2465
+ /*lsl \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
2466
+ add \PUT_BITS, \SIZE
2467
+ /*orr \PUT_BUFFER, \PUT_BUFFER, \CODE*/
2468
+ orr \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
2469
+ .endm
2470
+
2471
+ .macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
2472
+ cmp \PUT_BITS, #0x10
2473
+ blt 15f
2474
+ eor \ZERO, \ZERO, \ZERO
2475
+ emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
2476
+ emit_byte \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
2477
+ 15:
2478
+ .endm
2479
+
2480
+ .balign 16
2481
+ jsimd_huff_encode_one_block_neon_consts:
2482
+ .byte 0x01
2483
+ .byte 0x02
2484
+ .byte 0x04
2485
+ .byte 0x08
2486
+ .byte 0x10
2487
+ .byte 0x20
2488
+ .byte 0x40
2489
+ .byte 0x80
2490
+
2491
+ asm_function jsimd_huff_encode_one_block_neon
2492
+ push {r4, r5, r6, r7, r8, r9, r10, r11, lr}
2493
+ add r7, sp, #0x1c
2494
+ sub r4, sp, #0x40
2495
+ bfc r4, #0, #5
2496
+ mov sp, r4 /* align sp on 32 bytes */
2497
+ vst1.64 {d8, d9, d10, d11}, [r4, :128]!
2498
+ vst1.64 {d12, d13, d14, d15}, [r4, :128]
2499
+ sub sp, #0x140 /* reserve 320 bytes */
2500
+ str r0, [sp, #0x18] /* working state > sp + Ox18 */
2501
+ add r4, sp, #0x20 /* r4 = t1 */
2502
+ ldr lr, [r7, #0x8] /* lr = dctbl */
2503
+ sub r10, r1, #0x1 /* r10=buffer-- */
2504
+ ldrsh r1, [r2]
2505
+ mov r9, #0x10
2506
+ mov r8, #0x1
2507
+ adr r5, jsimd_huff_encode_one_block_neon_consts
2508
+ /* prepare data */
2509
+ vld1.8 {d26}, [r5, :64]
2510
+ veor q8, q8, q8
2511
+ veor q9, q9, q9
2512
+ vdup.16 q14, r9
2513
+ vdup.16 q15, r8
2514
+ veor q10, q10, q10
2515
+ veor q11, q11, q11
2516
+ sub r1, r1, r3
2517
+ add r9, r2, #0x22
2518
+ add r8, r2, #0x18
2519
+ add r3, r2, #0x36
2520
+ vmov.16 d0[0], r1
2521
+ vld1.16 {d2[0]}, [r9, :16]
2522
+ vld1.16 {d4[0]}, [r8, :16]
2523
+ vld1.16 {d6[0]}, [r3, :16]
2524
+ add r1, r2, #0x2
2525
+ add r9, r2, #0x30
2526
+ add r8, r2, #0x26
2527
+ add r3, r2, #0x28
2528
+ vld1.16 {d0[1]}, [r1, :16]
2529
+ vld1.16 {d2[1]}, [r9, :16]
2530
+ vld1.16 {d4[1]}, [r8, :16]
2531
+ vld1.16 {d6[1]}, [r3, :16]
2532
+ add r1, r2, #0x10
2533
+ add r9, r2, #0x40
2534
+ add r8, r2, #0x34
2535
+ add r3, r2, #0x1a
2536
+ vld1.16 {d0[2]}, [r1, :16]
2537
+ vld1.16 {d2[2]}, [r9, :16]
2538
+ vld1.16 {d4[2]}, [r8, :16]
2539
+ vld1.16 {d6[2]}, [r3, :16]
2540
+ add r1, r2, #0x20
2541
+ add r9, r2, #0x32
2542
+ add r8, r2, #0x42
2543
+ add r3, r2, #0xc
2544
+ vld1.16 {d0[3]}, [r1, :16]
2545
+ vld1.16 {d2[3]}, [r9, :16]
2546
+ vld1.16 {d4[3]}, [r8, :16]
2547
+ vld1.16 {d6[3]}, [r3, :16]
2548
+ add r1, r2, #0x12
2549
+ add r9, r2, #0x24
2550
+ add r8, r2, #0x50
2551
+ add r3, r2, #0xe
2552
+ vld1.16 {d1[0]}, [r1, :16]
2553
+ vld1.16 {d3[0]}, [r9, :16]
2554
+ vld1.16 {d5[0]}, [r8, :16]
2555
+ vld1.16 {d7[0]}, [r3, :16]
2556
+ add r1, r2, #0x4
2557
+ add r9, r2, #0x16
2558
+ add r8, r2, #0x60
2559
+ add r3, r2, #0x1c
2560
+ vld1.16 {d1[1]}, [r1, :16]
2561
+ vld1.16 {d3[1]}, [r9, :16]
2562
+ vld1.16 {d5[1]}, [r8, :16]
2563
+ vld1.16 {d7[1]}, [r3, :16]
2564
+ add r1, r2, #0x6
2565
+ add r9, r2, #0x8
2566
+ add r8, r2, #0x52
2567
+ add r3, r2, #0x2a
2568
+ vld1.16 {d1[2]}, [r1, :16]
2569
+ vld1.16 {d3[2]}, [r9, :16]
2570
+ vld1.16 {d5[2]}, [r8, :16]
2571
+ vld1.16 {d7[2]}, [r3, :16]
2572
+ add r1, r2, #0x14
2573
+ add r9, r2, #0xa
2574
+ add r8, r2, #0x44
2575
+ add r3, r2, #0x38
2576
+ vld1.16 {d1[3]}, [r1, :16]
2577
+ vld1.16 {d3[3]}, [r9, :16]
2578
+ vld1.16 {d5[3]}, [r8, :16]
2579
+ vld1.16 {d7[3]}, [r3, :16]
2580
+ vcgt.s16 q8, q8, q0
2581
+ vcgt.s16 q9, q9, q1
2582
+ vcgt.s16 q10, q10, q2
2583
+ vcgt.s16 q11, q11, q3
2584
+ vabs.s16 q0, q0
2585
+ vabs.s16 q1, q1
2586
+ vabs.s16 q2, q2
2587
+ vabs.s16 q3, q3
2588
+ veor q8, q8, q0
2589
+ veor q9, q9, q1
2590
+ veor q10, q10, q2
2591
+ veor q11, q11, q3
2592
+ add r9, r4, #0x20
2593
+ add r8, r4, #0x80
2594
+ add r3, r4, #0xa0
2595
+ vclz.i16 q0, q0
2596
+ vclz.i16 q1, q1
2597
+ vclz.i16 q2, q2
2598
+ vclz.i16 q3, q3
2599
+ vsub.i16 q0, q14, q0
2600
+ vsub.i16 q1, q14, q1
2601
+ vsub.i16 q2, q14, q2
2602
+ vsub.i16 q3, q14, q3
2603
+ vst1.16 {d0, d1, d2, d3}, [r4, :256]
2604
+ vst1.16 {d4, d5, d6, d7}, [r9, :256]
2605
+ vshl.s16 q0, q15, q0
2606
+ vshl.s16 q1, q15, q1
2607
+ vshl.s16 q2, q15, q2
2608
+ vshl.s16 q3, q15, q3
2609
+ vsub.i16 q0, q0, q15
2610
+ vsub.i16 q1, q1, q15
2611
+ vsub.i16 q2, q2, q15
2612
+ vsub.i16 q3, q3, q15
2613
+ vand q8, q8, q0
2614
+ vand q9, q9, q1
2615
+ vand q10, q10, q2
2616
+ vand q11, q11, q3
2617
+ vst1.16 {d16, d17, d18, d19}, [r8, :256]
2618
+ vst1.16 {d20, d21, d22, d23}, [r3, :256]
2619
+ add r1, r2, #0x46
2620
+ add r9, r2, #0x3a
2621
+ add r8, r2, #0x74
2622
+ add r3, r2, #0x6a
2623
+ vld1.16 {d8[0]}, [r1, :16]
2624
+ vld1.16 {d10[0]}, [r9, :16]
2625
+ vld1.16 {d12[0]}, [r8, :16]
2626
+ vld1.16 {d14[0]}, [r3, :16]
2627
+ veor q8, q8, q8
2628
+ veor q9, q9, q9
2629
+ veor q10, q10, q10
2630
+ veor q11, q11, q11
2631
+ add r1, r2, #0x54
2632
+ add r9, r2, #0x2c
2633
+ add r8, r2, #0x76
2634
+ add r3, r2, #0x78
2635
+ vld1.16 {d8[1]}, [r1, :16]
2636
+ vld1.16 {d10[1]}, [r9, :16]
2637
+ vld1.16 {d12[1]}, [r8, :16]
2638
+ vld1.16 {d14[1]}, [r3, :16]
2639
+ add r1, r2, #0x62
2640
+ add r9, r2, #0x1e
2641
+ add r8, r2, #0x68
2642
+ add r3, r2, #0x7a
2643
+ vld1.16 {d8[2]}, [r1, :16]
2644
+ vld1.16 {d10[2]}, [r9, :16]
2645
+ vld1.16 {d12[2]}, [r8, :16]
2646
+ vld1.16 {d14[2]}, [r3, :16]
2647
+ add r1, r2, #0x70
2648
+ add r9, r2, #0x2e
2649
+ add r8, r2, #0x5a
2650
+ add r3, r2, #0x6c
2651
+ vld1.16 {d8[3]}, [r1, :16]
2652
+ vld1.16 {d10[3]}, [r9, :16]
2653
+ vld1.16 {d12[3]}, [r8, :16]
2654
+ vld1.16 {d14[3]}, [r3, :16]
2655
+ add r1, r2, #0x72
2656
+ add r9, r2, #0x3c
2657
+ add r8, r2, #0x4c
2658
+ add r3, r2, #0x5e
2659
+ vld1.16 {d9[0]}, [r1, :16]
2660
+ vld1.16 {d11[0]}, [r9, :16]
2661
+ vld1.16 {d13[0]}, [r8, :16]
2662
+ vld1.16 {d15[0]}, [r3, :16]
2663
+ add r1, r2, #0x64
2664
+ add r9, r2, #0x4a
2665
+ add r8, r2, #0x3e
2666
+ add r3, r2, #0x6e
2667
+ vld1.16 {d9[1]}, [r1, :16]
2668
+ vld1.16 {d11[1]}, [r9, :16]
2669
+ vld1.16 {d13[1]}, [r8, :16]
2670
+ vld1.16 {d15[1]}, [r3, :16]
2671
+ add r1, r2, #0x56
2672
+ add r9, r2, #0x58
2673
+ add r8, r2, #0x4e
2674
+ add r3, r2, #0x7c
2675
+ vld1.16 {d9[2]}, [r1, :16]
2676
+ vld1.16 {d11[2]}, [r9, :16]
2677
+ vld1.16 {d13[2]}, [r8, :16]
2678
+ vld1.16 {d15[2]}, [r3, :16]
2679
+ add r1, r2, #0x48
2680
+ add r9, r2, #0x66
2681
+ add r8, r2, #0x5c
2682
+ add r3, r2, #0x7e
2683
+ vld1.16 {d9[3]}, [r1, :16]
2684
+ vld1.16 {d11[3]}, [r9, :16]
2685
+ vld1.16 {d13[3]}, [r8, :16]
2686
+ vld1.16 {d15[3]}, [r3, :16]
2687
+ vcgt.s16 q8, q8, q4
2688
+ vcgt.s16 q9, q9, q5
2689
+ vcgt.s16 q10, q10, q6
2690
+ vcgt.s16 q11, q11, q7
2691
+ vabs.s16 q4, q4
2692
+ vabs.s16 q5, q5
2693
+ vabs.s16 q6, q6
2694
+ vabs.s16 q7, q7
2695
+ veor q8, q8, q4
2696
+ veor q9, q9, q5
2697
+ veor q10, q10, q6
2698
+ veor q11, q11, q7
2699
+ add r1, r4, #0x40
2700
+ add r9, r4, #0x60
2701
+ add r8, r4, #0xc0
2702
+ add r3, r4, #0xe0
2703
+ vclz.i16 q4, q4
2704
+ vclz.i16 q5, q5
2705
+ vclz.i16 q6, q6
2706
+ vclz.i16 q7, q7
2707
+ vsub.i16 q4, q14, q4
2708
+ vsub.i16 q5, q14, q5
2709
+ vsub.i16 q6, q14, q6
2710
+ vsub.i16 q7, q14, q7
2711
+ vst1.16 {d8, d9, d10, d11}, [r1, :256]
2712
+ vst1.16 {d12, d13, d14, d15}, [r9, :256]
2713
+ vshl.s16 q4, q15, q4
2714
+ vshl.s16 q5, q15, q5
2715
+ vshl.s16 q6, q15, q6
2716
+ vshl.s16 q7, q15, q7
2717
+ vsub.i16 q4, q4, q15
2718
+ vsub.i16 q5, q5, q15
2719
+ vsub.i16 q6, q6, q15
2720
+ vsub.i16 q7, q7, q15
2721
+ vand q8, q8, q4
2722
+ vand q9, q9, q5
2723
+ vand q10, q10, q6
2724
+ vand q11, q11, q7
2725
+ vst1.16 {d16, d17, d18, d19}, [r8, :256]
2726
+ vst1.16 {d20, d21, d22, d23}, [r3, :256]
2727
+ ldr r12, [r7, #0xc] /* r12 = actbl */
2728
+ add r1, lr, #0x400 /* r1 = dctbl->ehufsi */
2729
+ mov r9, r12 /* r9 = actbl */
2730
+ add r6, r4, #0x80 /* r6 = t2 */
2731
+ ldr r11, [r0, #0x8] /* r11 = put_buffer */
2732
+ ldr r4, [r0, #0xc] /* r4 = put_bits */
2733
+ ldrh r2, [r6, #-128] /* r2 = nbits */
2734
+ ldrh r3, [r6] /* r3 = temp2 & (((JLONG)1)<<nbits) - 1; */
2735
+ ldr r0, [lr, r2, lsl #2]
2736
+ ldrb r5, [r1, r2]
2737
+ put_bits r11, r4, r0, r5
2738
+ checkbuf15 r10, r11, r4, r5, r0
2739
+ put_bits r11, r4, r3, r2
2740
+ checkbuf15 r10, r11, r4, r5, r0
2741
+ mov lr, r6 /* lr = t2 */
2742
+ add r5, r9, #0x400 /* r5 = actbl->ehufsi */
2743
+ ldrsb r6, [r5, #0xf0] /* r6 = actbl->ehufsi[0xf0] */
2744
+ veor q8, q8, q8
2745
+ vceq.i16 q0, q0, q8
2746
+ vceq.i16 q1, q1, q8
2747
+ vceq.i16 q2, q2, q8
2748
+ vceq.i16 q3, q3, q8
2749
+ vceq.i16 q4, q4, q8
2750
+ vceq.i16 q5, q5, q8
2751
+ vceq.i16 q6, q6, q8
2752
+ vceq.i16 q7, q7, q8
2753
+ vmovn.i16 d0, q0
2754
+ vmovn.i16 d2, q1
2755
+ vmovn.i16 d4, q2
2756
+ vmovn.i16 d6, q3
2757
+ vmovn.i16 d8, q4
2758
+ vmovn.i16 d10, q5
2759
+ vmovn.i16 d12, q6
2760
+ vmovn.i16 d14, q7
2761
+ vand d0, d0, d26
2762
+ vand d2, d2, d26
2763
+ vand d4, d4, d26
2764
+ vand d6, d6, d26
2765
+ vand d8, d8, d26
2766
+ vand d10, d10, d26
2767
+ vand d12, d12, d26
2768
+ vand d14, d14, d26
2769
+ vpadd.i8 d0, d0, d2
2770
+ vpadd.i8 d4, d4, d6
2771
+ vpadd.i8 d8, d8, d10
2772
+ vpadd.i8 d12, d12, d14
2773
+ vpadd.i8 d0, d0, d4
2774
+ vpadd.i8 d8, d8, d12
2775
+ vpadd.i8 d0, d0, d8
2776
+ vmov.32 r1, d0[1]
2777
+ vmov.32 r8, d0[0]
2778
+ mvn r1, r1
2779
+ mvn r8, r8
2780
+ lsrs r1, r1, #0x1
2781
+ rrx r8, r8 /* shift in last r1 bit while shifting out DC bit */
2782
+ rbit r1, r1 /* r1 = index1 */
2783
+ rbit r8, r8 /* r8 = index0 */
2784
+ ldr r0, [r9, #0x3c0] /* r0 = actbl->ehufco[0xf0] */
2785
+ str r1, [sp, #0x14] /* index1 > sp + 0x14 */
2786
+ cmp r8, #0x0
2787
+ beq 6f
2788
+ 1:
2789
+ clz r2, r8
2790
+ add lr, lr, r2, lsl #1
2791
+ lsl r8, r8, r2
2792
+ ldrh r1, [lr, #-126]
2793
+ 2:
2794
+ cmp r2, #0x10
2795
+ blt 3f
2796
+ sub r2, r2, #0x10
2797
+ put_bits r11, r4, r0, r6
2798
+ cmp r4, #0x10
2799
+ blt 2b
2800
+ eor r3, r3, r3
2801
+ emit_byte r10, r11, r4, r3, r12
2802
+ emit_byte r10, r11, r4, r3, r12
2803
+ b 2b
2804
+ 3:
2805
+ add r2, r1, r2, lsl #4
2806
+ ldrh r3, [lr, #2]!
2807
+ ldr r12, [r9, r2, lsl #2]
2808
+ ldrb r2, [r5, r2]
2809
+ put_bits r11, r4, r12, r2
2810
+ checkbuf15 r10, r11, r4, r2, r12
2811
+ put_bits r11, r4, r3, r1
2812
+ checkbuf15 r10, r11, r4, r2, r12
2813
+ lsls r8, r8, #0x1
2814
+ bne 1b
2815
+ 6:
2816
+ add r12, sp, #0x20 /* r12 = t1 */
2817
+ ldr r8, [sp, #0x14] /* r8 = index1 */
2818
+ adds r12, #0xc0 /* r12 = t2 + (DCTSIZE2/2) */
2819
+ cmp r8, #0x0
2820
+ beq 6f
2821
+ clz r2, r8
2822
+ sub r12, r12, lr
2823
+ lsl r8, r8, r2
2824
+ add r2, r2, r12, lsr #1
2825
+ add lr, lr, r2, lsl #1
2826
+ b 7f
2827
+ 1:
2828
+ clz r2, r8
2829
+ add lr, lr, r2, lsl #1
2830
+ lsl r8, r8, r2
2831
+ 7:
2832
+ ldrh r1, [lr, #-126]
2833
+ 2:
2834
+ cmp r2, #0x10
2835
+ blt 3f
2836
+ sub r2, r2, #0x10
2837
+ put_bits r11, r4, r0, r6
2838
+ cmp r4, #0x10
2839
+ blt 2b
2840
+ eor r3, r3, r3
2841
+ emit_byte r10, r11, r4, r3, r12
2842
+ emit_byte r10, r11, r4, r3, r12
2843
+ b 2b
2844
+ 3:
2845
+ add r2, r1, r2, lsl #4
2846
+ ldrh r3, [lr, #2]!
2847
+ ldr r12, [r9, r2, lsl #2]
2848
+ ldrb r2, [r5, r2]
2849
+ put_bits r11, r4, r12, r2
2850
+ checkbuf15 r10, r11, r4, r2, r12
2851
+ put_bits r11, r4, r3, r1
2852
+ checkbuf15 r10, r11, r4, r2, r12
2853
+ lsls r8, r8, #0x1
2854
+ bne 1b
2855
+ 6:
2856
+ add r0, sp, #0x20
2857
+ add r0, #0xfe
2858
+ cmp lr, r0
2859
+ bhs 1f
2860
+ ldr r1, [r9]
2861
+ ldrb r0, [r5]
2862
+ put_bits r11, r4, r1, r0
2863
+ checkbuf15 r10, r11, r4, r0, r1
2864
+ 1:
2865
+ ldr r12, [sp, #0x18]
2866
+ str r11, [r12, #0x8]
2867
+ str r4, [r12, #0xc]
2868
+ add r0, r10, #0x1
2869
+ add r4, sp, #0x140
2870
+ vld1.64 {d8, d9, d10, d11}, [r4, :128]!
2871
+ vld1.64 {d12, d13, d14, d15}, [r4, :128]
2872
+ sub r4, r7, #0x1c
2873
+ mov sp, r4
2874
+ pop {r4, r5, r6, r7, r8, r9, r10, r11, pc}
2875
+
2876
+ .purgem emit_byte
2877
+ .purgem put_bits
2878
+ .purgem checkbuf15