image_pack 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (319) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +18 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +140 -0
  5. data/THIRD_PARTY_NOTICES.md +8 -0
  6. data/ext/image_pack/extconf.rb +515 -0
  7. data/ext/image_pack/image_pack.c +1618 -0
  8. data/ext/image_pack/vendor/.vendored +1 -0
  9. data/ext/image_pack/vendor/mozjpeg/BUILDING.txt +902 -0
  10. data/ext/image_pack/vendor/mozjpeg/CMakeLists.txt +1593 -0
  11. data/ext/image_pack/vendor/mozjpeg/LICENSE.md +132 -0
  12. data/ext/image_pack/vendor/mozjpeg/README-mozilla.txt +194 -0
  13. data/ext/image_pack/vendor/mozjpeg/README-turbo.txt +346 -0
  14. data/ext/image_pack/vendor/mozjpeg/README.ijg +258 -0
  15. data/ext/image_pack/vendor/mozjpeg/README.md +29 -0
  16. data/ext/image_pack/vendor/mozjpeg/cderror.h +128 -0
  17. data/ext/image_pack/vendor/mozjpeg/cdjpeg.c +156 -0
  18. data/ext/image_pack/vendor/mozjpeg/cdjpeg.h +171 -0
  19. data/ext/image_pack/vendor/mozjpeg/cjpeg.c +961 -0
  20. data/ext/image_pack/vendor/mozjpeg/cmyk.h +60 -0
  21. data/ext/image_pack/vendor/mozjpeg/coderules.txt +78 -0
  22. data/ext/image_pack/vendor/mozjpeg/croptest.in +95 -0
  23. data/ext/image_pack/vendor/mozjpeg/djpeg.c +855 -0
  24. data/ext/image_pack/vendor/mozjpeg/example.txt +464 -0
  25. data/ext/image_pack/vendor/mozjpeg/jaricom.c +157 -0
  26. data/ext/image_pack/vendor/mozjpeg/jcapimin.c +307 -0
  27. data/ext/image_pack/vendor/mozjpeg/jcapistd.c +168 -0
  28. data/ext/image_pack/vendor/mozjpeg/jcarith.c +972 -0
  29. data/ext/image_pack/vendor/mozjpeg/jccoefct.c +609 -0
  30. data/ext/image_pack/vendor/mozjpeg/jccolext.c +144 -0
  31. data/ext/image_pack/vendor/mozjpeg/jccolor.c +721 -0
  32. data/ext/image_pack/vendor/mozjpeg/jcdctmgr.c +1776 -0
  33. data/ext/image_pack/vendor/mozjpeg/jcext.c +219 -0
  34. data/ext/image_pack/vendor/mozjpeg/jchuff.c +1146 -0
  35. data/ext/image_pack/vendor/mozjpeg/jchuff.h +57 -0
  36. data/ext/image_pack/vendor/mozjpeg/jcicc.c +105 -0
  37. data/ext/image_pack/vendor/mozjpeg/jcinit.c +82 -0
  38. data/ext/image_pack/vendor/mozjpeg/jcmainct.c +162 -0
  39. data/ext/image_pack/vendor/mozjpeg/jcmarker.c +844 -0
  40. data/ext/image_pack/vendor/mozjpeg/jcmaster.c +958 -0
  41. data/ext/image_pack/vendor/mozjpeg/jcmaster.h +56 -0
  42. data/ext/image_pack/vendor/mozjpeg/jcomapi.c +109 -0
  43. data/ext/image_pack/vendor/mozjpeg/jconfig.h.in +37 -0
  44. data/ext/image_pack/vendor/mozjpeg/jconfig.txt +93 -0
  45. data/ext/image_pack/vendor/mozjpeg/jconfigint.h.in +44 -0
  46. data/ext/image_pack/vendor/mozjpeg/jcparam.c +991 -0
  47. data/ext/image_pack/vendor/mozjpeg/jcphuff.c +1123 -0
  48. data/ext/image_pack/vendor/mozjpeg/jcprepct.c +351 -0
  49. data/ext/image_pack/vendor/mozjpeg/jcsample.c +522 -0
  50. data/ext/image_pack/vendor/mozjpeg/jcstest.c +126 -0
  51. data/ext/image_pack/vendor/mozjpeg/jctrans.c +408 -0
  52. data/ext/image_pack/vendor/mozjpeg/jdapimin.c +407 -0
  53. data/ext/image_pack/vendor/mozjpeg/jdapistd.c +691 -0
  54. data/ext/image_pack/vendor/mozjpeg/jdarith.c +782 -0
  55. data/ext/image_pack/vendor/mozjpeg/jdatadst-tj.c +198 -0
  56. data/ext/image_pack/vendor/mozjpeg/jdatadst.c +299 -0
  57. data/ext/image_pack/vendor/mozjpeg/jdatasrc-tj.c +194 -0
  58. data/ext/image_pack/vendor/mozjpeg/jdatasrc.c +295 -0
  59. data/ext/image_pack/vendor/mozjpeg/jdcoefct.c +881 -0
  60. data/ext/image_pack/vendor/mozjpeg/jdcoefct.h +83 -0
  61. data/ext/image_pack/vendor/mozjpeg/jdcol565.c +384 -0
  62. data/ext/image_pack/vendor/mozjpeg/jdcolext.c +141 -0
  63. data/ext/image_pack/vendor/mozjpeg/jdcolor.c +881 -0
  64. data/ext/image_pack/vendor/mozjpeg/jdct.h +208 -0
  65. data/ext/image_pack/vendor/mozjpeg/jddctmgr.c +367 -0
  66. data/ext/image_pack/vendor/mozjpeg/jdhuff.c +834 -0
  67. data/ext/image_pack/vendor/mozjpeg/jdhuff.h +247 -0
  68. data/ext/image_pack/vendor/mozjpeg/jdicc.c +167 -0
  69. data/ext/image_pack/vendor/mozjpeg/jdinput.c +408 -0
  70. data/ext/image_pack/vendor/mozjpeg/jdmainct.c +460 -0
  71. data/ext/image_pack/vendor/mozjpeg/jdmainct.h +71 -0
  72. data/ext/image_pack/vendor/mozjpeg/jdmarker.c +1374 -0
  73. data/ext/image_pack/vendor/mozjpeg/jdmaster.c +727 -0
  74. data/ext/image_pack/vendor/mozjpeg/jdmaster.h +33 -0
  75. data/ext/image_pack/vendor/mozjpeg/jdmerge.c +587 -0
  76. data/ext/image_pack/vendor/mozjpeg/jdmerge.h +47 -0
  77. data/ext/image_pack/vendor/mozjpeg/jdmrg565.c +354 -0
  78. data/ext/image_pack/vendor/mozjpeg/jdmrgext.c +184 -0
  79. data/ext/image_pack/vendor/mozjpeg/jdphuff.c +679 -0
  80. data/ext/image_pack/vendor/mozjpeg/jdpostct.c +294 -0
  81. data/ext/image_pack/vendor/mozjpeg/jdsample.c +524 -0
  82. data/ext/image_pack/vendor/mozjpeg/jdsample.h +50 -0
  83. data/ext/image_pack/vendor/mozjpeg/jdtrans.c +156 -0
  84. data/ext/image_pack/vendor/mozjpeg/jerror.c +251 -0
  85. data/ext/image_pack/vendor/mozjpeg/jerror.h +335 -0
  86. data/ext/image_pack/vendor/mozjpeg/jfdctflt.c +169 -0
  87. data/ext/image_pack/vendor/mozjpeg/jfdctfst.c +227 -0
  88. data/ext/image_pack/vendor/mozjpeg/jfdctint.c +288 -0
  89. data/ext/image_pack/vendor/mozjpeg/jidctflt.c +240 -0
  90. data/ext/image_pack/vendor/mozjpeg/jidctfst.c +371 -0
  91. data/ext/image_pack/vendor/mozjpeg/jidctint.c +2627 -0
  92. data/ext/image_pack/vendor/mozjpeg/jidctred.c +409 -0
  93. data/ext/image_pack/vendor/mozjpeg/jinclude.h +147 -0
  94. data/ext/image_pack/vendor/mozjpeg/jmemmgr.c +1180 -0
  95. data/ext/image_pack/vendor/mozjpeg/jmemnobs.c +110 -0
  96. data/ext/image_pack/vendor/mozjpeg/jmemsys.h +178 -0
  97. data/ext/image_pack/vendor/mozjpeg/jmorecfg.h +382 -0
  98. data/ext/image_pack/vendor/mozjpeg/jpeg_nbits_table.h +4098 -0
  99. data/ext/image_pack/vendor/mozjpeg/jpegcomp.h +32 -0
  100. data/ext/image_pack/vendor/mozjpeg/jpegint.h +453 -0
  101. data/ext/image_pack/vendor/mozjpeg/jpeglib.h +1211 -0
  102. data/ext/image_pack/vendor/mozjpeg/jpegtran.c +827 -0
  103. data/ext/image_pack/vendor/mozjpeg/jpegyuv.c +172 -0
  104. data/ext/image_pack/vendor/mozjpeg/jquant1.c +856 -0
  105. data/ext/image_pack/vendor/mozjpeg/jquant2.c +1286 -0
  106. data/ext/image_pack/vendor/mozjpeg/jsimd.h +123 -0
  107. data/ext/image_pack/vendor/mozjpeg/jsimd_none.c +431 -0
  108. data/ext/image_pack/vendor/mozjpeg/jsimddct.h +70 -0
  109. data/ext/image_pack/vendor/mozjpeg/jstdhuff.c +144 -0
  110. data/ext/image_pack/vendor/mozjpeg/jutils.c +133 -0
  111. data/ext/image_pack/vendor/mozjpeg/jversion.h.in +56 -0
  112. data/ext/image_pack/vendor/mozjpeg/libjpeg.map.in +11 -0
  113. data/ext/image_pack/vendor/mozjpeg/libjpeg.txt +3150 -0
  114. data/ext/image_pack/vendor/mozjpeg/rdbmp.c +690 -0
  115. data/ext/image_pack/vendor/mozjpeg/rdcolmap.c +253 -0
  116. data/ext/image_pack/vendor/mozjpeg/rdgif.c +720 -0
  117. data/ext/image_pack/vendor/mozjpeg/rdjpeg.c +160 -0
  118. data/ext/image_pack/vendor/mozjpeg/rdjpgcom.c +494 -0
  119. data/ext/image_pack/vendor/mozjpeg/rdpng.c +194 -0
  120. data/ext/image_pack/vendor/mozjpeg/rdppm.c +781 -0
  121. data/ext/image_pack/vendor/mozjpeg/rdswitch.c +642 -0
  122. data/ext/image_pack/vendor/mozjpeg/rdtarga.c +508 -0
  123. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jccolext-neon.c +148 -0
  124. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jchuff-neon.c +334 -0
  125. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jsimd.c +976 -0
  126. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch32/jsimd_neon.S +1200 -0
  127. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jccolext-neon.c +316 -0
  128. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jchuff-neon.c +411 -0
  129. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jsimd.c +1053 -0
  130. data/ext/image_pack/vendor/mozjpeg/simd/arm/aarch64/jsimd_neon.S +2254 -0
  131. data/ext/image_pack/vendor/mozjpeg/simd/arm/align.h +28 -0
  132. data/ext/image_pack/vendor/mozjpeg/simd/arm/jccolor-neon.c +160 -0
  133. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcgray-neon.c +120 -0
  134. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcgryext-neon.c +106 -0
  135. data/ext/image_pack/vendor/mozjpeg/simd/arm/jchuff.h +131 -0
  136. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcphuff-neon.c +623 -0
  137. data/ext/image_pack/vendor/mozjpeg/simd/arm/jcsample-neon.c +192 -0
  138. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdcolext-neon.c +374 -0
  139. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdcolor-neon.c +141 -0
  140. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdmerge-neon.c +144 -0
  141. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdmrgext-neon.c +723 -0
  142. data/ext/image_pack/vendor/mozjpeg/simd/arm/jdsample-neon.c +569 -0
  143. data/ext/image_pack/vendor/mozjpeg/simd/arm/jfdctfst-neon.c +214 -0
  144. data/ext/image_pack/vendor/mozjpeg/simd/arm/jfdctint-neon.c +376 -0
  145. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctfst-neon.c +472 -0
  146. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctint-neon.c +801 -0
  147. data/ext/image_pack/vendor/mozjpeg/simd/arm/jidctred-neon.c +486 -0
  148. data/ext/image_pack/vendor/mozjpeg/simd/arm/jquanti-neon.c +193 -0
  149. data/ext/image_pack/vendor/mozjpeg/simd/arm/neon-compat.h +26 -0
  150. data/ext/image_pack/vendor/mozjpeg/simd/arm/neon-compat.h.in +37 -0
  151. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-avx2.asm +578 -0
  152. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-mmx.asm +476 -0
  153. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolext-sse2.asm +503 -0
  154. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-avx2.asm +121 -0
  155. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-mmx.asm +121 -0
  156. data/ext/image_pack/vendor/mozjpeg/simd/i386/jccolor-sse2.asm +120 -0
  157. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-avx2.asm +113 -0
  158. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-mmx.asm +113 -0
  159. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgray-sse2.asm +112 -0
  160. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-avx2.asm +457 -0
  161. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-mmx.asm +355 -0
  162. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcgryext-sse2.asm +382 -0
  163. data/ext/image_pack/vendor/mozjpeg/simd/i386/jchuff-sse2.asm +761 -0
  164. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcphuff-sse2.asm +662 -0
  165. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-avx2.asm +388 -0
  166. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-mmx.asm +324 -0
  167. data/ext/image_pack/vendor/mozjpeg/simd/i386/jcsample-sse2.asm +351 -0
  168. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-avx2.asm +515 -0
  169. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-mmx.asm +404 -0
  170. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolext-sse2.asm +458 -0
  171. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-avx2.asm +118 -0
  172. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-mmx.asm +117 -0
  173. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdcolor-sse2.asm +117 -0
  174. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-avx2.asm +136 -0
  175. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-mmx.asm +123 -0
  176. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmerge-sse2.asm +135 -0
  177. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-avx2.asm +575 -0
  178. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-mmx.asm +460 -0
  179. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdmrgext-sse2.asm +517 -0
  180. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-avx2.asm +760 -0
  181. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-mmx.asm +731 -0
  182. data/ext/image_pack/vendor/mozjpeg/simd/i386/jdsample-sse2.asm +724 -0
  183. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctflt-3dn.asm +318 -0
  184. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctflt-sse.asm +369 -0
  185. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctfst-mmx.asm +395 -0
  186. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctfst-sse2.asm +403 -0
  187. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-avx2.asm +331 -0
  188. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-mmx.asm +620 -0
  189. data/ext/image_pack/vendor/mozjpeg/simd/i386/jfdctint-sse2.asm +633 -0
  190. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-3dn.asm +451 -0
  191. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-sse.asm +571 -0
  192. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctflt-sse2.asm +497 -0
  193. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctfst-mmx.asm +499 -0
  194. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctfst-sse2.asm +501 -0
  195. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-avx2.asm +453 -0
  196. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-mmx.asm +851 -0
  197. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctint-sse2.asm +858 -0
  198. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctred-mmx.asm +704 -0
  199. data/ext/image_pack/vendor/mozjpeg/simd/i386/jidctred-sse2.asm +592 -0
  200. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-3dn.asm +230 -0
  201. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-mmx.asm +276 -0
  202. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquant-sse.asm +208 -0
  203. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquantf-sse2.asm +168 -0
  204. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquanti-avx2.asm +188 -0
  205. data/ext/image_pack/vendor/mozjpeg/simd/i386/jquanti-sse2.asm +201 -0
  206. data/ext/image_pack/vendor/mozjpeg/simd/i386/jsimd.c +1312 -0
  207. data/ext/image_pack/vendor/mozjpeg/simd/i386/jsimdcpu.asm +135 -0
  208. data/ext/image_pack/vendor/mozjpeg/simd/jsimd.h +1258 -0
  209. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd.c +1143 -0
  210. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd_dspr2.S +4543 -0
  211. data/ext/image_pack/vendor/mozjpeg/simd/mips/jsimd_dspr2_asm.h +292 -0
  212. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jccolext-mmi.c +455 -0
  213. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jccolor-mmi.c +148 -0
  214. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcgray-mmi.c +132 -0
  215. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcgryext-mmi.c +374 -0
  216. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcsample-mmi.c +98 -0
  217. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jcsample.h +28 -0
  218. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdcolext-mmi.c +415 -0
  219. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdcolor-mmi.c +139 -0
  220. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdmerge-mmi.c +149 -0
  221. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdmrgext-mmi.c +615 -0
  222. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jdsample-mmi.c +304 -0
  223. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jfdctfst-mmi.c +255 -0
  224. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jfdctint-mmi.c +398 -0
  225. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jidctfst-mmi.c +395 -0
  226. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jidctint-mmi.c +571 -0
  227. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jquanti-mmi.c +124 -0
  228. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jsimd.c +866 -0
  229. data/ext/image_pack/vendor/mozjpeg/simd/mips64/jsimd_mmi.h +69 -0
  230. data/ext/image_pack/vendor/mozjpeg/simd/mips64/loongson-mmintrin.h +1334 -0
  231. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jcolsamp.inc +135 -0
  232. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jdct.inc +31 -0
  233. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdcfg.inc +93 -0
  234. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdcfg.inc.h +133 -0
  235. data/ext/image_pack/vendor/mozjpeg/simd/nasm/jsimdext.inc +520 -0
  236. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jccolext-altivec.c +269 -0
  237. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jccolor-altivec.c +116 -0
  238. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcgray-altivec.c +111 -0
  239. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcgryext-altivec.c +228 -0
  240. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcsample-altivec.c +159 -0
  241. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jcsample.h +28 -0
  242. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdcolext-altivec.c +276 -0
  243. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdcolor-altivec.c +106 -0
  244. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdmerge-altivec.c +130 -0
  245. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdmrgext-altivec.c +329 -0
  246. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jdsample-altivec.c +400 -0
  247. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jfdctfst-altivec.c +154 -0
  248. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jfdctint-altivec.c +258 -0
  249. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jidctfst-altivec.c +255 -0
  250. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jidctint-altivec.c +357 -0
  251. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jquanti-altivec.c +250 -0
  252. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jsimd.c +884 -0
  253. data/ext/image_pack/vendor/mozjpeg/simd/powerpc/jsimd_altivec.h +98 -0
  254. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolext-avx2.asm +559 -0
  255. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolext-sse2.asm +484 -0
  256. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolor-avx2.asm +121 -0
  257. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jccolor-sse2.asm +120 -0
  258. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgray-avx2.asm +113 -0
  259. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgray-sse2.asm +112 -0
  260. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgryext-avx2.asm +438 -0
  261. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcgryext-sse2.asm +363 -0
  262. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jchuff-sse2.asm +583 -0
  263. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcphuff-sse2.asm +639 -0
  264. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcsample-avx2.asm +367 -0
  265. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jcsample-sse2.asm +330 -0
  266. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolext-avx2.asm +496 -0
  267. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolext-sse2.asm +439 -0
  268. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolor-avx2.asm +118 -0
  269. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdcolor-sse2.asm +117 -0
  270. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmerge-avx2.asm +136 -0
  271. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmerge-sse2.asm +135 -0
  272. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmrgext-avx2.asm +596 -0
  273. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdmrgext-sse2.asm +538 -0
  274. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdsample-avx2.asm +696 -0
  275. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jdsample-sse2.asm +665 -0
  276. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctflt-sse.asm +355 -0
  277. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctfst-sse2.asm +389 -0
  278. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctint-avx2.asm +320 -0
  279. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jfdctint-sse2.asm +619 -0
  280. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctflt-sse2.asm +482 -0
  281. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctfst-sse2.asm +491 -0
  282. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctint-avx2.asm +418 -0
  283. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctint-sse2.asm +847 -0
  284. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jidctred-sse2.asm +574 -0
  285. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquantf-sse2.asm +155 -0
  286. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquanti-avx2.asm +163 -0
  287. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jquanti-sse2.asm +188 -0
  288. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jsimd.c +1110 -0
  289. data/ext/image_pack/vendor/mozjpeg/simd/x86_64/jsimdcpu.asm +86 -0
  290. data/ext/image_pack/vendor/mozjpeg/strtest.c +170 -0
  291. data/ext/image_pack/vendor/mozjpeg/structure.txt +900 -0
  292. data/ext/image_pack/vendor/mozjpeg/tjbench.c +1044 -0
  293. data/ext/image_pack/vendor/mozjpeg/tjbenchtest.in +256 -0
  294. data/ext/image_pack/vendor/mozjpeg/tjbenchtest.java.in +215 -0
  295. data/ext/image_pack/vendor/mozjpeg/tjexample.c +406 -0
  296. data/ext/image_pack/vendor/mozjpeg/tjexampletest.in +149 -0
  297. data/ext/image_pack/vendor/mozjpeg/tjexampletest.java.in +151 -0
  298. data/ext/image_pack/vendor/mozjpeg/tjunittest.c +961 -0
  299. data/ext/image_pack/vendor/mozjpeg/tjutil.c +70 -0
  300. data/ext/image_pack/vendor/mozjpeg/tjutil.h +53 -0
  301. data/ext/image_pack/vendor/mozjpeg/transupp.c +2373 -0
  302. data/ext/image_pack/vendor/mozjpeg/transupp.h +243 -0
  303. data/ext/image_pack/vendor/mozjpeg/turbojpeg-jni.c +1259 -0
  304. data/ext/image_pack/vendor/mozjpeg/turbojpeg.c +2320 -0
  305. data/ext/image_pack/vendor/mozjpeg/turbojpeg.h +1784 -0
  306. data/ext/image_pack/vendor/mozjpeg/usage.txt +679 -0
  307. data/ext/image_pack/vendor/mozjpeg/wizard.txt +220 -0
  308. data/ext/image_pack/vendor/mozjpeg/wrbmp.c +552 -0
  309. data/ext/image_pack/vendor/mozjpeg/wrgif.c +580 -0
  310. data/ext/image_pack/vendor/mozjpeg/wrjpgcom.c +577 -0
  311. data/ext/image_pack/vendor/mozjpeg/wrppm.c +366 -0
  312. data/ext/image_pack/vendor/mozjpeg/wrtarga.c +258 -0
  313. data/ext/image_pack/vendor/mozjpeg/yuvjpeg.c +268 -0
  314. data/lib/image_pack/backend.rb +8 -0
  315. data/lib/image_pack/configuration.rb +23 -0
  316. data/lib/image_pack/errors.rb +13 -0
  317. data/lib/image_pack/version.rb +5 -0
  318. data/lib/image_pack.rb +208 -0
  319. metadata +433 -0
@@ -0,0 +1,761 @@
1
+ ;
2
+ ; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
3
+ ;
4
+ ; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
5
+ ; Copyright (C) 2015, Matthieu Darbois.
6
+ ; Copyright (C) 2018, Matthias Räncker.
7
+ ;
8
+ ; Based on the x86 SIMD extension for IJG JPEG library
9
+ ; Copyright (C) 1999-2006, MIYASAKA Masaru.
10
+ ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11
+ ;
12
+ ; This file should be assembled with NASM (Netwide Assembler),
13
+ ; can *not* be assembled with Microsoft's MASM or any compatible
14
+ ; assembler (including Borland's Turbo Assembler).
15
+ ; NASM is available from http://nasm.sourceforge.net/ or
16
+ ; http://sourceforge.net/project/showfiles.php?group_id=6208
17
+ ;
18
+ ; This file contains an SSE2 implementation for Huffman coding of one block.
19
+ ; The following code is based on jchuff.c; see jchuff.c for more details.
20
+
21
+ %include "jsimdext.inc"
22
+
23
+ struc working_state
24
+ .next_output_byte: resp 1 ; => next byte to write in buffer
25
+ .free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
26
+ .cur.put_buffer.simd resq 1 ; current bit accumulation buffer
27
+ .cur.free_bits resd 1 ; # of bits available in it
28
+ .cur.last_dc_val resd 4 ; last DC coef for each component
29
+ .cinfo: resp 1 ; dump_buffer needs access to this
30
+ endstruc
31
+
32
+ struc c_derived_tbl
33
+ .ehufco: resd 256 ; code for each symbol
34
+ .ehufsi: resb 256 ; length of code for each symbol
35
+ ; If no code has been allocated for a symbol S, ehufsi[S] contains 0
36
+ endstruc
37
+
38
+ ; --------------------------------------------------------------------------
39
+ SECTION SEG_CONST
40
+
41
+ GLOBAL_DATA(jconst_huff_encode_one_block)
42
+
43
+ EXTN(jconst_huff_encode_one_block):
44
+
45
+ alignz 32
46
+
47
+ jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
48
+ dq 0x000f, 0x001f, 0x003f, 0x007f
49
+ dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
50
+ dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
51
+
52
+ times 1 << 14 db 15
53
+ times 1 << 13 db 14
54
+ times 1 << 12 db 13
55
+ times 1 << 11 db 12
56
+ times 1 << 10 db 11
57
+ times 1 << 9 db 10
58
+ times 1 << 8 db 9
59
+ times 1 << 7 db 8
60
+ times 1 << 6 db 7
61
+ times 1 << 5 db 6
62
+ times 1 << 4 db 5
63
+ times 1 << 3 db 4
64
+ times 1 << 2 db 3
65
+ times 1 << 1 db 2
66
+ times 1 << 0 db 1
67
+ times 1 db 0
68
+ jpeg_nbits_table:
69
+ times 1 db 0
70
+ times 1 << 0 db 1
71
+ times 1 << 1 db 2
72
+ times 1 << 2 db 3
73
+ times 1 << 3 db 4
74
+ times 1 << 4 db 5
75
+ times 1 << 5 db 6
76
+ times 1 << 6 db 7
77
+ times 1 << 7 db 8
78
+ times 1 << 8 db 9
79
+ times 1 << 9 db 10
80
+ times 1 << 10 db 11
81
+ times 1 << 11 db 12
82
+ times 1 << 12 db 13
83
+ times 1 << 13 db 14
84
+ times 1 << 14 db 15
85
+
86
+ alignz 32
87
+
88
+ %ifdef PIC
89
+ %define NBITS(x) nbits_base + x
90
+ %else
91
+ %define NBITS(x) jpeg_nbits_table + x
92
+ %endif
93
+ %define MASK_BITS(x) NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
94
+
95
+ ; --------------------------------------------------------------------------
96
+ SECTION SEG_TEXT
97
+ BITS 32
98
+
99
+ %define mm_put_buffer mm0
100
+ %define mm_all_0xff mm1
101
+ %define mm_temp mm2
102
+ %define mm_nbits mm3
103
+ %define mm_code_bits mm3
104
+ %define mm_code mm4
105
+ %define mm_overflow_bits mm5
106
+ %define mm_save_nbits mm6
107
+
108
+ ; Shorthand used to describe SIMD operations:
109
+ ; wN: xmmN treated as eight signed 16-bit values
110
+ ; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
111
+ ; bN: xmmN treated as 16 unsigned 8-bit values, or
112
+ ; mmN treated as eight unsigned 8-bit values
113
+ ; bN[i]: perform the same operation on all unsigned 8-bit values,
114
+ ; i=0..15 (SSE register) or i=0..7 (MMX register)
115
+ ; Contents of SIMD registers are shown in memory order.
116
+
117
+ ; Fill the bit buffer to capacity with the leading bits from code, then output
118
+ ; the bit buffer and put the remaining bits from code into the bit buffer.
119
+ ;
120
+ ; Usage:
121
+ ; code - contains the bits to shift into the bit buffer (LSB-aligned)
122
+ ; %1 - temp register
123
+ ; %2 - low byte of temp register
124
+ ; %3 - second byte of temp register
125
+ ; %4-%8 (optional) - extra instructions to execute before the macro completes
126
+ ; %9 - the label to which to jump when the macro completes
127
+ ;
128
+ ; Upon completion, free_bits will be set to the number of remaining bits from
129
+ ; code, and put_buffer will contain those remaining bits. temp and code will
130
+ ; be clobbered.
131
+ ;
132
+ ; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
133
+ ; macro in jchuff.c.
134
+
135
+ %macro EMIT_QWORD 9
136
+ %define %%temp %1
137
+ %define %%tempb %2
138
+ %define %%temph %3
139
+ add nbits, free_bits ; nbits += free_bits;
140
+ neg free_bits ; free_bits = -free_bits;
141
+ movq mm_temp, mm_code ; temp = code;
142
+ movd mm_nbits, nbits ; nbits --> MMX register
143
+ movd mm_overflow_bits, free_bits ; overflow_bits (temp register) = free_bits;
144
+ neg free_bits ; free_bits = -free_bits;
145
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
146
+ psrlq mm_temp, mm_overflow_bits ; temp >>= overflow_bits;
147
+ add free_bits, 64 ; free_bits += 64;
148
+ por mm_temp, mm_put_buffer ; temp |= put_buffer;
149
+ %ifidn %%temp, nbits_base
150
+ movd mm_save_nbits, nbits_base ; save nbits_base
151
+ %endif
152
+ movq mm_code_bits, mm_temp ; code_bits (temp register) = temp;
153
+ movq mm_put_buffer, mm_code ; put_buffer = code;
154
+ pcmpeqb mm_temp, mm_all_0xff ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
155
+ movq mm_code, mm_code_bits ; code = code_bits;
156
+ psrlq mm_code_bits, 32 ; code_bits >>= 32;
157
+ pmovmskb nbits, mm_temp ; nbits = 0; nbits |= ((b_temp[i] >> 7) << i);
158
+ movd %%temp, mm_code_bits ; temp = code_bits;
159
+ bswap %%temp ; temp = htonl(temp);
160
+ test nbits, nbits ; if (nbits != 0) /* Some 0xFF bytes */
161
+ jnz %%.SLOW ; goto %%.SLOW
162
+ mov dword [buffer], %%temp ; *(uint32_t)buffer = temp;
163
+ %ifidn %%temp, nbits_base
164
+ movd nbits_base, mm_save_nbits ; restore nbits_base
165
+ %endif
166
+ %4
167
+ movd nbits, mm_code ; nbits = (uint32_t)(code);
168
+ %5
169
+ bswap nbits ; nbits = htonl(nbits);
170
+ mov dword [buffer + 4], nbits ; *(uint32_t)(buffer + 4) = nbits;
171
+ lea buffer, [buffer + 8] ; buffer += 8;
172
+ %6
173
+ %7
174
+ %8
175
+ jmp %9 ; return
176
+ %%.SLOW:
177
+ ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
178
+ ; bytes in the qword.
179
+ mov byte [buffer], %%tempb ; buffer[0] = temp[0];
180
+ cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
181
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
182
+ sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
183
+ mov byte [buffer], %%temph ; buffer[0] = temp[1];
184
+ cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
185
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
186
+ sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
187
+ shr %%temp, 16 ; temp >>= 16;
188
+ mov byte [buffer], %%tempb ; buffer[0] = temp[0];
189
+ cmp %%tempb, 0xFF ; Set CF if temp[0] < 0xFF
190
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
191
+ sbb buffer, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
192
+ mov byte [buffer], %%temph ; buffer[0] = temp[1];
193
+ cmp %%temph, 0xFF ; Set CF if temp[1] < 0xFF
194
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
195
+ sbb buffer, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
196
+ movd nbits, mm_code ; nbits (temp register) = (uint32_t)(code)
197
+ %ifidn %%temp, nbits_base
198
+ movd nbits_base, mm_save_nbits ; restore nbits_base
199
+ %endif
200
+ bswap nbits ; nbits = htonl(nbits)
201
+ mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
202
+ cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
203
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
204
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
205
+ mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
206
+ cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
207
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
208
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
209
+ shr nbits, 16 ; nbits >>= 16;
210
+ mov byte [buffer], nbitsb ; buffer[0] = nbits[0];
211
+ cmp nbitsb, 0xFF ; Set CF if nbits[0] < 0xFF
212
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
213
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
214
+ mov byte [buffer], nbitsh ; buffer[0] = nbits[1];
215
+ %4
216
+ cmp nbitsh, 0xFF ; Set CF if nbits[1] < 0xFF
217
+ mov byte [buffer+1], 0 ; buffer[1] = 0;
218
+ sbb buffer, -2 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
219
+ %5
220
+ %6
221
+ %7
222
+ %8
223
+ jmp %9 ; return;
224
+ %endmacro
225
+
226
+ %macro PUSH 1
227
+ push %1
228
+ %assign stack_offset stack_offset + 4
229
+ %endmacro
230
+
231
+ %macro POP 1
232
+ pop %1
233
+ %assign stack_offset stack_offset - 4
234
+ %endmacro
235
+
236
+ ; If PIC is defined, load the address of a symbol defined in this file into a
237
+ ; register. Equivalent to
238
+ ; get_GOT %1
239
+ ; lea %1, [GOTOFF(%1, %2)]
240
+ ; without using the GOT.
241
+ ;
242
+ ; Usage:
243
+ ; %1 - register into which to load the address of the symbol
244
+ ; %2 - symbol whose address should be loaded
245
+ ; %3 - optional multi-line macro to execute before the symbol address is loaded
246
+ ; %4 - optional multi-line macro to execute after the symbol address is loaded
247
+ ;
248
+ ; If PIC is not defined, then %3 and %4 are executed in order.
249
+
250
+ %macro GET_SYM 2-4
251
+ %ifdef PIC
252
+ call %%.geteip
253
+ %%.ref:
254
+ %4
255
+ add %1, %2 - %%.ref
256
+ jmp short %%.done
257
+ align 32
258
+ %%.geteip:
259
+ %3 4 ; must adjust stack pointer because of call
260
+ mov %1, POINTER [esp]
261
+ ret
262
+ align 32
263
+ %%.done:
264
+ %else
265
+ %3 0
266
+ %4
267
+ %endif
268
+ %endmacro
269
+
270
+ ;
271
+ ; Encode a single block's worth of coefficients.
272
+ ;
273
+ ; GLOBAL(JOCTET *)
274
+ ; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
275
+ ; JCOEFPTR block, int last_dc_val,
276
+ ; c_derived_tbl *dctbl, c_derived_tbl *actbl)
277
+ ;
278
+ ; Stack layout:
279
+ ; Function args
280
+ ; Return address
281
+ ; Saved ebx
282
+ ; Saved ebp
283
+ ; Saved esi
284
+ ; Saved edi <-- esp_save
285
+ ; ...
286
+ ; esp_save
287
+ ; t_ 64*2 bytes (aligned to 128 bytes)
288
+ ;
289
+ ; esp is used (as t) to point into t_ (data in lower indices is not used once
290
+ ; esp passes over them, so this is signal-safe.) Aligning to 128 bytes allows
291
+ ; us to find the rest of the data again.
292
+ ;
293
+ ; NOTES:
294
+ ; When shuffling data, we try to avoid pinsrw as much as possible, since it is
295
+ ; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
296
+ ; modern CPUs, so chains of pinsrw instructions (even with different outputs)
297
+ ; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
298
+ ; requires 2 µops (with memory operand) on Intel. In either case, only one
299
+ ; pinsrw instruction can be decoded per cycle (and nothing else if they are
300
+ ; back-to-back), so out-of-order execution cannot be used to work around long
301
+ ; pinsrw chains (though for Sandy Bridge and later, this may be less of a
302
+ ; problem if the code runs from the µop cache.)
303
+ ;
304
+ ; We use tzcnt instead of bsf without checking for support. The instruction is
305
+ ; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
306
+ ; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
307
+ ; an input dependency (although the behavior is not formally defined, Intel
308
+ ; CPUs usually leave the destination unmodified if the source is zero.) This
309
+ ; can prevent out-of-order execution, so we clear the destination before
310
+ ; invoking tzcnt.
311
+ ;
312
+ ; Initial register allocation
313
+ ; eax - frame --> buffer
314
+ ; ebx - nbits_base (PIC) / emit_temp
315
+ ; ecx - dctbl --> size --> state
316
+ ; edx - block --> nbits
317
+ ; esi - code_temp --> state --> actbl
318
+ ; edi - index_temp --> free_bits
319
+ ; esp - t
320
+ ; ebp - index
321
+
322
+ %define frame eax
323
+ %ifdef PIC
324
+ %define nbits_base ebx
325
+ %endif
326
+ %define emit_temp ebx
327
+ %define emit_tempb bl
328
+ %define emit_temph bh
329
+ %define dctbl ecx
330
+ %define block edx
331
+ %define code_temp esi
332
+ %define index_temp edi
333
+ %define t esp
334
+ %define index ebp
335
+
336
+ %assign save_frame DCTSIZE2 * SIZEOF_WORD
337
+
338
+ ; Step 1: Re-arrange input data according to jpeg_natural_order
339
+ ; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
340
+ ; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
341
+ ; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
342
+ ; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
343
+ ; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
344
+ ; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
345
+ ; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
346
+ ; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
347
+
348
+ align 32
349
+ GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
350
+
351
+ EXTN(jsimd_huff_encode_one_block_sse2):
352
+
353
+ %assign stack_offset 0
354
+ %define arg_state 4 + stack_offset
355
+ %define arg_buffer 8 + stack_offset
356
+ %define arg_block 12 + stack_offset
357
+ %define arg_last_dc_val 16 + stack_offset
358
+ %define arg_dctbl 20 + stack_offset
359
+ %define arg_actbl 24 + stack_offset
360
+
361
+ ;X: X = code stream
362
+ mov block, [esp + arg_block]
363
+ PUSH ebx
364
+ PUSH ebp
365
+ movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
366
+ PUSH esi
367
+ PUSH edi
368
+ movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
369
+ mov frame, esp
370
+ lea t, [frame - (save_frame + 4)]
371
+ movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
372
+ and t, -DCTSIZE2 * SIZEOF_WORD ; t = &t_[0]
373
+ mov [t + save_frame], frame
374
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
375
+ punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
376
+ pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
377
+ pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
378
+ punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
379
+ punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
380
+ pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
381
+ ;A: (Row 0, offset 1)
382
+ pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
383
+ paddw xmm0, xmm4 ;A: w0[i] += w4[i];
384
+ movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
385
+
386
+ movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
387
+ pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
388
+ pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
389
+ movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
390
+ movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
391
+ punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
392
+ pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
393
+ pxor xmm4, xmm4 ;A: w4[i] = 0;
394
+ psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
395
+ pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
396
+ pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
397
+ ; (Row 1, offset 1)
398
+ pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
399
+ paddw xmm1, xmm4 ;B: w1[i] += w4[i];
400
+ movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
401
+ pxor xmm4, xmm4 ;B: w4[i] = 0;
402
+ pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
403
+
404
+ packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
405
+ ; w/ signed saturation
406
+
407
+ pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
408
+ pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
409
+ pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
410
+ pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
411
+ ; (Row 3, offset 1)
412
+ pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
413
+ paddw xmm3, xmm4 ;D: w3[i] += w4[i];
414
+ movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
415
+ pxor xmm4, xmm4 ;D: w4[i] = 0;
416
+ pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
417
+
418
+ pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
419
+ pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
420
+ pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
421
+ pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
422
+ pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
423
+ pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
424
+ ; (Row 2, offset 1)
425
+ pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
426
+ paddw xmm2, xmm4 ;C: w2[i] += w4[i];
427
+ movsx code_temp, word [block] ;Z: code_temp = block[0];
428
+
429
+ ; %1 - stack pointer adjustment
430
+ %macro GET_SYM_BEFORE 1
431
+ movaps XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
432
+ ;C: t[i+16] = w2[i];
433
+ pxor xmm4, xmm4 ;C: w4[i] = 0;
434
+ pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
435
+ sub code_temp, [frame + arg_last_dc_val] ;Z: code_temp -= last_dc_val;
436
+
437
+ packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
438
+ ; w/ signed saturation
439
+
440
+ movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
441
+ pmovmskb index_temp, xmm2 ;Z: index_temp = 0; index_temp |= ((b2[i] >> 7) << i);
442
+ pmovmskb index, xmm0 ;Z: index = 0; index |= ((b0[i] >> 7) << i);
443
+ movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
444
+ punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
445
+ shl index_temp, 16 ;Z: index_temp <<= 16;
446
+ psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
447
+ pxor xmm2, xmm2 ;H: w2[i] = 0;
448
+ pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
449
+ or index, index_temp ;Z: index |= index_temp;
450
+ %undef index_temp
451
+ %define free_bits edi
452
+ %endmacro
453
+
454
+ %macro GET_SYM_AFTER 0
455
+ movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
456
+ unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
457
+ pxor xmm0, xmm0 ;H: w0[i] = 0;
458
+ not index ;Z: index = ~index;
459
+ pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
460
+ ; (Row 7, offset 1)
461
+ pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
462
+ mov dctbl, [frame + arg_dctbl]
463
+ paddw xmm3, xmm2 ;H: w3[i] += w2[i];
464
+ movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
465
+ movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
466
+ pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
467
+ punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
468
+ movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
469
+ pcmpeqw mm_all_0xff, mm_all_0xff ;Z: all_0xff[i] = 0xFF;
470
+ %endmacro
471
+
472
+ GET_SYM nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
473
+
474
+ psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
475
+ shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
476
+ pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
477
+ pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
478
+ pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
479
+ pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
480
+ cmp code_temp, 1 << 31 ;Z: Set CF if code_temp < 0x80000000,
481
+ ;Z: i.e. if code_temp is positive
482
+ pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
483
+ movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
484
+ pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
485
+ pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
486
+ pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
487
+ ; (Row 6, offset 1)
488
+ adc code_temp, -1 ;Z: code_temp += -1 + (code_temp >= 0 ? 1 : 0);
489
+ pxor xmm2, xmm2 ;G: w2[i] = 0;
490
+ pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
491
+ pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
492
+ paddw xmm4, xmm0 ;G: w4[i] += w0[i];
493
+ movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
494
+ movd mm_temp, code_temp ;Z: temp = code_temp
495
+ pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
496
+ ; (Row 5, offset 1)
497
+ pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
498
+
499
+ packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
500
+ ; w/ signed saturation
501
+
502
+ lea t, [t - SIZEOF_WORD] ;Z: t = &t[-1]
503
+ pxor xmm0, xmm0 ;F: w0[i] = 0;
504
+ pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
505
+ paddw xmm1, xmm2 ;F: w1[i] += w2[i];
506
+ movaps XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
507
+ pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
508
+ pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
509
+ pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
510
+ pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
511
+ pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
512
+ ; (Row 4, offset 1)
513
+ %undef block
514
+ %define nbits edx
515
+ %define nbitsb dl
516
+ %define nbitsh dh
517
+ movzx nbits, byte [NBITS(code_temp)] ;Z: nbits = JPEG_NBITS(code_temp);
518
+ %undef code_temp
519
+ %define state esi
520
+ pxor xmm2, xmm2 ;E: w2[i] = 0;
521
+ mov state, [frame + arg_state]
522
+ movd mm_nbits, nbits ;Z: nbits --> MMX register
523
+ pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
524
+ movd mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
525
+ ;Z: code = dctbl->ehufco[nbits];
526
+ %define size ecx
527
+ %define sizeb cl
528
+ %define sizeh ch
529
+ paddw xmm5, xmm0 ;E: w5[i] += w0[i];
530
+ movaps XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
531
+ movzx size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
532
+ ;Z: size = dctbl->ehufsi[nbits];
533
+ %undef dctbl
534
+ pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
535
+
536
+ packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
537
+ ; w/ signed saturation
538
+
539
+ movq mm_put_buffer, [state + working_state.cur.put_buffer.simd]
540
+ ;Z: put_buffer = state->cur.put_buffer.simd;
541
+ mov free_bits, [state + working_state.cur.free_bits]
542
+ ;Z: free_bits = state->cur.free_bits;
543
+ %undef state
544
+ %define actbl esi
545
+ mov actbl, [frame + arg_actbl]
546
+ %define buffer eax
547
+ mov buffer, [frame + arg_buffer]
548
+ %undef frame
549
+ jmp .BEGIN
550
+
551
+ ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
552
+
553
+ align 16
554
+ ; size <= 32, so this is not really a loop
555
+ .BRLOOP1: ; .BRLOOP1:
556
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
557
+ ; nbits = actbl->ehufsi[0xf0];
558
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
559
+ ; code = actbl->ehufco[0xf0];
560
+ and index, 0x7ffffff ; clear index if size == 32
561
+ sub size, 16 ; size -= 16;
562
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
563
+ jle .EMIT_BRLOOP1 ; goto .EMIT_BRLOOP1;
564
+ movd mm_nbits, nbits ; nbits --> MMX register
565
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
566
+ por mm_put_buffer, mm_code ; put_buffer |= code;
567
+ jmp .ERLOOP1 ; goto .ERLOOP1;
568
+
569
+ ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
570
+
571
+ align 16
572
+ %ifdef PIC
573
+ times 6 nop
574
+ %else
575
+ times 2 nop
576
+ %endif
577
+ .BLOOP1: ; do { /* size = # of zero bits/elements to skip */
578
+ ; if size == 32, index remains unchanged. Correct in .BRLOOP.
579
+ shr index, sizeb ; index >>= size;
580
+ lea t, [t + size * SIZEOF_WORD] ; t += size;
581
+ cmp size, 16 ; if (size > 16)
582
+ jg .BRLOOP1 ; goto .BRLOOP1;
583
+ .ERLOOP1: ; .ERLOOP1:
584
+ movsx nbits, word [t] ; nbits = *t;
585
+ %ifdef PIC
586
+ add size, size ; size += size;
587
+ %else
588
+ lea size, [size * 2] ; size += size;
589
+ %endif
590
+ movd mm_temp, nbits ; temp = nbits;
591
+ movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
592
+ lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
593
+ movd mm_nbits, nbits ; nbits --> MMX register
594
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
595
+ ; code = actbl->ehufco[size-16];
596
+ movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
597
+ ; size = actbl->ehufsi[size-16];
598
+ .BEGIN: ; .BEGIN:
599
+ pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
600
+ psllq mm_code, mm_nbits ; code <<= nbits;
601
+ add nbits, size ; nbits += size;
602
+ por mm_code, mm_temp ; code |= temp;
603
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
604
+ jle .EMIT_ERLOOP1 ; insert code, flush buffer, init size, goto .BLOOP1
605
+ xor size, size ; size = 0; /* kill tzcnt input dependency */
606
+ tzcnt size, index ; size = # of trailing 0 bits in index
607
+ movd mm_nbits, nbits ; nbits --> MMX register
608
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
609
+ inc size ; ++size;
610
+ por mm_put_buffer, mm_code ; put_buffer |= code;
611
+ test index, index
612
+ jnz .BLOOP1 ; } while (index != 0);
613
+ ; Round 2
614
+ ; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
615
+ .ELOOP1: ; .ELOOP1:
616
+ pmovmskb size, xmm4 ; size = 0; size |= ((b4[i] >> 7) << i);
617
+ pmovmskb index, xmm5 ; index = 0; index |= ((b5[i] >> 7) << i);
618
+ shl size, 16 ; size <<= 16;
619
+ or index, size ; index |= size;
620
+ not index ; index = ~index;
621
+ lea nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
622
+ ; nbits = t + 1 + 64;
623
+ and nbits, -DCTSIZE2 * SIZEOF_WORD ; nbits &= -128; /* now points to &t_[64] */
624
+ sub nbits, t ; nbits -= t;
625
+ shr nbits, 1 ; nbits >>= 1; /* # of leading 0 bits in old index + 33 */
626
+ tzcnt size, index ; size = # of trailing 0 bits in index
627
+ inc size ; ++size;
628
+ test index, index ; if (index == 0)
629
+ jz .ELOOP2 ; goto .ELOOP2;
630
+ ; NOTE: size == 32 cannot happen, since the last element is always 0.
631
+ shr index, sizeb ; index >>= size;
632
+ lea size, [size + nbits - 33] ; size = size + nbits - 33;
633
+ lea t, [t + size * SIZEOF_WORD] ; t += size;
634
+ cmp size, 16 ; if (size <= 16)
635
+ jle .ERLOOP2 ; goto .ERLOOP2;
636
+ .BRLOOP2: ; do {
637
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
638
+ ; nbits = actbl->ehufsi[0xf0];
639
+ sub size, 16 ; size -= 16;
640
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
641
+ ; code = actbl->ehufco[0xf0];
642
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
643
+ jle .EMIT_BRLOOP2 ; insert code and flush put_buffer
644
+ movd mm_nbits, nbits ; else { nbits --> MMX register
645
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
646
+ por mm_put_buffer, mm_code ; put_buffer |= code;
647
+ cmp size, 16 ; if (size <= 16)
648
+ jle .ERLOOP2 ; goto .ERLOOP2;
649
+ jmp .BRLOOP2 ; } while (1);
650
+
651
+ ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
652
+
653
+ align 16
654
+ .BLOOP2: ; do { /* size = # of zero bits/elements to skip */
655
+ shr index, sizeb ; index >>= size;
656
+ lea t, [t + size * SIZEOF_WORD] ; t += size;
657
+ cmp size, 16 ; if (size > 16)
658
+ jg .BRLOOP2 ; goto .BRLOOP2;
659
+ .ERLOOP2: ; .ERLOOP2:
660
+ movsx nbits, word [t] ; nbits = *t;
661
+ add size, size ; size += size;
662
+ movd mm_temp, nbits ; temp = nbits;
663
+ movzx nbits, byte [NBITS(nbits)] ; nbits = JPEG_NBITS(nbits);
664
+ movd mm_nbits, nbits ; nbits --> MMX register
665
+ lea size, [size * 8 + nbits] ; size = size * 8 + nbits;
666
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
667
+ ; code = actbl->ehufco[size-16];
668
+ movzx size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
669
+ ; size = actbl->ehufsi[size-16];
670
+ psllq mm_code, mm_nbits ; code <<= nbits;
671
+ pand mm_temp, [MASK_BITS(nbits)] ; temp &= (1 << nbits) - 1;
672
+ lea nbits, [nbits + size] ; nbits += size;
673
+ por mm_code, mm_temp ; code |= temp;
674
+ xor size, size ; size = 0; /* kill tzcnt input dependency */
675
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
676
+ jle .EMIT_ERLOOP2 ; insert code, flush buffer, init size, goto .BLOOP2
677
+ tzcnt size, index ; size = # of trailing 0 bits in index
678
+ movd mm_nbits, nbits ; nbits --> MMX register
679
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
680
+ inc size ; ++size;
681
+ por mm_put_buffer, mm_code ; put_buffer |= code;
682
+ test index, index
683
+ jnz .BLOOP2 ; } while (index != 0);
684
+ .ELOOP2: ; .ELOOP2:
685
+ mov nbits, t ; nbits = t;
686
+ lea t, [t + SIZEOF_WORD] ; t = &t[1];
687
+ and nbits, DCTSIZE2 * SIZEOF_WORD - 1 ; nbits &= 127;
688
+ and t, -DCTSIZE2 * SIZEOF_WORD ; t &= -128; /* t = &t_[0]; */
689
+ cmp nbits, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (nbits != 62 * 2)
690
+ je .EFN ; {
691
+ movd mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
692
+ ; code = actbl->ehufco[0];
693
+ movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
694
+ ; nbits = actbl->ehufsi[0];
695
+ sub free_bits, nbits ; if ((free_bits -= nbits) <= 0)
696
+ jg .EFN_SKIP_EMIT_CODE ; {
697
+ EMIT_QWORD size, sizeb, sizeh, , , , , , .EFN ; insert code, flush put_buffer
698
+ align 16
699
+ .EFN_SKIP_EMIT_CODE: ; } else {
700
+ movd mm_nbits, nbits ; nbits --> MMX register
701
+ psllq mm_put_buffer, mm_nbits ; put_buffer <<= nbits;
702
+ por mm_put_buffer, mm_code ; put_buffer |= code;
703
+ .EFN: ; } }
704
+ %define frame esp
705
+ mov frame, [t + save_frame]
706
+ %define state ecx
707
+ mov state, [frame + arg_state]
708
+ movq [state + working_state.cur.put_buffer.simd], mm_put_buffer
709
+ ; state->cur.put_buffer.simd = put_buffer;
710
+ emms
711
+ mov [state + working_state.cur.free_bits], free_bits
712
+ ; state->cur.free_bits = free_bits;
713
+ POP edi
714
+ POP esi
715
+ POP ebp
716
+ POP ebx
717
+ ret
718
+
719
+ ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
720
+
721
+ align 16
722
+ .EMIT_BRLOOP1:
723
+ EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , , , \
724
+ .ERLOOP1
725
+
726
+ ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
727
+
728
+ align 16
729
+ .EMIT_ERLOOP1:
730
+ EMIT_QWORD size, sizeb, sizeh, \
731
+ { xor size, size }, \
732
+ { tzcnt size, index }, \
733
+ { inc size }, \
734
+ { test index, index }, \
735
+ { jnz .BLOOP1 }, \
736
+ .ELOOP1
737
+
738
+ ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
739
+
740
+ align 16
741
+ .EMIT_BRLOOP2:
742
+ EMIT_QWORD emit_temp, emit_tempb, emit_temph, , , , \
743
+ { cmp size, 16 }, \
744
+ { jle .ERLOOP2 }, \
745
+ .BRLOOP2
746
+
747
+ ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
748
+
749
+ align 16
750
+ .EMIT_ERLOOP2:
751
+ EMIT_QWORD size, sizeb, sizeh, \
752
+ { xor size, size }, \
753
+ { tzcnt size, index }, \
754
+ { inc size }, \
755
+ { test index, index }, \
756
+ { jnz .BLOOP2 }, \
757
+ .ELOOP2
758
+
759
+ ; For some reason, the OS X linker does not honor the request to align the
760
+ ; segment unless we do this.
761
+ align 32