@shopify/react-native-skia 0.1.233 → 0.1.236

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (294) hide show
  1. package/android/CMakeLists.txt +0 -1
  2. package/android/cpp/jni/JniSkiaManager.cpp +0 -1
  3. package/android/cpp/rnskia-android/RNSkAndroidView.h +0 -1
  4. package/android/cpp/rnskia-android/RNSkOpenGLCanvasProvider.cpp +1 -1
  5. package/android/cpp/rnskia-android/RNSkOpenGLCanvasProvider.h +1 -2
  6. package/cpp/api/JsiSkTypefaceFactory.h +2 -1
  7. package/cpp/skia/include/android/AHardwareBufferUtils.h +23 -0
  8. package/cpp/skia/include/android/GrAHardwareBufferUtils.h +2 -0
  9. package/cpp/skia/include/android/graphite/SurfaceAndroid.h +59 -0
  10. package/cpp/skia/include/codec/SkAvifDecoder.h +1 -1
  11. package/cpp/skia/include/codec/SkBmpDecoder.h +1 -1
  12. package/cpp/skia/include/codec/SkCodec.h +21 -3
  13. package/cpp/skia/include/codec/SkGifDecoder.h +1 -1
  14. package/cpp/skia/include/codec/SkIcoDecoder.h +1 -1
  15. package/cpp/skia/include/codec/SkJpegDecoder.h +1 -1
  16. package/cpp/skia/include/codec/SkJpegxlDecoder.h +1 -1
  17. package/cpp/skia/include/codec/SkPngDecoder.h +1 -1
  18. package/cpp/skia/include/codec/SkRawDecoder.h +1 -1
  19. package/cpp/skia/include/codec/SkWbmpDecoder.h +1 -1
  20. package/cpp/skia/include/codec/SkWebpDecoder.h +1 -1
  21. package/cpp/skia/include/config/SkUserConfig.h +3 -1
  22. package/cpp/skia/include/core/SkCanvas.h +66 -37
  23. package/cpp/skia/include/core/SkColorFilter.h +5 -2
  24. package/cpp/skia/include/core/SkContourMeasure.h +1 -0
  25. package/cpp/skia/include/core/SkDocument.h +1 -0
  26. package/cpp/skia/include/core/SkFont.h +14 -24
  27. package/cpp/skia/include/core/SkFontArguments.h +1 -1
  28. package/cpp/skia/include/core/SkFontMetrics.h +1 -1
  29. package/cpp/skia/include/core/SkFontMgr.h +0 -7
  30. package/cpp/skia/include/core/SkGraphics.h +13 -0
  31. package/cpp/skia/include/core/SkMesh.h +9 -13
  32. package/cpp/skia/include/core/SkMilestone.h +1 -1
  33. package/cpp/skia/include/core/SkPathMeasure.h +2 -0
  34. package/cpp/skia/include/core/SkSerialProcs.h +29 -11
  35. package/cpp/skia/include/core/SkSize.h +3 -3
  36. package/cpp/skia/include/core/SkStream.h +3 -13
  37. package/cpp/skia/include/core/SkSurface.h +6 -3
  38. package/cpp/skia/include/core/SkSurfaceProps.h +2 -4
  39. package/cpp/skia/include/core/SkTraceMemoryDump.h +15 -0
  40. package/cpp/skia/include/core/SkTypeface.h +8 -56
  41. package/cpp/skia/include/core/SkTypes.h +8 -0
  42. package/cpp/skia/include/core/SkVertices.h +1 -1
  43. package/cpp/skia/include/docs/SkMultiPictureDocument.h +53 -0
  44. package/cpp/skia/include/docs/SkPDFDocument.h +11 -0
  45. package/cpp/skia/include/effects/SkGradientShader.h +9 -0
  46. package/cpp/skia/include/effects/SkRuntimeEffect.h +3 -7
  47. package/cpp/skia/include/gpu/GrBackendSemaphore.h +33 -47
  48. package/cpp/skia/include/gpu/GrBackendSurface.h +2 -3
  49. package/cpp/skia/include/gpu/GrContextOptions.h +0 -6
  50. package/cpp/skia/include/gpu/GrContextThreadSafeProxy.h +44 -28
  51. package/cpp/skia/include/gpu/GrDirectContext.h +12 -31
  52. package/cpp/skia/include/gpu/GrTypes.h +1 -16
  53. package/cpp/skia/include/gpu/MutableTextureState.h +35 -80
  54. package/cpp/skia/include/gpu/ShaderErrorHandler.h +11 -1
  55. package/cpp/skia/include/gpu/ganesh/SkImageGanesh.h +2 -2
  56. package/cpp/skia/include/gpu/ganesh/SkSurfaceGanesh.h +1 -1
  57. package/cpp/skia/include/gpu/ganesh/gl/GrGLDirectContext.h +3 -2
  58. package/cpp/skia/include/gpu/ganesh/vk/GrVkBackendSemaphore.h +20 -0
  59. package/cpp/skia/include/gpu/ganesh/vk/GrVkDirectContext.h +30 -0
  60. package/cpp/skia/include/gpu/gl/GrGLFunctions.h +1 -1
  61. package/cpp/skia/include/gpu/gl/GrGLInterface.h +2 -0
  62. package/cpp/skia/include/gpu/gl/glx/GrGLMakeGLXInterface.h +6 -0
  63. package/cpp/skia/include/gpu/graphite/BackendSemaphore.h +3 -3
  64. package/cpp/skia/include/gpu/graphite/BackendTexture.h +39 -27
  65. package/cpp/skia/include/gpu/graphite/Context.h +39 -13
  66. package/cpp/skia/include/gpu/graphite/ContextOptions.h +2 -0
  67. package/cpp/skia/include/gpu/graphite/GraphiteTypes.h +2 -1
  68. package/cpp/skia/include/gpu/graphite/Image.h +106 -87
  69. package/cpp/skia/include/gpu/graphite/Recorder.h +24 -3
  70. package/cpp/skia/include/gpu/graphite/Surface.h +7 -2
  71. package/cpp/skia/include/gpu/graphite/dawn/DawnBackendContext.h +41 -2
  72. package/cpp/skia/include/gpu/graphite/dawn/DawnTypes.h +11 -6
  73. package/cpp/skia/include/gpu/graphite/mtl/MtlGraphiteTypes.h +1 -2
  74. package/cpp/skia/include/gpu/graphite/vk/VulkanGraphiteTypes.h +6 -6
  75. package/cpp/skia/include/gpu/mock/GrMockTypes.h +1 -0
  76. package/cpp/skia/include/gpu/vk/GrVkBackendContext.h +1 -1
  77. package/cpp/skia/include/gpu/vk/GrVkTypes.h +1 -44
  78. package/cpp/skia/include/gpu/vk/VulkanExtensions.h +1 -1
  79. package/cpp/skia/include/gpu/vk/VulkanMutableTextureState.h +25 -0
  80. package/cpp/skia/include/gpu/vk/VulkanTypes.h +44 -0
  81. package/cpp/skia/include/ports/SkFontConfigInterface.h +3 -6
  82. package/cpp/skia/include/private/SkEncodedInfo.h +5 -0
  83. package/cpp/skia/include/private/SkExif.h +102 -0
  84. package/cpp/skia/include/private/SkGainmapInfo.h +11 -1
  85. package/cpp/skia/include/private/base/SkAssert.h +16 -0
  86. package/cpp/skia/include/private/base/SkDeque.h +2 -7
  87. package/cpp/skia/include/private/base/SkLoadUserConfig.h +1 -1
  88. package/cpp/skia/include/private/base/SkTArray.h +69 -28
  89. package/cpp/skia/include/private/base/SkThreadAnnotations.h +18 -5
  90. package/cpp/skia/include/private/chromium/GrSurfaceCharacterization.h +26 -30
  91. package/cpp/skia/include/private/chromium/GrVkSecondaryCBDrawContext.h +4 -3
  92. package/cpp/skia/include/private/chromium/SkImageChromium.h +1 -1
  93. package/cpp/skia/include/private/gpu/ganesh/GrTypesPriv.h +8 -6
  94. package/cpp/skia/include/private/gpu/graphite/ContextOptionsPriv.h +29 -0
  95. package/cpp/skia/include/private/gpu/graphite/DawnTypesPriv.h +12 -9
  96. package/cpp/skia/include/private/gpu/graphite/VulkanGraphiteTypesPriv.h +16 -11
  97. package/cpp/skia/include/third_party/vulkan/vulkan/vk_platform.h +2 -2
  98. package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codec_h264std.h +312 -0
  99. package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codec_h264std_decode.h +77 -0
  100. package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codec_h265std.h +446 -0
  101. package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codec_h265std_decode.h +67 -0
  102. package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codecs_common.h +36 -0
  103. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan.h +9 -2
  104. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_android.h +31 -3
  105. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_core.h +10624 -5716
  106. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_ios.h +2 -1
  107. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_macos.h +2 -1
  108. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_win32.h +28 -1
  109. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_xcb.h +2 -1
  110. package/cpp/skia/include/utils/mac/SkCGUtils.h +23 -11
  111. package/cpp/skia/modules/skcms/skcms.h +2 -410
  112. package/cpp/skia/modules/skcms/src/Transform_inl.h +831 -704
  113. package/cpp/skia/modules/skcms/src/skcms_Transform.h +161 -0
  114. package/cpp/skia/modules/skcms/src/skcms_internals.h +136 -0
  115. package/cpp/skia/modules/skcms/src/skcms_public.h +404 -0
  116. package/cpp/skia/modules/skparagraph/include/FontArguments.h +1 -1
  117. package/cpp/skia/modules/skparagraph/include/FontCollection.h +2 -0
  118. package/cpp/skia/modules/skparagraph/include/Paragraph.h +2 -2
  119. package/cpp/skia/modules/skparagraph/include/TextStyle.h +4 -3
  120. package/cpp/skia/modules/skparagraph/include/TypefaceFontProvider.h +1 -3
  121. package/cpp/skia/modules/skresources/include/SkResources.h +28 -17
  122. package/cpp/skia/modules/skunicode/include/SkUnicode.h +12 -0
  123. package/cpp/skia/modules/svg/include/SkSVGDOM.h +4 -1
  124. package/cpp/skia/modules/svg/include/SkSVGRenderContext.h +4 -1
  125. package/cpp/skia/src/base/SkUTF.h +7 -0
  126. package/cpp/skia/src/core/SkTHash.h +20 -8
  127. package/lib/commonjs/dom/nodes/JsiSkDOM.d.ts +3 -2
  128. package/lib/commonjs/dom/nodes/JsiSkDOM.js +56 -57
  129. package/lib/commonjs/dom/nodes/JsiSkDOM.js.map +1 -1
  130. package/lib/commonjs/external/reanimated/index.d.ts +1 -0
  131. package/lib/commonjs/external/reanimated/index.js +13 -0
  132. package/lib/commonjs/external/reanimated/index.js.map +1 -1
  133. package/lib/commonjs/external/reanimated/interpolators.js +16 -2
  134. package/lib/commonjs/external/reanimated/interpolators.js.map +1 -1
  135. package/lib/commonjs/external/reanimated/moduleWrapper.d.ts +1 -0
  136. package/lib/commonjs/external/reanimated/moduleWrapper.js +5 -3
  137. package/lib/commonjs/external/reanimated/moduleWrapper.js.map +1 -1
  138. package/lib/commonjs/external/reanimated/textures.d.ts +5 -0
  139. package/lib/commonjs/external/reanimated/textures.js +52 -0
  140. package/lib/commonjs/external/reanimated/textures.js.map +1 -0
  141. package/lib/commonjs/headless/index.js +1 -1
  142. package/lib/commonjs/headless/index.js.map +1 -1
  143. package/lib/commonjs/mock/index.js +3 -0
  144. package/lib/commonjs/mock/index.js.map +1 -1
  145. package/lib/commonjs/renderer/Canvas.js +6 -5
  146. package/lib/commonjs/renderer/Canvas.js.map +1 -1
  147. package/lib/commonjs/renderer/Container.d.ts +1 -1
  148. package/lib/commonjs/renderer/Container.js +2 -1
  149. package/lib/commonjs/renderer/Container.js.map +1 -1
  150. package/lib/commonjs/renderer/Offscreen.d.ts +1 -0
  151. package/lib/commonjs/renderer/Offscreen.js +18 -5
  152. package/lib/commonjs/renderer/Offscreen.js.map +1 -1
  153. package/lib/commonjs/renderer/Reconciler.d.ts +1 -1
  154. package/lib/commonjs/renderer/Reconciler.js +7 -4
  155. package/lib/commonjs/renderer/Reconciler.js.map +1 -1
  156. package/lib/commonjs/skia/types/Matrix4.d.ts +2 -2
  157. package/lib/commonjs/skia/types/Matrix4.js.map +1 -1
  158. package/lib/commonjs/skia/types/Shader/Shader.d.ts +1 -1
  159. package/lib/commonjs/skia/types/Shader/Shader.js.map +1 -1
  160. package/lib/commonjs/skia/web/JsiSkPath.d.ts +2 -2
  161. package/lib/commonjs/skia/web/JsiSkPath.js +10 -2
  162. package/lib/commonjs/skia/web/JsiSkPath.js.map +1 -1
  163. package/lib/commonjs/views/SkiaJSDomView.d.ts +31 -0
  164. package/lib/commonjs/views/SkiaJSDomView.js +161 -0
  165. package/lib/commonjs/views/SkiaJSDomView.js.map +1 -0
  166. package/lib/commonjs/views/SkiaJSDomView.web.d.ts +1 -0
  167. package/lib/commonjs/views/SkiaJSDomView.web.js +14 -0
  168. package/lib/commonjs/views/SkiaJSDomView.web.js.map +1 -0
  169. package/lib/module/dom/nodes/JsiSkDOM.d.ts +3 -2
  170. package/lib/module/dom/nodes/JsiSkDOM.js +56 -56
  171. package/lib/module/dom/nodes/JsiSkDOM.js.map +1 -1
  172. package/lib/module/external/reanimated/index.d.ts +1 -0
  173. package/lib/module/external/reanimated/index.js +1 -0
  174. package/lib/module/external/reanimated/index.js.map +1 -1
  175. package/lib/module/external/reanimated/interpolators.js +15 -2
  176. package/lib/module/external/reanimated/interpolators.js.map +1 -1
  177. package/lib/module/external/reanimated/moduleWrapper.d.ts +1 -0
  178. package/lib/module/external/reanimated/moduleWrapper.js +3 -2
  179. package/lib/module/external/reanimated/moduleWrapper.js.map +1 -1
  180. package/lib/module/external/reanimated/textures.d.ts +5 -0
  181. package/lib/module/external/reanimated/textures.js +35 -0
  182. package/lib/module/external/reanimated/textures.js.map +1 -0
  183. package/lib/module/headless/index.js +1 -1
  184. package/lib/module/headless/index.js.map +1 -1
  185. package/lib/module/mock/index.js +3 -0
  186. package/lib/module/mock/index.js.map +1 -1
  187. package/lib/module/renderer/Canvas.js +5 -4
  188. package/lib/module/renderer/Canvas.js.map +1 -1
  189. package/lib/module/renderer/Container.d.ts +1 -1
  190. package/lib/module/renderer/Container.js +2 -1
  191. package/lib/module/renderer/Container.js.map +1 -1
  192. package/lib/module/renderer/Offscreen.d.ts +1 -0
  193. package/lib/module/renderer/Offscreen.js +11 -3
  194. package/lib/module/renderer/Offscreen.js.map +1 -1
  195. package/lib/module/renderer/Reconciler.d.ts +1 -1
  196. package/lib/module/renderer/Reconciler.js +7 -4
  197. package/lib/module/renderer/Reconciler.js.map +1 -1
  198. package/lib/module/skia/types/Matrix4.d.ts +2 -2
  199. package/lib/module/skia/types/Matrix4.js.map +1 -1
  200. package/lib/module/skia/types/Shader/Shader.d.ts +1 -1
  201. package/lib/module/skia/types/Shader/Shader.js.map +1 -1
  202. package/lib/module/skia/web/JsiSkPath.d.ts +2 -2
  203. package/lib/module/skia/web/JsiSkPath.js +10 -2
  204. package/lib/module/skia/web/JsiSkPath.js.map +1 -1
  205. package/lib/module/views/SkiaJSDomView.d.ts +31 -0
  206. package/lib/module/views/SkiaJSDomView.js +136 -0
  207. package/lib/module/views/SkiaJSDomView.js.map +1 -0
  208. package/lib/module/views/SkiaJSDomView.web.d.ts +1 -0
  209. package/lib/module/views/SkiaJSDomView.web.js +2 -0
  210. package/lib/module/views/SkiaJSDomView.web.js.map +1 -0
  211. package/lib/typescript/src/dom/nodes/JsiSkDOM.d.ts +3 -2
  212. package/lib/typescript/src/external/reanimated/index.d.ts +1 -0
  213. package/lib/typescript/src/external/reanimated/moduleWrapper.d.ts +1 -0
  214. package/lib/typescript/src/external/reanimated/textures.d.ts +5 -0
  215. package/lib/typescript/src/renderer/Container.d.ts +1 -1
  216. package/lib/typescript/src/renderer/Offscreen.d.ts +1 -0
  217. package/lib/typescript/src/renderer/Reconciler.d.ts +1 -1
  218. package/lib/typescript/src/skia/types/Matrix4.d.ts +2 -2
  219. package/lib/typescript/src/skia/types/Shader/Shader.d.ts +1 -1
  220. package/lib/typescript/src/skia/web/JsiSkPath.d.ts +2 -2
  221. package/lib/typescript/src/views/SkiaJSDomView.d.ts +31 -0
  222. package/lib/typescript/src/views/SkiaJSDomView.web.d.ts +1 -0
  223. package/libs/android/arm64-v8a/libskia.a +0 -0
  224. package/libs/android/arm64-v8a/libskottie.a +0 -0
  225. package/libs/android/arm64-v8a/libskparagraph.a +0 -0
  226. package/libs/android/arm64-v8a/libsksg.a +0 -0
  227. package/libs/android/arm64-v8a/libskshaper.a +0 -0
  228. package/libs/android/arm64-v8a/libskunicode.a +0 -0
  229. package/libs/android/arm64-v8a/libsvg.a +0 -0
  230. package/libs/android/armeabi-v7a/libskia.a +0 -0
  231. package/libs/android/armeabi-v7a/libskottie.a +0 -0
  232. package/libs/android/armeabi-v7a/libskparagraph.a +0 -0
  233. package/libs/android/armeabi-v7a/libsksg.a +0 -0
  234. package/libs/android/armeabi-v7a/libskshaper.a +0 -0
  235. package/libs/android/armeabi-v7a/libskunicode.a +0 -0
  236. package/libs/android/armeabi-v7a/libsvg.a +0 -0
  237. package/libs/android/x86/libskia.a +0 -0
  238. package/libs/android/x86/libskottie.a +0 -0
  239. package/libs/android/x86/libskparagraph.a +0 -0
  240. package/libs/android/x86/libsksg.a +0 -0
  241. package/libs/android/x86/libskshaper.a +0 -0
  242. package/libs/android/x86/libskunicode.a +0 -0
  243. package/libs/android/x86/libsvg.a +0 -0
  244. package/libs/android/x86_64/libskia.a +0 -0
  245. package/libs/android/x86_64/libskottie.a +0 -0
  246. package/libs/android/x86_64/libskparagraph.a +0 -0
  247. package/libs/android/x86_64/libsksg.a +0 -0
  248. package/libs/android/x86_64/libskshaper.a +0 -0
  249. package/libs/android/x86_64/libskunicode.a +0 -0
  250. package/libs/android/x86_64/libsvg.a +0 -0
  251. package/libs/ios/libskia.xcframework/ios-arm64_arm64e/libskia.a +0 -0
  252. package/libs/ios/libskia.xcframework/ios-arm64_arm64e_x86_64-simulator/libskia.a +0 -0
  253. package/libs/ios/libskottie.xcframework/ios-arm64_arm64e/libskottie.a +0 -0
  254. package/libs/ios/libskottie.xcframework/ios-arm64_arm64e_x86_64-simulator/libskottie.a +0 -0
  255. package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e/libskparagraph.a +0 -0
  256. package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e_x86_64-simulator/libskparagraph.a +0 -0
  257. package/libs/ios/libsksg.xcframework/ios-arm64_arm64e/libsksg.a +0 -0
  258. package/libs/ios/libsksg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsksg.a +0 -0
  259. package/libs/ios/libskshaper.xcframework/Info.plist +5 -5
  260. package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e/libskshaper.a +0 -0
  261. package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e_x86_64-simulator/libskshaper.a +0 -0
  262. package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e/libskunicode.a +0 -0
  263. package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e_x86_64-simulator/libskunicode.a +0 -0
  264. package/libs/ios/libsvg.xcframework/Info.plist +5 -5
  265. package/libs/ios/libsvg.xcframework/ios-arm64_arm64e/libsvg.a +0 -0
  266. package/libs/ios/libsvg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsvg.a +0 -0
  267. package/package.json +1 -1
  268. package/src/dom/nodes/JsiSkDOM.ts +55 -56
  269. package/src/external/reanimated/index.ts +1 -0
  270. package/src/external/reanimated/interpolators.ts +17 -3
  271. package/src/external/reanimated/moduleWrapper.ts +1 -0
  272. package/src/external/reanimated/textures.tsx +50 -0
  273. package/src/headless/index.ts +1 -1
  274. package/src/mock/index.ts +3 -0
  275. package/src/renderer/Canvas.tsx +4 -3
  276. package/src/renderer/Container.tsx +3 -2
  277. package/src/renderer/Offscreen.tsx +12 -3
  278. package/src/renderer/Reconciler.tsx +5 -2
  279. package/src/skia/types/Matrix4.ts +2 -2
  280. package/src/skia/types/Shader/Shader.ts +6 -1
  281. package/src/skia/web/JsiSkPath.ts +23 -3
  282. package/src/views/SkiaJSDomView.tsx +126 -0
  283. package/src/views/SkiaJSDomView.web.tsx +1 -0
  284. package/android/cpp/jni/include/JniSkiaDrawView.h +0 -90
  285. package/cpp/rnskia/RNSkJsView.cpp +0 -236
  286. package/cpp/rnskia/RNSkJsView.h +0 -121
  287. package/cpp/skia/include/gpu/GrSurfaceInfo.h +0 -142
  288. package/cpp/skia/include/private/gpu/ganesh/GrGLTypesPriv.h +0 -107
  289. package/cpp/skia/include/private/gpu/ganesh/GrMockTypesPriv.h +0 -32
  290. package/cpp/skia/include/private/gpu/ganesh/GrMtlTypesPriv.h +0 -83
  291. package/cpp/skia/include/private/gpu/ganesh/GrVkTypesPriv.h +0 -47
  292. package/cpp/skia/include/private/gpu/vk/VulkanTypesPriv.h +0 -57
  293. package/cpp/skia/include/utils/SkBase64.h +0 -53
  294. package/cpp/skia/modules/skcms/skcms_internal.h +0 -56
@@ -8,26 +8,29 @@
8
8
  // Intentionally NO #pragma once... included multiple times.
9
9
 
10
10
  // This file is included from skcms.cc in a namespace with some pre-defines:
11
- // - N: depth of all vectors, 1,4,8, or 16 (preprocessor define)
11
+ // - N: SIMD width of all vectors; 1, 4, 8 or 16 (preprocessor define)
12
12
  // - V<T>: a template to create a vector of N T's.
13
13
 
14
- using F = V<Color>; // Called F for historic reasons... maybe rename C?
14
+ using F = V<float>;
15
15
  using I32 = V<int32_t>;
16
16
  using U64 = V<uint64_t>;
17
17
  using U32 = V<uint32_t>;
18
18
  using U16 = V<uint16_t>;
19
19
  using U8 = V<uint8_t>;
20
20
 
21
-
22
21
  #if defined(__GNUC__) && !defined(__clang__)
23
- // Once again, GCC is kind of weird, not allowing vector = scalar directly.
22
+ // GCC is kind of weird, not allowing vector = scalar directly.
24
23
  static constexpr F F0 = F() + 0.0f,
25
24
  F1 = F() + 1.0f,
25
+ FHalf = F() + 0.5f,
26
26
  FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
27
+ static constexpr I32 F16InfBits = I32() + 0x4780'0000;
27
28
  #else
28
29
  static constexpr F F0 = 0.0f,
29
30
  F1 = 1.0f,
31
+ FHalf = 0.5f,
30
32
  FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
33
+ static constexpr I32 F16InfBits = 0x4780'0000; // equals +Inf in half float, shifted to 32-bits
31
34
  #endif
32
35
 
33
36
  // Instead of checking __AVX__ below, we'll check USING_AVX.
@@ -84,19 +87,11 @@ using U8 = V<uint8_t>;
84
87
  #endif
85
88
  #endif
86
89
 
87
- #if defined(__clang__)
88
- #define FALLTHROUGH [[clang::fallthrough]]
89
- #else
90
- #define FALLTHROUGH
91
- #endif
92
-
93
90
  // We tag most helper functions as SI, to enforce good code generation
94
91
  // but also work around what we think is a bug in GCC: when targeting 32-bit
95
92
  // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
96
93
  // MMX mm0 register, which seems to mess with unrelated code that later uses
97
94
  // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
98
- //
99
- // It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
100
95
  #if defined(__clang__) || defined(__GNUC__)
101
96
  #define SI static inline __attribute__((always_inline))
102
97
  #else
@@ -106,12 +101,12 @@ using U8 = V<uint8_t>;
106
101
  template <typename T, typename P>
107
102
  SI T load(const P* ptr) {
108
103
  T val;
109
- small_memcpy(&val, ptr, sizeof(val));
104
+ memcpy(&val, ptr, sizeof(val));
110
105
  return val;
111
106
  }
112
107
  template <typename T, typename P>
113
108
  SI void store(P* ptr, const T& val) {
114
- small_memcpy(ptr, &val, sizeof(val));
109
+ memcpy(ptr, &val, sizeof(val));
115
110
  }
116
111
 
117
112
  // (T)v is a cast when N == 1 and a bit-pun when N>1,
@@ -142,7 +137,6 @@ SI D bit_pun(const S& v) {
142
137
  // To serve both those ends, we use this function to_fixed() instead of direct cast().
143
138
  SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
144
139
 
145
-
146
140
  // Sometimes we do something crazy on one branch of a conditonal,
147
141
  // like divide by zero or convert a huge float to an integer,
148
142
  // but then harmlessly select the other side. That trips up N==1
@@ -159,7 +153,22 @@ SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
159
153
  }
160
154
  #endif
161
155
 
156
+ #if defined(USING_NEON)
157
+ SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
158
+ SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
159
+
160
+ SI I32 min_(I32 x, I32 y) { return (I32)vminq_s32((int32x4_t)x, (int32x4_t)y); }
161
+ SI I32 max_(I32 x, I32 y) { return (I32)vmaxq_s32((int32x4_t)x, (int32x4_t)y); }
162
+ #else
163
+ SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
164
+ SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
165
+
166
+ SI I32 min_(I32 x, I32 y) { return if_then_else(x > y, y, x); }
167
+ SI I32 max_(I32 x, I32 y) { return if_then_else(x < y, y, x); }
168
+ #endif
162
169
 
170
+ // KEEP IN SYNC with skvx::from_half to ensure that f16 colors are computed consistently in both
171
+ // skcms and skvx.
163
172
  SI F F_from_Half(U16 half) {
164
173
  #if defined(USING_NEON_F16C)
165
174
  return vcvt_f32_f16((float16x4_t)half);
@@ -169,24 +178,27 @@ SI F F_from_Half(U16 half) {
169
178
  typedef int16_t __attribute__((vector_size(16))) I16;
170
179
  return __builtin_ia32_vcvtph2ps256((I16)half);
171
180
  #else
172
- U32 wide = cast<U32>(half);
181
+ I32 wide = cast<I32>(half);
173
182
  // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
174
- U32 s = wide & 0x8000,
175
- em = wide ^ s;
176
-
177
- // Constructing the float is easy if the half is not denormalized.
178
- F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
179
-
180
- // Simply flush all denorm half floats to zero.
181
- return if_then_else(em < 0x0400, F0, norm);
183
+ // To match intrinsic behavior, this preserves denormal values, infinities, and NaNs, which
184
+ // helps improve consistency between architectures.
185
+ I32 s = wide & 0x8000,
186
+ em = wide ^ s,
187
+ inf_or_nan = (em >= (31 << 10)) & (255 << 23), // Expands exponent to fill 8 bits
188
+ is_norm = em > 0x3ff,
189
+ // denormalized f16's are 2^-14*0.[m0:9] == 2^-24*[m0:9].0
190
+ sub = bit_pun<I32>(cast<F>(em) * (1.f/(1<<24))),
191
+ norm = ((em<<13) + ((127-15)<<23)), // Shifts mantissa, shifts + re-biases exponent
192
+ finite = if_then_else(is_norm, norm, sub);
193
+ // If 'x' is f16 +/- infinity, inf_or_nan will be the filled 8-bit exponent but 'norm' will be
194
+ // all 0s since 'x's mantissa is 0. Thus norm | inf_or_nan becomes f32 infinity. However, if
195
+ // 'x' is an f16 NaN, some bits of 'norm' will be non-zero, so it stays an f32 NaN after the OR.
196
+ return bit_pun<F>((s<<16) | finite | inf_or_nan);
182
197
  #endif
183
198
  }
184
199
 
185
- #if defined(__clang__)
186
- // The -((127-15)<<10) underflows that side of the math when
187
- // we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
188
- __attribute__((no_sanitize("unsigned-integer-overflow")))
189
- #endif
200
+ // KEEP IN SYNC with skvx::to_half to ensure that f16 colors are computed consistently in both
201
+ // skcms and skvx.
190
202
  SI U16 Half_from_F(F f) {
191
203
  #if defined(USING_NEON_F16C)
192
204
  return (U16)vcvt_f16_f32(f);
@@ -196,13 +208,23 @@ SI U16 Half_from_F(F f) {
196
208
  return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
197
209
  #else
198
210
  // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
199
- U32 sem = bit_pun<U32>(f),
200
- s = sem & 0x80000000,
201
- em = sem ^ s;
202
-
203
- // For simplicity we flush denorm half floats (including all denorm floats) to zero.
204
- return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
205
- , (s>>16) + (em>>13) - ((127-15)<<10)));
211
+ // To match intrinsic behavior, this implements round-to-nearest-even, converting floats to
212
+ // denormal f16 values, overflowing to infinity and preserving infinity. However, it does not
213
+ // handle NaN float values (they become infinity).
214
+ I32 sem = bit_pun<I32>(f),
215
+ s = sem & 0x8000'0000,
216
+ em = min_(sem ^ s, F16InfBits), // |x| clamped to f16 infinity
217
+ // F(em)*8192 increases the exponent by 13, which when added back to em will shift the
218
+ // mantissa bits 13 to the right. We clamp to 1/2 for subnormal values, which
219
+ // automatically shifts the mantissa to match 2^-14 expected for a subnorm f16.
220
+ magic = bit_pun<I32>(max_(bit_pun<F>(em) * 8192.f, FHalf)) & (255 << 23),
221
+ // Shift mantissa with automatic round-to-even
222
+ rounded = bit_pun<I32>((bit_pun<F>(em) + bit_pun<F>(magic))),
223
+ // Subtract 127 for f32 bias, subtract 13 to undo the *8192, subtract 1 to remove
224
+ // the implicit leading 1., and add 15 to get the f16 biased exponent.
225
+ exp = ((magic >> 13) - ((127-15+13+1)<<10)), // shift and re-bias exponent
226
+ f16 = rounded + exp; // use + if 'rounded' rolled over into first exponent bit
227
+ return cast<U16>((s>>16) | f16);
206
228
  #endif
207
229
  }
208
230
 
@@ -218,14 +240,6 @@ SI U64 swap_endian_16x4(const U64& rgba) {
218
240
  | (rgba & 0xff00ff00ff00ff00) >> 8;
219
241
  }
220
242
 
221
- #if defined(USING_NEON)
222
- SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
223
- SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
224
- #else
225
- SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
226
- SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
227
- #endif
228
-
229
243
  SI F floor_(F x) {
230
244
  #if N == 1
231
245
  return floorf_(x);
@@ -292,19 +306,35 @@ SI F approx_exp(F x) {
292
306
  return approx_exp2(log2_e * x);
293
307
  }
294
308
 
309
+ SI F strip_sign(F x, U32* sign) {
310
+ U32 bits = bit_pun<U32>(x);
311
+ *sign = bits & 0x80000000;
312
+ return bit_pun<F>(bits ^ *sign);
313
+ }
314
+
315
+ SI F apply_sign(F x, U32 sign) {
316
+ return bit_pun<F>(sign | bit_pun<U32>(x));
317
+ }
318
+
295
319
  // Return tf(x).
296
320
  SI F apply_tf(const skcms_TransferFunction* tf, F x) {
297
321
  // Peel off the sign bit and set x = |x|.
298
- U32 bits = bit_pun<U32>(x),
299
- sign = bits & 0x80000000;
300
- x = bit_pun<F>(bits ^ sign);
322
+ U32 sign;
323
+ x = strip_sign(x, &sign);
301
324
 
302
325
  // The transfer function has a linear part up to d, exponential at d and after.
303
326
  F v = if_then_else(x < tf->d, tf->c*x + tf->f
304
327
  , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
305
328
 
306
329
  // Tack the sign bit back on.
307
- return bit_pun<F>(sign | bit_pun<U32>(v));
330
+ return apply_sign(v, sign);
331
+ }
332
+
333
+ // Return the gamma function (|x|^G with the original sign re-applied to x).
334
+ SI F apply_gamma(const skcms_TransferFunction* tf, F x) {
335
+ U32 sign;
336
+ x = strip_sign(x, &sign);
337
+ return apply_sign(approx_pow(x, tf->g), sign);
308
338
  }
309
339
 
310
340
  SI F apply_pq(const skcms_TransferFunction* tf, F x) {
@@ -717,12 +747,12 @@ static void clut(uint32_t input_channels, uint32_t output_channels,
717
747
  switch ((dim-1)&3) { // This lets the compiler know there are no other cases to handle.
718
748
  case 3: ix += index [3 + (combo&8)/2];
719
749
  w *= weight[3 + (combo&8)/2];
720
- FALLTHROUGH;
750
+ SKCMS_FALLTHROUGH;
721
751
  // fall through
722
752
 
723
753
  case 2: ix += index [2 + (combo&4)*1];
724
754
  w *= weight[2 + (combo&4)*1];
725
- FALLTHROUGH;
755
+ SKCMS_FALLTHROUGH;
726
756
  // fall through
727
757
 
728
758
  case 1: ix += index [1 + (combo&2)*2];
@@ -755,643 +785,763 @@ static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
755
785
  r,g,b,a);
756
786
  }
757
787
 
758
- static void exec_ops(const Op* ops, const void** args,
759
- const char* src, char* dst, int i) {
760
- F r = F0, g = F0, b = F0, a = F1;
761
- while (true) {
762
- switch (*ops++) {
763
- case Op_load_a8:{
764
- a = F_from_U8(load<U8>(src + 1*i));
765
- } break;
766
-
767
- case Op_load_g8:{
768
- r = g = b = F_from_U8(load<U8>(src + 1*i));
769
- } break;
770
-
771
- case Op_load_4444:{
772
- U16 abgr = load<U16>(src + 2*i);
773
-
774
- r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
775
- g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
776
- b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
777
- a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
778
- } break;
779
-
780
- case Op_load_565:{
781
- U16 rgb = load<U16>(src + 2*i);
782
-
783
- r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
784
- g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
785
- b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
786
- } break;
787
-
788
- case Op_load_888:{
789
- const uint8_t* rgb = (const uint8_t*)(src + 3*i);
790
- #if defined(USING_NEON)
791
- // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
792
- // a time. Since we're doing that, we might as well load them into 16-bit lanes.
793
- // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
794
- uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
795
- v = vld3_lane_u8(rgb+0, v, 0);
796
- v = vld3_lane_u8(rgb+3, v, 2);
797
- v = vld3_lane_u8(rgb+6, v, 4);
798
- v = vld3_lane_u8(rgb+9, v, 6);
799
-
800
- // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
801
- // convert to F. (Again, U32 would be even better here if drop ARMv7 or split
802
- // ARMv7 and ARMv8 impls.)
803
- r = cast<F>((U16)v.val[0]) * (1/255.0f);
804
- g = cast<F>((U16)v.val[1]) * (1/255.0f);
805
- b = cast<F>((U16)v.val[2]) * (1/255.0f);
806
- #else
807
- r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
808
- g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
809
- b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
810
- #endif
811
- } break;
812
-
813
- case Op_load_8888:{
814
- U32 rgba = load<U32>(src + 4*i);
815
-
816
- r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
817
- g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
818
- b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
819
- a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
820
- } break;
821
-
822
- case Op_load_8888_palette8:{
823
- const uint8_t* palette = (const uint8_t*) *args++;
824
- I32 ix = cast<I32>(load<U8>(src + 1*i));
825
- U32 rgba = gather_32(palette, ix);
826
-
827
- r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
828
- g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
829
- b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
830
- a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
831
- } break;
832
-
833
- case Op_load_1010102:{
834
- U32 rgba = load<U32>(src + 4*i);
835
-
836
- r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
837
- g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
838
- b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
839
- a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
840
- } break;
841
-
842
- case Op_load_101010x_XR:{
843
- static constexpr float min = -0.752941f;
844
- static constexpr float max = 1.25098f;
845
- static constexpr float range = max - min;
846
- U32 rgba = load<U32>(src + 4*i);
847
- r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f) * range + min;
848
- g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f) * range + min;
849
- b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f) * range + min;
850
- } break;
851
-
852
- case Op_load_161616LE:{
853
- uintptr_t ptr = (uintptr_t)(src + 6*i);
854
- assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
855
- const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
856
- #if defined(USING_NEON)
857
- uint16x4x3_t v = vld3_u16(rgb);
858
- r = cast<F>((U16)v.val[0]) * (1/65535.0f);
859
- g = cast<F>((U16)v.val[1]) * (1/65535.0f);
860
- b = cast<F>((U16)v.val[2]) * (1/65535.0f);
861
- #else
862
- r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
863
- g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
864
- b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
865
- #endif
866
- } break;
867
-
868
- case Op_load_16161616LE:{
869
- uintptr_t ptr = (uintptr_t)(src + 8*i);
870
- assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
871
- const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
872
- #if defined(USING_NEON)
873
- uint16x4x4_t v = vld4_u16(rgba);
874
- r = cast<F>((U16)v.val[0]) * (1/65535.0f);
875
- g = cast<F>((U16)v.val[1]) * (1/65535.0f);
876
- b = cast<F>((U16)v.val[2]) * (1/65535.0f);
877
- a = cast<F>((U16)v.val[3]) * (1/65535.0f);
878
- #else
879
- U64 px = load<U64>(rgba);
880
-
881
- r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
882
- g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
883
- b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
884
- a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
885
- #endif
886
- } break;
887
-
888
- case Op_load_161616BE:{
889
- uintptr_t ptr = (uintptr_t)(src + 6*i);
890
- assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
891
- const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
892
- #if defined(USING_NEON)
893
- uint16x4x3_t v = vld3_u16(rgb);
894
- r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
895
- g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
896
- b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
897
- #else
898
- U32 R = load_3<U32>(rgb+0),
899
- G = load_3<U32>(rgb+1),
900
- B = load_3<U32>(rgb+2);
901
- // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
902
- r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
903
- g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
904
- b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
905
- #endif
906
- } break;
907
-
908
- case Op_load_16161616BE:{
909
- uintptr_t ptr = (uintptr_t)(src + 8*i);
910
- assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
911
- const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
912
- #if defined(USING_NEON)
913
- uint16x4x4_t v = vld4_u16(rgba);
914
- r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
915
- g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
916
- b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
917
- a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
918
- #else
919
- U64 px = swap_endian_16x4(load<U64>(rgba));
920
-
921
- r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
922
- g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
923
- b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
924
- a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
925
- #endif
926
- } break;
927
-
928
- case Op_load_hhh:{
929
- uintptr_t ptr = (uintptr_t)(src + 6*i);
930
- assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
931
- const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
932
- #if defined(USING_NEON)
933
- uint16x4x3_t v = vld3_u16(rgb);
934
- U16 R = (U16)v.val[0],
935
- G = (U16)v.val[1],
936
- B = (U16)v.val[2];
937
- #else
938
- U16 R = load_3<U16>(rgb+0),
939
- G = load_3<U16>(rgb+1),
940
- B = load_3<U16>(rgb+2);
941
- #endif
942
- r = F_from_Half(R);
943
- g = F_from_Half(G);
944
- b = F_from_Half(B);
945
- } break;
946
-
947
- case Op_load_hhhh:{
948
- uintptr_t ptr = (uintptr_t)(src + 8*i);
949
- assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
950
- const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
951
- #if defined(USING_NEON)
952
- uint16x4x4_t v = vld4_u16(rgba);
953
- U16 R = (U16)v.val[0],
954
- G = (U16)v.val[1],
955
- B = (U16)v.val[2],
956
- A = (U16)v.val[3];
957
- #else
958
- U64 px = load<U64>(rgba);
959
- U16 R = cast<U16>((px >> 0) & 0xffff),
960
- G = cast<U16>((px >> 16) & 0xffff),
961
- B = cast<U16>((px >> 32) & 0xffff),
962
- A = cast<U16>((px >> 48) & 0xffff);
963
- #endif
964
- r = F_from_Half(R);
965
- g = F_from_Half(G);
966
- b = F_from_Half(B);
967
- a = F_from_Half(A);
968
- } break;
969
-
970
- case Op_load_fff:{
971
- uintptr_t ptr = (uintptr_t)(src + 12*i);
972
- assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
973
- const float* rgb = (const float*)ptr; // cast to const float* to be safe.
974
- #if defined(USING_NEON)
975
- float32x4x3_t v = vld3q_f32(rgb);
976
- r = (F)v.val[0];
977
- g = (F)v.val[1];
978
- b = (F)v.val[2];
979
- #else
980
- r = load_3<F>(rgb+0);
981
- g = load_3<F>(rgb+1);
982
- b = load_3<F>(rgb+2);
983
- #endif
984
- } break;
985
-
986
- case Op_load_ffff:{
987
- uintptr_t ptr = (uintptr_t)(src + 16*i);
988
- assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
989
- const float* rgba = (const float*)ptr; // cast to const float* to be safe.
990
- #if defined(USING_NEON)
991
- float32x4x4_t v = vld4q_f32(rgba);
992
- r = (F)v.val[0];
993
- g = (F)v.val[1];
994
- b = (F)v.val[2];
995
- a = (F)v.val[3];
996
- #else
997
- r = load_4<F>(rgba+0);
998
- g = load_4<F>(rgba+1);
999
- b = load_4<F>(rgba+2);
1000
- a = load_4<F>(rgba+3);
1001
- #endif
1002
- } break;
1003
-
1004
- case Op_swap_rb:{
1005
- F t = r;
1006
- r = b;
1007
- b = t;
1008
- } break;
1009
-
1010
- case Op_clamp:{
1011
- r = max_(F0, min_(r, F1));
1012
- g = max_(F0, min_(g, F1));
1013
- b = max_(F0, min_(b, F1));
1014
- a = max_(F0, min_(a, F1));
1015
- } break;
1016
-
1017
- case Op_invert:{
1018
- r = F1 - r;
1019
- g = F1 - g;
1020
- b = F1 - b;
1021
- a = F1 - a;
1022
- } break;
1023
-
1024
- case Op_force_opaque:{
1025
- a = F1;
1026
- } break;
1027
-
1028
- case Op_premul:{
1029
- r *= a;
1030
- g *= a;
1031
- b *= a;
1032
- } break;
1033
-
1034
- case Op_unpremul:{
1035
- F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1036
- r *= scale;
1037
- g *= scale;
1038
- b *= scale;
1039
- } break;
1040
-
1041
- case Op_matrix_3x3:{
1042
- const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
1043
- const float* m = &matrix->vals[0][0];
1044
-
1045
- F R = m[0]*r + m[1]*g + m[2]*b,
1046
- G = m[3]*r + m[4]*g + m[5]*b,
1047
- B = m[6]*r + m[7]*g + m[8]*b;
1048
-
1049
- r = R;
1050
- g = G;
1051
- b = B;
1052
- } break;
1053
-
1054
- case Op_matrix_3x4:{
1055
- const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
1056
- const float* m = &matrix->vals[0][0];
1057
-
1058
- F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1059
- G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1060
- B = m[8]*r + m[9]*g + m[10]*b + m[11];
1061
-
1062
- r = R;
1063
- g = G;
1064
- b = B;
1065
- } break;
1066
-
1067
- case Op_lab_to_xyz:{
1068
- // The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
1069
- F L = r * 100.0f,
1070
- A = g * 255.0f - 128.0f,
1071
- B = b * 255.0f - 128.0f;
1072
-
1073
- // Convert to CIE XYZ.
1074
- F Y = (L + 16.0f) * (1/116.0f),
1075
- X = Y + A*(1/500.0f),
1076
- Z = Y - B*(1/200.0f);
1077
-
1078
- X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1079
- Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1080
- Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1081
-
1082
- // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1083
- r = X * 0.9642f;
1084
- g = Y ;
1085
- b = Z * 0.8249f;
1086
- } break;
1087
-
1088
- // As above, in reverse.
1089
- case Op_xyz_to_lab:{
1090
- F X = r * (1/0.9642f),
1091
- Y = g,
1092
- Z = b * (1/0.8249f);
1093
-
1094
- X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
1095
- Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
1096
- Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
1097
-
1098
- F L = Y*116.0f - 16.0f,
1099
- A = (X-Y)*500.0f,
1100
- B = (Y-Z)*200.0f;
1101
-
1102
- r = L * (1/100.f);
1103
- g = (A + 128.0f) * (1/255.0f);
1104
- b = (B + 128.0f) * (1/255.0f);
1105
- } break;
1106
-
1107
- case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
1108
- case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
1109
- case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
1110
- case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
1111
-
1112
- case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
1113
- case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
1114
- case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
1115
- case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
1116
-
1117
- case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
1118
- case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
1119
- case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
1120
- case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
1121
-
1122
- case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
1123
- case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
1124
- case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
1125
- case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
1126
-
1127
- case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
1128
- case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
1129
- case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
1130
- case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
1131
-
1132
- case Op_clut_A2B: {
1133
- const skcms_A2B* a2b = (const skcms_A2B*) *args++;
1134
- clut(a2b, &r,&g,&b,a);
1135
-
1136
- if (a2b->input_channels == 4) {
1137
- // CMYK is opaque.
1138
- a = F1;
1139
- }
1140
- } break;
1141
-
1142
- case Op_clut_B2A: {
1143
- const skcms_B2A* b2a = (const skcms_B2A*) *args++;
1144
- clut(b2a, &r,&g,&b,&a);
1145
- } break;
1146
-
1147
- // Notice, from here on down the store_ ops all return, ending the loop.
1148
-
1149
- case Op_store_a8: {
1150
- store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1151
- } return;
1152
-
1153
- case Op_store_g8: {
1154
- // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1155
- store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1156
- } return;
1157
-
1158
- case Op_store_4444: {
1159
- store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1160
- | cast<U16>(to_fixed(g * 15) << 8)
1161
- | cast<U16>(to_fixed(b * 15) << 4)
1162
- | cast<U16>(to_fixed(a * 15) << 0));
1163
- } return;
1164
-
1165
- case Op_store_565: {
1166
- store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
1167
- | cast<U16>(to_fixed(g * 63) << 5 )
1168
- | cast<U16>(to_fixed(b * 31) << 11 ));
1169
- } return;
1170
-
1171
- case Op_store_888: {
1172
- uint8_t* rgb = (uint8_t*)dst + 3*i;
1173
- #if defined(USING_NEON)
1174
- // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1175
- // get there via U16 to save some instructions converting to float. And just
1176
- // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1177
- U16 R = cast<U16>(to_fixed(r * 255)),
1178
- G = cast<U16>(to_fixed(g * 255)),
1179
- B = cast<U16>(to_fixed(b * 255));
1180
-
1181
- uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1182
- vst3_lane_u8(rgb+0, v, 0);
1183
- vst3_lane_u8(rgb+3, v, 2);
1184
- vst3_lane_u8(rgb+6, v, 4);
1185
- vst3_lane_u8(rgb+9, v, 6);
1186
- #else
1187
- store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1188
- store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1189
- store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1190
- #endif
1191
- } return;
1192
-
1193
- case Op_store_8888: {
1194
- store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0
1195
- | cast<U32>(to_fixed(g * 255)) << 8
1196
- | cast<U32>(to_fixed(b * 255)) << 16
1197
- | cast<U32>(to_fixed(a * 255)) << 24);
1198
- } return;
1199
-
1200
- case Op_store_101010x_XR: {
1201
- static constexpr float min = -0.752941f;
1202
- static constexpr float max = 1.25098f;
1203
- static constexpr float range = max - min;
1204
- store(dst + 4*i, cast<U32>(to_fixed(((r - min) / range) * 1023)) << 0
1205
- | cast<U32>(to_fixed(((g - min) / range) * 1023)) << 10
1206
- | cast<U32>(to_fixed(((b - min) / range) * 1023)) << 20);
1207
- return;
788
+ struct NoCtx {};
789
+
790
+ struct Ctx {
791
+ const void* fArg;
792
+ operator NoCtx() { return NoCtx{}; }
793
+ template <typename T> operator T*() { return (const T*)fArg; }
794
+ };
795
+
796
+ #define STAGE_PARAMS(MAYBE_REF) SKCMS_MAYBE_UNUSED const char* src, \
797
+ SKCMS_MAYBE_UNUSED char* dst, \
798
+ SKCMS_MAYBE_UNUSED F MAYBE_REF r, \
799
+ SKCMS_MAYBE_UNUSED F MAYBE_REF g, \
800
+ SKCMS_MAYBE_UNUSED F MAYBE_REF b, \
801
+ SKCMS_MAYBE_UNUSED F MAYBE_REF a, \
802
+ SKCMS_MAYBE_UNUSED int i
803
+
804
+ #if SKCMS_HAS_MUSTTAIL
805
+
806
+ // Stages take a stage list, and each stage is responsible for tail-calling the next one.
807
+ //
808
+ // Unfortunately, we can't declare a StageFn as a function pointer which takes a pointer to
809
+ // another StageFn; declaring this leads to a circular dependency. To avoid this, StageFn is
810
+ // wrapped in a single-element `struct StageList` which we are able to forward-declare.
811
+ struct StageList;
812
+ using StageFn = void (*)(StageList stages, const void** ctx, STAGE_PARAMS());
813
+ struct StageList {
814
+ const StageFn* fn;
815
+ };
816
+
817
+ #define DECLARE_STAGE(name, arg, CALL_NEXT) \
818
+ SI void Exec_##name##_k(arg, STAGE_PARAMS(&)); \
819
+ \
820
+ SI void Exec_##name(StageList list, const void** ctx, STAGE_PARAMS()) { \
821
+ Exec_##name##_k(Ctx{*ctx}, src, dst, r, g, b, a, i); \
822
+ ++list.fn; ++ctx; \
823
+ CALL_NEXT; \
824
+ } \
825
+ \
826
+ SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
827
+
828
+ #define STAGE(name, arg) \
829
+ DECLARE_STAGE(name, arg, [[clang::musttail]] return (*list.fn)(list, ctx, src, dst, \
830
+ r, g, b, a, i))
831
+
832
+ #define FINAL_STAGE(name, arg) \
833
+ DECLARE_STAGE(name, arg, /* Stop executing stages and return to the caller. */)
834
+
835
+ #else
836
+
837
+ #define DECLARE_STAGE(name, arg) \
838
+ SI void Exec_##name##_k(arg, STAGE_PARAMS(&)); \
839
+ \
840
+ SI void Exec_##name(const void* ctx, STAGE_PARAMS(&)) { \
841
+ Exec_##name##_k(Ctx{ctx}, src, dst, r, g, b, a, i); \
842
+ } \
843
+ \
844
+ SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
845
+
846
+ #define STAGE(name, arg) DECLARE_STAGE(name, arg)
847
+ #define FINAL_STAGE(name, arg) DECLARE_STAGE(name, arg)
848
+
849
+ #endif
850
+
851
+ STAGE(load_a8, NoCtx) {
852
+ a = F_from_U8(load<U8>(src + 1*i));
853
+ }
854
+
855
+ STAGE(load_g8, NoCtx) {
856
+ r = g = b = F_from_U8(load<U8>(src + 1*i));
857
+ }
858
+
859
+ STAGE(load_4444, NoCtx) {
860
+ U16 abgr = load<U16>(src + 2*i);
861
+
862
+ r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
863
+ g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
864
+ b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
865
+ a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
866
+ }
867
+
868
+ STAGE(load_565, NoCtx) {
869
+ U16 rgb = load<U16>(src + 2*i);
870
+
871
+ r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
872
+ g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
873
+ b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
874
+ }
875
+
876
+ STAGE(load_888, NoCtx) {
877
+ const uint8_t* rgb = (const uint8_t*)(src + 3*i);
878
+ #if defined(USING_NEON)
879
+ // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
880
+ // a time. Since we're doing that, we might as well load them into 16-bit lanes.
881
+ // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
882
+ uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
883
+ v = vld3_lane_u8(rgb+0, v, 0);
884
+ v = vld3_lane_u8(rgb+3, v, 2);
885
+ v = vld3_lane_u8(rgb+6, v, 4);
886
+ v = vld3_lane_u8(rgb+9, v, 6);
887
+
888
+ // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
889
+ // convert to F. (Again, U32 would be even better here if drop ARMv7 or split
890
+ // ARMv7 and ARMv8 impls.)
891
+ r = cast<F>((U16)v.val[0]) * (1/255.0f);
892
+ g = cast<F>((U16)v.val[1]) * (1/255.0f);
893
+ b = cast<F>((U16)v.val[2]) * (1/255.0f);
894
+ #else
895
+ r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
896
+ g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
897
+ b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
898
+ #endif
899
+ }
900
+
901
+ STAGE(load_8888, NoCtx) {
902
+ U32 rgba = load<U32>(src + 4*i);
903
+
904
+ r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
905
+ g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
906
+ b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
907
+ a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
908
+ }
909
+
910
+ STAGE(load_1010102, NoCtx) {
911
+ U32 rgba = load<U32>(src + 4*i);
912
+
913
+ r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
914
+ g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
915
+ b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
916
+ a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
917
+ }
918
+
919
+ STAGE(load_101010x_XR, NoCtx) {
920
+ static constexpr float min = -0.752941f;
921
+ static constexpr float max = 1.25098f;
922
+ static constexpr float range = max - min;
923
+ U32 rgba = load<U32>(src + 4*i);
924
+ r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f) * range + min;
925
+ g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f) * range + min;
926
+ b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f) * range + min;
927
+ }
928
+
929
+ STAGE(load_161616LE, NoCtx) {
930
+ uintptr_t ptr = (uintptr_t)(src + 6*i);
931
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
932
+ const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
933
+ #if defined(USING_NEON)
934
+ uint16x4x3_t v = vld3_u16(rgb);
935
+ r = cast<F>((U16)v.val[0]) * (1/65535.0f);
936
+ g = cast<F>((U16)v.val[1]) * (1/65535.0f);
937
+ b = cast<F>((U16)v.val[2]) * (1/65535.0f);
938
+ #else
939
+ r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
940
+ g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
941
+ b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
942
+ #endif
943
+ }
944
+
945
+ STAGE(load_16161616LE, NoCtx) {
946
+ uintptr_t ptr = (uintptr_t)(src + 8*i);
947
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
948
+ const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
949
+ #if defined(USING_NEON)
950
+ uint16x4x4_t v = vld4_u16(rgba);
951
+ r = cast<F>((U16)v.val[0]) * (1/65535.0f);
952
+ g = cast<F>((U16)v.val[1]) * (1/65535.0f);
953
+ b = cast<F>((U16)v.val[2]) * (1/65535.0f);
954
+ a = cast<F>((U16)v.val[3]) * (1/65535.0f);
955
+ #else
956
+ U64 px = load<U64>(rgba);
957
+
958
+ r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
959
+ g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
960
+ b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
961
+ a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
962
+ #endif
963
+ }
964
+
965
+ STAGE(load_161616BE, NoCtx) {
966
+ uintptr_t ptr = (uintptr_t)(src + 6*i);
967
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
968
+ const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
969
+ #if defined(USING_NEON)
970
+ uint16x4x3_t v = vld3_u16(rgb);
971
+ r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
972
+ g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
973
+ b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
974
+ #else
975
+ U32 R = load_3<U32>(rgb+0),
976
+ G = load_3<U32>(rgb+1),
977
+ B = load_3<U32>(rgb+2);
978
+ // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
979
+ r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
980
+ g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
981
+ b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
982
+ #endif
983
+ }
984
+
985
+ STAGE(load_16161616BE, NoCtx) {
986
+ uintptr_t ptr = (uintptr_t)(src + 8*i);
987
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
988
+ const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
989
+ #if defined(USING_NEON)
990
+ uint16x4x4_t v = vld4_u16(rgba);
991
+ r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
992
+ g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
993
+ b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
994
+ a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
995
+ #else
996
+ U64 px = swap_endian_16x4(load<U64>(rgba));
997
+
998
+ r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
999
+ g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
1000
+ b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
1001
+ a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
1002
+ #endif
1003
+ }
1004
+
1005
+ STAGE(load_hhh, NoCtx) {
1006
+ uintptr_t ptr = (uintptr_t)(src + 6*i);
1007
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1008
+ const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1009
+ #if defined(USING_NEON)
1010
+ uint16x4x3_t v = vld3_u16(rgb);
1011
+ U16 R = (U16)v.val[0],
1012
+ G = (U16)v.val[1],
1013
+ B = (U16)v.val[2];
1014
+ #else
1015
+ U16 R = load_3<U16>(rgb+0),
1016
+ G = load_3<U16>(rgb+1),
1017
+ B = load_3<U16>(rgb+2);
1018
+ #endif
1019
+ r = F_from_Half(R);
1020
+ g = F_from_Half(G);
1021
+ b = F_from_Half(B);
1022
+ }
1023
+
1024
+ STAGE(load_hhhh, NoCtx) {
1025
+ uintptr_t ptr = (uintptr_t)(src + 8*i);
1026
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1027
+ const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1028
+ #if defined(USING_NEON)
1029
+ uint16x4x4_t v = vld4_u16(rgba);
1030
+ U16 R = (U16)v.val[0],
1031
+ G = (U16)v.val[1],
1032
+ B = (U16)v.val[2],
1033
+ A = (U16)v.val[3];
1034
+ #else
1035
+ U64 px = load<U64>(rgba);
1036
+ U16 R = cast<U16>((px >> 0) & 0xffff),
1037
+ G = cast<U16>((px >> 16) & 0xffff),
1038
+ B = cast<U16>((px >> 32) & 0xffff),
1039
+ A = cast<U16>((px >> 48) & 0xffff);
1040
+ #endif
1041
+ r = F_from_Half(R);
1042
+ g = F_from_Half(G);
1043
+ b = F_from_Half(B);
1044
+ a = F_from_Half(A);
1045
+ }
1046
+
1047
+ STAGE(load_fff, NoCtx) {
1048
+ uintptr_t ptr = (uintptr_t)(src + 12*i);
1049
+ assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1050
+ const float* rgb = (const float*)ptr; // cast to const float* to be safe.
1051
+ #if defined(USING_NEON)
1052
+ float32x4x3_t v = vld3q_f32(rgb);
1053
+ r = (F)v.val[0];
1054
+ g = (F)v.val[1];
1055
+ b = (F)v.val[2];
1056
+ #else
1057
+ r = load_3<F>(rgb+0);
1058
+ g = load_3<F>(rgb+1);
1059
+ b = load_3<F>(rgb+2);
1060
+ #endif
1061
+ }
1062
+
1063
+ STAGE(load_ffff, NoCtx) {
1064
+ uintptr_t ptr = (uintptr_t)(src + 16*i);
1065
+ assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1066
+ const float* rgba = (const float*)ptr; // cast to const float* to be safe.
1067
+ #if defined(USING_NEON)
1068
+ float32x4x4_t v = vld4q_f32(rgba);
1069
+ r = (F)v.val[0];
1070
+ g = (F)v.val[1];
1071
+ b = (F)v.val[2];
1072
+ a = (F)v.val[3];
1073
+ #else
1074
+ r = load_4<F>(rgba+0);
1075
+ g = load_4<F>(rgba+1);
1076
+ b = load_4<F>(rgba+2);
1077
+ a = load_4<F>(rgba+3);
1078
+ #endif
1079
+ }
1080
+
1081
+ STAGE(swap_rb, NoCtx) {
1082
+ F t = r;
1083
+ r = b;
1084
+ b = t;
1085
+ }
1086
+
1087
+ STAGE(clamp, NoCtx) {
1088
+ r = max_(F0, min_(r, F1));
1089
+ g = max_(F0, min_(g, F1));
1090
+ b = max_(F0, min_(b, F1));
1091
+ a = max_(F0, min_(a, F1));
1092
+ }
1093
+
1094
+ STAGE(invert, NoCtx) {
1095
+ r = F1 - r;
1096
+ g = F1 - g;
1097
+ b = F1 - b;
1098
+ a = F1 - a;
1099
+ }
1100
+
1101
+ STAGE(force_opaque, NoCtx) {
1102
+ a = F1;
1103
+ }
1104
+
1105
+ STAGE(premul, NoCtx) {
1106
+ r *= a;
1107
+ g *= a;
1108
+ b *= a;
1109
+ }
1110
+
1111
+ STAGE(unpremul, NoCtx) {
1112
+ F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1113
+ r *= scale;
1114
+ g *= scale;
1115
+ b *= scale;
1116
+ }
1117
+
1118
+ STAGE(matrix_3x3, const skcms_Matrix3x3* matrix) {
1119
+ const float* m = &matrix->vals[0][0];
1120
+
1121
+ F R = m[0]*r + m[1]*g + m[2]*b,
1122
+ G = m[3]*r + m[4]*g + m[5]*b,
1123
+ B = m[6]*r + m[7]*g + m[8]*b;
1124
+
1125
+ r = R;
1126
+ g = G;
1127
+ b = B;
1128
+ }
1129
+
1130
+ STAGE(matrix_3x4, const skcms_Matrix3x4* matrix) {
1131
+ const float* m = &matrix->vals[0][0];
1132
+
1133
+ F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1134
+ G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1135
+ B = m[8]*r + m[9]*g + m[10]*b + m[11];
1136
+
1137
+ r = R;
1138
+ g = G;
1139
+ b = B;
1140
+ }
1141
+
1142
+ STAGE(lab_to_xyz, NoCtx) {
1143
+ // The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
1144
+ F L = r * 100.0f,
1145
+ A = g * 255.0f - 128.0f,
1146
+ B = b * 255.0f - 128.0f;
1147
+
1148
+ // Convert to CIE XYZ.
1149
+ F Y = (L + 16.0f) * (1/116.0f),
1150
+ X = Y + A*(1/500.0f),
1151
+ Z = Y - B*(1/200.0f);
1152
+
1153
+ X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1154
+ Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1155
+ Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1156
+
1157
+ // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1158
+ r = X * 0.9642f;
1159
+ g = Y ;
1160
+ b = Z * 0.8249f;
1161
+ }
1162
+
1163
+ // As above, in reverse.
1164
+ STAGE(xyz_to_lab, NoCtx) {
1165
+ F X = r * (1/0.9642f),
1166
+ Y = g,
1167
+ Z = b * (1/0.8249f);
1168
+
1169
+ X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
1170
+ Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
1171
+ Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
1172
+
1173
+ F L = Y*116.0f - 16.0f,
1174
+ A = (X-Y)*500.0f,
1175
+ B = (Y-Z)*200.0f;
1176
+
1177
+ r = L * (1/100.f);
1178
+ g = (A + 128.0f) * (1/255.0f);
1179
+ b = (B + 128.0f) * (1/255.0f);
1180
+ }
1181
+
1182
+ STAGE(gamma_r, const skcms_TransferFunction* tf) { r = apply_gamma(tf, r); }
1183
+ STAGE(gamma_g, const skcms_TransferFunction* tf) { g = apply_gamma(tf, g); }
1184
+ STAGE(gamma_b, const skcms_TransferFunction* tf) { b = apply_gamma(tf, b); }
1185
+ STAGE(gamma_a, const skcms_TransferFunction* tf) { a = apply_gamma(tf, a); }
1186
+
1187
+ STAGE(gamma_rgb, const skcms_TransferFunction* tf) {
1188
+ r = apply_gamma(tf, r);
1189
+ g = apply_gamma(tf, g);
1190
+ b = apply_gamma(tf, b);
1191
+ }
1192
+
1193
+ STAGE(tf_r, const skcms_TransferFunction* tf) { r = apply_tf(tf, r); }
1194
+ STAGE(tf_g, const skcms_TransferFunction* tf) { g = apply_tf(tf, g); }
1195
+ STAGE(tf_b, const skcms_TransferFunction* tf) { b = apply_tf(tf, b); }
1196
+ STAGE(tf_a, const skcms_TransferFunction* tf) { a = apply_tf(tf, a); }
1197
+
1198
+ STAGE(tf_rgb, const skcms_TransferFunction* tf) {
1199
+ r = apply_tf(tf, r);
1200
+ g = apply_tf(tf, g);
1201
+ b = apply_tf(tf, b);
1202
+ }
1203
+
1204
+ STAGE(pq_r, const skcms_TransferFunction* tf) { r = apply_pq(tf, r); }
1205
+ STAGE(pq_g, const skcms_TransferFunction* tf) { g = apply_pq(tf, g); }
1206
+ STAGE(pq_b, const skcms_TransferFunction* tf) { b = apply_pq(tf, b); }
1207
+ STAGE(pq_a, const skcms_TransferFunction* tf) { a = apply_pq(tf, a); }
1208
+
1209
+ STAGE(pq_rgb, const skcms_TransferFunction* tf) {
1210
+ r = apply_pq(tf, r);
1211
+ g = apply_pq(tf, g);
1212
+ b = apply_pq(tf, b);
1213
+ }
1214
+
1215
+ STAGE(hlg_r, const skcms_TransferFunction* tf) { r = apply_hlg(tf, r); }
1216
+ STAGE(hlg_g, const skcms_TransferFunction* tf) { g = apply_hlg(tf, g); }
1217
+ STAGE(hlg_b, const skcms_TransferFunction* tf) { b = apply_hlg(tf, b); }
1218
+ STAGE(hlg_a, const skcms_TransferFunction* tf) { a = apply_hlg(tf, a); }
1219
+
1220
+ STAGE(hlg_rgb, const skcms_TransferFunction* tf) {
1221
+ r = apply_hlg(tf, r);
1222
+ g = apply_hlg(tf, g);
1223
+ b = apply_hlg(tf, b);
1224
+ }
1225
+
1226
+ STAGE(hlginv_r, const skcms_TransferFunction* tf) { r = apply_hlginv(tf, r); }
1227
+ STAGE(hlginv_g, const skcms_TransferFunction* tf) { g = apply_hlginv(tf, g); }
1228
+ STAGE(hlginv_b, const skcms_TransferFunction* tf) { b = apply_hlginv(tf, b); }
1229
+ STAGE(hlginv_a, const skcms_TransferFunction* tf) { a = apply_hlginv(tf, a); }
1230
+
1231
+ STAGE(hlginv_rgb, const skcms_TransferFunction* tf) {
1232
+ r = apply_hlginv(tf, r);
1233
+ g = apply_hlginv(tf, g);
1234
+ b = apply_hlginv(tf, b);
1235
+ }
1236
+
1237
+ STAGE(table_r, const skcms_Curve* curve) { r = table(curve, r); }
1238
+ STAGE(table_g, const skcms_Curve* curve) { g = table(curve, g); }
1239
+ STAGE(table_b, const skcms_Curve* curve) { b = table(curve, b); }
1240
+ STAGE(table_a, const skcms_Curve* curve) { a = table(curve, a); }
1241
+
1242
+ STAGE(clut_A2B, const skcms_A2B* a2b) {
1243
+ clut(a2b, &r,&g,&b,a);
1244
+
1245
+ if (a2b->input_channels == 4) {
1246
+ // CMYK is opaque.
1247
+ a = F1;
1248
+ }
1249
+ }
1250
+
1251
+ STAGE(clut_B2A, const skcms_B2A* b2a) {
1252
+ clut(b2a, &r,&g,&b,&a);
1253
+ }
1254
+
1255
+ // From here on down, the store_ ops are all "final stages," terminating processing of this group.
1256
+
1257
+ FINAL_STAGE(store_a8, NoCtx) {
1258
+ store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1259
+ }
1260
+
1261
+ FINAL_STAGE(store_g8, NoCtx) {
1262
+ // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1263
+ store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1264
+ }
1265
+
1266
+ FINAL_STAGE(store_4444, NoCtx) {
1267
+ store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1268
+ | cast<U16>(to_fixed(g * 15) << 8)
1269
+ | cast<U16>(to_fixed(b * 15) << 4)
1270
+ | cast<U16>(to_fixed(a * 15) << 0));
1271
+ }
1272
+
1273
+ FINAL_STAGE(store_565, NoCtx) {
1274
+ store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
1275
+ | cast<U16>(to_fixed(g * 63) << 5 )
1276
+ | cast<U16>(to_fixed(b * 31) << 11 ));
1277
+ }
1278
+
1279
+ FINAL_STAGE(store_888, NoCtx) {
1280
+ uint8_t* rgb = (uint8_t*)dst + 3*i;
1281
+ #if defined(USING_NEON)
1282
+ // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1283
+ // get there via U16 to save some instructions converting to float. And just
1284
+ // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1285
+ U16 R = cast<U16>(to_fixed(r * 255)),
1286
+ G = cast<U16>(to_fixed(g * 255)),
1287
+ B = cast<U16>(to_fixed(b * 255));
1288
+
1289
+ uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1290
+ vst3_lane_u8(rgb+0, v, 0);
1291
+ vst3_lane_u8(rgb+3, v, 2);
1292
+ vst3_lane_u8(rgb+6, v, 4);
1293
+ vst3_lane_u8(rgb+9, v, 6);
1294
+ #else
1295
+ store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1296
+ store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1297
+ store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1298
+ #endif
1299
+ }
1300
+
1301
+ FINAL_STAGE(store_8888, NoCtx) {
1302
+ store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0
1303
+ | cast<U32>(to_fixed(g * 255)) << 8
1304
+ | cast<U32>(to_fixed(b * 255)) << 16
1305
+ | cast<U32>(to_fixed(a * 255)) << 24);
1306
+ }
1307
+
1308
+ FINAL_STAGE(store_101010x_XR, NoCtx) {
1309
+ static constexpr float min = -0.752941f;
1310
+ static constexpr float max = 1.25098f;
1311
+ static constexpr float range = max - min;
1312
+ store(dst + 4*i, cast<U32>(to_fixed(((r - min) / range) * 1023)) << 0
1313
+ | cast<U32>(to_fixed(((g - min) / range) * 1023)) << 10
1314
+ | cast<U32>(to_fixed(((b - min) / range) * 1023)) << 20);
1315
+ return;
1316
+ }
1317
+
1318
+ FINAL_STAGE(store_1010102, NoCtx) {
1319
+ store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
1320
+ | cast<U32>(to_fixed(g * 1023)) << 10
1321
+ | cast<U32>(to_fixed(b * 1023)) << 20
1322
+ | cast<U32>(to_fixed(a * 3)) << 30);
1323
+ }
1324
+
1325
+ FINAL_STAGE(store_161616LE, NoCtx) {
1326
+ uintptr_t ptr = (uintptr_t)(dst + 6*i);
1327
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1328
+ uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1329
+ #if defined(USING_NEON)
1330
+ uint16x4x3_t v = {{
1331
+ (uint16x4_t)U16_from_F(r),
1332
+ (uint16x4_t)U16_from_F(g),
1333
+ (uint16x4_t)U16_from_F(b),
1334
+ }};
1335
+ vst3_u16(rgb, v);
1336
+ #else
1337
+ store_3(rgb+0, U16_from_F(r));
1338
+ store_3(rgb+1, U16_from_F(g));
1339
+ store_3(rgb+2, U16_from_F(b));
1340
+ #endif
1341
+
1342
+ }
1343
+
1344
+ FINAL_STAGE(store_16161616LE, NoCtx) {
1345
+ uintptr_t ptr = (uintptr_t)(dst + 8*i);
1346
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1347
+ uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1348
+ #if defined(USING_NEON)
1349
+ uint16x4x4_t v = {{
1350
+ (uint16x4_t)U16_from_F(r),
1351
+ (uint16x4_t)U16_from_F(g),
1352
+ (uint16x4_t)U16_from_F(b),
1353
+ (uint16x4_t)U16_from_F(a),
1354
+ }};
1355
+ vst4_u16(rgba, v);
1356
+ #else
1357
+ U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1358
+ | cast<U64>(to_fixed(g * 65535)) << 16
1359
+ | cast<U64>(to_fixed(b * 65535)) << 32
1360
+ | cast<U64>(to_fixed(a * 65535)) << 48;
1361
+ store(rgba, px);
1362
+ #endif
1363
+ }
1364
+
1365
+ FINAL_STAGE(store_161616BE, NoCtx) {
1366
+ uintptr_t ptr = (uintptr_t)(dst + 6*i);
1367
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1368
+ uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1369
+ #if defined(USING_NEON)
1370
+ uint16x4x3_t v = {{
1371
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1372
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1373
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1374
+ }};
1375
+ vst3_u16(rgb, v);
1376
+ #else
1377
+ U32 R = to_fixed(r * 65535),
1378
+ G = to_fixed(g * 65535),
1379
+ B = to_fixed(b * 65535);
1380
+ store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1381
+ store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1382
+ store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1383
+ #endif
1384
+
1385
+ }
1386
+
1387
+ FINAL_STAGE(store_16161616BE, NoCtx) {
1388
+ uintptr_t ptr = (uintptr_t)(dst + 8*i);
1389
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1390
+ uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1391
+ #if defined(USING_NEON)
1392
+ uint16x4x4_t v = {{
1393
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1394
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1395
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1396
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1397
+ }};
1398
+ vst4_u16(rgba, v);
1399
+ #else
1400
+ U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1401
+ | cast<U64>(to_fixed(g * 65535)) << 16
1402
+ | cast<U64>(to_fixed(b * 65535)) << 32
1403
+ | cast<U64>(to_fixed(a * 65535)) << 48;
1404
+ store(rgba, swap_endian_16x4(px));
1405
+ #endif
1406
+ }
1407
+
1408
+ FINAL_STAGE(store_hhh, NoCtx) {
1409
+ uintptr_t ptr = (uintptr_t)(dst + 6*i);
1410
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1411
+ uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1412
+
1413
+ U16 R = Half_from_F(r),
1414
+ G = Half_from_F(g),
1415
+ B = Half_from_F(b);
1416
+ #if defined(USING_NEON)
1417
+ uint16x4x3_t v = {{
1418
+ (uint16x4_t)R,
1419
+ (uint16x4_t)G,
1420
+ (uint16x4_t)B,
1421
+ }};
1422
+ vst3_u16(rgb, v);
1423
+ #else
1424
+ store_3(rgb+0, R);
1425
+ store_3(rgb+1, G);
1426
+ store_3(rgb+2, B);
1427
+ #endif
1428
+ }
1429
+
1430
+ FINAL_STAGE(store_hhhh, NoCtx) {
1431
+ uintptr_t ptr = (uintptr_t)(dst + 8*i);
1432
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1433
+ uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1434
+
1435
+ U16 R = Half_from_F(r),
1436
+ G = Half_from_F(g),
1437
+ B = Half_from_F(b),
1438
+ A = Half_from_F(a);
1439
+ #if defined(USING_NEON)
1440
+ uint16x4x4_t v = {{
1441
+ (uint16x4_t)R,
1442
+ (uint16x4_t)G,
1443
+ (uint16x4_t)B,
1444
+ (uint16x4_t)A,
1445
+ }};
1446
+ vst4_u16(rgba, v);
1447
+ #else
1448
+ store(rgba, cast<U64>(R) << 0
1449
+ | cast<U64>(G) << 16
1450
+ | cast<U64>(B) << 32
1451
+ | cast<U64>(A) << 48);
1452
+ #endif
1453
+ }
1454
+
1455
+ FINAL_STAGE(store_fff, NoCtx) {
1456
+ uintptr_t ptr = (uintptr_t)(dst + 12*i);
1457
+ assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1458
+ float* rgb = (float*)ptr; // for this cast to float* to be safe.
1459
+ #if defined(USING_NEON)
1460
+ float32x4x3_t v = {{
1461
+ (float32x4_t)r,
1462
+ (float32x4_t)g,
1463
+ (float32x4_t)b,
1464
+ }};
1465
+ vst3q_f32(rgb, v);
1466
+ #else
1467
+ store_3(rgb+0, r);
1468
+ store_3(rgb+1, g);
1469
+ store_3(rgb+2, b);
1470
+ #endif
1471
+ }
1472
+
1473
+ FINAL_STAGE(store_ffff, NoCtx) {
1474
+ uintptr_t ptr = (uintptr_t)(dst + 16*i);
1475
+ assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1476
+ float* rgba = (float*)ptr; // for this cast to float* to be safe.
1477
+ #if defined(USING_NEON)
1478
+ float32x4x4_t v = {{
1479
+ (float32x4_t)r,
1480
+ (float32x4_t)g,
1481
+ (float32x4_t)b,
1482
+ (float32x4_t)a,
1483
+ }};
1484
+ vst4q_f32(rgba, v);
1485
+ #else
1486
+ store_4(rgba+0, r);
1487
+ store_4(rgba+1, g);
1488
+ store_4(rgba+2, b);
1489
+ store_4(rgba+3, a);
1490
+ #endif
1491
+ }
1492
+
1493
+ #if SKCMS_HAS_MUSTTAIL
1494
+
1495
+ SI void exec_stages(StageFn* stages, const void** contexts, const char* src, char* dst, int i) {
1496
+ (*stages)({stages}, contexts, src, dst, F0, F0, F0, F1, i);
1497
+ }
1498
+
1499
+ #else
1500
+
1501
+ static void exec_stages(const Op* ops, const void** contexts,
1502
+ const char* src, char* dst, int i) {
1503
+ F r = F0, g = F0, b = F0, a = F1;
1504
+ while (true) {
1505
+ switch (*ops++) {
1506
+ #define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); break;
1507
+ SKCMS_WORK_OPS(M)
1508
+ #undef M
1509
+ #define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); return;
1510
+ SKCMS_STORE_OPS(M)
1511
+ #undef M
1208
1512
  }
1209
- case Op_store_1010102: {
1210
- store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
1211
- | cast<U32>(to_fixed(g * 1023)) << 10
1212
- | cast<U32>(to_fixed(b * 1023)) << 20
1213
- | cast<U32>(to_fixed(a * 3)) << 30);
1214
- } return;
1215
-
1216
- case Op_store_161616LE: {
1217
- uintptr_t ptr = (uintptr_t)(dst + 6*i);
1218
- assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1219
- uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1220
- #if defined(USING_NEON)
1221
- uint16x4x3_t v = {{
1222
- (uint16x4_t)U16_from_F(r),
1223
- (uint16x4_t)U16_from_F(g),
1224
- (uint16x4_t)U16_from_F(b),
1225
- }};
1226
- vst3_u16(rgb, v);
1227
- #else
1228
- store_3(rgb+0, U16_from_F(r));
1229
- store_3(rgb+1, U16_from_F(g));
1230
- store_3(rgb+2, U16_from_F(b));
1231
- #endif
1232
-
1233
- } return;
1234
-
1235
- case Op_store_16161616LE: {
1236
- uintptr_t ptr = (uintptr_t)(dst + 8*i);
1237
- assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1238
- uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1239
- #if defined(USING_NEON)
1240
- uint16x4x4_t v = {{
1241
- (uint16x4_t)U16_from_F(r),
1242
- (uint16x4_t)U16_from_F(g),
1243
- (uint16x4_t)U16_from_F(b),
1244
- (uint16x4_t)U16_from_F(a),
1245
- }};
1246
- vst4_u16(rgba, v);
1247
- #else
1248
- U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1249
- | cast<U64>(to_fixed(g * 65535)) << 16
1250
- | cast<U64>(to_fixed(b * 65535)) << 32
1251
- | cast<U64>(to_fixed(a * 65535)) << 48;
1252
- store(rgba, px);
1253
- #endif
1254
- } return;
1255
-
1256
- case Op_store_161616BE: {
1257
- uintptr_t ptr = (uintptr_t)(dst + 6*i);
1258
- assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1259
- uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1260
- #if defined(USING_NEON)
1261
- uint16x4x3_t v = {{
1262
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1263
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1264
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1265
- }};
1266
- vst3_u16(rgb, v);
1267
- #else
1268
- U32 R = to_fixed(r * 65535),
1269
- G = to_fixed(g * 65535),
1270
- B = to_fixed(b * 65535);
1271
- store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1272
- store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1273
- store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1274
- #endif
1275
-
1276
- } return;
1277
-
1278
- case Op_store_16161616BE: {
1279
- uintptr_t ptr = (uintptr_t)(dst + 8*i);
1280
- assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1281
- uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1282
- #if defined(USING_NEON)
1283
- uint16x4x4_t v = {{
1284
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1285
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1286
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1287
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1288
- }};
1289
- vst4_u16(rgba, v);
1290
- #else
1291
- U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1292
- | cast<U64>(to_fixed(g * 65535)) << 16
1293
- | cast<U64>(to_fixed(b * 65535)) << 32
1294
- | cast<U64>(to_fixed(a * 65535)) << 48;
1295
- store(rgba, swap_endian_16x4(px));
1296
- #endif
1297
- } return;
1298
-
1299
- case Op_store_hhh: {
1300
- uintptr_t ptr = (uintptr_t)(dst + 6*i);
1301
- assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1302
- uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1303
-
1304
- U16 R = Half_from_F(r),
1305
- G = Half_from_F(g),
1306
- B = Half_from_F(b);
1307
- #if defined(USING_NEON)
1308
- uint16x4x3_t v = {{
1309
- (uint16x4_t)R,
1310
- (uint16x4_t)G,
1311
- (uint16x4_t)B,
1312
- }};
1313
- vst3_u16(rgb, v);
1314
- #else
1315
- store_3(rgb+0, R);
1316
- store_3(rgb+1, G);
1317
- store_3(rgb+2, B);
1318
- #endif
1319
- } return;
1320
-
1321
- case Op_store_hhhh: {
1322
- uintptr_t ptr = (uintptr_t)(dst + 8*i);
1323
- assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1324
- uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1325
-
1326
- U16 R = Half_from_F(r),
1327
- G = Half_from_F(g),
1328
- B = Half_from_F(b),
1329
- A = Half_from_F(a);
1330
- #if defined(USING_NEON)
1331
- uint16x4x4_t v = {{
1332
- (uint16x4_t)R,
1333
- (uint16x4_t)G,
1334
- (uint16x4_t)B,
1335
- (uint16x4_t)A,
1336
- }};
1337
- vst4_u16(rgba, v);
1338
- #else
1339
- store(rgba, cast<U64>(R) << 0
1340
- | cast<U64>(G) << 16
1341
- | cast<U64>(B) << 32
1342
- | cast<U64>(A) << 48);
1343
- #endif
1344
-
1345
- } return;
1346
-
1347
- case Op_store_fff: {
1348
- uintptr_t ptr = (uintptr_t)(dst + 12*i);
1349
- assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1350
- float* rgb = (float*)ptr; // for this cast to float* to be safe.
1351
- #if defined(USING_NEON)
1352
- float32x4x3_t v = {{
1353
- (float32x4_t)r,
1354
- (float32x4_t)g,
1355
- (float32x4_t)b,
1356
- }};
1357
- vst3q_f32(rgb, v);
1358
- #else
1359
- store_3(rgb+0, r);
1360
- store_3(rgb+1, g);
1361
- store_3(rgb+2, b);
1362
- #endif
1363
- } return;
1364
-
1365
- case Op_store_ffff: {
1366
- uintptr_t ptr = (uintptr_t)(dst + 16*i);
1367
- assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1368
- float* rgba = (float*)ptr; // for this cast to float* to be safe.
1369
- #if defined(USING_NEON)
1370
- float32x4x4_t v = {{
1371
- (float32x4_t)r,
1372
- (float32x4_t)g,
1373
- (float32x4_t)b,
1374
- (float32x4_t)a,
1375
- }};
1376
- vst4q_f32(rgba, v);
1377
- #else
1378
- store_4(rgba+0, r);
1379
- store_4(rgba+1, g);
1380
- store_4(rgba+2, b);
1381
- store_4(rgba+3, a);
1382
- #endif
1383
- } return;
1384
1513
  }
1385
1514
  }
1386
- }
1387
1515
 
1516
+ #endif
1517
+
1518
+ // NOLINTNEXTLINE(misc-definitions-in-headers)
1519
+ void run_program(const Op* program, const void** contexts, SKCMS_MAYBE_UNUSED ptrdiff_t programSize,
1520
+ const char* src, char* dst, int n,
1521
+ const size_t src_bpp, const size_t dst_bpp) {
1522
+ #if SKCMS_HAS_MUSTTAIL
1523
+ // Convert the program into an array of tailcall stages.
1524
+ StageFn stages[32];
1525
+ assert(programSize <= ARRAY_COUNT(stages));
1526
+
1527
+ static constexpr StageFn kStageFns[] = {
1528
+ #define M(name) &Exec_##name,
1529
+ SKCMS_WORK_OPS(M)
1530
+ SKCMS_STORE_OPS(M)
1531
+ #undef M
1532
+ };
1533
+
1534
+ for (ptrdiff_t index = 0; index < programSize; ++index) {
1535
+ stages[index] = kStageFns[(int)program[index]];
1536
+ }
1537
+ #else
1538
+ // Use the op array as-is.
1539
+ const Op* stages = program;
1540
+ #endif
1388
1541
 
1389
- static void run_program(const Op* program, const void** arguments,
1390
- const char* src, char* dst, int n,
1391
- const size_t src_bpp, const size_t dst_bpp) {
1392
1542
  int i = 0;
1393
1543
  while (n >= N) {
1394
- exec_ops(program, arguments, src, dst, i);
1544
+ exec_stages(stages, contexts, src, dst, i);
1395
1545
  i += N;
1396
1546
  n -= N;
1397
1547
  }
@@ -1399,30 +1549,7 @@ static void run_program(const Op* program, const void** arguments,
1399
1549
  char tmp[4*4*N] = {0};
1400
1550
 
1401
1551
  memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
1402
- exec_ops(program, arguments, tmp, tmp, 0);
1552
+ exec_stages(stages, contexts, tmp, tmp, 0);
1403
1553
  memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
1404
1554
  }
1405
1555
  }
1406
-
1407
- // Clean up any #defines we may have set so that we can be #included again.
1408
- #if defined(USING_AVX)
1409
- #undef USING_AVX
1410
- #endif
1411
- #if defined(USING_AVX_F16C)
1412
- #undef USING_AVX_F16C
1413
- #endif
1414
- #if defined(USING_AVX2)
1415
- #undef USING_AVX2
1416
- #endif
1417
- #if defined(USING_AVX512F)
1418
- #undef USING_AVX512F
1419
- #endif
1420
-
1421
- #if defined(USING_NEON)
1422
- #undef USING_NEON
1423
- #endif
1424
- #if defined(USING_NEON_F16C)
1425
- #undef USING_NEON_F16C
1426
- #endif
1427
-
1428
- #undef FALLTHROUGH