@shopify/react-native-skia 0.1.234 → 0.1.236

Sign up to get free protection for your applications and to get access to all the features.
Files changed (262) hide show
  1. package/cpp/api/JsiSkTypefaceFactory.h +2 -1
  2. package/cpp/skia/include/android/AHardwareBufferUtils.h +23 -0
  3. package/cpp/skia/include/android/GrAHardwareBufferUtils.h +2 -0
  4. package/cpp/skia/include/android/graphite/SurfaceAndroid.h +59 -0
  5. package/cpp/skia/include/codec/SkAvifDecoder.h +1 -1
  6. package/cpp/skia/include/codec/SkBmpDecoder.h +1 -1
  7. package/cpp/skia/include/codec/SkCodec.h +21 -3
  8. package/cpp/skia/include/codec/SkGifDecoder.h +1 -1
  9. package/cpp/skia/include/codec/SkIcoDecoder.h +1 -1
  10. package/cpp/skia/include/codec/SkJpegDecoder.h +1 -1
  11. package/cpp/skia/include/codec/SkJpegxlDecoder.h +1 -1
  12. package/cpp/skia/include/codec/SkPngDecoder.h +1 -1
  13. package/cpp/skia/include/codec/SkRawDecoder.h +1 -1
  14. package/cpp/skia/include/codec/SkWbmpDecoder.h +1 -1
  15. package/cpp/skia/include/codec/SkWebpDecoder.h +1 -1
  16. package/cpp/skia/include/config/SkUserConfig.h +3 -1
  17. package/cpp/skia/include/core/SkCanvas.h +66 -37
  18. package/cpp/skia/include/core/SkColorFilter.h +5 -2
  19. package/cpp/skia/include/core/SkContourMeasure.h +1 -0
  20. package/cpp/skia/include/core/SkDocument.h +1 -0
  21. package/cpp/skia/include/core/SkFont.h +14 -24
  22. package/cpp/skia/include/core/SkFontArguments.h +1 -1
  23. package/cpp/skia/include/core/SkFontMetrics.h +1 -1
  24. package/cpp/skia/include/core/SkFontMgr.h +0 -7
  25. package/cpp/skia/include/core/SkGraphics.h +13 -0
  26. package/cpp/skia/include/core/SkMesh.h +9 -13
  27. package/cpp/skia/include/core/SkMilestone.h +1 -1
  28. package/cpp/skia/include/core/SkPathMeasure.h +2 -0
  29. package/cpp/skia/include/core/SkSerialProcs.h +29 -11
  30. package/cpp/skia/include/core/SkSize.h +3 -3
  31. package/cpp/skia/include/core/SkStream.h +3 -13
  32. package/cpp/skia/include/core/SkSurface.h +6 -3
  33. package/cpp/skia/include/core/SkSurfaceProps.h +2 -4
  34. package/cpp/skia/include/core/SkTraceMemoryDump.h +15 -0
  35. package/cpp/skia/include/core/SkTypeface.h +8 -56
  36. package/cpp/skia/include/core/SkTypes.h +8 -0
  37. package/cpp/skia/include/core/SkVertices.h +1 -1
  38. package/cpp/skia/include/docs/SkMultiPictureDocument.h +53 -0
  39. package/cpp/skia/include/docs/SkPDFDocument.h +11 -0
  40. package/cpp/skia/include/effects/SkGradientShader.h +9 -0
  41. package/cpp/skia/include/effects/SkRuntimeEffect.h +3 -7
  42. package/cpp/skia/include/gpu/GrBackendSemaphore.h +33 -47
  43. package/cpp/skia/include/gpu/GrBackendSurface.h +2 -3
  44. package/cpp/skia/include/gpu/GrContextOptions.h +0 -6
  45. package/cpp/skia/include/gpu/GrContextThreadSafeProxy.h +44 -28
  46. package/cpp/skia/include/gpu/GrDirectContext.h +12 -31
  47. package/cpp/skia/include/gpu/GrTypes.h +1 -16
  48. package/cpp/skia/include/gpu/MutableTextureState.h +35 -80
  49. package/cpp/skia/include/gpu/ShaderErrorHandler.h +11 -1
  50. package/cpp/skia/include/gpu/ganesh/SkImageGanesh.h +2 -2
  51. package/cpp/skia/include/gpu/ganesh/SkSurfaceGanesh.h +1 -1
  52. package/cpp/skia/include/gpu/ganesh/gl/GrGLDirectContext.h +3 -2
  53. package/cpp/skia/include/gpu/ganesh/vk/GrVkBackendSemaphore.h +20 -0
  54. package/cpp/skia/include/gpu/ganesh/vk/GrVkDirectContext.h +30 -0
  55. package/cpp/skia/include/gpu/gl/GrGLFunctions.h +1 -1
  56. package/cpp/skia/include/gpu/gl/GrGLInterface.h +2 -0
  57. package/cpp/skia/include/gpu/gl/glx/GrGLMakeGLXInterface.h +6 -0
  58. package/cpp/skia/include/gpu/graphite/BackendSemaphore.h +3 -3
  59. package/cpp/skia/include/gpu/graphite/BackendTexture.h +39 -27
  60. package/cpp/skia/include/gpu/graphite/Context.h +39 -13
  61. package/cpp/skia/include/gpu/graphite/ContextOptions.h +2 -0
  62. package/cpp/skia/include/gpu/graphite/GraphiteTypes.h +2 -1
  63. package/cpp/skia/include/gpu/graphite/Image.h +106 -87
  64. package/cpp/skia/include/gpu/graphite/Recorder.h +24 -3
  65. package/cpp/skia/include/gpu/graphite/Surface.h +7 -2
  66. package/cpp/skia/include/gpu/graphite/dawn/DawnBackendContext.h +41 -2
  67. package/cpp/skia/include/gpu/graphite/dawn/DawnTypes.h +11 -6
  68. package/cpp/skia/include/gpu/graphite/mtl/MtlGraphiteTypes.h +1 -2
  69. package/cpp/skia/include/gpu/graphite/vk/VulkanGraphiteTypes.h +6 -6
  70. package/cpp/skia/include/gpu/mock/GrMockTypes.h +1 -0
  71. package/cpp/skia/include/gpu/vk/GrVkBackendContext.h +1 -1
  72. package/cpp/skia/include/gpu/vk/GrVkTypes.h +1 -44
  73. package/cpp/skia/include/gpu/vk/VulkanExtensions.h +1 -1
  74. package/cpp/skia/include/gpu/vk/VulkanMutableTextureState.h +25 -0
  75. package/cpp/skia/include/gpu/vk/VulkanTypes.h +44 -0
  76. package/cpp/skia/include/ports/SkFontConfigInterface.h +3 -6
  77. package/cpp/skia/include/private/SkEncodedInfo.h +5 -0
  78. package/cpp/skia/include/private/SkExif.h +102 -0
  79. package/cpp/skia/include/private/SkGainmapInfo.h +11 -1
  80. package/cpp/skia/include/private/base/SkAssert.h +16 -0
  81. package/cpp/skia/include/private/base/SkDeque.h +2 -7
  82. package/cpp/skia/include/private/base/SkLoadUserConfig.h +1 -1
  83. package/cpp/skia/include/private/base/SkTArray.h +69 -28
  84. package/cpp/skia/include/private/base/SkThreadAnnotations.h +18 -5
  85. package/cpp/skia/include/private/chromium/GrSurfaceCharacterization.h +26 -30
  86. package/cpp/skia/include/private/chromium/GrVkSecondaryCBDrawContext.h +4 -3
  87. package/cpp/skia/include/private/chromium/SkImageChromium.h +1 -1
  88. package/cpp/skia/include/private/gpu/ganesh/GrTypesPriv.h +8 -6
  89. package/cpp/skia/include/private/gpu/graphite/ContextOptionsPriv.h +29 -0
  90. package/cpp/skia/include/private/gpu/graphite/DawnTypesPriv.h +12 -9
  91. package/cpp/skia/include/private/gpu/graphite/VulkanGraphiteTypesPriv.h +16 -11
  92. package/cpp/skia/include/third_party/vulkan/vulkan/vk_platform.h +2 -2
  93. package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codec_h264std.h +312 -0
  94. package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codec_h264std_decode.h +77 -0
  95. package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codec_h265std.h +446 -0
  96. package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codec_h265std_decode.h +67 -0
  97. package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codecs_common.h +36 -0
  98. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan.h +9 -2
  99. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_android.h +31 -3
  100. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_core.h +10624 -5716
  101. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_ios.h +2 -1
  102. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_macos.h +2 -1
  103. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_win32.h +28 -1
  104. package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_xcb.h +2 -1
  105. package/cpp/skia/include/utils/mac/SkCGUtils.h +23 -11
  106. package/cpp/skia/modules/skcms/skcms.h +2 -410
  107. package/cpp/skia/modules/skcms/src/Transform_inl.h +831 -704
  108. package/cpp/skia/modules/skcms/src/skcms_Transform.h +161 -0
  109. package/cpp/skia/modules/skcms/src/skcms_internals.h +136 -0
  110. package/cpp/skia/modules/skcms/src/skcms_public.h +404 -0
  111. package/cpp/skia/modules/skparagraph/include/FontArguments.h +1 -1
  112. package/cpp/skia/modules/skparagraph/include/FontCollection.h +2 -0
  113. package/cpp/skia/modules/skparagraph/include/Paragraph.h +2 -2
  114. package/cpp/skia/modules/skparagraph/include/TextStyle.h +4 -3
  115. package/cpp/skia/modules/skparagraph/include/TypefaceFontProvider.h +1 -3
  116. package/cpp/skia/modules/skresources/include/SkResources.h +28 -17
  117. package/cpp/skia/modules/skunicode/include/SkUnicode.h +12 -0
  118. package/cpp/skia/modules/svg/include/SkSVGDOM.h +4 -1
  119. package/cpp/skia/modules/svg/include/SkSVGRenderContext.h +4 -1
  120. package/cpp/skia/src/base/SkUTF.h +7 -0
  121. package/cpp/skia/src/core/SkTHash.h +20 -8
  122. package/lib/commonjs/dom/nodes/JsiSkDOM.d.ts +3 -2
  123. package/lib/commonjs/dom/nodes/JsiSkDOM.js +56 -57
  124. package/lib/commonjs/dom/nodes/JsiSkDOM.js.map +1 -1
  125. package/lib/commonjs/external/reanimated/index.d.ts +1 -0
  126. package/lib/commonjs/external/reanimated/index.js +13 -0
  127. package/lib/commonjs/external/reanimated/index.js.map +1 -1
  128. package/lib/commonjs/external/reanimated/interpolators.js +13 -1
  129. package/lib/commonjs/external/reanimated/interpolators.js.map +1 -1
  130. package/lib/commonjs/external/reanimated/moduleWrapper.d.ts +1 -0
  131. package/lib/commonjs/external/reanimated/moduleWrapper.js +5 -3
  132. package/lib/commonjs/external/reanimated/moduleWrapper.js.map +1 -1
  133. package/lib/commonjs/external/reanimated/textures.d.ts +5 -0
  134. package/lib/commonjs/external/reanimated/textures.js +52 -0
  135. package/lib/commonjs/external/reanimated/textures.js.map +1 -0
  136. package/lib/commonjs/headless/index.js +1 -1
  137. package/lib/commonjs/headless/index.js.map +1 -1
  138. package/lib/commonjs/mock/index.js +3 -0
  139. package/lib/commonjs/mock/index.js.map +1 -1
  140. package/lib/commonjs/renderer/Canvas.js +1 -1
  141. package/lib/commonjs/renderer/Canvas.js.map +1 -1
  142. package/lib/commonjs/renderer/Container.d.ts +1 -1
  143. package/lib/commonjs/renderer/Container.js +2 -1
  144. package/lib/commonjs/renderer/Container.js.map +1 -1
  145. package/lib/commonjs/renderer/Offscreen.d.ts +1 -0
  146. package/lib/commonjs/renderer/Offscreen.js +18 -5
  147. package/lib/commonjs/renderer/Offscreen.js.map +1 -1
  148. package/lib/commonjs/renderer/Reconciler.d.ts +1 -1
  149. package/lib/commonjs/renderer/Reconciler.js +7 -4
  150. package/lib/commonjs/renderer/Reconciler.js.map +1 -1
  151. package/lib/commonjs/skia/types/Matrix4.d.ts +2 -2
  152. package/lib/commonjs/skia/types/Matrix4.js.map +1 -1
  153. package/lib/commonjs/skia/types/Shader/Shader.d.ts +1 -1
  154. package/lib/commonjs/skia/types/Shader/Shader.js.map +1 -1
  155. package/lib/module/dom/nodes/JsiSkDOM.d.ts +3 -2
  156. package/lib/module/dom/nodes/JsiSkDOM.js +56 -56
  157. package/lib/module/dom/nodes/JsiSkDOM.js.map +1 -1
  158. package/lib/module/external/reanimated/index.d.ts +1 -0
  159. package/lib/module/external/reanimated/index.js +1 -0
  160. package/lib/module/external/reanimated/index.js.map +1 -1
  161. package/lib/module/external/reanimated/interpolators.js +13 -1
  162. package/lib/module/external/reanimated/interpolators.js.map +1 -1
  163. package/lib/module/external/reanimated/moduleWrapper.d.ts +1 -0
  164. package/lib/module/external/reanimated/moduleWrapper.js +3 -2
  165. package/lib/module/external/reanimated/moduleWrapper.js.map +1 -1
  166. package/lib/module/external/reanimated/textures.d.ts +5 -0
  167. package/lib/module/external/reanimated/textures.js +35 -0
  168. package/lib/module/external/reanimated/textures.js.map +1 -0
  169. package/lib/module/headless/index.js +1 -1
  170. package/lib/module/headless/index.js.map +1 -1
  171. package/lib/module/mock/index.js +3 -0
  172. package/lib/module/mock/index.js.map +1 -1
  173. package/lib/module/renderer/Canvas.js +1 -1
  174. package/lib/module/renderer/Canvas.js.map +1 -1
  175. package/lib/module/renderer/Container.d.ts +1 -1
  176. package/lib/module/renderer/Container.js +2 -1
  177. package/lib/module/renderer/Container.js.map +1 -1
  178. package/lib/module/renderer/Offscreen.d.ts +1 -0
  179. package/lib/module/renderer/Offscreen.js +11 -3
  180. package/lib/module/renderer/Offscreen.js.map +1 -1
  181. package/lib/module/renderer/Reconciler.d.ts +1 -1
  182. package/lib/module/renderer/Reconciler.js +7 -4
  183. package/lib/module/renderer/Reconciler.js.map +1 -1
  184. package/lib/module/skia/types/Matrix4.d.ts +2 -2
  185. package/lib/module/skia/types/Matrix4.js.map +1 -1
  186. package/lib/module/skia/types/Shader/Shader.d.ts +1 -1
  187. package/lib/module/skia/types/Shader/Shader.js.map +1 -1
  188. package/lib/typescript/src/dom/nodes/JsiSkDOM.d.ts +3 -2
  189. package/lib/typescript/src/external/reanimated/index.d.ts +1 -0
  190. package/lib/typescript/src/external/reanimated/moduleWrapper.d.ts +1 -0
  191. package/lib/typescript/src/external/reanimated/textures.d.ts +5 -0
  192. package/lib/typescript/src/renderer/Container.d.ts +1 -1
  193. package/lib/typescript/src/renderer/Offscreen.d.ts +1 -0
  194. package/lib/typescript/src/renderer/Reconciler.d.ts +1 -1
  195. package/lib/typescript/src/skia/types/Matrix4.d.ts +2 -2
  196. package/lib/typescript/src/skia/types/Shader/Shader.d.ts +1 -1
  197. package/libs/android/arm64-v8a/libskia.a +0 -0
  198. package/libs/android/arm64-v8a/libskottie.a +0 -0
  199. package/libs/android/arm64-v8a/libskparagraph.a +0 -0
  200. package/libs/android/arm64-v8a/libsksg.a +0 -0
  201. package/libs/android/arm64-v8a/libskshaper.a +0 -0
  202. package/libs/android/arm64-v8a/libskunicode.a +0 -0
  203. package/libs/android/arm64-v8a/libsvg.a +0 -0
  204. package/libs/android/armeabi-v7a/libskia.a +0 -0
  205. package/libs/android/armeabi-v7a/libskottie.a +0 -0
  206. package/libs/android/armeabi-v7a/libskparagraph.a +0 -0
  207. package/libs/android/armeabi-v7a/libsksg.a +0 -0
  208. package/libs/android/armeabi-v7a/libskshaper.a +0 -0
  209. package/libs/android/armeabi-v7a/libskunicode.a +0 -0
  210. package/libs/android/armeabi-v7a/libsvg.a +0 -0
  211. package/libs/android/x86/libskia.a +0 -0
  212. package/libs/android/x86/libskottie.a +0 -0
  213. package/libs/android/x86/libskparagraph.a +0 -0
  214. package/libs/android/x86/libsksg.a +0 -0
  215. package/libs/android/x86/libskshaper.a +0 -0
  216. package/libs/android/x86/libskunicode.a +0 -0
  217. package/libs/android/x86/libsvg.a +0 -0
  218. package/libs/android/x86_64/libskia.a +0 -0
  219. package/libs/android/x86_64/libskottie.a +0 -0
  220. package/libs/android/x86_64/libskparagraph.a +0 -0
  221. package/libs/android/x86_64/libsksg.a +0 -0
  222. package/libs/android/x86_64/libskshaper.a +0 -0
  223. package/libs/android/x86_64/libskunicode.a +0 -0
  224. package/libs/android/x86_64/libsvg.a +0 -0
  225. package/libs/ios/libskia.xcframework/ios-arm64_arm64e/libskia.a +0 -0
  226. package/libs/ios/libskia.xcframework/ios-arm64_arm64e_x86_64-simulator/libskia.a +0 -0
  227. package/libs/ios/libskottie.xcframework/ios-arm64_arm64e/libskottie.a +0 -0
  228. package/libs/ios/libskottie.xcframework/ios-arm64_arm64e_x86_64-simulator/libskottie.a +0 -0
  229. package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e/libskparagraph.a +0 -0
  230. package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e_x86_64-simulator/libskparagraph.a +0 -0
  231. package/libs/ios/libsksg.xcframework/ios-arm64_arm64e/libsksg.a +0 -0
  232. package/libs/ios/libsksg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsksg.a +0 -0
  233. package/libs/ios/libskshaper.xcframework/Info.plist +5 -5
  234. package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e/libskshaper.a +0 -0
  235. package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e_x86_64-simulator/libskshaper.a +0 -0
  236. package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e/libskunicode.a +0 -0
  237. package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e_x86_64-simulator/libskunicode.a +0 -0
  238. package/libs/ios/libsvg.xcframework/Info.plist +5 -5
  239. package/libs/ios/libsvg.xcframework/ios-arm64_arm64e/libsvg.a +0 -0
  240. package/libs/ios/libsvg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsvg.a +0 -0
  241. package/package.json +1 -1
  242. package/src/dom/nodes/JsiSkDOM.ts +55 -56
  243. package/src/external/reanimated/index.ts +1 -0
  244. package/src/external/reanimated/interpolators.ts +15 -2
  245. package/src/external/reanimated/moduleWrapper.ts +1 -0
  246. package/src/external/reanimated/textures.tsx +50 -0
  247. package/src/headless/index.ts +1 -1
  248. package/src/mock/index.ts +3 -0
  249. package/src/renderer/Canvas.tsx +1 -1
  250. package/src/renderer/Container.tsx +3 -2
  251. package/src/renderer/Offscreen.tsx +12 -3
  252. package/src/renderer/Reconciler.tsx +5 -2
  253. package/src/skia/types/Matrix4.ts +2 -2
  254. package/src/skia/types/Shader/Shader.ts +6 -1
  255. package/cpp/skia/include/gpu/GrSurfaceInfo.h +0 -142
  256. package/cpp/skia/include/private/gpu/ganesh/GrGLTypesPriv.h +0 -107
  257. package/cpp/skia/include/private/gpu/ganesh/GrMockTypesPriv.h +0 -32
  258. package/cpp/skia/include/private/gpu/ganesh/GrMtlTypesPriv.h +0 -83
  259. package/cpp/skia/include/private/gpu/ganesh/GrVkTypesPriv.h +0 -47
  260. package/cpp/skia/include/private/gpu/vk/VulkanTypesPriv.h +0 -57
  261. package/cpp/skia/include/utils/SkBase64.h +0 -53
  262. package/cpp/skia/modules/skcms/skcms_internal.h +0 -56
@@ -8,26 +8,29 @@
8
8
  // Intentionally NO #pragma once... included multiple times.
9
9
 
10
10
  // This file is included from skcms.cc in a namespace with some pre-defines:
11
- // - N: depth of all vectors, 1,4,8, or 16 (preprocessor define)
11
+ // - N: SIMD width of all vectors; 1, 4, 8 or 16 (preprocessor define)
12
12
  // - V<T>: a template to create a vector of N T's.
13
13
 
14
- using F = V<Color>; // Called F for historic reasons... maybe rename C?
14
+ using F = V<float>;
15
15
  using I32 = V<int32_t>;
16
16
  using U64 = V<uint64_t>;
17
17
  using U32 = V<uint32_t>;
18
18
  using U16 = V<uint16_t>;
19
19
  using U8 = V<uint8_t>;
20
20
 
21
-
22
21
  #if defined(__GNUC__) && !defined(__clang__)
23
- // Once again, GCC is kind of weird, not allowing vector = scalar directly.
22
+ // GCC is kind of weird, not allowing vector = scalar directly.
24
23
  static constexpr F F0 = F() + 0.0f,
25
24
  F1 = F() + 1.0f,
25
+ FHalf = F() + 0.5f,
26
26
  FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
27
+ static constexpr I32 F16InfBits = I32() + 0x4780'0000;
27
28
  #else
28
29
  static constexpr F F0 = 0.0f,
29
30
  F1 = 1.0f,
31
+ FHalf = 0.5f,
30
32
  FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
33
+ static constexpr I32 F16InfBits = 0x4780'0000; // equals +Inf in half float, shifted to 32-bits
31
34
  #endif
32
35
 
33
36
  // Instead of checking __AVX__ below, we'll check USING_AVX.
@@ -84,19 +87,11 @@ using U8 = V<uint8_t>;
84
87
  #endif
85
88
  #endif
86
89
 
87
- #if defined(__clang__)
88
- #define FALLTHROUGH [[clang::fallthrough]]
89
- #else
90
- #define FALLTHROUGH
91
- #endif
92
-
93
90
  // We tag most helper functions as SI, to enforce good code generation
94
91
  // but also work around what we think is a bug in GCC: when targeting 32-bit
95
92
  // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
96
93
  // MMX mm0 register, which seems to mess with unrelated code that later uses
97
94
  // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
98
- //
99
- // It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
100
95
  #if defined(__clang__) || defined(__GNUC__)
101
96
  #define SI static inline __attribute__((always_inline))
102
97
  #else
@@ -106,12 +101,12 @@ using U8 = V<uint8_t>;
106
101
  template <typename T, typename P>
107
102
  SI T load(const P* ptr) {
108
103
  T val;
109
- small_memcpy(&val, ptr, sizeof(val));
104
+ memcpy(&val, ptr, sizeof(val));
110
105
  return val;
111
106
  }
112
107
  template <typename T, typename P>
113
108
  SI void store(P* ptr, const T& val) {
114
- small_memcpy(ptr, &val, sizeof(val));
109
+ memcpy(ptr, &val, sizeof(val));
115
110
  }
116
111
 
117
112
  // (T)v is a cast when N == 1 and a bit-pun when N>1,
@@ -142,7 +137,6 @@ SI D bit_pun(const S& v) {
142
137
  // To serve both those ends, we use this function to_fixed() instead of direct cast().
143
138
  SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
144
139
 
145
-
146
140
  // Sometimes we do something crazy on one branch of a conditonal,
147
141
  // like divide by zero or convert a huge float to an integer,
148
142
  // but then harmlessly select the other side. That trips up N==1
@@ -159,7 +153,22 @@ SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
159
153
  }
160
154
  #endif
161
155
 
156
+ #if defined(USING_NEON)
157
+ SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
158
+ SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
159
+
160
+ SI I32 min_(I32 x, I32 y) { return (I32)vminq_s32((int32x4_t)x, (int32x4_t)y); }
161
+ SI I32 max_(I32 x, I32 y) { return (I32)vmaxq_s32((int32x4_t)x, (int32x4_t)y); }
162
+ #else
163
+ SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
164
+ SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
165
+
166
+ SI I32 min_(I32 x, I32 y) { return if_then_else(x > y, y, x); }
167
+ SI I32 max_(I32 x, I32 y) { return if_then_else(x < y, y, x); }
168
+ #endif
162
169
 
170
+ // KEEP IN SYNC with skvx::from_half to ensure that f16 colors are computed consistently in both
171
+ // skcms and skvx.
163
172
  SI F F_from_Half(U16 half) {
164
173
  #if defined(USING_NEON_F16C)
165
174
  return vcvt_f32_f16((float16x4_t)half);
@@ -169,24 +178,27 @@ SI F F_from_Half(U16 half) {
169
178
  typedef int16_t __attribute__((vector_size(16))) I16;
170
179
  return __builtin_ia32_vcvtph2ps256((I16)half);
171
180
  #else
172
- U32 wide = cast<U32>(half);
181
+ I32 wide = cast<I32>(half);
173
182
  // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
174
- U32 s = wide & 0x8000,
175
- em = wide ^ s;
176
-
177
- // Constructing the float is easy if the half is not denormalized.
178
- F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
179
-
180
- // Simply flush all denorm half floats to zero.
181
- return if_then_else(em < 0x0400, F0, norm);
183
+ // To match intrinsic behavior, this preserves denormal values, infinities, and NaNs, which
184
+ // helps improve consistency between architectures.
185
+ I32 s = wide & 0x8000,
186
+ em = wide ^ s,
187
+ inf_or_nan = (em >= (31 << 10)) & (255 << 23), // Expands exponent to fill 8 bits
188
+ is_norm = em > 0x3ff,
189
+ // denormalized f16's are 2^-14*0.[m0:9] == 2^-24*[m0:9].0
190
+ sub = bit_pun<I32>(cast<F>(em) * (1.f/(1<<24))),
191
+ norm = ((em<<13) + ((127-15)<<23)), // Shifts mantissa, shifts + re-biases exponent
192
+ finite = if_then_else(is_norm, norm, sub);
193
+ // If 'x' is f16 +/- infinity, inf_or_nan will be the filled 8-bit exponent but 'norm' will be
194
+ // all 0s since 'x's mantissa is 0. Thus norm | inf_or_nan becomes f32 infinity. However, if
195
+ // 'x' is an f16 NaN, some bits of 'norm' will be non-zero, so it stays an f32 NaN after the OR.
196
+ return bit_pun<F>((s<<16) | finite | inf_or_nan);
182
197
  #endif
183
198
  }
184
199
 
185
- #if defined(__clang__)
186
- // The -((127-15)<<10) underflows that side of the math when
187
- // we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
188
- __attribute__((no_sanitize("unsigned-integer-overflow")))
189
- #endif
200
+ // KEEP IN SYNC with skvx::to_half to ensure that f16 colors are computed consistently in both
201
+ // skcms and skvx.
190
202
  SI U16 Half_from_F(F f) {
191
203
  #if defined(USING_NEON_F16C)
192
204
  return (U16)vcvt_f16_f32(f);
@@ -196,13 +208,23 @@ SI U16 Half_from_F(F f) {
196
208
  return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
197
209
  #else
198
210
  // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
199
- U32 sem = bit_pun<U32>(f),
200
- s = sem & 0x80000000,
201
- em = sem ^ s;
202
-
203
- // For simplicity we flush denorm half floats (including all denorm floats) to zero.
204
- return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
205
- , (s>>16) + (em>>13) - ((127-15)<<10)));
211
+ // To match intrinsic behavior, this implements round-to-nearest-even, converting floats to
212
+ // denormal f16 values, overflowing to infinity and preserving infinity. However, it does not
213
+ // handle NaN float values (they become infinity).
214
+ I32 sem = bit_pun<I32>(f),
215
+ s = sem & 0x8000'0000,
216
+ em = min_(sem ^ s, F16InfBits), // |x| clamped to f16 infinity
217
+ // F(em)*8192 increases the exponent by 13, which when added back to em will shift the
218
+ // mantissa bits 13 to the right. We clamp to 1/2 for subnormal values, which
219
+ // automatically shifts the mantissa to match 2^-14 expected for a subnorm f16.
220
+ magic = bit_pun<I32>(max_(bit_pun<F>(em) * 8192.f, FHalf)) & (255 << 23),
221
+ // Shift mantissa with automatic round-to-even
222
+ rounded = bit_pun<I32>((bit_pun<F>(em) + bit_pun<F>(magic))),
223
+ // Subtract 127 for f32 bias, subtract 13 to undo the *8192, subtract 1 to remove
224
+ // the implicit leading 1., and add 15 to get the f16 biased exponent.
225
+ exp = ((magic >> 13) - ((127-15+13+1)<<10)), // shift and re-bias exponent
226
+ f16 = rounded + exp; // use + if 'rounded' rolled over into first exponent bit
227
+ return cast<U16>((s>>16) | f16);
206
228
  #endif
207
229
  }
208
230
 
@@ -218,14 +240,6 @@ SI U64 swap_endian_16x4(const U64& rgba) {
218
240
  | (rgba & 0xff00ff00ff00ff00) >> 8;
219
241
  }
220
242
 
221
- #if defined(USING_NEON)
222
- SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
223
- SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
224
- #else
225
- SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
226
- SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
227
- #endif
228
-
229
243
  SI F floor_(F x) {
230
244
  #if N == 1
231
245
  return floorf_(x);
@@ -292,19 +306,35 @@ SI F approx_exp(F x) {
292
306
  return approx_exp2(log2_e * x);
293
307
  }
294
308
 
309
+ SI F strip_sign(F x, U32* sign) {
310
+ U32 bits = bit_pun<U32>(x);
311
+ *sign = bits & 0x80000000;
312
+ return bit_pun<F>(bits ^ *sign);
313
+ }
314
+
315
+ SI F apply_sign(F x, U32 sign) {
316
+ return bit_pun<F>(sign | bit_pun<U32>(x));
317
+ }
318
+
295
319
  // Return tf(x).
296
320
  SI F apply_tf(const skcms_TransferFunction* tf, F x) {
297
321
  // Peel off the sign bit and set x = |x|.
298
- U32 bits = bit_pun<U32>(x),
299
- sign = bits & 0x80000000;
300
- x = bit_pun<F>(bits ^ sign);
322
+ U32 sign;
323
+ x = strip_sign(x, &sign);
301
324
 
302
325
  // The transfer function has a linear part up to d, exponential at d and after.
303
326
  F v = if_then_else(x < tf->d, tf->c*x + tf->f
304
327
  , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
305
328
 
306
329
  // Tack the sign bit back on.
307
- return bit_pun<F>(sign | bit_pun<U32>(v));
330
+ return apply_sign(v, sign);
331
+ }
332
+
333
+ // Return the gamma function (|x|^G with the original sign re-applied to x).
334
+ SI F apply_gamma(const skcms_TransferFunction* tf, F x) {
335
+ U32 sign;
336
+ x = strip_sign(x, &sign);
337
+ return apply_sign(approx_pow(x, tf->g), sign);
308
338
  }
309
339
 
310
340
  SI F apply_pq(const skcms_TransferFunction* tf, F x) {
@@ -717,12 +747,12 @@ static void clut(uint32_t input_channels, uint32_t output_channels,
717
747
  switch ((dim-1)&3) { // This lets the compiler know there are no other cases to handle.
718
748
  case 3: ix += index [3 + (combo&8)/2];
719
749
  w *= weight[3 + (combo&8)/2];
720
- FALLTHROUGH;
750
+ SKCMS_FALLTHROUGH;
721
751
  // fall through
722
752
 
723
753
  case 2: ix += index [2 + (combo&4)*1];
724
754
  w *= weight[2 + (combo&4)*1];
725
- FALLTHROUGH;
755
+ SKCMS_FALLTHROUGH;
726
756
  // fall through
727
757
 
728
758
  case 1: ix += index [1 + (combo&2)*2];
@@ -755,643 +785,763 @@ static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
755
785
  r,g,b,a);
756
786
  }
757
787
 
758
- static void exec_ops(const Op* ops, const void** args,
759
- const char* src, char* dst, int i) {
760
- F r = F0, g = F0, b = F0, a = F1;
761
- while (true) {
762
- switch (*ops++) {
763
- case Op_load_a8:{
764
- a = F_from_U8(load<U8>(src + 1*i));
765
- } break;
766
-
767
- case Op_load_g8:{
768
- r = g = b = F_from_U8(load<U8>(src + 1*i));
769
- } break;
770
-
771
- case Op_load_4444:{
772
- U16 abgr = load<U16>(src + 2*i);
773
-
774
- r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
775
- g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
776
- b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
777
- a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
778
- } break;
779
-
780
- case Op_load_565:{
781
- U16 rgb = load<U16>(src + 2*i);
782
-
783
- r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
784
- g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
785
- b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
786
- } break;
787
-
788
- case Op_load_888:{
789
- const uint8_t* rgb = (const uint8_t*)(src + 3*i);
790
- #if defined(USING_NEON)
791
- // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
792
- // a time. Since we're doing that, we might as well load them into 16-bit lanes.
793
- // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
794
- uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
795
- v = vld3_lane_u8(rgb+0, v, 0);
796
- v = vld3_lane_u8(rgb+3, v, 2);
797
- v = vld3_lane_u8(rgb+6, v, 4);
798
- v = vld3_lane_u8(rgb+9, v, 6);
799
-
800
- // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
801
- // convert to F. (Again, U32 would be even better here if drop ARMv7 or split
802
- // ARMv7 and ARMv8 impls.)
803
- r = cast<F>((U16)v.val[0]) * (1/255.0f);
804
- g = cast<F>((U16)v.val[1]) * (1/255.0f);
805
- b = cast<F>((U16)v.val[2]) * (1/255.0f);
806
- #else
807
- r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
808
- g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
809
- b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
810
- #endif
811
- } break;
812
-
813
- case Op_load_8888:{
814
- U32 rgba = load<U32>(src + 4*i);
815
-
816
- r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
817
- g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
818
- b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
819
- a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
820
- } break;
821
-
822
- case Op_load_8888_palette8:{
823
- const uint8_t* palette = (const uint8_t*) *args++;
824
- I32 ix = cast<I32>(load<U8>(src + 1*i));
825
- U32 rgba = gather_32(palette, ix);
826
-
827
- r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
828
- g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
829
- b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
830
- a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
831
- } break;
832
-
833
- case Op_load_1010102:{
834
- U32 rgba = load<U32>(src + 4*i);
835
-
836
- r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
837
- g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
838
- b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
839
- a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
840
- } break;
841
-
842
- case Op_load_101010x_XR:{
843
- static constexpr float min = -0.752941f;
844
- static constexpr float max = 1.25098f;
845
- static constexpr float range = max - min;
846
- U32 rgba = load<U32>(src + 4*i);
847
- r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f) * range + min;
848
- g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f) * range + min;
849
- b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f) * range + min;
850
- } break;
851
-
852
- case Op_load_161616LE:{
853
- uintptr_t ptr = (uintptr_t)(src + 6*i);
854
- assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
855
- const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
856
- #if defined(USING_NEON)
857
- uint16x4x3_t v = vld3_u16(rgb);
858
- r = cast<F>((U16)v.val[0]) * (1/65535.0f);
859
- g = cast<F>((U16)v.val[1]) * (1/65535.0f);
860
- b = cast<F>((U16)v.val[2]) * (1/65535.0f);
861
- #else
862
- r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
863
- g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
864
- b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
865
- #endif
866
- } break;
867
-
868
- case Op_load_16161616LE:{
869
- uintptr_t ptr = (uintptr_t)(src + 8*i);
870
- assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
871
- const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
872
- #if defined(USING_NEON)
873
- uint16x4x4_t v = vld4_u16(rgba);
874
- r = cast<F>((U16)v.val[0]) * (1/65535.0f);
875
- g = cast<F>((U16)v.val[1]) * (1/65535.0f);
876
- b = cast<F>((U16)v.val[2]) * (1/65535.0f);
877
- a = cast<F>((U16)v.val[3]) * (1/65535.0f);
878
- #else
879
- U64 px = load<U64>(rgba);
880
-
881
- r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
882
- g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
883
- b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
884
- a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
885
- #endif
886
- } break;
887
-
888
- case Op_load_161616BE:{
889
- uintptr_t ptr = (uintptr_t)(src + 6*i);
890
- assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
891
- const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
892
- #if defined(USING_NEON)
893
- uint16x4x3_t v = vld3_u16(rgb);
894
- r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
895
- g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
896
- b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
897
- #else
898
- U32 R = load_3<U32>(rgb+0),
899
- G = load_3<U32>(rgb+1),
900
- B = load_3<U32>(rgb+2);
901
- // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
902
- r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
903
- g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
904
- b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
905
- #endif
906
- } break;
907
-
908
- case Op_load_16161616BE:{
909
- uintptr_t ptr = (uintptr_t)(src + 8*i);
910
- assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
911
- const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
912
- #if defined(USING_NEON)
913
- uint16x4x4_t v = vld4_u16(rgba);
914
- r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
915
- g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
916
- b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
917
- a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
918
- #else
919
- U64 px = swap_endian_16x4(load<U64>(rgba));
920
-
921
- r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
922
- g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
923
- b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
924
- a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
925
- #endif
926
- } break;
927
-
928
- case Op_load_hhh:{
929
- uintptr_t ptr = (uintptr_t)(src + 6*i);
930
- assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
931
- const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
932
- #if defined(USING_NEON)
933
- uint16x4x3_t v = vld3_u16(rgb);
934
- U16 R = (U16)v.val[0],
935
- G = (U16)v.val[1],
936
- B = (U16)v.val[2];
937
- #else
938
- U16 R = load_3<U16>(rgb+0),
939
- G = load_3<U16>(rgb+1),
940
- B = load_3<U16>(rgb+2);
941
- #endif
942
- r = F_from_Half(R);
943
- g = F_from_Half(G);
944
- b = F_from_Half(B);
945
- } break;
946
-
947
- case Op_load_hhhh:{
948
- uintptr_t ptr = (uintptr_t)(src + 8*i);
949
- assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
950
- const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
951
- #if defined(USING_NEON)
952
- uint16x4x4_t v = vld4_u16(rgba);
953
- U16 R = (U16)v.val[0],
954
- G = (U16)v.val[1],
955
- B = (U16)v.val[2],
956
- A = (U16)v.val[3];
957
- #else
958
- U64 px = load<U64>(rgba);
959
- U16 R = cast<U16>((px >> 0) & 0xffff),
960
- G = cast<U16>((px >> 16) & 0xffff),
961
- B = cast<U16>((px >> 32) & 0xffff),
962
- A = cast<U16>((px >> 48) & 0xffff);
963
- #endif
964
- r = F_from_Half(R);
965
- g = F_from_Half(G);
966
- b = F_from_Half(B);
967
- a = F_from_Half(A);
968
- } break;
969
-
970
- case Op_load_fff:{
971
- uintptr_t ptr = (uintptr_t)(src + 12*i);
972
- assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
973
- const float* rgb = (const float*)ptr; // cast to const float* to be safe.
974
- #if defined(USING_NEON)
975
- float32x4x3_t v = vld3q_f32(rgb);
976
- r = (F)v.val[0];
977
- g = (F)v.val[1];
978
- b = (F)v.val[2];
979
- #else
980
- r = load_3<F>(rgb+0);
981
- g = load_3<F>(rgb+1);
982
- b = load_3<F>(rgb+2);
983
- #endif
984
- } break;
985
-
986
- case Op_load_ffff:{
987
- uintptr_t ptr = (uintptr_t)(src + 16*i);
988
- assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
989
- const float* rgba = (const float*)ptr; // cast to const float* to be safe.
990
- #if defined(USING_NEON)
991
- float32x4x4_t v = vld4q_f32(rgba);
992
- r = (F)v.val[0];
993
- g = (F)v.val[1];
994
- b = (F)v.val[2];
995
- a = (F)v.val[3];
996
- #else
997
- r = load_4<F>(rgba+0);
998
- g = load_4<F>(rgba+1);
999
- b = load_4<F>(rgba+2);
1000
- a = load_4<F>(rgba+3);
1001
- #endif
1002
- } break;
1003
-
1004
- case Op_swap_rb:{
1005
- F t = r;
1006
- r = b;
1007
- b = t;
1008
- } break;
1009
-
1010
- case Op_clamp:{
1011
- r = max_(F0, min_(r, F1));
1012
- g = max_(F0, min_(g, F1));
1013
- b = max_(F0, min_(b, F1));
1014
- a = max_(F0, min_(a, F1));
1015
- } break;
1016
-
1017
- case Op_invert:{
1018
- r = F1 - r;
1019
- g = F1 - g;
1020
- b = F1 - b;
1021
- a = F1 - a;
1022
- } break;
1023
-
1024
- case Op_force_opaque:{
1025
- a = F1;
1026
- } break;
1027
-
1028
- case Op_premul:{
1029
- r *= a;
1030
- g *= a;
1031
- b *= a;
1032
- } break;
1033
-
1034
- case Op_unpremul:{
1035
- F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1036
- r *= scale;
1037
- g *= scale;
1038
- b *= scale;
1039
- } break;
1040
-
1041
- case Op_matrix_3x3:{
1042
- const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
1043
- const float* m = &matrix->vals[0][0];
1044
-
1045
- F R = m[0]*r + m[1]*g + m[2]*b,
1046
- G = m[3]*r + m[4]*g + m[5]*b,
1047
- B = m[6]*r + m[7]*g + m[8]*b;
1048
-
1049
- r = R;
1050
- g = G;
1051
- b = B;
1052
- } break;
1053
-
1054
- case Op_matrix_3x4:{
1055
- const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
1056
- const float* m = &matrix->vals[0][0];
1057
-
1058
- F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1059
- G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1060
- B = m[8]*r + m[9]*g + m[10]*b + m[11];
1061
-
1062
- r = R;
1063
- g = G;
1064
- b = B;
1065
- } break;
1066
-
1067
- case Op_lab_to_xyz:{
1068
- // The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
1069
- F L = r * 100.0f,
1070
- A = g * 255.0f - 128.0f,
1071
- B = b * 255.0f - 128.0f;
1072
-
1073
- // Convert to CIE XYZ.
1074
- F Y = (L + 16.0f) * (1/116.0f),
1075
- X = Y + A*(1/500.0f),
1076
- Z = Y - B*(1/200.0f);
1077
-
1078
- X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1079
- Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1080
- Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1081
-
1082
- // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1083
- r = X * 0.9642f;
1084
- g = Y ;
1085
- b = Z * 0.8249f;
1086
- } break;
1087
-
1088
- // As above, in reverse.
1089
- case Op_xyz_to_lab:{
1090
- F X = r * (1/0.9642f),
1091
- Y = g,
1092
- Z = b * (1/0.8249f);
1093
-
1094
- X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
1095
- Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
1096
- Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
1097
-
1098
- F L = Y*116.0f - 16.0f,
1099
- A = (X-Y)*500.0f,
1100
- B = (Y-Z)*200.0f;
1101
-
1102
- r = L * (1/100.f);
1103
- g = (A + 128.0f) * (1/255.0f);
1104
- b = (B + 128.0f) * (1/255.0f);
1105
- } break;
1106
-
1107
- case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
1108
- case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
1109
- case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
1110
- case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
1111
-
1112
- case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
1113
- case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
1114
- case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
1115
- case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
1116
-
1117
- case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
1118
- case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
1119
- case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
1120
- case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
1121
-
1122
- case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
1123
- case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
1124
- case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
1125
- case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
1126
-
1127
- case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
1128
- case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
1129
- case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
1130
- case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
1131
-
1132
- case Op_clut_A2B: {
1133
- const skcms_A2B* a2b = (const skcms_A2B*) *args++;
1134
- clut(a2b, &r,&g,&b,a);
1135
-
1136
- if (a2b->input_channels == 4) {
1137
- // CMYK is opaque.
1138
- a = F1;
1139
- }
1140
- } break;
1141
-
1142
- case Op_clut_B2A: {
1143
- const skcms_B2A* b2a = (const skcms_B2A*) *args++;
1144
- clut(b2a, &r,&g,&b,&a);
1145
- } break;
1146
-
1147
- // Notice, from here on down the store_ ops all return, ending the loop.
1148
-
1149
- case Op_store_a8: {
1150
- store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1151
- } return;
1152
-
1153
- case Op_store_g8: {
1154
- // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1155
- store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1156
- } return;
1157
-
1158
- case Op_store_4444: {
1159
- store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1160
- | cast<U16>(to_fixed(g * 15) << 8)
1161
- | cast<U16>(to_fixed(b * 15) << 4)
1162
- | cast<U16>(to_fixed(a * 15) << 0));
1163
- } return;
1164
-
1165
- case Op_store_565: {
1166
- store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
1167
- | cast<U16>(to_fixed(g * 63) << 5 )
1168
- | cast<U16>(to_fixed(b * 31) << 11 ));
1169
- } return;
1170
-
1171
- case Op_store_888: {
1172
- uint8_t* rgb = (uint8_t*)dst + 3*i;
1173
- #if defined(USING_NEON)
1174
- // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1175
- // get there via U16 to save some instructions converting to float. And just
1176
- // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1177
- U16 R = cast<U16>(to_fixed(r * 255)),
1178
- G = cast<U16>(to_fixed(g * 255)),
1179
- B = cast<U16>(to_fixed(b * 255));
1180
-
1181
- uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1182
- vst3_lane_u8(rgb+0, v, 0);
1183
- vst3_lane_u8(rgb+3, v, 2);
1184
- vst3_lane_u8(rgb+6, v, 4);
1185
- vst3_lane_u8(rgb+9, v, 6);
1186
- #else
1187
- store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1188
- store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1189
- store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1190
- #endif
1191
- } return;
1192
-
1193
- case Op_store_8888: {
1194
- store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0
1195
- | cast<U32>(to_fixed(g * 255)) << 8
1196
- | cast<U32>(to_fixed(b * 255)) << 16
1197
- | cast<U32>(to_fixed(a * 255)) << 24);
1198
- } return;
1199
-
1200
- case Op_store_101010x_XR: {
1201
- static constexpr float min = -0.752941f;
1202
- static constexpr float max = 1.25098f;
1203
- static constexpr float range = max - min;
1204
- store(dst + 4*i, cast<U32>(to_fixed(((r - min) / range) * 1023)) << 0
1205
- | cast<U32>(to_fixed(((g - min) / range) * 1023)) << 10
1206
- | cast<U32>(to_fixed(((b - min) / range) * 1023)) << 20);
1207
- return;
788
+ struct NoCtx {};
789
+
790
+ struct Ctx {
791
+ const void* fArg;
792
+ operator NoCtx() { return NoCtx{}; }
793
+ template <typename T> operator T*() { return (const T*)fArg; }
794
+ };
795
+
796
+ #define STAGE_PARAMS(MAYBE_REF) SKCMS_MAYBE_UNUSED const char* src, \
797
+ SKCMS_MAYBE_UNUSED char* dst, \
798
+ SKCMS_MAYBE_UNUSED F MAYBE_REF r, \
799
+ SKCMS_MAYBE_UNUSED F MAYBE_REF g, \
800
+ SKCMS_MAYBE_UNUSED F MAYBE_REF b, \
801
+ SKCMS_MAYBE_UNUSED F MAYBE_REF a, \
802
+ SKCMS_MAYBE_UNUSED int i
803
+
804
+ #if SKCMS_HAS_MUSTTAIL
805
+
806
+ // Stages take a stage list, and each stage is responsible for tail-calling the next one.
807
+ //
808
+ // Unfortunately, we can't declare a StageFn as a function pointer which takes a pointer to
809
+ // another StageFn; declaring this leads to a circular dependency. To avoid this, StageFn is
810
+ // wrapped in a single-element `struct StageList` which we are able to forward-declare.
811
+ struct StageList;
812
+ using StageFn = void (*)(StageList stages, const void** ctx, STAGE_PARAMS());
813
+ struct StageList {
814
+ const StageFn* fn;
815
+ };
816
+
817
+ #define DECLARE_STAGE(name, arg, CALL_NEXT) \
818
+ SI void Exec_##name##_k(arg, STAGE_PARAMS(&)); \
819
+ \
820
+ SI void Exec_##name(StageList list, const void** ctx, STAGE_PARAMS()) { \
821
+ Exec_##name##_k(Ctx{*ctx}, src, dst, r, g, b, a, i); \
822
+ ++list.fn; ++ctx; \
823
+ CALL_NEXT; \
824
+ } \
825
+ \
826
+ SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
827
+
828
+ #define STAGE(name, arg) \
829
+ DECLARE_STAGE(name, arg, [[clang::musttail]] return (*list.fn)(list, ctx, src, dst, \
830
+ r, g, b, a, i))
831
+
832
+ #define FINAL_STAGE(name, arg) \
833
+ DECLARE_STAGE(name, arg, /* Stop executing stages and return to the caller. */)
834
+
835
+ #else
836
+
837
+ #define DECLARE_STAGE(name, arg) \
838
+ SI void Exec_##name##_k(arg, STAGE_PARAMS(&)); \
839
+ \
840
+ SI void Exec_##name(const void* ctx, STAGE_PARAMS(&)) { \
841
+ Exec_##name##_k(Ctx{ctx}, src, dst, r, g, b, a, i); \
842
+ } \
843
+ \
844
+ SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
845
+
846
+ #define STAGE(name, arg) DECLARE_STAGE(name, arg)
847
+ #define FINAL_STAGE(name, arg) DECLARE_STAGE(name, arg)
848
+
849
+ #endif
850
+
851
+ STAGE(load_a8, NoCtx) {
852
+ a = F_from_U8(load<U8>(src + 1*i));
853
+ }
854
+
855
+ STAGE(load_g8, NoCtx) {
856
+ r = g = b = F_from_U8(load<U8>(src + 1*i));
857
+ }
858
+
859
+ STAGE(load_4444, NoCtx) {
860
+ U16 abgr = load<U16>(src + 2*i);
861
+
862
+ r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
863
+ g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
864
+ b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
865
+ a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
866
+ }
867
+
868
+ STAGE(load_565, NoCtx) {
869
+ U16 rgb = load<U16>(src + 2*i);
870
+
871
+ r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
872
+ g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
873
+ b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
874
+ }
875
+
876
+ STAGE(load_888, NoCtx) {
877
+ const uint8_t* rgb = (const uint8_t*)(src + 3*i);
878
+ #if defined(USING_NEON)
879
+ // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
880
+ // a time. Since we're doing that, we might as well load them into 16-bit lanes.
881
+ // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
882
+ uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
883
+ v = vld3_lane_u8(rgb+0, v, 0);
884
+ v = vld3_lane_u8(rgb+3, v, 2);
885
+ v = vld3_lane_u8(rgb+6, v, 4);
886
+ v = vld3_lane_u8(rgb+9, v, 6);
887
+
888
+ // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
889
+ // convert to F. (Again, U32 would be even better here if drop ARMv7 or split
890
+ // ARMv7 and ARMv8 impls.)
891
+ r = cast<F>((U16)v.val[0]) * (1/255.0f);
892
+ g = cast<F>((U16)v.val[1]) * (1/255.0f);
893
+ b = cast<F>((U16)v.val[2]) * (1/255.0f);
894
+ #else
895
+ r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
896
+ g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
897
+ b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
898
+ #endif
899
+ }
900
+
901
+ STAGE(load_8888, NoCtx) {
902
+ U32 rgba = load<U32>(src + 4*i);
903
+
904
+ r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
905
+ g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
906
+ b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
907
+ a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
908
+ }
909
+
910
+ STAGE(load_1010102, NoCtx) {
911
+ U32 rgba = load<U32>(src + 4*i);
912
+
913
+ r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
914
+ g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
915
+ b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
916
+ a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
917
+ }
918
+
919
+ STAGE(load_101010x_XR, NoCtx) {
920
+ static constexpr float min = -0.752941f;
921
+ static constexpr float max = 1.25098f;
922
+ static constexpr float range = max - min;
923
+ U32 rgba = load<U32>(src + 4*i);
924
+ r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f) * range + min;
925
+ g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f) * range + min;
926
+ b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f) * range + min;
927
+ }
928
+
929
+ STAGE(load_161616LE, NoCtx) {
930
+ uintptr_t ptr = (uintptr_t)(src + 6*i);
931
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
932
+ const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
933
+ #if defined(USING_NEON)
934
+ uint16x4x3_t v = vld3_u16(rgb);
935
+ r = cast<F>((U16)v.val[0]) * (1/65535.0f);
936
+ g = cast<F>((U16)v.val[1]) * (1/65535.0f);
937
+ b = cast<F>((U16)v.val[2]) * (1/65535.0f);
938
+ #else
939
+ r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
940
+ g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
941
+ b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
942
+ #endif
943
+ }
944
+
945
+ STAGE(load_16161616LE, NoCtx) {
946
+ uintptr_t ptr = (uintptr_t)(src + 8*i);
947
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
948
+ const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
949
+ #if defined(USING_NEON)
950
+ uint16x4x4_t v = vld4_u16(rgba);
951
+ r = cast<F>((U16)v.val[0]) * (1/65535.0f);
952
+ g = cast<F>((U16)v.val[1]) * (1/65535.0f);
953
+ b = cast<F>((U16)v.val[2]) * (1/65535.0f);
954
+ a = cast<F>((U16)v.val[3]) * (1/65535.0f);
955
+ #else
956
+ U64 px = load<U64>(rgba);
957
+
958
+ r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
959
+ g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
960
+ b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
961
+ a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
962
+ #endif
963
+ }
964
+
965
+ STAGE(load_161616BE, NoCtx) {
966
+ uintptr_t ptr = (uintptr_t)(src + 6*i);
967
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
968
+ const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
969
+ #if defined(USING_NEON)
970
+ uint16x4x3_t v = vld3_u16(rgb);
971
+ r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
972
+ g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
973
+ b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
974
+ #else
975
+ U32 R = load_3<U32>(rgb+0),
976
+ G = load_3<U32>(rgb+1),
977
+ B = load_3<U32>(rgb+2);
978
+ // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
979
+ r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
980
+ g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
981
+ b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
982
+ #endif
983
+ }
984
+
985
+ STAGE(load_16161616BE, NoCtx) {
986
+ uintptr_t ptr = (uintptr_t)(src + 8*i);
987
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
988
+ const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
989
+ #if defined(USING_NEON)
990
+ uint16x4x4_t v = vld4_u16(rgba);
991
+ r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
992
+ g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
993
+ b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
994
+ a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
995
+ #else
996
+ U64 px = swap_endian_16x4(load<U64>(rgba));
997
+
998
+ r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
999
+ g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
1000
+ b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
1001
+ a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
1002
+ #endif
1003
+ }
1004
+
1005
+ STAGE(load_hhh, NoCtx) {
1006
+ uintptr_t ptr = (uintptr_t)(src + 6*i);
1007
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1008
+ const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1009
+ #if defined(USING_NEON)
1010
+ uint16x4x3_t v = vld3_u16(rgb);
1011
+ U16 R = (U16)v.val[0],
1012
+ G = (U16)v.val[1],
1013
+ B = (U16)v.val[2];
1014
+ #else
1015
+ U16 R = load_3<U16>(rgb+0),
1016
+ G = load_3<U16>(rgb+1),
1017
+ B = load_3<U16>(rgb+2);
1018
+ #endif
1019
+ r = F_from_Half(R);
1020
+ g = F_from_Half(G);
1021
+ b = F_from_Half(B);
1022
+ }
1023
+
1024
+ STAGE(load_hhhh, NoCtx) {
1025
+ uintptr_t ptr = (uintptr_t)(src + 8*i);
1026
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1027
+ const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1028
+ #if defined(USING_NEON)
1029
+ uint16x4x4_t v = vld4_u16(rgba);
1030
+ U16 R = (U16)v.val[0],
1031
+ G = (U16)v.val[1],
1032
+ B = (U16)v.val[2],
1033
+ A = (U16)v.val[3];
1034
+ #else
1035
+ U64 px = load<U64>(rgba);
1036
+ U16 R = cast<U16>((px >> 0) & 0xffff),
1037
+ G = cast<U16>((px >> 16) & 0xffff),
1038
+ B = cast<U16>((px >> 32) & 0xffff),
1039
+ A = cast<U16>((px >> 48) & 0xffff);
1040
+ #endif
1041
+ r = F_from_Half(R);
1042
+ g = F_from_Half(G);
1043
+ b = F_from_Half(B);
1044
+ a = F_from_Half(A);
1045
+ }
1046
+
1047
+ STAGE(load_fff, NoCtx) {
1048
+ uintptr_t ptr = (uintptr_t)(src + 12*i);
1049
+ assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1050
+ const float* rgb = (const float*)ptr; // cast to const float* to be safe.
1051
+ #if defined(USING_NEON)
1052
+ float32x4x3_t v = vld3q_f32(rgb);
1053
+ r = (F)v.val[0];
1054
+ g = (F)v.val[1];
1055
+ b = (F)v.val[2];
1056
+ #else
1057
+ r = load_3<F>(rgb+0);
1058
+ g = load_3<F>(rgb+1);
1059
+ b = load_3<F>(rgb+2);
1060
+ #endif
1061
+ }
1062
+
1063
+ STAGE(load_ffff, NoCtx) {
1064
+ uintptr_t ptr = (uintptr_t)(src + 16*i);
1065
+ assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1066
+ const float* rgba = (const float*)ptr; // cast to const float* to be safe.
1067
+ #if defined(USING_NEON)
1068
+ float32x4x4_t v = vld4q_f32(rgba);
1069
+ r = (F)v.val[0];
1070
+ g = (F)v.val[1];
1071
+ b = (F)v.val[2];
1072
+ a = (F)v.val[3];
1073
+ #else
1074
+ r = load_4<F>(rgba+0);
1075
+ g = load_4<F>(rgba+1);
1076
+ b = load_4<F>(rgba+2);
1077
+ a = load_4<F>(rgba+3);
1078
+ #endif
1079
+ }
1080
+
1081
+ STAGE(swap_rb, NoCtx) {
1082
+ F t = r;
1083
+ r = b;
1084
+ b = t;
1085
+ }
1086
+
1087
+ STAGE(clamp, NoCtx) {
1088
+ r = max_(F0, min_(r, F1));
1089
+ g = max_(F0, min_(g, F1));
1090
+ b = max_(F0, min_(b, F1));
1091
+ a = max_(F0, min_(a, F1));
1092
+ }
1093
+
1094
+ STAGE(invert, NoCtx) {
1095
+ r = F1 - r;
1096
+ g = F1 - g;
1097
+ b = F1 - b;
1098
+ a = F1 - a;
1099
+ }
1100
+
1101
+ STAGE(force_opaque, NoCtx) {
1102
+ a = F1;
1103
+ }
1104
+
1105
+ STAGE(premul, NoCtx) {
1106
+ r *= a;
1107
+ g *= a;
1108
+ b *= a;
1109
+ }
1110
+
1111
+ STAGE(unpremul, NoCtx) {
1112
+ F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1113
+ r *= scale;
1114
+ g *= scale;
1115
+ b *= scale;
1116
+ }
1117
+
1118
+ STAGE(matrix_3x3, const skcms_Matrix3x3* matrix) {
1119
+ const float* m = &matrix->vals[0][0];
1120
+
1121
+ F R = m[0]*r + m[1]*g + m[2]*b,
1122
+ G = m[3]*r + m[4]*g + m[5]*b,
1123
+ B = m[6]*r + m[7]*g + m[8]*b;
1124
+
1125
+ r = R;
1126
+ g = G;
1127
+ b = B;
1128
+ }
1129
+
1130
+ STAGE(matrix_3x4, const skcms_Matrix3x4* matrix) {
1131
+ const float* m = &matrix->vals[0][0];
1132
+
1133
+ F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1134
+ G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1135
+ B = m[8]*r + m[9]*g + m[10]*b + m[11];
1136
+
1137
+ r = R;
1138
+ g = G;
1139
+ b = B;
1140
+ }
1141
+
1142
+ STAGE(lab_to_xyz, NoCtx) {
1143
+ // The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
1144
+ F L = r * 100.0f,
1145
+ A = g * 255.0f - 128.0f,
1146
+ B = b * 255.0f - 128.0f;
1147
+
1148
+ // Convert to CIE XYZ.
1149
+ F Y = (L + 16.0f) * (1/116.0f),
1150
+ X = Y + A*(1/500.0f),
1151
+ Z = Y - B*(1/200.0f);
1152
+
1153
+ X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1154
+ Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1155
+ Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1156
+
1157
+ // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1158
+ r = X * 0.9642f;
1159
+ g = Y ;
1160
+ b = Z * 0.8249f;
1161
+ }
1162
+
1163
+ // As above, in reverse.
1164
+ STAGE(xyz_to_lab, NoCtx) {
1165
+ F X = r * (1/0.9642f),
1166
+ Y = g,
1167
+ Z = b * (1/0.8249f);
1168
+
1169
+ X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
1170
+ Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
1171
+ Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
1172
+
1173
+ F L = Y*116.0f - 16.0f,
1174
+ A = (X-Y)*500.0f,
1175
+ B = (Y-Z)*200.0f;
1176
+
1177
+ r = L * (1/100.f);
1178
+ g = (A + 128.0f) * (1/255.0f);
1179
+ b = (B + 128.0f) * (1/255.0f);
1180
+ }
1181
+
1182
+ STAGE(gamma_r, const skcms_TransferFunction* tf) { r = apply_gamma(tf, r); }
1183
+ STAGE(gamma_g, const skcms_TransferFunction* tf) { g = apply_gamma(tf, g); }
1184
+ STAGE(gamma_b, const skcms_TransferFunction* tf) { b = apply_gamma(tf, b); }
1185
+ STAGE(gamma_a, const skcms_TransferFunction* tf) { a = apply_gamma(tf, a); }
1186
+
1187
+ STAGE(gamma_rgb, const skcms_TransferFunction* tf) {
1188
+ r = apply_gamma(tf, r);
1189
+ g = apply_gamma(tf, g);
1190
+ b = apply_gamma(tf, b);
1191
+ }
1192
+
1193
+ STAGE(tf_r, const skcms_TransferFunction* tf) { r = apply_tf(tf, r); }
1194
+ STAGE(tf_g, const skcms_TransferFunction* tf) { g = apply_tf(tf, g); }
1195
+ STAGE(tf_b, const skcms_TransferFunction* tf) { b = apply_tf(tf, b); }
1196
+ STAGE(tf_a, const skcms_TransferFunction* tf) { a = apply_tf(tf, a); }
1197
+
1198
+ STAGE(tf_rgb, const skcms_TransferFunction* tf) {
1199
+ r = apply_tf(tf, r);
1200
+ g = apply_tf(tf, g);
1201
+ b = apply_tf(tf, b);
1202
+ }
1203
+
1204
+ STAGE(pq_r, const skcms_TransferFunction* tf) { r = apply_pq(tf, r); }
1205
+ STAGE(pq_g, const skcms_TransferFunction* tf) { g = apply_pq(tf, g); }
1206
+ STAGE(pq_b, const skcms_TransferFunction* tf) { b = apply_pq(tf, b); }
1207
+ STAGE(pq_a, const skcms_TransferFunction* tf) { a = apply_pq(tf, a); }
1208
+
1209
+ STAGE(pq_rgb, const skcms_TransferFunction* tf) {
1210
+ r = apply_pq(tf, r);
1211
+ g = apply_pq(tf, g);
1212
+ b = apply_pq(tf, b);
1213
+ }
1214
+
1215
+ STAGE(hlg_r, const skcms_TransferFunction* tf) { r = apply_hlg(tf, r); }
1216
+ STAGE(hlg_g, const skcms_TransferFunction* tf) { g = apply_hlg(tf, g); }
1217
+ STAGE(hlg_b, const skcms_TransferFunction* tf) { b = apply_hlg(tf, b); }
1218
+ STAGE(hlg_a, const skcms_TransferFunction* tf) { a = apply_hlg(tf, a); }
1219
+
1220
+ STAGE(hlg_rgb, const skcms_TransferFunction* tf) {
1221
+ r = apply_hlg(tf, r);
1222
+ g = apply_hlg(tf, g);
1223
+ b = apply_hlg(tf, b);
1224
+ }
1225
+
1226
+ STAGE(hlginv_r, const skcms_TransferFunction* tf) { r = apply_hlginv(tf, r); }
1227
+ STAGE(hlginv_g, const skcms_TransferFunction* tf) { g = apply_hlginv(tf, g); }
1228
+ STAGE(hlginv_b, const skcms_TransferFunction* tf) { b = apply_hlginv(tf, b); }
1229
+ STAGE(hlginv_a, const skcms_TransferFunction* tf) { a = apply_hlginv(tf, a); }
1230
+
1231
+ STAGE(hlginv_rgb, const skcms_TransferFunction* tf) {
1232
+ r = apply_hlginv(tf, r);
1233
+ g = apply_hlginv(tf, g);
1234
+ b = apply_hlginv(tf, b);
1235
+ }
1236
+
1237
+ STAGE(table_r, const skcms_Curve* curve) { r = table(curve, r); }
1238
+ STAGE(table_g, const skcms_Curve* curve) { g = table(curve, g); }
1239
+ STAGE(table_b, const skcms_Curve* curve) { b = table(curve, b); }
1240
+ STAGE(table_a, const skcms_Curve* curve) { a = table(curve, a); }
1241
+
1242
+ STAGE(clut_A2B, const skcms_A2B* a2b) {
1243
+ clut(a2b, &r,&g,&b,a);
1244
+
1245
+ if (a2b->input_channels == 4) {
1246
+ // CMYK is opaque.
1247
+ a = F1;
1248
+ }
1249
+ }
1250
+
1251
+ STAGE(clut_B2A, const skcms_B2A* b2a) {
1252
+ clut(b2a, &r,&g,&b,&a);
1253
+ }
1254
+
1255
+ // From here on down, the store_ ops are all "final stages," terminating processing of this group.
1256
+
1257
+ FINAL_STAGE(store_a8, NoCtx) {
1258
+ store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1259
+ }
1260
+
1261
+ FINAL_STAGE(store_g8, NoCtx) {
1262
+ // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1263
+ store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1264
+ }
1265
+
1266
+ FINAL_STAGE(store_4444, NoCtx) {
1267
+ store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1268
+ | cast<U16>(to_fixed(g * 15) << 8)
1269
+ | cast<U16>(to_fixed(b * 15) << 4)
1270
+ | cast<U16>(to_fixed(a * 15) << 0));
1271
+ }
1272
+
1273
+ FINAL_STAGE(store_565, NoCtx) {
1274
+ store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
1275
+ | cast<U16>(to_fixed(g * 63) << 5 )
1276
+ | cast<U16>(to_fixed(b * 31) << 11 ));
1277
+ }
1278
+
1279
+ FINAL_STAGE(store_888, NoCtx) {
1280
+ uint8_t* rgb = (uint8_t*)dst + 3*i;
1281
+ #if defined(USING_NEON)
1282
+ // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1283
+ // get there via U16 to save some instructions converting to float. And just
1284
+ // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1285
+ U16 R = cast<U16>(to_fixed(r * 255)),
1286
+ G = cast<U16>(to_fixed(g * 255)),
1287
+ B = cast<U16>(to_fixed(b * 255));
1288
+
1289
+ uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1290
+ vst3_lane_u8(rgb+0, v, 0);
1291
+ vst3_lane_u8(rgb+3, v, 2);
1292
+ vst3_lane_u8(rgb+6, v, 4);
1293
+ vst3_lane_u8(rgb+9, v, 6);
1294
+ #else
1295
+ store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1296
+ store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1297
+ store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1298
+ #endif
1299
+ }
1300
+
1301
+ FINAL_STAGE(store_8888, NoCtx) {
1302
+ store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0
1303
+ | cast<U32>(to_fixed(g * 255)) << 8
1304
+ | cast<U32>(to_fixed(b * 255)) << 16
1305
+ | cast<U32>(to_fixed(a * 255)) << 24);
1306
+ }
1307
+
1308
+ FINAL_STAGE(store_101010x_XR, NoCtx) {
1309
+ static constexpr float min = -0.752941f;
1310
+ static constexpr float max = 1.25098f;
1311
+ static constexpr float range = max - min;
1312
+ store(dst + 4*i, cast<U32>(to_fixed(((r - min) / range) * 1023)) << 0
1313
+ | cast<U32>(to_fixed(((g - min) / range) * 1023)) << 10
1314
+ | cast<U32>(to_fixed(((b - min) / range) * 1023)) << 20);
1315
+ return;
1316
+ }
1317
+
1318
+ FINAL_STAGE(store_1010102, NoCtx) {
1319
+ store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
1320
+ | cast<U32>(to_fixed(g * 1023)) << 10
1321
+ | cast<U32>(to_fixed(b * 1023)) << 20
1322
+ | cast<U32>(to_fixed(a * 3)) << 30);
1323
+ }
1324
+
1325
+ FINAL_STAGE(store_161616LE, NoCtx) {
1326
+ uintptr_t ptr = (uintptr_t)(dst + 6*i);
1327
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1328
+ uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1329
+ #if defined(USING_NEON)
1330
+ uint16x4x3_t v = {{
1331
+ (uint16x4_t)U16_from_F(r),
1332
+ (uint16x4_t)U16_from_F(g),
1333
+ (uint16x4_t)U16_from_F(b),
1334
+ }};
1335
+ vst3_u16(rgb, v);
1336
+ #else
1337
+ store_3(rgb+0, U16_from_F(r));
1338
+ store_3(rgb+1, U16_from_F(g));
1339
+ store_3(rgb+2, U16_from_F(b));
1340
+ #endif
1341
+
1342
+ }
1343
+
1344
+ FINAL_STAGE(store_16161616LE, NoCtx) {
1345
+ uintptr_t ptr = (uintptr_t)(dst + 8*i);
1346
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1347
+ uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1348
+ #if defined(USING_NEON)
1349
+ uint16x4x4_t v = {{
1350
+ (uint16x4_t)U16_from_F(r),
1351
+ (uint16x4_t)U16_from_F(g),
1352
+ (uint16x4_t)U16_from_F(b),
1353
+ (uint16x4_t)U16_from_F(a),
1354
+ }};
1355
+ vst4_u16(rgba, v);
1356
+ #else
1357
+ U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1358
+ | cast<U64>(to_fixed(g * 65535)) << 16
1359
+ | cast<U64>(to_fixed(b * 65535)) << 32
1360
+ | cast<U64>(to_fixed(a * 65535)) << 48;
1361
+ store(rgba, px);
1362
+ #endif
1363
+ }
1364
+
1365
+ FINAL_STAGE(store_161616BE, NoCtx) {
1366
+ uintptr_t ptr = (uintptr_t)(dst + 6*i);
1367
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1368
+ uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1369
+ #if defined(USING_NEON)
1370
+ uint16x4x3_t v = {{
1371
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1372
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1373
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1374
+ }};
1375
+ vst3_u16(rgb, v);
1376
+ #else
1377
+ U32 R = to_fixed(r * 65535),
1378
+ G = to_fixed(g * 65535),
1379
+ B = to_fixed(b * 65535);
1380
+ store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1381
+ store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1382
+ store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1383
+ #endif
1384
+
1385
+ }
1386
+
1387
+ FINAL_STAGE(store_16161616BE, NoCtx) {
1388
+ uintptr_t ptr = (uintptr_t)(dst + 8*i);
1389
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1390
+ uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1391
+ #if defined(USING_NEON)
1392
+ uint16x4x4_t v = {{
1393
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1394
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1395
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1396
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1397
+ }};
1398
+ vst4_u16(rgba, v);
1399
+ #else
1400
+ U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1401
+ | cast<U64>(to_fixed(g * 65535)) << 16
1402
+ | cast<U64>(to_fixed(b * 65535)) << 32
1403
+ | cast<U64>(to_fixed(a * 65535)) << 48;
1404
+ store(rgba, swap_endian_16x4(px));
1405
+ #endif
1406
+ }
1407
+
1408
+ FINAL_STAGE(store_hhh, NoCtx) {
1409
+ uintptr_t ptr = (uintptr_t)(dst + 6*i);
1410
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1411
+ uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1412
+
1413
+ U16 R = Half_from_F(r),
1414
+ G = Half_from_F(g),
1415
+ B = Half_from_F(b);
1416
+ #if defined(USING_NEON)
1417
+ uint16x4x3_t v = {{
1418
+ (uint16x4_t)R,
1419
+ (uint16x4_t)G,
1420
+ (uint16x4_t)B,
1421
+ }};
1422
+ vst3_u16(rgb, v);
1423
+ #else
1424
+ store_3(rgb+0, R);
1425
+ store_3(rgb+1, G);
1426
+ store_3(rgb+2, B);
1427
+ #endif
1428
+ }
1429
+
1430
+ FINAL_STAGE(store_hhhh, NoCtx) {
1431
+ uintptr_t ptr = (uintptr_t)(dst + 8*i);
1432
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1433
+ uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1434
+
1435
+ U16 R = Half_from_F(r),
1436
+ G = Half_from_F(g),
1437
+ B = Half_from_F(b),
1438
+ A = Half_from_F(a);
1439
+ #if defined(USING_NEON)
1440
+ uint16x4x4_t v = {{
1441
+ (uint16x4_t)R,
1442
+ (uint16x4_t)G,
1443
+ (uint16x4_t)B,
1444
+ (uint16x4_t)A,
1445
+ }};
1446
+ vst4_u16(rgba, v);
1447
+ #else
1448
+ store(rgba, cast<U64>(R) << 0
1449
+ | cast<U64>(G) << 16
1450
+ | cast<U64>(B) << 32
1451
+ | cast<U64>(A) << 48);
1452
+ #endif
1453
+ }
1454
+
1455
+ FINAL_STAGE(store_fff, NoCtx) {
1456
+ uintptr_t ptr = (uintptr_t)(dst + 12*i);
1457
+ assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1458
+ float* rgb = (float*)ptr; // for this cast to float* to be safe.
1459
+ #if defined(USING_NEON)
1460
+ float32x4x3_t v = {{
1461
+ (float32x4_t)r,
1462
+ (float32x4_t)g,
1463
+ (float32x4_t)b,
1464
+ }};
1465
+ vst3q_f32(rgb, v);
1466
+ #else
1467
+ store_3(rgb+0, r);
1468
+ store_3(rgb+1, g);
1469
+ store_3(rgb+2, b);
1470
+ #endif
1471
+ }
1472
+
1473
+ FINAL_STAGE(store_ffff, NoCtx) {
1474
+ uintptr_t ptr = (uintptr_t)(dst + 16*i);
1475
+ assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1476
+ float* rgba = (float*)ptr; // for this cast to float* to be safe.
1477
+ #if defined(USING_NEON)
1478
+ float32x4x4_t v = {{
1479
+ (float32x4_t)r,
1480
+ (float32x4_t)g,
1481
+ (float32x4_t)b,
1482
+ (float32x4_t)a,
1483
+ }};
1484
+ vst4q_f32(rgba, v);
1485
+ #else
1486
+ store_4(rgba+0, r);
1487
+ store_4(rgba+1, g);
1488
+ store_4(rgba+2, b);
1489
+ store_4(rgba+3, a);
1490
+ #endif
1491
+ }
1492
+
1493
+ #if SKCMS_HAS_MUSTTAIL
1494
+
1495
+ SI void exec_stages(StageFn* stages, const void** contexts, const char* src, char* dst, int i) {
1496
+ (*stages)({stages}, contexts, src, dst, F0, F0, F0, F1, i);
1497
+ }
1498
+
1499
+ #else
1500
+
1501
+ static void exec_stages(const Op* ops, const void** contexts,
1502
+ const char* src, char* dst, int i) {
1503
+ F r = F0, g = F0, b = F0, a = F1;
1504
+ while (true) {
1505
+ switch (*ops++) {
1506
+ #define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); break;
1507
+ SKCMS_WORK_OPS(M)
1508
+ #undef M
1509
+ #define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); return;
1510
+ SKCMS_STORE_OPS(M)
1511
+ #undef M
1208
1512
  }
1209
- case Op_store_1010102: {
1210
- store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
1211
- | cast<U32>(to_fixed(g * 1023)) << 10
1212
- | cast<U32>(to_fixed(b * 1023)) << 20
1213
- | cast<U32>(to_fixed(a * 3)) << 30);
1214
- } return;
1215
-
1216
- case Op_store_161616LE: {
1217
- uintptr_t ptr = (uintptr_t)(dst + 6*i);
1218
- assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1219
- uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1220
- #if defined(USING_NEON)
1221
- uint16x4x3_t v = {{
1222
- (uint16x4_t)U16_from_F(r),
1223
- (uint16x4_t)U16_from_F(g),
1224
- (uint16x4_t)U16_from_F(b),
1225
- }};
1226
- vst3_u16(rgb, v);
1227
- #else
1228
- store_3(rgb+0, U16_from_F(r));
1229
- store_3(rgb+1, U16_from_F(g));
1230
- store_3(rgb+2, U16_from_F(b));
1231
- #endif
1232
-
1233
- } return;
1234
-
1235
- case Op_store_16161616LE: {
1236
- uintptr_t ptr = (uintptr_t)(dst + 8*i);
1237
- assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1238
- uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1239
- #if defined(USING_NEON)
1240
- uint16x4x4_t v = {{
1241
- (uint16x4_t)U16_from_F(r),
1242
- (uint16x4_t)U16_from_F(g),
1243
- (uint16x4_t)U16_from_F(b),
1244
- (uint16x4_t)U16_from_F(a),
1245
- }};
1246
- vst4_u16(rgba, v);
1247
- #else
1248
- U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1249
- | cast<U64>(to_fixed(g * 65535)) << 16
1250
- | cast<U64>(to_fixed(b * 65535)) << 32
1251
- | cast<U64>(to_fixed(a * 65535)) << 48;
1252
- store(rgba, px);
1253
- #endif
1254
- } return;
1255
-
1256
- case Op_store_161616BE: {
1257
- uintptr_t ptr = (uintptr_t)(dst + 6*i);
1258
- assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1259
- uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1260
- #if defined(USING_NEON)
1261
- uint16x4x3_t v = {{
1262
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1263
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1264
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1265
- }};
1266
- vst3_u16(rgb, v);
1267
- #else
1268
- U32 R = to_fixed(r * 65535),
1269
- G = to_fixed(g * 65535),
1270
- B = to_fixed(b * 65535);
1271
- store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1272
- store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1273
- store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1274
- #endif
1275
-
1276
- } return;
1277
-
1278
- case Op_store_16161616BE: {
1279
- uintptr_t ptr = (uintptr_t)(dst + 8*i);
1280
- assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1281
- uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1282
- #if defined(USING_NEON)
1283
- uint16x4x4_t v = {{
1284
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1285
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1286
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1287
- (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1288
- }};
1289
- vst4_u16(rgba, v);
1290
- #else
1291
- U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1292
- | cast<U64>(to_fixed(g * 65535)) << 16
1293
- | cast<U64>(to_fixed(b * 65535)) << 32
1294
- | cast<U64>(to_fixed(a * 65535)) << 48;
1295
- store(rgba, swap_endian_16x4(px));
1296
- #endif
1297
- } return;
1298
-
1299
- case Op_store_hhh: {
1300
- uintptr_t ptr = (uintptr_t)(dst + 6*i);
1301
- assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1302
- uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1303
-
1304
- U16 R = Half_from_F(r),
1305
- G = Half_from_F(g),
1306
- B = Half_from_F(b);
1307
- #if defined(USING_NEON)
1308
- uint16x4x3_t v = {{
1309
- (uint16x4_t)R,
1310
- (uint16x4_t)G,
1311
- (uint16x4_t)B,
1312
- }};
1313
- vst3_u16(rgb, v);
1314
- #else
1315
- store_3(rgb+0, R);
1316
- store_3(rgb+1, G);
1317
- store_3(rgb+2, B);
1318
- #endif
1319
- } return;
1320
-
1321
- case Op_store_hhhh: {
1322
- uintptr_t ptr = (uintptr_t)(dst + 8*i);
1323
- assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1324
- uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1325
-
1326
- U16 R = Half_from_F(r),
1327
- G = Half_from_F(g),
1328
- B = Half_from_F(b),
1329
- A = Half_from_F(a);
1330
- #if defined(USING_NEON)
1331
- uint16x4x4_t v = {{
1332
- (uint16x4_t)R,
1333
- (uint16x4_t)G,
1334
- (uint16x4_t)B,
1335
- (uint16x4_t)A,
1336
- }};
1337
- vst4_u16(rgba, v);
1338
- #else
1339
- store(rgba, cast<U64>(R) << 0
1340
- | cast<U64>(G) << 16
1341
- | cast<U64>(B) << 32
1342
- | cast<U64>(A) << 48);
1343
- #endif
1344
-
1345
- } return;
1346
-
1347
- case Op_store_fff: {
1348
- uintptr_t ptr = (uintptr_t)(dst + 12*i);
1349
- assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1350
- float* rgb = (float*)ptr; // for this cast to float* to be safe.
1351
- #if defined(USING_NEON)
1352
- float32x4x3_t v = {{
1353
- (float32x4_t)r,
1354
- (float32x4_t)g,
1355
- (float32x4_t)b,
1356
- }};
1357
- vst3q_f32(rgb, v);
1358
- #else
1359
- store_3(rgb+0, r);
1360
- store_3(rgb+1, g);
1361
- store_3(rgb+2, b);
1362
- #endif
1363
- } return;
1364
-
1365
- case Op_store_ffff: {
1366
- uintptr_t ptr = (uintptr_t)(dst + 16*i);
1367
- assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1368
- float* rgba = (float*)ptr; // for this cast to float* to be safe.
1369
- #if defined(USING_NEON)
1370
- float32x4x4_t v = {{
1371
- (float32x4_t)r,
1372
- (float32x4_t)g,
1373
- (float32x4_t)b,
1374
- (float32x4_t)a,
1375
- }};
1376
- vst4q_f32(rgba, v);
1377
- #else
1378
- store_4(rgba+0, r);
1379
- store_4(rgba+1, g);
1380
- store_4(rgba+2, b);
1381
- store_4(rgba+3, a);
1382
- #endif
1383
- } return;
1384
1513
  }
1385
1514
  }
1386
- }
1387
1515
 
1516
+ #endif
1517
+
1518
+ // NOLINTNEXTLINE(misc-definitions-in-headers)
1519
+ void run_program(const Op* program, const void** contexts, SKCMS_MAYBE_UNUSED ptrdiff_t programSize,
1520
+ const char* src, char* dst, int n,
1521
+ const size_t src_bpp, const size_t dst_bpp) {
1522
+ #if SKCMS_HAS_MUSTTAIL
1523
+ // Convert the program into an array of tailcall stages.
1524
+ StageFn stages[32];
1525
+ assert(programSize <= ARRAY_COUNT(stages));
1526
+
1527
+ static constexpr StageFn kStageFns[] = {
1528
+ #define M(name) &Exec_##name,
1529
+ SKCMS_WORK_OPS(M)
1530
+ SKCMS_STORE_OPS(M)
1531
+ #undef M
1532
+ };
1533
+
1534
+ for (ptrdiff_t index = 0; index < programSize; ++index) {
1535
+ stages[index] = kStageFns[(int)program[index]];
1536
+ }
1537
+ #else
1538
+ // Use the op array as-is.
1539
+ const Op* stages = program;
1540
+ #endif
1388
1541
 
1389
- static void run_program(const Op* program, const void** arguments,
1390
- const char* src, char* dst, int n,
1391
- const size_t src_bpp, const size_t dst_bpp) {
1392
1542
  int i = 0;
1393
1543
  while (n >= N) {
1394
- exec_ops(program, arguments, src, dst, i);
1544
+ exec_stages(stages, contexts, src, dst, i);
1395
1545
  i += N;
1396
1546
  n -= N;
1397
1547
  }
@@ -1399,30 +1549,7 @@ static void run_program(const Op* program, const void** arguments,
1399
1549
  char tmp[4*4*N] = {0};
1400
1550
 
1401
1551
  memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
1402
- exec_ops(program, arguments, tmp, tmp, 0);
1552
+ exec_stages(stages, contexts, tmp, tmp, 0);
1403
1553
  memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
1404
1554
  }
1405
1555
  }
1406
-
1407
- // Clean up any #defines we may have set so that we can be #included again.
1408
- #if defined(USING_AVX)
1409
- #undef USING_AVX
1410
- #endif
1411
- #if defined(USING_AVX_F16C)
1412
- #undef USING_AVX_F16C
1413
- #endif
1414
- #if defined(USING_AVX2)
1415
- #undef USING_AVX2
1416
- #endif
1417
- #if defined(USING_AVX512F)
1418
- #undef USING_AVX512F
1419
- #endif
1420
-
1421
- #if defined(USING_NEON)
1422
- #undef USING_NEON
1423
- #endif
1424
- #if defined(USING_NEON_F16C)
1425
- #undef USING_NEON_F16C
1426
- #endif
1427
-
1428
- #undef FALLTHROUGH