@shopify/react-native-skia 0.1.233 → 0.1.236
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/CMakeLists.txt +0 -1
- package/android/cpp/jni/JniSkiaManager.cpp +0 -1
- package/android/cpp/rnskia-android/RNSkAndroidView.h +0 -1
- package/android/cpp/rnskia-android/RNSkOpenGLCanvasProvider.cpp +1 -1
- package/android/cpp/rnskia-android/RNSkOpenGLCanvasProvider.h +1 -2
- package/cpp/api/JsiSkTypefaceFactory.h +2 -1
- package/cpp/skia/include/android/AHardwareBufferUtils.h +23 -0
- package/cpp/skia/include/android/GrAHardwareBufferUtils.h +2 -0
- package/cpp/skia/include/android/graphite/SurfaceAndroid.h +59 -0
- package/cpp/skia/include/codec/SkAvifDecoder.h +1 -1
- package/cpp/skia/include/codec/SkBmpDecoder.h +1 -1
- package/cpp/skia/include/codec/SkCodec.h +21 -3
- package/cpp/skia/include/codec/SkGifDecoder.h +1 -1
- package/cpp/skia/include/codec/SkIcoDecoder.h +1 -1
- package/cpp/skia/include/codec/SkJpegDecoder.h +1 -1
- package/cpp/skia/include/codec/SkJpegxlDecoder.h +1 -1
- package/cpp/skia/include/codec/SkPngDecoder.h +1 -1
- package/cpp/skia/include/codec/SkRawDecoder.h +1 -1
- package/cpp/skia/include/codec/SkWbmpDecoder.h +1 -1
- package/cpp/skia/include/codec/SkWebpDecoder.h +1 -1
- package/cpp/skia/include/config/SkUserConfig.h +3 -1
- package/cpp/skia/include/core/SkCanvas.h +66 -37
- package/cpp/skia/include/core/SkColorFilter.h +5 -2
- package/cpp/skia/include/core/SkContourMeasure.h +1 -0
- package/cpp/skia/include/core/SkDocument.h +1 -0
- package/cpp/skia/include/core/SkFont.h +14 -24
- package/cpp/skia/include/core/SkFontArguments.h +1 -1
- package/cpp/skia/include/core/SkFontMetrics.h +1 -1
- package/cpp/skia/include/core/SkFontMgr.h +0 -7
- package/cpp/skia/include/core/SkGraphics.h +13 -0
- package/cpp/skia/include/core/SkMesh.h +9 -13
- package/cpp/skia/include/core/SkMilestone.h +1 -1
- package/cpp/skia/include/core/SkPathMeasure.h +2 -0
- package/cpp/skia/include/core/SkSerialProcs.h +29 -11
- package/cpp/skia/include/core/SkSize.h +3 -3
- package/cpp/skia/include/core/SkStream.h +3 -13
- package/cpp/skia/include/core/SkSurface.h +6 -3
- package/cpp/skia/include/core/SkSurfaceProps.h +2 -4
- package/cpp/skia/include/core/SkTraceMemoryDump.h +15 -0
- package/cpp/skia/include/core/SkTypeface.h +8 -56
- package/cpp/skia/include/core/SkTypes.h +8 -0
- package/cpp/skia/include/core/SkVertices.h +1 -1
- package/cpp/skia/include/docs/SkMultiPictureDocument.h +53 -0
- package/cpp/skia/include/docs/SkPDFDocument.h +11 -0
- package/cpp/skia/include/effects/SkGradientShader.h +9 -0
- package/cpp/skia/include/effects/SkRuntimeEffect.h +3 -7
- package/cpp/skia/include/gpu/GrBackendSemaphore.h +33 -47
- package/cpp/skia/include/gpu/GrBackendSurface.h +2 -3
- package/cpp/skia/include/gpu/GrContextOptions.h +0 -6
- package/cpp/skia/include/gpu/GrContextThreadSafeProxy.h +44 -28
- package/cpp/skia/include/gpu/GrDirectContext.h +12 -31
- package/cpp/skia/include/gpu/GrTypes.h +1 -16
- package/cpp/skia/include/gpu/MutableTextureState.h +35 -80
- package/cpp/skia/include/gpu/ShaderErrorHandler.h +11 -1
- package/cpp/skia/include/gpu/ganesh/SkImageGanesh.h +2 -2
- package/cpp/skia/include/gpu/ganesh/SkSurfaceGanesh.h +1 -1
- package/cpp/skia/include/gpu/ganesh/gl/GrGLDirectContext.h +3 -2
- package/cpp/skia/include/gpu/ganesh/vk/GrVkBackendSemaphore.h +20 -0
- package/cpp/skia/include/gpu/ganesh/vk/GrVkDirectContext.h +30 -0
- package/cpp/skia/include/gpu/gl/GrGLFunctions.h +1 -1
- package/cpp/skia/include/gpu/gl/GrGLInterface.h +2 -0
- package/cpp/skia/include/gpu/gl/glx/GrGLMakeGLXInterface.h +6 -0
- package/cpp/skia/include/gpu/graphite/BackendSemaphore.h +3 -3
- package/cpp/skia/include/gpu/graphite/BackendTexture.h +39 -27
- package/cpp/skia/include/gpu/graphite/Context.h +39 -13
- package/cpp/skia/include/gpu/graphite/ContextOptions.h +2 -0
- package/cpp/skia/include/gpu/graphite/GraphiteTypes.h +2 -1
- package/cpp/skia/include/gpu/graphite/Image.h +106 -87
- package/cpp/skia/include/gpu/graphite/Recorder.h +24 -3
- package/cpp/skia/include/gpu/graphite/Surface.h +7 -2
- package/cpp/skia/include/gpu/graphite/dawn/DawnBackendContext.h +41 -2
- package/cpp/skia/include/gpu/graphite/dawn/DawnTypes.h +11 -6
- package/cpp/skia/include/gpu/graphite/mtl/MtlGraphiteTypes.h +1 -2
- package/cpp/skia/include/gpu/graphite/vk/VulkanGraphiteTypes.h +6 -6
- package/cpp/skia/include/gpu/mock/GrMockTypes.h +1 -0
- package/cpp/skia/include/gpu/vk/GrVkBackendContext.h +1 -1
- package/cpp/skia/include/gpu/vk/GrVkTypes.h +1 -44
- package/cpp/skia/include/gpu/vk/VulkanExtensions.h +1 -1
- package/cpp/skia/include/gpu/vk/VulkanMutableTextureState.h +25 -0
- package/cpp/skia/include/gpu/vk/VulkanTypes.h +44 -0
- package/cpp/skia/include/ports/SkFontConfigInterface.h +3 -6
- package/cpp/skia/include/private/SkEncodedInfo.h +5 -0
- package/cpp/skia/include/private/SkExif.h +102 -0
- package/cpp/skia/include/private/SkGainmapInfo.h +11 -1
- package/cpp/skia/include/private/base/SkAssert.h +16 -0
- package/cpp/skia/include/private/base/SkDeque.h +2 -7
- package/cpp/skia/include/private/base/SkLoadUserConfig.h +1 -1
- package/cpp/skia/include/private/base/SkTArray.h +69 -28
- package/cpp/skia/include/private/base/SkThreadAnnotations.h +18 -5
- package/cpp/skia/include/private/chromium/GrSurfaceCharacterization.h +26 -30
- package/cpp/skia/include/private/chromium/GrVkSecondaryCBDrawContext.h +4 -3
- package/cpp/skia/include/private/chromium/SkImageChromium.h +1 -1
- package/cpp/skia/include/private/gpu/ganesh/GrTypesPriv.h +8 -6
- package/cpp/skia/include/private/gpu/graphite/ContextOptionsPriv.h +29 -0
- package/cpp/skia/include/private/gpu/graphite/DawnTypesPriv.h +12 -9
- package/cpp/skia/include/private/gpu/graphite/VulkanGraphiteTypesPriv.h +16 -11
- package/cpp/skia/include/third_party/vulkan/vulkan/vk_platform.h +2 -2
- package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codec_h264std.h +312 -0
- package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codec_h264std_decode.h +77 -0
- package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codec_h265std.h +446 -0
- package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codec_h265std_decode.h +67 -0
- package/cpp/skia/include/third_party/vulkan/vulkan/vk_video/vulkan_video_codecs_common.h +36 -0
- package/cpp/skia/include/third_party/vulkan/vulkan/vulkan.h +9 -2
- package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_android.h +31 -3
- package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_core.h +10624 -5716
- package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_ios.h +2 -1
- package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_macos.h +2 -1
- package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_win32.h +28 -1
- package/cpp/skia/include/third_party/vulkan/vulkan/vulkan_xcb.h +2 -1
- package/cpp/skia/include/utils/mac/SkCGUtils.h +23 -11
- package/cpp/skia/modules/skcms/skcms.h +2 -410
- package/cpp/skia/modules/skcms/src/Transform_inl.h +831 -704
- package/cpp/skia/modules/skcms/src/skcms_Transform.h +161 -0
- package/cpp/skia/modules/skcms/src/skcms_internals.h +136 -0
- package/cpp/skia/modules/skcms/src/skcms_public.h +404 -0
- package/cpp/skia/modules/skparagraph/include/FontArguments.h +1 -1
- package/cpp/skia/modules/skparagraph/include/FontCollection.h +2 -0
- package/cpp/skia/modules/skparagraph/include/Paragraph.h +2 -2
- package/cpp/skia/modules/skparagraph/include/TextStyle.h +4 -3
- package/cpp/skia/modules/skparagraph/include/TypefaceFontProvider.h +1 -3
- package/cpp/skia/modules/skresources/include/SkResources.h +28 -17
- package/cpp/skia/modules/skunicode/include/SkUnicode.h +12 -0
- package/cpp/skia/modules/svg/include/SkSVGDOM.h +4 -1
- package/cpp/skia/modules/svg/include/SkSVGRenderContext.h +4 -1
- package/cpp/skia/src/base/SkUTF.h +7 -0
- package/cpp/skia/src/core/SkTHash.h +20 -8
- package/lib/commonjs/dom/nodes/JsiSkDOM.d.ts +3 -2
- package/lib/commonjs/dom/nodes/JsiSkDOM.js +56 -57
- package/lib/commonjs/dom/nodes/JsiSkDOM.js.map +1 -1
- package/lib/commonjs/external/reanimated/index.d.ts +1 -0
- package/lib/commonjs/external/reanimated/index.js +13 -0
- package/lib/commonjs/external/reanimated/index.js.map +1 -1
- package/lib/commonjs/external/reanimated/interpolators.js +16 -2
- package/lib/commonjs/external/reanimated/interpolators.js.map +1 -1
- package/lib/commonjs/external/reanimated/moduleWrapper.d.ts +1 -0
- package/lib/commonjs/external/reanimated/moduleWrapper.js +5 -3
- package/lib/commonjs/external/reanimated/moduleWrapper.js.map +1 -1
- package/lib/commonjs/external/reanimated/textures.d.ts +5 -0
- package/lib/commonjs/external/reanimated/textures.js +52 -0
- package/lib/commonjs/external/reanimated/textures.js.map +1 -0
- package/lib/commonjs/headless/index.js +1 -1
- package/lib/commonjs/headless/index.js.map +1 -1
- package/lib/commonjs/mock/index.js +3 -0
- package/lib/commonjs/mock/index.js.map +1 -1
- package/lib/commonjs/renderer/Canvas.js +6 -5
- package/lib/commonjs/renderer/Canvas.js.map +1 -1
- package/lib/commonjs/renderer/Container.d.ts +1 -1
- package/lib/commonjs/renderer/Container.js +2 -1
- package/lib/commonjs/renderer/Container.js.map +1 -1
- package/lib/commonjs/renderer/Offscreen.d.ts +1 -0
- package/lib/commonjs/renderer/Offscreen.js +18 -5
- package/lib/commonjs/renderer/Offscreen.js.map +1 -1
- package/lib/commonjs/renderer/Reconciler.d.ts +1 -1
- package/lib/commonjs/renderer/Reconciler.js +7 -4
- package/lib/commonjs/renderer/Reconciler.js.map +1 -1
- package/lib/commonjs/skia/types/Matrix4.d.ts +2 -2
- package/lib/commonjs/skia/types/Matrix4.js.map +1 -1
- package/lib/commonjs/skia/types/Shader/Shader.d.ts +1 -1
- package/lib/commonjs/skia/types/Shader/Shader.js.map +1 -1
- package/lib/commonjs/skia/web/JsiSkPath.d.ts +2 -2
- package/lib/commonjs/skia/web/JsiSkPath.js +10 -2
- package/lib/commonjs/skia/web/JsiSkPath.js.map +1 -1
- package/lib/commonjs/views/SkiaJSDomView.d.ts +31 -0
- package/lib/commonjs/views/SkiaJSDomView.js +161 -0
- package/lib/commonjs/views/SkiaJSDomView.js.map +1 -0
- package/lib/commonjs/views/SkiaJSDomView.web.d.ts +1 -0
- package/lib/commonjs/views/SkiaJSDomView.web.js +14 -0
- package/lib/commonjs/views/SkiaJSDomView.web.js.map +1 -0
- package/lib/module/dom/nodes/JsiSkDOM.d.ts +3 -2
- package/lib/module/dom/nodes/JsiSkDOM.js +56 -56
- package/lib/module/dom/nodes/JsiSkDOM.js.map +1 -1
- package/lib/module/external/reanimated/index.d.ts +1 -0
- package/lib/module/external/reanimated/index.js +1 -0
- package/lib/module/external/reanimated/index.js.map +1 -1
- package/lib/module/external/reanimated/interpolators.js +15 -2
- package/lib/module/external/reanimated/interpolators.js.map +1 -1
- package/lib/module/external/reanimated/moduleWrapper.d.ts +1 -0
- package/lib/module/external/reanimated/moduleWrapper.js +3 -2
- package/lib/module/external/reanimated/moduleWrapper.js.map +1 -1
- package/lib/module/external/reanimated/textures.d.ts +5 -0
- package/lib/module/external/reanimated/textures.js +35 -0
- package/lib/module/external/reanimated/textures.js.map +1 -0
- package/lib/module/headless/index.js +1 -1
- package/lib/module/headless/index.js.map +1 -1
- package/lib/module/mock/index.js +3 -0
- package/lib/module/mock/index.js.map +1 -1
- package/lib/module/renderer/Canvas.js +5 -4
- package/lib/module/renderer/Canvas.js.map +1 -1
- package/lib/module/renderer/Container.d.ts +1 -1
- package/lib/module/renderer/Container.js +2 -1
- package/lib/module/renderer/Container.js.map +1 -1
- package/lib/module/renderer/Offscreen.d.ts +1 -0
- package/lib/module/renderer/Offscreen.js +11 -3
- package/lib/module/renderer/Offscreen.js.map +1 -1
- package/lib/module/renderer/Reconciler.d.ts +1 -1
- package/lib/module/renderer/Reconciler.js +7 -4
- package/lib/module/renderer/Reconciler.js.map +1 -1
- package/lib/module/skia/types/Matrix4.d.ts +2 -2
- package/lib/module/skia/types/Matrix4.js.map +1 -1
- package/lib/module/skia/types/Shader/Shader.d.ts +1 -1
- package/lib/module/skia/types/Shader/Shader.js.map +1 -1
- package/lib/module/skia/web/JsiSkPath.d.ts +2 -2
- package/lib/module/skia/web/JsiSkPath.js +10 -2
- package/lib/module/skia/web/JsiSkPath.js.map +1 -1
- package/lib/module/views/SkiaJSDomView.d.ts +31 -0
- package/lib/module/views/SkiaJSDomView.js +136 -0
- package/lib/module/views/SkiaJSDomView.js.map +1 -0
- package/lib/module/views/SkiaJSDomView.web.d.ts +1 -0
- package/lib/module/views/SkiaJSDomView.web.js +2 -0
- package/lib/module/views/SkiaJSDomView.web.js.map +1 -0
- package/lib/typescript/src/dom/nodes/JsiSkDOM.d.ts +3 -2
- package/lib/typescript/src/external/reanimated/index.d.ts +1 -0
- package/lib/typescript/src/external/reanimated/moduleWrapper.d.ts +1 -0
- package/lib/typescript/src/external/reanimated/textures.d.ts +5 -0
- package/lib/typescript/src/renderer/Container.d.ts +1 -1
- package/lib/typescript/src/renderer/Offscreen.d.ts +1 -0
- package/lib/typescript/src/renderer/Reconciler.d.ts +1 -1
- package/lib/typescript/src/skia/types/Matrix4.d.ts +2 -2
- package/lib/typescript/src/skia/types/Shader/Shader.d.ts +1 -1
- package/lib/typescript/src/skia/web/JsiSkPath.d.ts +2 -2
- package/lib/typescript/src/views/SkiaJSDomView.d.ts +31 -0
- package/lib/typescript/src/views/SkiaJSDomView.web.d.ts +1 -0
- package/libs/android/arm64-v8a/libskia.a +0 -0
- package/libs/android/arm64-v8a/libskottie.a +0 -0
- package/libs/android/arm64-v8a/libskparagraph.a +0 -0
- package/libs/android/arm64-v8a/libsksg.a +0 -0
- package/libs/android/arm64-v8a/libskshaper.a +0 -0
- package/libs/android/arm64-v8a/libskunicode.a +0 -0
- package/libs/android/arm64-v8a/libsvg.a +0 -0
- package/libs/android/armeabi-v7a/libskia.a +0 -0
- package/libs/android/armeabi-v7a/libskottie.a +0 -0
- package/libs/android/armeabi-v7a/libskparagraph.a +0 -0
- package/libs/android/armeabi-v7a/libsksg.a +0 -0
- package/libs/android/armeabi-v7a/libskshaper.a +0 -0
- package/libs/android/armeabi-v7a/libskunicode.a +0 -0
- package/libs/android/armeabi-v7a/libsvg.a +0 -0
- package/libs/android/x86/libskia.a +0 -0
- package/libs/android/x86/libskottie.a +0 -0
- package/libs/android/x86/libskparagraph.a +0 -0
- package/libs/android/x86/libsksg.a +0 -0
- package/libs/android/x86/libskshaper.a +0 -0
- package/libs/android/x86/libskunicode.a +0 -0
- package/libs/android/x86/libsvg.a +0 -0
- package/libs/android/x86_64/libskia.a +0 -0
- package/libs/android/x86_64/libskottie.a +0 -0
- package/libs/android/x86_64/libskparagraph.a +0 -0
- package/libs/android/x86_64/libsksg.a +0 -0
- package/libs/android/x86_64/libskshaper.a +0 -0
- package/libs/android/x86_64/libskunicode.a +0 -0
- package/libs/android/x86_64/libsvg.a +0 -0
- package/libs/ios/libskia.xcframework/ios-arm64_arm64e/libskia.a +0 -0
- package/libs/ios/libskia.xcframework/ios-arm64_arm64e_x86_64-simulator/libskia.a +0 -0
- package/libs/ios/libskottie.xcframework/ios-arm64_arm64e/libskottie.a +0 -0
- package/libs/ios/libskottie.xcframework/ios-arm64_arm64e_x86_64-simulator/libskottie.a +0 -0
- package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e/libskparagraph.a +0 -0
- package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e_x86_64-simulator/libskparagraph.a +0 -0
- package/libs/ios/libsksg.xcframework/ios-arm64_arm64e/libsksg.a +0 -0
- package/libs/ios/libsksg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsksg.a +0 -0
- package/libs/ios/libskshaper.xcframework/Info.plist +5 -5
- package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e/libskshaper.a +0 -0
- package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e_x86_64-simulator/libskshaper.a +0 -0
- package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e/libskunicode.a +0 -0
- package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e_x86_64-simulator/libskunicode.a +0 -0
- package/libs/ios/libsvg.xcframework/Info.plist +5 -5
- package/libs/ios/libsvg.xcframework/ios-arm64_arm64e/libsvg.a +0 -0
- package/libs/ios/libsvg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsvg.a +0 -0
- package/package.json +1 -1
- package/src/dom/nodes/JsiSkDOM.ts +55 -56
- package/src/external/reanimated/index.ts +1 -0
- package/src/external/reanimated/interpolators.ts +17 -3
- package/src/external/reanimated/moduleWrapper.ts +1 -0
- package/src/external/reanimated/textures.tsx +50 -0
- package/src/headless/index.ts +1 -1
- package/src/mock/index.ts +3 -0
- package/src/renderer/Canvas.tsx +4 -3
- package/src/renderer/Container.tsx +3 -2
- package/src/renderer/Offscreen.tsx +12 -3
- package/src/renderer/Reconciler.tsx +5 -2
- package/src/skia/types/Matrix4.ts +2 -2
- package/src/skia/types/Shader/Shader.ts +6 -1
- package/src/skia/web/JsiSkPath.ts +23 -3
- package/src/views/SkiaJSDomView.tsx +126 -0
- package/src/views/SkiaJSDomView.web.tsx +1 -0
- package/android/cpp/jni/include/JniSkiaDrawView.h +0 -90
- package/cpp/rnskia/RNSkJsView.cpp +0 -236
- package/cpp/rnskia/RNSkJsView.h +0 -121
- package/cpp/skia/include/gpu/GrSurfaceInfo.h +0 -142
- package/cpp/skia/include/private/gpu/ganesh/GrGLTypesPriv.h +0 -107
- package/cpp/skia/include/private/gpu/ganesh/GrMockTypesPriv.h +0 -32
- package/cpp/skia/include/private/gpu/ganesh/GrMtlTypesPriv.h +0 -83
- package/cpp/skia/include/private/gpu/ganesh/GrVkTypesPriv.h +0 -47
- package/cpp/skia/include/private/gpu/vk/VulkanTypesPriv.h +0 -57
- package/cpp/skia/include/utils/SkBase64.h +0 -53
- package/cpp/skia/modules/skcms/skcms_internal.h +0 -56
@@ -8,26 +8,29 @@
|
|
8
8
|
// Intentionally NO #pragma once... included multiple times.
|
9
9
|
|
10
10
|
// This file is included from skcms.cc in a namespace with some pre-defines:
|
11
|
-
// - N:
|
11
|
+
// - N: SIMD width of all vectors; 1, 4, 8 or 16 (preprocessor define)
|
12
12
|
// - V<T>: a template to create a vector of N T's.
|
13
13
|
|
14
|
-
using F = V<
|
14
|
+
using F = V<float>;
|
15
15
|
using I32 = V<int32_t>;
|
16
16
|
using U64 = V<uint64_t>;
|
17
17
|
using U32 = V<uint32_t>;
|
18
18
|
using U16 = V<uint16_t>;
|
19
19
|
using U8 = V<uint8_t>;
|
20
20
|
|
21
|
-
|
22
21
|
#if defined(__GNUC__) && !defined(__clang__)
|
23
|
-
//
|
22
|
+
// GCC is kind of weird, not allowing vector = scalar directly.
|
24
23
|
static constexpr F F0 = F() + 0.0f,
|
25
24
|
F1 = F() + 1.0f,
|
25
|
+
FHalf = F() + 0.5f,
|
26
26
|
FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
|
27
|
+
static constexpr I32 F16InfBits = I32() + 0x4780'0000;
|
27
28
|
#else
|
28
29
|
static constexpr F F0 = 0.0f,
|
29
30
|
F1 = 1.0f,
|
31
|
+
FHalf = 0.5f,
|
30
32
|
FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
|
33
|
+
static constexpr I32 F16InfBits = 0x4780'0000; // equals +Inf in half float, shifted to 32-bits
|
31
34
|
#endif
|
32
35
|
|
33
36
|
// Instead of checking __AVX__ below, we'll check USING_AVX.
|
@@ -84,19 +87,11 @@ using U8 = V<uint8_t>;
|
|
84
87
|
#endif
|
85
88
|
#endif
|
86
89
|
|
87
|
-
#if defined(__clang__)
|
88
|
-
#define FALLTHROUGH [[clang::fallthrough]]
|
89
|
-
#else
|
90
|
-
#define FALLTHROUGH
|
91
|
-
#endif
|
92
|
-
|
93
90
|
// We tag most helper functions as SI, to enforce good code generation
|
94
91
|
// but also work around what we think is a bug in GCC: when targeting 32-bit
|
95
92
|
// x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
|
96
93
|
// MMX mm0 register, which seems to mess with unrelated code that later uses
|
97
94
|
// x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
|
98
|
-
//
|
99
|
-
// It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
|
100
95
|
#if defined(__clang__) || defined(__GNUC__)
|
101
96
|
#define SI static inline __attribute__((always_inline))
|
102
97
|
#else
|
@@ -106,12 +101,12 @@ using U8 = V<uint8_t>;
|
|
106
101
|
template <typename T, typename P>
|
107
102
|
SI T load(const P* ptr) {
|
108
103
|
T val;
|
109
|
-
|
104
|
+
memcpy(&val, ptr, sizeof(val));
|
110
105
|
return val;
|
111
106
|
}
|
112
107
|
template <typename T, typename P>
|
113
108
|
SI void store(P* ptr, const T& val) {
|
114
|
-
|
109
|
+
memcpy(ptr, &val, sizeof(val));
|
115
110
|
}
|
116
111
|
|
117
112
|
// (T)v is a cast when N == 1 and a bit-pun when N>1,
|
@@ -142,7 +137,6 @@ SI D bit_pun(const S& v) {
|
|
142
137
|
// To serve both those ends, we use this function to_fixed() instead of direct cast().
|
143
138
|
SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
|
144
139
|
|
145
|
-
|
146
140
|
// Sometimes we do something crazy on one branch of a conditonal,
|
147
141
|
// like divide by zero or convert a huge float to an integer,
|
148
142
|
// but then harmlessly select the other side. That trips up N==1
|
@@ -159,7 +153,22 @@ SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
|
|
159
153
|
}
|
160
154
|
#endif
|
161
155
|
|
156
|
+
#if defined(USING_NEON)
|
157
|
+
SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
|
158
|
+
SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
|
159
|
+
|
160
|
+
SI I32 min_(I32 x, I32 y) { return (I32)vminq_s32((int32x4_t)x, (int32x4_t)y); }
|
161
|
+
SI I32 max_(I32 x, I32 y) { return (I32)vmaxq_s32((int32x4_t)x, (int32x4_t)y); }
|
162
|
+
#else
|
163
|
+
SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
|
164
|
+
SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
|
165
|
+
|
166
|
+
SI I32 min_(I32 x, I32 y) { return if_then_else(x > y, y, x); }
|
167
|
+
SI I32 max_(I32 x, I32 y) { return if_then_else(x < y, y, x); }
|
168
|
+
#endif
|
162
169
|
|
170
|
+
// KEEP IN SYNC with skvx::from_half to ensure that f16 colors are computed consistently in both
|
171
|
+
// skcms and skvx.
|
163
172
|
SI F F_from_Half(U16 half) {
|
164
173
|
#if defined(USING_NEON_F16C)
|
165
174
|
return vcvt_f32_f16((float16x4_t)half);
|
@@ -169,24 +178,27 @@ SI F F_from_Half(U16 half) {
|
|
169
178
|
typedef int16_t __attribute__((vector_size(16))) I16;
|
170
179
|
return __builtin_ia32_vcvtph2ps256((I16)half);
|
171
180
|
#else
|
172
|
-
|
181
|
+
I32 wide = cast<I32>(half);
|
173
182
|
// A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
183
|
+
// To match intrinsic behavior, this preserves denormal values, infinities, and NaNs, which
|
184
|
+
// helps improve consistency between architectures.
|
185
|
+
I32 s = wide & 0x8000,
|
186
|
+
em = wide ^ s,
|
187
|
+
inf_or_nan = (em >= (31 << 10)) & (255 << 23), // Expands exponent to fill 8 bits
|
188
|
+
is_norm = em > 0x3ff,
|
189
|
+
// denormalized f16's are 2^-14*0.[m0:9] == 2^-24*[m0:9].0
|
190
|
+
sub = bit_pun<I32>(cast<F>(em) * (1.f/(1<<24))),
|
191
|
+
norm = ((em<<13) + ((127-15)<<23)), // Shifts mantissa, shifts + re-biases exponent
|
192
|
+
finite = if_then_else(is_norm, norm, sub);
|
193
|
+
// If 'x' is f16 +/- infinity, inf_or_nan will be the filled 8-bit exponent but 'norm' will be
|
194
|
+
// all 0s since 'x's mantissa is 0. Thus norm | inf_or_nan becomes f32 infinity. However, if
|
195
|
+
// 'x' is an f16 NaN, some bits of 'norm' will be non-zero, so it stays an f32 NaN after the OR.
|
196
|
+
return bit_pun<F>((s<<16) | finite | inf_or_nan);
|
182
197
|
#endif
|
183
198
|
}
|
184
199
|
|
185
|
-
|
186
|
-
|
187
|
-
// we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
|
188
|
-
__attribute__((no_sanitize("unsigned-integer-overflow")))
|
189
|
-
#endif
|
200
|
+
// KEEP IN SYNC with skvx::to_half to ensure that f16 colors are computed consistently in both
|
201
|
+
// skcms and skvx.
|
190
202
|
SI U16 Half_from_F(F f) {
|
191
203
|
#if defined(USING_NEON_F16C)
|
192
204
|
return (U16)vcvt_f16_f32(f);
|
@@ -196,13 +208,23 @@ SI U16 Half_from_F(F f) {
|
|
196
208
|
return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
|
197
209
|
#else
|
198
210
|
// A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
211
|
+
// To match intrinsic behavior, this implements round-to-nearest-even, converting floats to
|
212
|
+
// denormal f16 values, overflowing to infinity and preserving infinity. However, it does not
|
213
|
+
// handle NaN float values (they become infinity).
|
214
|
+
I32 sem = bit_pun<I32>(f),
|
215
|
+
s = sem & 0x8000'0000,
|
216
|
+
em = min_(sem ^ s, F16InfBits), // |x| clamped to f16 infinity
|
217
|
+
// F(em)*8192 increases the exponent by 13, which when added back to em will shift the
|
218
|
+
// mantissa bits 13 to the right. We clamp to 1/2 for subnormal values, which
|
219
|
+
// automatically shifts the mantissa to match 2^-14 expected for a subnorm f16.
|
220
|
+
magic = bit_pun<I32>(max_(bit_pun<F>(em) * 8192.f, FHalf)) & (255 << 23),
|
221
|
+
// Shift mantissa with automatic round-to-even
|
222
|
+
rounded = bit_pun<I32>((bit_pun<F>(em) + bit_pun<F>(magic))),
|
223
|
+
// Subtract 127 for f32 bias, subtract 13 to undo the *8192, subtract 1 to remove
|
224
|
+
// the implicit leading 1., and add 15 to get the f16 biased exponent.
|
225
|
+
exp = ((magic >> 13) - ((127-15+13+1)<<10)), // shift and re-bias exponent
|
226
|
+
f16 = rounded + exp; // use + if 'rounded' rolled over into first exponent bit
|
227
|
+
return cast<U16>((s>>16) | f16);
|
206
228
|
#endif
|
207
229
|
}
|
208
230
|
|
@@ -218,14 +240,6 @@ SI U64 swap_endian_16x4(const U64& rgba) {
|
|
218
240
|
| (rgba & 0xff00ff00ff00ff00) >> 8;
|
219
241
|
}
|
220
242
|
|
221
|
-
#if defined(USING_NEON)
|
222
|
-
SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
|
223
|
-
SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
|
224
|
-
#else
|
225
|
-
SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
|
226
|
-
SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
|
227
|
-
#endif
|
228
|
-
|
229
243
|
SI F floor_(F x) {
|
230
244
|
#if N == 1
|
231
245
|
return floorf_(x);
|
@@ -292,19 +306,35 @@ SI F approx_exp(F x) {
|
|
292
306
|
return approx_exp2(log2_e * x);
|
293
307
|
}
|
294
308
|
|
309
|
+
SI F strip_sign(F x, U32* sign) {
|
310
|
+
U32 bits = bit_pun<U32>(x);
|
311
|
+
*sign = bits & 0x80000000;
|
312
|
+
return bit_pun<F>(bits ^ *sign);
|
313
|
+
}
|
314
|
+
|
315
|
+
SI F apply_sign(F x, U32 sign) {
|
316
|
+
return bit_pun<F>(sign | bit_pun<U32>(x));
|
317
|
+
}
|
318
|
+
|
295
319
|
// Return tf(x).
|
296
320
|
SI F apply_tf(const skcms_TransferFunction* tf, F x) {
|
297
321
|
// Peel off the sign bit and set x = |x|.
|
298
|
-
U32
|
299
|
-
|
300
|
-
x = bit_pun<F>(bits ^ sign);
|
322
|
+
U32 sign;
|
323
|
+
x = strip_sign(x, &sign);
|
301
324
|
|
302
325
|
// The transfer function has a linear part up to d, exponential at d and after.
|
303
326
|
F v = if_then_else(x < tf->d, tf->c*x + tf->f
|
304
327
|
, approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
|
305
328
|
|
306
329
|
// Tack the sign bit back on.
|
307
|
-
return
|
330
|
+
return apply_sign(v, sign);
|
331
|
+
}
|
332
|
+
|
333
|
+
// Return the gamma function (|x|^G with the original sign re-applied to x).
|
334
|
+
SI F apply_gamma(const skcms_TransferFunction* tf, F x) {
|
335
|
+
U32 sign;
|
336
|
+
x = strip_sign(x, &sign);
|
337
|
+
return apply_sign(approx_pow(x, tf->g), sign);
|
308
338
|
}
|
309
339
|
|
310
340
|
SI F apply_pq(const skcms_TransferFunction* tf, F x) {
|
@@ -717,12 +747,12 @@ static void clut(uint32_t input_channels, uint32_t output_channels,
|
|
717
747
|
switch ((dim-1)&3) { // This lets the compiler know there are no other cases to handle.
|
718
748
|
case 3: ix += index [3 + (combo&8)/2];
|
719
749
|
w *= weight[3 + (combo&8)/2];
|
720
|
-
|
750
|
+
SKCMS_FALLTHROUGH;
|
721
751
|
// fall through
|
722
752
|
|
723
753
|
case 2: ix += index [2 + (combo&4)*1];
|
724
754
|
w *= weight[2 + (combo&4)*1];
|
725
|
-
|
755
|
+
SKCMS_FALLTHROUGH;
|
726
756
|
// fall through
|
727
757
|
|
728
758
|
case 1: ix += index [1 + (combo&2)*2];
|
@@ -755,643 +785,763 @@ static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
|
|
755
785
|
r,g,b,a);
|
756
786
|
}
|
757
787
|
|
758
|
-
|
759
|
-
|
760
|
-
|
761
|
-
|
762
|
-
|
763
|
-
|
764
|
-
|
765
|
-
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
}
|
812
|
-
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
827
|
-
|
828
|
-
|
829
|
-
|
830
|
-
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
|
866
|
-
|
867
|
-
|
868
|
-
|
869
|
-
|
870
|
-
|
871
|
-
|
872
|
-
|
873
|
-
|
874
|
-
|
875
|
-
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
926
|
-
|
927
|
-
|
928
|
-
|
929
|
-
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
|
938
|
-
|
939
|
-
|
940
|
-
|
941
|
-
|
942
|
-
|
943
|
-
|
944
|
-
|
945
|
-
|
946
|
-
|
947
|
-
|
948
|
-
|
949
|
-
|
950
|
-
|
951
|
-
|
952
|
-
|
953
|
-
|
954
|
-
|
955
|
-
|
956
|
-
|
957
|
-
|
958
|
-
|
959
|
-
|
960
|
-
|
961
|
-
|
962
|
-
|
963
|
-
|
964
|
-
|
965
|
-
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
974
|
-
|
975
|
-
|
976
|
-
|
977
|
-
|
978
|
-
|
979
|
-
|
980
|
-
|
981
|
-
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
989
|
-
|
990
|
-
|
991
|
-
|
992
|
-
|
993
|
-
|
994
|
-
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1002
|
-
|
1003
|
-
|
1004
|
-
|
1005
|
-
|
1006
|
-
|
1007
|
-
|
1008
|
-
|
1009
|
-
|
1010
|
-
|
1011
|
-
|
1012
|
-
|
1013
|
-
|
1014
|
-
|
1015
|
-
|
1016
|
-
|
1017
|
-
|
1018
|
-
|
1019
|
-
|
1020
|
-
|
1021
|
-
|
1022
|
-
|
1023
|
-
|
1024
|
-
|
1025
|
-
|
1026
|
-
|
1027
|
-
|
1028
|
-
|
1029
|
-
|
1030
|
-
|
1031
|
-
|
1032
|
-
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1062
|
-
|
1063
|
-
|
1064
|
-
|
1065
|
-
|
1066
|
-
|
1067
|
-
|
1068
|
-
|
1069
|
-
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
|
1101
|
-
|
1102
|
-
|
1103
|
-
|
1104
|
-
|
1105
|
-
|
1106
|
-
|
1107
|
-
|
1108
|
-
|
1109
|
-
|
1110
|
-
|
1111
|
-
|
1112
|
-
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1131
|
-
|
1132
|
-
|
1133
|
-
|
1134
|
-
|
1135
|
-
|
1136
|
-
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1140
|
-
|
1141
|
-
|
1142
|
-
|
1143
|
-
|
1144
|
-
|
1145
|
-
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1153
|
-
|
1154
|
-
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1175
|
-
|
1176
|
-
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1182
|
-
|
1183
|
-
|
1184
|
-
|
1185
|
-
|
1186
|
-
|
1187
|
-
|
1188
|
-
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
788
|
+
struct NoCtx {};
|
789
|
+
|
790
|
+
struct Ctx {
|
791
|
+
const void* fArg;
|
792
|
+
operator NoCtx() { return NoCtx{}; }
|
793
|
+
template <typename T> operator T*() { return (const T*)fArg; }
|
794
|
+
};
|
795
|
+
|
796
|
+
#define STAGE_PARAMS(MAYBE_REF) SKCMS_MAYBE_UNUSED const char* src, \
|
797
|
+
SKCMS_MAYBE_UNUSED char* dst, \
|
798
|
+
SKCMS_MAYBE_UNUSED F MAYBE_REF r, \
|
799
|
+
SKCMS_MAYBE_UNUSED F MAYBE_REF g, \
|
800
|
+
SKCMS_MAYBE_UNUSED F MAYBE_REF b, \
|
801
|
+
SKCMS_MAYBE_UNUSED F MAYBE_REF a, \
|
802
|
+
SKCMS_MAYBE_UNUSED int i
|
803
|
+
|
804
|
+
#if SKCMS_HAS_MUSTTAIL
|
805
|
+
|
806
|
+
// Stages take a stage list, and each stage is responsible for tail-calling the next one.
|
807
|
+
//
|
808
|
+
// Unfortunately, we can't declare a StageFn as a function pointer which takes a pointer to
|
809
|
+
// another StageFn; declaring this leads to a circular dependency. To avoid this, StageFn is
|
810
|
+
// wrapped in a single-element `struct StageList` which we are able to forward-declare.
|
811
|
+
struct StageList;
|
812
|
+
using StageFn = void (*)(StageList stages, const void** ctx, STAGE_PARAMS());
|
813
|
+
struct StageList {
|
814
|
+
const StageFn* fn;
|
815
|
+
};
|
816
|
+
|
817
|
+
#define DECLARE_STAGE(name, arg, CALL_NEXT) \
|
818
|
+
SI void Exec_##name##_k(arg, STAGE_PARAMS(&)); \
|
819
|
+
\
|
820
|
+
SI void Exec_##name(StageList list, const void** ctx, STAGE_PARAMS()) { \
|
821
|
+
Exec_##name##_k(Ctx{*ctx}, src, dst, r, g, b, a, i); \
|
822
|
+
++list.fn; ++ctx; \
|
823
|
+
CALL_NEXT; \
|
824
|
+
} \
|
825
|
+
\
|
826
|
+
SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
|
827
|
+
|
828
|
+
#define STAGE(name, arg) \
|
829
|
+
DECLARE_STAGE(name, arg, [[clang::musttail]] return (*list.fn)(list, ctx, src, dst, \
|
830
|
+
r, g, b, a, i))
|
831
|
+
|
832
|
+
#define FINAL_STAGE(name, arg) \
|
833
|
+
DECLARE_STAGE(name, arg, /* Stop executing stages and return to the caller. */)
|
834
|
+
|
835
|
+
#else
|
836
|
+
|
837
|
+
#define DECLARE_STAGE(name, arg) \
|
838
|
+
SI void Exec_##name##_k(arg, STAGE_PARAMS(&)); \
|
839
|
+
\
|
840
|
+
SI void Exec_##name(const void* ctx, STAGE_PARAMS(&)) { \
|
841
|
+
Exec_##name##_k(Ctx{ctx}, src, dst, r, g, b, a, i); \
|
842
|
+
} \
|
843
|
+
\
|
844
|
+
SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
|
845
|
+
|
846
|
+
#define STAGE(name, arg) DECLARE_STAGE(name, arg)
|
847
|
+
#define FINAL_STAGE(name, arg) DECLARE_STAGE(name, arg)
|
848
|
+
|
849
|
+
#endif
|
850
|
+
|
851
|
+
STAGE(load_a8, NoCtx) {
|
852
|
+
a = F_from_U8(load<U8>(src + 1*i));
|
853
|
+
}
|
854
|
+
|
855
|
+
STAGE(load_g8, NoCtx) {
|
856
|
+
r = g = b = F_from_U8(load<U8>(src + 1*i));
|
857
|
+
}
|
858
|
+
|
859
|
+
STAGE(load_4444, NoCtx) {
|
860
|
+
U16 abgr = load<U16>(src + 2*i);
|
861
|
+
|
862
|
+
r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
|
863
|
+
g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
|
864
|
+
b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
|
865
|
+
a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
|
866
|
+
}
|
867
|
+
|
868
|
+
STAGE(load_565, NoCtx) {
|
869
|
+
U16 rgb = load<U16>(src + 2*i);
|
870
|
+
|
871
|
+
r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
|
872
|
+
g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
|
873
|
+
b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
|
874
|
+
}
|
875
|
+
|
876
|
+
STAGE(load_888, NoCtx) {
|
877
|
+
const uint8_t* rgb = (const uint8_t*)(src + 3*i);
|
878
|
+
#if defined(USING_NEON)
|
879
|
+
// There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
|
880
|
+
// a time. Since we're doing that, we might as well load them into 16-bit lanes.
|
881
|
+
// (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
|
882
|
+
uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
|
883
|
+
v = vld3_lane_u8(rgb+0, v, 0);
|
884
|
+
v = vld3_lane_u8(rgb+3, v, 2);
|
885
|
+
v = vld3_lane_u8(rgb+6, v, 4);
|
886
|
+
v = vld3_lane_u8(rgb+9, v, 6);
|
887
|
+
|
888
|
+
// Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
|
889
|
+
// convert to F. (Again, U32 would be even better here if drop ARMv7 or split
|
890
|
+
// ARMv7 and ARMv8 impls.)
|
891
|
+
r = cast<F>((U16)v.val[0]) * (1/255.0f);
|
892
|
+
g = cast<F>((U16)v.val[1]) * (1/255.0f);
|
893
|
+
b = cast<F>((U16)v.val[2]) * (1/255.0f);
|
894
|
+
#else
|
895
|
+
r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
|
896
|
+
g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
|
897
|
+
b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
|
898
|
+
#endif
|
899
|
+
}
|
900
|
+
|
901
|
+
STAGE(load_8888, NoCtx) {
|
902
|
+
U32 rgba = load<U32>(src + 4*i);
|
903
|
+
|
904
|
+
r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
|
905
|
+
g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
|
906
|
+
b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
|
907
|
+
a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
|
908
|
+
}
|
909
|
+
|
910
|
+
STAGE(load_1010102, NoCtx) {
|
911
|
+
U32 rgba = load<U32>(src + 4*i);
|
912
|
+
|
913
|
+
r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
|
914
|
+
g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
|
915
|
+
b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
|
916
|
+
a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
|
917
|
+
}
|
918
|
+
|
919
|
+
STAGE(load_101010x_XR, NoCtx) {
|
920
|
+
static constexpr float min = -0.752941f;
|
921
|
+
static constexpr float max = 1.25098f;
|
922
|
+
static constexpr float range = max - min;
|
923
|
+
U32 rgba = load<U32>(src + 4*i);
|
924
|
+
r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f) * range + min;
|
925
|
+
g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f) * range + min;
|
926
|
+
b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f) * range + min;
|
927
|
+
}
|
928
|
+
|
929
|
+
STAGE(load_161616LE, NoCtx) {
|
930
|
+
uintptr_t ptr = (uintptr_t)(src + 6*i);
|
931
|
+
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
|
932
|
+
const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
|
933
|
+
#if defined(USING_NEON)
|
934
|
+
uint16x4x3_t v = vld3_u16(rgb);
|
935
|
+
r = cast<F>((U16)v.val[0]) * (1/65535.0f);
|
936
|
+
g = cast<F>((U16)v.val[1]) * (1/65535.0f);
|
937
|
+
b = cast<F>((U16)v.val[2]) * (1/65535.0f);
|
938
|
+
#else
|
939
|
+
r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
|
940
|
+
g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
|
941
|
+
b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
|
942
|
+
#endif
|
943
|
+
}
|
944
|
+
|
945
|
+
STAGE(load_16161616LE, NoCtx) {
|
946
|
+
uintptr_t ptr = (uintptr_t)(src + 8*i);
|
947
|
+
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
|
948
|
+
const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
|
949
|
+
#if defined(USING_NEON)
|
950
|
+
uint16x4x4_t v = vld4_u16(rgba);
|
951
|
+
r = cast<F>((U16)v.val[0]) * (1/65535.0f);
|
952
|
+
g = cast<F>((U16)v.val[1]) * (1/65535.0f);
|
953
|
+
b = cast<F>((U16)v.val[2]) * (1/65535.0f);
|
954
|
+
a = cast<F>((U16)v.val[3]) * (1/65535.0f);
|
955
|
+
#else
|
956
|
+
U64 px = load<U64>(rgba);
|
957
|
+
|
958
|
+
r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
|
959
|
+
g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
|
960
|
+
b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
|
961
|
+
a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
|
962
|
+
#endif
|
963
|
+
}
|
964
|
+
|
965
|
+
STAGE(load_161616BE, NoCtx) {
|
966
|
+
uintptr_t ptr = (uintptr_t)(src + 6*i);
|
967
|
+
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
|
968
|
+
const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
|
969
|
+
#if defined(USING_NEON)
|
970
|
+
uint16x4x3_t v = vld3_u16(rgb);
|
971
|
+
r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
|
972
|
+
g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
|
973
|
+
b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
|
974
|
+
#else
|
975
|
+
U32 R = load_3<U32>(rgb+0),
|
976
|
+
G = load_3<U32>(rgb+1),
|
977
|
+
B = load_3<U32>(rgb+2);
|
978
|
+
// R,G,B are big-endian 16-bit, so byte swap them before converting to float.
|
979
|
+
r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
|
980
|
+
g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
|
981
|
+
b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
|
982
|
+
#endif
|
983
|
+
}
|
984
|
+
|
985
|
+
STAGE(load_16161616BE, NoCtx) {
|
986
|
+
uintptr_t ptr = (uintptr_t)(src + 8*i);
|
987
|
+
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
|
988
|
+
const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
|
989
|
+
#if defined(USING_NEON)
|
990
|
+
uint16x4x4_t v = vld4_u16(rgba);
|
991
|
+
r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
|
992
|
+
g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
|
993
|
+
b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
|
994
|
+
a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
|
995
|
+
#else
|
996
|
+
U64 px = swap_endian_16x4(load<U64>(rgba));
|
997
|
+
|
998
|
+
r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
|
999
|
+
g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
|
1000
|
+
b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
|
1001
|
+
a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
|
1002
|
+
#endif
|
1003
|
+
}
|
1004
|
+
|
1005
|
+
STAGE(load_hhh, NoCtx) {
|
1006
|
+
uintptr_t ptr = (uintptr_t)(src + 6*i);
|
1007
|
+
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
|
1008
|
+
const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
|
1009
|
+
#if defined(USING_NEON)
|
1010
|
+
uint16x4x3_t v = vld3_u16(rgb);
|
1011
|
+
U16 R = (U16)v.val[0],
|
1012
|
+
G = (U16)v.val[1],
|
1013
|
+
B = (U16)v.val[2];
|
1014
|
+
#else
|
1015
|
+
U16 R = load_3<U16>(rgb+0),
|
1016
|
+
G = load_3<U16>(rgb+1),
|
1017
|
+
B = load_3<U16>(rgb+2);
|
1018
|
+
#endif
|
1019
|
+
r = F_from_Half(R);
|
1020
|
+
g = F_from_Half(G);
|
1021
|
+
b = F_from_Half(B);
|
1022
|
+
}
|
1023
|
+
|
1024
|
+
STAGE(load_hhhh, NoCtx) {
|
1025
|
+
uintptr_t ptr = (uintptr_t)(src + 8*i);
|
1026
|
+
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
|
1027
|
+
const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
|
1028
|
+
#if defined(USING_NEON)
|
1029
|
+
uint16x4x4_t v = vld4_u16(rgba);
|
1030
|
+
U16 R = (U16)v.val[0],
|
1031
|
+
G = (U16)v.val[1],
|
1032
|
+
B = (U16)v.val[2],
|
1033
|
+
A = (U16)v.val[3];
|
1034
|
+
#else
|
1035
|
+
U64 px = load<U64>(rgba);
|
1036
|
+
U16 R = cast<U16>((px >> 0) & 0xffff),
|
1037
|
+
G = cast<U16>((px >> 16) & 0xffff),
|
1038
|
+
B = cast<U16>((px >> 32) & 0xffff),
|
1039
|
+
A = cast<U16>((px >> 48) & 0xffff);
|
1040
|
+
#endif
|
1041
|
+
r = F_from_Half(R);
|
1042
|
+
g = F_from_Half(G);
|
1043
|
+
b = F_from_Half(B);
|
1044
|
+
a = F_from_Half(A);
|
1045
|
+
}
|
1046
|
+
|
1047
|
+
STAGE(load_fff, NoCtx) {
|
1048
|
+
uintptr_t ptr = (uintptr_t)(src + 12*i);
|
1049
|
+
assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
|
1050
|
+
const float* rgb = (const float*)ptr; // cast to const float* to be safe.
|
1051
|
+
#if defined(USING_NEON)
|
1052
|
+
float32x4x3_t v = vld3q_f32(rgb);
|
1053
|
+
r = (F)v.val[0];
|
1054
|
+
g = (F)v.val[1];
|
1055
|
+
b = (F)v.val[2];
|
1056
|
+
#else
|
1057
|
+
r = load_3<F>(rgb+0);
|
1058
|
+
g = load_3<F>(rgb+1);
|
1059
|
+
b = load_3<F>(rgb+2);
|
1060
|
+
#endif
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
STAGE(load_ffff, NoCtx) {
|
1064
|
+
uintptr_t ptr = (uintptr_t)(src + 16*i);
|
1065
|
+
assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
|
1066
|
+
const float* rgba = (const float*)ptr; // cast to const float* to be safe.
|
1067
|
+
#if defined(USING_NEON)
|
1068
|
+
float32x4x4_t v = vld4q_f32(rgba);
|
1069
|
+
r = (F)v.val[0];
|
1070
|
+
g = (F)v.val[1];
|
1071
|
+
b = (F)v.val[2];
|
1072
|
+
a = (F)v.val[3];
|
1073
|
+
#else
|
1074
|
+
r = load_4<F>(rgba+0);
|
1075
|
+
g = load_4<F>(rgba+1);
|
1076
|
+
b = load_4<F>(rgba+2);
|
1077
|
+
a = load_4<F>(rgba+3);
|
1078
|
+
#endif
|
1079
|
+
}
|
1080
|
+
|
1081
|
+
STAGE(swap_rb, NoCtx) {
|
1082
|
+
F t = r;
|
1083
|
+
r = b;
|
1084
|
+
b = t;
|
1085
|
+
}
|
1086
|
+
|
1087
|
+
STAGE(clamp, NoCtx) {
|
1088
|
+
r = max_(F0, min_(r, F1));
|
1089
|
+
g = max_(F0, min_(g, F1));
|
1090
|
+
b = max_(F0, min_(b, F1));
|
1091
|
+
a = max_(F0, min_(a, F1));
|
1092
|
+
}
|
1093
|
+
|
1094
|
+
STAGE(invert, NoCtx) {
|
1095
|
+
r = F1 - r;
|
1096
|
+
g = F1 - g;
|
1097
|
+
b = F1 - b;
|
1098
|
+
a = F1 - a;
|
1099
|
+
}
|
1100
|
+
|
1101
|
+
STAGE(force_opaque, NoCtx) {
|
1102
|
+
a = F1;
|
1103
|
+
}
|
1104
|
+
|
1105
|
+
STAGE(premul, NoCtx) {
|
1106
|
+
r *= a;
|
1107
|
+
g *= a;
|
1108
|
+
b *= a;
|
1109
|
+
}
|
1110
|
+
|
1111
|
+
STAGE(unpremul, NoCtx) {
|
1112
|
+
F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
|
1113
|
+
r *= scale;
|
1114
|
+
g *= scale;
|
1115
|
+
b *= scale;
|
1116
|
+
}
|
1117
|
+
|
1118
|
+
STAGE(matrix_3x3, const skcms_Matrix3x3* matrix) {
|
1119
|
+
const float* m = &matrix->vals[0][0];
|
1120
|
+
|
1121
|
+
F R = m[0]*r + m[1]*g + m[2]*b,
|
1122
|
+
G = m[3]*r + m[4]*g + m[5]*b,
|
1123
|
+
B = m[6]*r + m[7]*g + m[8]*b;
|
1124
|
+
|
1125
|
+
r = R;
|
1126
|
+
g = G;
|
1127
|
+
b = B;
|
1128
|
+
}
|
1129
|
+
|
1130
|
+
STAGE(matrix_3x4, const skcms_Matrix3x4* matrix) {
|
1131
|
+
const float* m = &matrix->vals[0][0];
|
1132
|
+
|
1133
|
+
F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
|
1134
|
+
G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
|
1135
|
+
B = m[8]*r + m[9]*g + m[10]*b + m[11];
|
1136
|
+
|
1137
|
+
r = R;
|
1138
|
+
g = G;
|
1139
|
+
b = B;
|
1140
|
+
}
|
1141
|
+
|
1142
|
+
STAGE(lab_to_xyz, NoCtx) {
|
1143
|
+
// The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
|
1144
|
+
F L = r * 100.0f,
|
1145
|
+
A = g * 255.0f - 128.0f,
|
1146
|
+
B = b * 255.0f - 128.0f;
|
1147
|
+
|
1148
|
+
// Convert to CIE XYZ.
|
1149
|
+
F Y = (L + 16.0f) * (1/116.0f),
|
1150
|
+
X = Y + A*(1/500.0f),
|
1151
|
+
Z = Y - B*(1/200.0f);
|
1152
|
+
|
1153
|
+
X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
|
1154
|
+
Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
|
1155
|
+
Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
|
1156
|
+
|
1157
|
+
// Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
|
1158
|
+
r = X * 0.9642f;
|
1159
|
+
g = Y ;
|
1160
|
+
b = Z * 0.8249f;
|
1161
|
+
}
|
1162
|
+
|
1163
|
+
// As above, in reverse.
|
1164
|
+
STAGE(xyz_to_lab, NoCtx) {
|
1165
|
+
F X = r * (1/0.9642f),
|
1166
|
+
Y = g,
|
1167
|
+
Z = b * (1/0.8249f);
|
1168
|
+
|
1169
|
+
X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
|
1170
|
+
Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
|
1171
|
+
Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
|
1172
|
+
|
1173
|
+
F L = Y*116.0f - 16.0f,
|
1174
|
+
A = (X-Y)*500.0f,
|
1175
|
+
B = (Y-Z)*200.0f;
|
1176
|
+
|
1177
|
+
r = L * (1/100.f);
|
1178
|
+
g = (A + 128.0f) * (1/255.0f);
|
1179
|
+
b = (B + 128.0f) * (1/255.0f);
|
1180
|
+
}
|
1181
|
+
|
1182
|
+
STAGE(gamma_r, const skcms_TransferFunction* tf) { r = apply_gamma(tf, r); }
|
1183
|
+
STAGE(gamma_g, const skcms_TransferFunction* tf) { g = apply_gamma(tf, g); }
|
1184
|
+
STAGE(gamma_b, const skcms_TransferFunction* tf) { b = apply_gamma(tf, b); }
|
1185
|
+
STAGE(gamma_a, const skcms_TransferFunction* tf) { a = apply_gamma(tf, a); }
|
1186
|
+
|
1187
|
+
STAGE(gamma_rgb, const skcms_TransferFunction* tf) {
|
1188
|
+
r = apply_gamma(tf, r);
|
1189
|
+
g = apply_gamma(tf, g);
|
1190
|
+
b = apply_gamma(tf, b);
|
1191
|
+
}
|
1192
|
+
|
1193
|
+
STAGE(tf_r, const skcms_TransferFunction* tf) { r = apply_tf(tf, r); }
|
1194
|
+
STAGE(tf_g, const skcms_TransferFunction* tf) { g = apply_tf(tf, g); }
|
1195
|
+
STAGE(tf_b, const skcms_TransferFunction* tf) { b = apply_tf(tf, b); }
|
1196
|
+
STAGE(tf_a, const skcms_TransferFunction* tf) { a = apply_tf(tf, a); }
|
1197
|
+
|
1198
|
+
STAGE(tf_rgb, const skcms_TransferFunction* tf) {
|
1199
|
+
r = apply_tf(tf, r);
|
1200
|
+
g = apply_tf(tf, g);
|
1201
|
+
b = apply_tf(tf, b);
|
1202
|
+
}
|
1203
|
+
|
1204
|
+
STAGE(pq_r, const skcms_TransferFunction* tf) { r = apply_pq(tf, r); }
|
1205
|
+
STAGE(pq_g, const skcms_TransferFunction* tf) { g = apply_pq(tf, g); }
|
1206
|
+
STAGE(pq_b, const skcms_TransferFunction* tf) { b = apply_pq(tf, b); }
|
1207
|
+
STAGE(pq_a, const skcms_TransferFunction* tf) { a = apply_pq(tf, a); }
|
1208
|
+
|
1209
|
+
STAGE(pq_rgb, const skcms_TransferFunction* tf) {
|
1210
|
+
r = apply_pq(tf, r);
|
1211
|
+
g = apply_pq(tf, g);
|
1212
|
+
b = apply_pq(tf, b);
|
1213
|
+
}
|
1214
|
+
|
1215
|
+
STAGE(hlg_r, const skcms_TransferFunction* tf) { r = apply_hlg(tf, r); }
|
1216
|
+
STAGE(hlg_g, const skcms_TransferFunction* tf) { g = apply_hlg(tf, g); }
|
1217
|
+
STAGE(hlg_b, const skcms_TransferFunction* tf) { b = apply_hlg(tf, b); }
|
1218
|
+
STAGE(hlg_a, const skcms_TransferFunction* tf) { a = apply_hlg(tf, a); }
|
1219
|
+
|
1220
|
+
STAGE(hlg_rgb, const skcms_TransferFunction* tf) {
|
1221
|
+
r = apply_hlg(tf, r);
|
1222
|
+
g = apply_hlg(tf, g);
|
1223
|
+
b = apply_hlg(tf, b);
|
1224
|
+
}
|
1225
|
+
|
1226
|
+
STAGE(hlginv_r, const skcms_TransferFunction* tf) { r = apply_hlginv(tf, r); }
|
1227
|
+
STAGE(hlginv_g, const skcms_TransferFunction* tf) { g = apply_hlginv(tf, g); }
|
1228
|
+
STAGE(hlginv_b, const skcms_TransferFunction* tf) { b = apply_hlginv(tf, b); }
|
1229
|
+
STAGE(hlginv_a, const skcms_TransferFunction* tf) { a = apply_hlginv(tf, a); }
|
1230
|
+
|
1231
|
+
STAGE(hlginv_rgb, const skcms_TransferFunction* tf) {
|
1232
|
+
r = apply_hlginv(tf, r);
|
1233
|
+
g = apply_hlginv(tf, g);
|
1234
|
+
b = apply_hlginv(tf, b);
|
1235
|
+
}
|
1236
|
+
|
1237
|
+
STAGE(table_r, const skcms_Curve* curve) { r = table(curve, r); }
|
1238
|
+
STAGE(table_g, const skcms_Curve* curve) { g = table(curve, g); }
|
1239
|
+
STAGE(table_b, const skcms_Curve* curve) { b = table(curve, b); }
|
1240
|
+
STAGE(table_a, const skcms_Curve* curve) { a = table(curve, a); }
|
1241
|
+
|
1242
|
+
STAGE(clut_A2B, const skcms_A2B* a2b) {
|
1243
|
+
clut(a2b, &r,&g,&b,a);
|
1244
|
+
|
1245
|
+
if (a2b->input_channels == 4) {
|
1246
|
+
// CMYK is opaque.
|
1247
|
+
a = F1;
|
1248
|
+
}
|
1249
|
+
}
|
1250
|
+
|
1251
|
+
STAGE(clut_B2A, const skcms_B2A* b2a) {
|
1252
|
+
clut(b2a, &r,&g,&b,&a);
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
// From here on down, the store_ ops are all "final stages," terminating processing of this group.
|
1256
|
+
|
1257
|
+
FINAL_STAGE(store_a8, NoCtx) {
|
1258
|
+
store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
|
1259
|
+
}
|
1260
|
+
|
1261
|
+
FINAL_STAGE(store_g8, NoCtx) {
|
1262
|
+
// g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
|
1263
|
+
store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
|
1264
|
+
}
|
1265
|
+
|
1266
|
+
FINAL_STAGE(store_4444, NoCtx) {
|
1267
|
+
store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
|
1268
|
+
| cast<U16>(to_fixed(g * 15) << 8)
|
1269
|
+
| cast<U16>(to_fixed(b * 15) << 4)
|
1270
|
+
| cast<U16>(to_fixed(a * 15) << 0));
|
1271
|
+
}
|
1272
|
+
|
1273
|
+
FINAL_STAGE(store_565, NoCtx) {
|
1274
|
+
store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
|
1275
|
+
| cast<U16>(to_fixed(g * 63) << 5 )
|
1276
|
+
| cast<U16>(to_fixed(b * 31) << 11 ));
|
1277
|
+
}
|
1278
|
+
|
1279
|
+
FINAL_STAGE(store_888, NoCtx) {
|
1280
|
+
uint8_t* rgb = (uint8_t*)dst + 3*i;
|
1281
|
+
#if defined(USING_NEON)
|
1282
|
+
// Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
|
1283
|
+
// get there via U16 to save some instructions converting to float. And just
|
1284
|
+
// like load_888, we'd prefer to go via U32 but for ARMv7 support.
|
1285
|
+
U16 R = cast<U16>(to_fixed(r * 255)),
|
1286
|
+
G = cast<U16>(to_fixed(g * 255)),
|
1287
|
+
B = cast<U16>(to_fixed(b * 255));
|
1288
|
+
|
1289
|
+
uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
|
1290
|
+
vst3_lane_u8(rgb+0, v, 0);
|
1291
|
+
vst3_lane_u8(rgb+3, v, 2);
|
1292
|
+
vst3_lane_u8(rgb+6, v, 4);
|
1293
|
+
vst3_lane_u8(rgb+9, v, 6);
|
1294
|
+
#else
|
1295
|
+
store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
|
1296
|
+
store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
|
1297
|
+
store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
|
1298
|
+
#endif
|
1299
|
+
}
|
1300
|
+
|
1301
|
+
FINAL_STAGE(store_8888, NoCtx) {
|
1302
|
+
store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0
|
1303
|
+
| cast<U32>(to_fixed(g * 255)) << 8
|
1304
|
+
| cast<U32>(to_fixed(b * 255)) << 16
|
1305
|
+
| cast<U32>(to_fixed(a * 255)) << 24);
|
1306
|
+
}
|
1307
|
+
|
1308
|
+
FINAL_STAGE(store_101010x_XR, NoCtx) {
|
1309
|
+
static constexpr float min = -0.752941f;
|
1310
|
+
static constexpr float max = 1.25098f;
|
1311
|
+
static constexpr float range = max - min;
|
1312
|
+
store(dst + 4*i, cast<U32>(to_fixed(((r - min) / range) * 1023)) << 0
|
1313
|
+
| cast<U32>(to_fixed(((g - min) / range) * 1023)) << 10
|
1314
|
+
| cast<U32>(to_fixed(((b - min) / range) * 1023)) << 20);
|
1315
|
+
return;
|
1316
|
+
}
|
1317
|
+
|
1318
|
+
FINAL_STAGE(store_1010102, NoCtx) {
|
1319
|
+
store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
|
1320
|
+
| cast<U32>(to_fixed(g * 1023)) << 10
|
1321
|
+
| cast<U32>(to_fixed(b * 1023)) << 20
|
1322
|
+
| cast<U32>(to_fixed(a * 3)) << 30);
|
1323
|
+
}
|
1324
|
+
|
1325
|
+
FINAL_STAGE(store_161616LE, NoCtx) {
|
1326
|
+
uintptr_t ptr = (uintptr_t)(dst + 6*i);
|
1327
|
+
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1328
|
+
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1329
|
+
#if defined(USING_NEON)
|
1330
|
+
uint16x4x3_t v = {{
|
1331
|
+
(uint16x4_t)U16_from_F(r),
|
1332
|
+
(uint16x4_t)U16_from_F(g),
|
1333
|
+
(uint16x4_t)U16_from_F(b),
|
1334
|
+
}};
|
1335
|
+
vst3_u16(rgb, v);
|
1336
|
+
#else
|
1337
|
+
store_3(rgb+0, U16_from_F(r));
|
1338
|
+
store_3(rgb+1, U16_from_F(g));
|
1339
|
+
store_3(rgb+2, U16_from_F(b));
|
1340
|
+
#endif
|
1341
|
+
|
1342
|
+
}
|
1343
|
+
|
1344
|
+
FINAL_STAGE(store_16161616LE, NoCtx) {
|
1345
|
+
uintptr_t ptr = (uintptr_t)(dst + 8*i);
|
1346
|
+
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1347
|
+
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1348
|
+
#if defined(USING_NEON)
|
1349
|
+
uint16x4x4_t v = {{
|
1350
|
+
(uint16x4_t)U16_from_F(r),
|
1351
|
+
(uint16x4_t)U16_from_F(g),
|
1352
|
+
(uint16x4_t)U16_from_F(b),
|
1353
|
+
(uint16x4_t)U16_from_F(a),
|
1354
|
+
}};
|
1355
|
+
vst4_u16(rgba, v);
|
1356
|
+
#else
|
1357
|
+
U64 px = cast<U64>(to_fixed(r * 65535)) << 0
|
1358
|
+
| cast<U64>(to_fixed(g * 65535)) << 16
|
1359
|
+
| cast<U64>(to_fixed(b * 65535)) << 32
|
1360
|
+
| cast<U64>(to_fixed(a * 65535)) << 48;
|
1361
|
+
store(rgba, px);
|
1362
|
+
#endif
|
1363
|
+
}
|
1364
|
+
|
1365
|
+
FINAL_STAGE(store_161616BE, NoCtx) {
|
1366
|
+
uintptr_t ptr = (uintptr_t)(dst + 6*i);
|
1367
|
+
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1368
|
+
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1369
|
+
#if defined(USING_NEON)
|
1370
|
+
uint16x4x3_t v = {{
|
1371
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
|
1372
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
|
1373
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
|
1374
|
+
}};
|
1375
|
+
vst3_u16(rgb, v);
|
1376
|
+
#else
|
1377
|
+
U32 R = to_fixed(r * 65535),
|
1378
|
+
G = to_fixed(g * 65535),
|
1379
|
+
B = to_fixed(b * 65535);
|
1380
|
+
store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
|
1381
|
+
store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
|
1382
|
+
store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
|
1383
|
+
#endif
|
1384
|
+
|
1385
|
+
}
|
1386
|
+
|
1387
|
+
FINAL_STAGE(store_16161616BE, NoCtx) {
|
1388
|
+
uintptr_t ptr = (uintptr_t)(dst + 8*i);
|
1389
|
+
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1390
|
+
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1391
|
+
#if defined(USING_NEON)
|
1392
|
+
uint16x4x4_t v = {{
|
1393
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
|
1394
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
|
1395
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
|
1396
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
|
1397
|
+
}};
|
1398
|
+
vst4_u16(rgba, v);
|
1399
|
+
#else
|
1400
|
+
U64 px = cast<U64>(to_fixed(r * 65535)) << 0
|
1401
|
+
| cast<U64>(to_fixed(g * 65535)) << 16
|
1402
|
+
| cast<U64>(to_fixed(b * 65535)) << 32
|
1403
|
+
| cast<U64>(to_fixed(a * 65535)) << 48;
|
1404
|
+
store(rgba, swap_endian_16x4(px));
|
1405
|
+
#endif
|
1406
|
+
}
|
1407
|
+
|
1408
|
+
FINAL_STAGE(store_hhh, NoCtx) {
|
1409
|
+
uintptr_t ptr = (uintptr_t)(dst + 6*i);
|
1410
|
+
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1411
|
+
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1412
|
+
|
1413
|
+
U16 R = Half_from_F(r),
|
1414
|
+
G = Half_from_F(g),
|
1415
|
+
B = Half_from_F(b);
|
1416
|
+
#if defined(USING_NEON)
|
1417
|
+
uint16x4x3_t v = {{
|
1418
|
+
(uint16x4_t)R,
|
1419
|
+
(uint16x4_t)G,
|
1420
|
+
(uint16x4_t)B,
|
1421
|
+
}};
|
1422
|
+
vst3_u16(rgb, v);
|
1423
|
+
#else
|
1424
|
+
store_3(rgb+0, R);
|
1425
|
+
store_3(rgb+1, G);
|
1426
|
+
store_3(rgb+2, B);
|
1427
|
+
#endif
|
1428
|
+
}
|
1429
|
+
|
1430
|
+
FINAL_STAGE(store_hhhh, NoCtx) {
|
1431
|
+
uintptr_t ptr = (uintptr_t)(dst + 8*i);
|
1432
|
+
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1433
|
+
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1434
|
+
|
1435
|
+
U16 R = Half_from_F(r),
|
1436
|
+
G = Half_from_F(g),
|
1437
|
+
B = Half_from_F(b),
|
1438
|
+
A = Half_from_F(a);
|
1439
|
+
#if defined(USING_NEON)
|
1440
|
+
uint16x4x4_t v = {{
|
1441
|
+
(uint16x4_t)R,
|
1442
|
+
(uint16x4_t)G,
|
1443
|
+
(uint16x4_t)B,
|
1444
|
+
(uint16x4_t)A,
|
1445
|
+
}};
|
1446
|
+
vst4_u16(rgba, v);
|
1447
|
+
#else
|
1448
|
+
store(rgba, cast<U64>(R) << 0
|
1449
|
+
| cast<U64>(G) << 16
|
1450
|
+
| cast<U64>(B) << 32
|
1451
|
+
| cast<U64>(A) << 48);
|
1452
|
+
#endif
|
1453
|
+
}
|
1454
|
+
|
1455
|
+
FINAL_STAGE(store_fff, NoCtx) {
|
1456
|
+
uintptr_t ptr = (uintptr_t)(dst + 12*i);
|
1457
|
+
assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
|
1458
|
+
float* rgb = (float*)ptr; // for this cast to float* to be safe.
|
1459
|
+
#if defined(USING_NEON)
|
1460
|
+
float32x4x3_t v = {{
|
1461
|
+
(float32x4_t)r,
|
1462
|
+
(float32x4_t)g,
|
1463
|
+
(float32x4_t)b,
|
1464
|
+
}};
|
1465
|
+
vst3q_f32(rgb, v);
|
1466
|
+
#else
|
1467
|
+
store_3(rgb+0, r);
|
1468
|
+
store_3(rgb+1, g);
|
1469
|
+
store_3(rgb+2, b);
|
1470
|
+
#endif
|
1471
|
+
}
|
1472
|
+
|
1473
|
+
FINAL_STAGE(store_ffff, NoCtx) {
|
1474
|
+
uintptr_t ptr = (uintptr_t)(dst + 16*i);
|
1475
|
+
assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
|
1476
|
+
float* rgba = (float*)ptr; // for this cast to float* to be safe.
|
1477
|
+
#if defined(USING_NEON)
|
1478
|
+
float32x4x4_t v = {{
|
1479
|
+
(float32x4_t)r,
|
1480
|
+
(float32x4_t)g,
|
1481
|
+
(float32x4_t)b,
|
1482
|
+
(float32x4_t)a,
|
1483
|
+
}};
|
1484
|
+
vst4q_f32(rgba, v);
|
1485
|
+
#else
|
1486
|
+
store_4(rgba+0, r);
|
1487
|
+
store_4(rgba+1, g);
|
1488
|
+
store_4(rgba+2, b);
|
1489
|
+
store_4(rgba+3, a);
|
1490
|
+
#endif
|
1491
|
+
}
|
1492
|
+
|
1493
|
+
#if SKCMS_HAS_MUSTTAIL
|
1494
|
+
|
1495
|
+
SI void exec_stages(StageFn* stages, const void** contexts, const char* src, char* dst, int i) {
|
1496
|
+
(*stages)({stages}, contexts, src, dst, F0, F0, F0, F1, i);
|
1497
|
+
}
|
1498
|
+
|
1499
|
+
#else
|
1500
|
+
|
1501
|
+
static void exec_stages(const Op* ops, const void** contexts,
|
1502
|
+
const char* src, char* dst, int i) {
|
1503
|
+
F r = F0, g = F0, b = F0, a = F1;
|
1504
|
+
while (true) {
|
1505
|
+
switch (*ops++) {
|
1506
|
+
#define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); break;
|
1507
|
+
SKCMS_WORK_OPS(M)
|
1508
|
+
#undef M
|
1509
|
+
#define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); return;
|
1510
|
+
SKCMS_STORE_OPS(M)
|
1511
|
+
#undef M
|
1208
1512
|
}
|
1209
|
-
case Op_store_1010102: {
|
1210
|
-
store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
|
1211
|
-
| cast<U32>(to_fixed(g * 1023)) << 10
|
1212
|
-
| cast<U32>(to_fixed(b * 1023)) << 20
|
1213
|
-
| cast<U32>(to_fixed(a * 3)) << 30);
|
1214
|
-
} return;
|
1215
|
-
|
1216
|
-
case Op_store_161616LE: {
|
1217
|
-
uintptr_t ptr = (uintptr_t)(dst + 6*i);
|
1218
|
-
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1219
|
-
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1220
|
-
#if defined(USING_NEON)
|
1221
|
-
uint16x4x3_t v = {{
|
1222
|
-
(uint16x4_t)U16_from_F(r),
|
1223
|
-
(uint16x4_t)U16_from_F(g),
|
1224
|
-
(uint16x4_t)U16_from_F(b),
|
1225
|
-
}};
|
1226
|
-
vst3_u16(rgb, v);
|
1227
|
-
#else
|
1228
|
-
store_3(rgb+0, U16_from_F(r));
|
1229
|
-
store_3(rgb+1, U16_from_F(g));
|
1230
|
-
store_3(rgb+2, U16_from_F(b));
|
1231
|
-
#endif
|
1232
|
-
|
1233
|
-
} return;
|
1234
|
-
|
1235
|
-
case Op_store_16161616LE: {
|
1236
|
-
uintptr_t ptr = (uintptr_t)(dst + 8*i);
|
1237
|
-
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1238
|
-
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1239
|
-
#if defined(USING_NEON)
|
1240
|
-
uint16x4x4_t v = {{
|
1241
|
-
(uint16x4_t)U16_from_F(r),
|
1242
|
-
(uint16x4_t)U16_from_F(g),
|
1243
|
-
(uint16x4_t)U16_from_F(b),
|
1244
|
-
(uint16x4_t)U16_from_F(a),
|
1245
|
-
}};
|
1246
|
-
vst4_u16(rgba, v);
|
1247
|
-
#else
|
1248
|
-
U64 px = cast<U64>(to_fixed(r * 65535)) << 0
|
1249
|
-
| cast<U64>(to_fixed(g * 65535)) << 16
|
1250
|
-
| cast<U64>(to_fixed(b * 65535)) << 32
|
1251
|
-
| cast<U64>(to_fixed(a * 65535)) << 48;
|
1252
|
-
store(rgba, px);
|
1253
|
-
#endif
|
1254
|
-
} return;
|
1255
|
-
|
1256
|
-
case Op_store_161616BE: {
|
1257
|
-
uintptr_t ptr = (uintptr_t)(dst + 6*i);
|
1258
|
-
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1259
|
-
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1260
|
-
#if defined(USING_NEON)
|
1261
|
-
uint16x4x3_t v = {{
|
1262
|
-
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
|
1263
|
-
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
|
1264
|
-
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
|
1265
|
-
}};
|
1266
|
-
vst3_u16(rgb, v);
|
1267
|
-
#else
|
1268
|
-
U32 R = to_fixed(r * 65535),
|
1269
|
-
G = to_fixed(g * 65535),
|
1270
|
-
B = to_fixed(b * 65535);
|
1271
|
-
store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
|
1272
|
-
store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
|
1273
|
-
store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
|
1274
|
-
#endif
|
1275
|
-
|
1276
|
-
} return;
|
1277
|
-
|
1278
|
-
case Op_store_16161616BE: {
|
1279
|
-
uintptr_t ptr = (uintptr_t)(dst + 8*i);
|
1280
|
-
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1281
|
-
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1282
|
-
#if defined(USING_NEON)
|
1283
|
-
uint16x4x4_t v = {{
|
1284
|
-
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
|
1285
|
-
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
|
1286
|
-
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
|
1287
|
-
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
|
1288
|
-
}};
|
1289
|
-
vst4_u16(rgba, v);
|
1290
|
-
#else
|
1291
|
-
U64 px = cast<U64>(to_fixed(r * 65535)) << 0
|
1292
|
-
| cast<U64>(to_fixed(g * 65535)) << 16
|
1293
|
-
| cast<U64>(to_fixed(b * 65535)) << 32
|
1294
|
-
| cast<U64>(to_fixed(a * 65535)) << 48;
|
1295
|
-
store(rgba, swap_endian_16x4(px));
|
1296
|
-
#endif
|
1297
|
-
} return;
|
1298
|
-
|
1299
|
-
case Op_store_hhh: {
|
1300
|
-
uintptr_t ptr = (uintptr_t)(dst + 6*i);
|
1301
|
-
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1302
|
-
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1303
|
-
|
1304
|
-
U16 R = Half_from_F(r),
|
1305
|
-
G = Half_from_F(g),
|
1306
|
-
B = Half_from_F(b);
|
1307
|
-
#if defined(USING_NEON)
|
1308
|
-
uint16x4x3_t v = {{
|
1309
|
-
(uint16x4_t)R,
|
1310
|
-
(uint16x4_t)G,
|
1311
|
-
(uint16x4_t)B,
|
1312
|
-
}};
|
1313
|
-
vst3_u16(rgb, v);
|
1314
|
-
#else
|
1315
|
-
store_3(rgb+0, R);
|
1316
|
-
store_3(rgb+1, G);
|
1317
|
-
store_3(rgb+2, B);
|
1318
|
-
#endif
|
1319
|
-
} return;
|
1320
|
-
|
1321
|
-
case Op_store_hhhh: {
|
1322
|
-
uintptr_t ptr = (uintptr_t)(dst + 8*i);
|
1323
|
-
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1324
|
-
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1325
|
-
|
1326
|
-
U16 R = Half_from_F(r),
|
1327
|
-
G = Half_from_F(g),
|
1328
|
-
B = Half_from_F(b),
|
1329
|
-
A = Half_from_F(a);
|
1330
|
-
#if defined(USING_NEON)
|
1331
|
-
uint16x4x4_t v = {{
|
1332
|
-
(uint16x4_t)R,
|
1333
|
-
(uint16x4_t)G,
|
1334
|
-
(uint16x4_t)B,
|
1335
|
-
(uint16x4_t)A,
|
1336
|
-
}};
|
1337
|
-
vst4_u16(rgba, v);
|
1338
|
-
#else
|
1339
|
-
store(rgba, cast<U64>(R) << 0
|
1340
|
-
| cast<U64>(G) << 16
|
1341
|
-
| cast<U64>(B) << 32
|
1342
|
-
| cast<U64>(A) << 48);
|
1343
|
-
#endif
|
1344
|
-
|
1345
|
-
} return;
|
1346
|
-
|
1347
|
-
case Op_store_fff: {
|
1348
|
-
uintptr_t ptr = (uintptr_t)(dst + 12*i);
|
1349
|
-
assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
|
1350
|
-
float* rgb = (float*)ptr; // for this cast to float* to be safe.
|
1351
|
-
#if defined(USING_NEON)
|
1352
|
-
float32x4x3_t v = {{
|
1353
|
-
(float32x4_t)r,
|
1354
|
-
(float32x4_t)g,
|
1355
|
-
(float32x4_t)b,
|
1356
|
-
}};
|
1357
|
-
vst3q_f32(rgb, v);
|
1358
|
-
#else
|
1359
|
-
store_3(rgb+0, r);
|
1360
|
-
store_3(rgb+1, g);
|
1361
|
-
store_3(rgb+2, b);
|
1362
|
-
#endif
|
1363
|
-
} return;
|
1364
|
-
|
1365
|
-
case Op_store_ffff: {
|
1366
|
-
uintptr_t ptr = (uintptr_t)(dst + 16*i);
|
1367
|
-
assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
|
1368
|
-
float* rgba = (float*)ptr; // for this cast to float* to be safe.
|
1369
|
-
#if defined(USING_NEON)
|
1370
|
-
float32x4x4_t v = {{
|
1371
|
-
(float32x4_t)r,
|
1372
|
-
(float32x4_t)g,
|
1373
|
-
(float32x4_t)b,
|
1374
|
-
(float32x4_t)a,
|
1375
|
-
}};
|
1376
|
-
vst4q_f32(rgba, v);
|
1377
|
-
#else
|
1378
|
-
store_4(rgba+0, r);
|
1379
|
-
store_4(rgba+1, g);
|
1380
|
-
store_4(rgba+2, b);
|
1381
|
-
store_4(rgba+3, a);
|
1382
|
-
#endif
|
1383
|
-
} return;
|
1384
1513
|
}
|
1385
1514
|
}
|
1386
|
-
}
|
1387
1515
|
|
1516
|
+
#endif
|
1517
|
+
|
1518
|
+
// NOLINTNEXTLINE(misc-definitions-in-headers)
|
1519
|
+
void run_program(const Op* program, const void** contexts, SKCMS_MAYBE_UNUSED ptrdiff_t programSize,
|
1520
|
+
const char* src, char* dst, int n,
|
1521
|
+
const size_t src_bpp, const size_t dst_bpp) {
|
1522
|
+
#if SKCMS_HAS_MUSTTAIL
|
1523
|
+
// Convert the program into an array of tailcall stages.
|
1524
|
+
StageFn stages[32];
|
1525
|
+
assert(programSize <= ARRAY_COUNT(stages));
|
1526
|
+
|
1527
|
+
static constexpr StageFn kStageFns[] = {
|
1528
|
+
#define M(name) &Exec_##name,
|
1529
|
+
SKCMS_WORK_OPS(M)
|
1530
|
+
SKCMS_STORE_OPS(M)
|
1531
|
+
#undef M
|
1532
|
+
};
|
1533
|
+
|
1534
|
+
for (ptrdiff_t index = 0; index < programSize; ++index) {
|
1535
|
+
stages[index] = kStageFns[(int)program[index]];
|
1536
|
+
}
|
1537
|
+
#else
|
1538
|
+
// Use the op array as-is.
|
1539
|
+
const Op* stages = program;
|
1540
|
+
#endif
|
1388
1541
|
|
1389
|
-
static void run_program(const Op* program, const void** arguments,
|
1390
|
-
const char* src, char* dst, int n,
|
1391
|
-
const size_t src_bpp, const size_t dst_bpp) {
|
1392
1542
|
int i = 0;
|
1393
1543
|
while (n >= N) {
|
1394
|
-
|
1544
|
+
exec_stages(stages, contexts, src, dst, i);
|
1395
1545
|
i += N;
|
1396
1546
|
n -= N;
|
1397
1547
|
}
|
@@ -1399,30 +1549,7 @@ static void run_program(const Op* program, const void** arguments,
|
|
1399
1549
|
char tmp[4*4*N] = {0};
|
1400
1550
|
|
1401
1551
|
memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
|
1402
|
-
|
1552
|
+
exec_stages(stages, contexts, tmp, tmp, 0);
|
1403
1553
|
memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
|
1404
1554
|
}
|
1405
1555
|
}
|
1406
|
-
|
1407
|
-
// Clean up any #defines we may have set so that we can be #included again.
|
1408
|
-
#if defined(USING_AVX)
|
1409
|
-
#undef USING_AVX
|
1410
|
-
#endif
|
1411
|
-
#if defined(USING_AVX_F16C)
|
1412
|
-
#undef USING_AVX_F16C
|
1413
|
-
#endif
|
1414
|
-
#if defined(USING_AVX2)
|
1415
|
-
#undef USING_AVX2
|
1416
|
-
#endif
|
1417
|
-
#if defined(USING_AVX512F)
|
1418
|
-
#undef USING_AVX512F
|
1419
|
-
#endif
|
1420
|
-
|
1421
|
-
#if defined(USING_NEON)
|
1422
|
-
#undef USING_NEON
|
1423
|
-
#endif
|
1424
|
-
#if defined(USING_NEON_F16C)
|
1425
|
-
#undef USING_NEON_F16C
|
1426
|
-
#endif
|
1427
|
-
|
1428
|
-
#undef FALLTHROUGH
|