@shopify/react-native-skia 0.1.158 → 0.1.159
Sign up to get free protection for your applications and to get access to all the features.
- package/android/CMakeLists.txt +35 -11
- package/android/build.gradle +21 -25
- package/android/cpp/jni/JniLoad.cpp +2 -0
- package/android/cpp/jni/include/JniSkiaDomView.h +89 -0
- package/android/cpp/rnskia-android/SkiaOpenGLRenderer.cpp +4 -3
- package/android/cpp/rnskia-android/SkiaOpenGLRenderer.h +4 -3
- package/android/src/main/java/com/shopify/reactnative/skia/RNSkiaPackage.java +2 -1
- package/android/src/main/java/com/shopify/reactnative/skia/SkiaDomView.java +45 -0
- package/android/src/main/java/com/shopify/reactnative/skia/SkiaDomViewManager.java +64 -0
- package/cpp/api/JsiSkHostObjects.h +6 -0
- package/cpp/api/JsiSkImageFilterFactory.h +1 -1
- package/cpp/api/JsiSkPaint.h +9 -2
- package/cpp/api/JsiSkPath.h +1 -0
- package/cpp/api/JsiSkRuntimeEffect.h +36 -36
- package/cpp/jsi/JsiHostObject.cpp +16 -28
- package/cpp/jsi/JsiHostObject.h +127 -7
- package/cpp/jsi/JsiValue.cpp +346 -0
- package/cpp/jsi/JsiValue.h +222 -0
- package/cpp/jsi/JsiValueWrapper.h +33 -5
- package/cpp/rnskia/RNSkDomView.cpp +220 -0
- package/cpp/rnskia/RNSkDomView.h +140 -0
- package/cpp/rnskia/RNSkJsView.cpp +0 -4
- package/cpp/rnskia/RNSkJsView.h +6 -4
- package/cpp/rnskia/RNSkManager.cpp +7 -0
- package/cpp/rnskia/RNSkPictureView.h +5 -8
- package/cpp/rnskia/RNSkView.h +113 -5
- package/cpp/rnskia/dom/JsiDomApi.h +167 -0
- package/cpp/rnskia/dom/base/BaseNodeProp.h +72 -0
- package/cpp/rnskia/dom/base/DerivedNodeProp.h +187 -0
- package/cpp/rnskia/dom/base/DrawingContext.cpp +227 -0
- package/cpp/rnskia/dom/base/DrawingContext.h +136 -0
- package/cpp/rnskia/dom/base/JsiDependencyManager.h +294 -0
- package/cpp/rnskia/dom/base/JsiDomDeclarationNode.h +176 -0
- package/cpp/rnskia/dom/base/JsiDomDrawingNode.h +50 -0
- package/cpp/rnskia/dom/base/JsiDomNode.h +361 -0
- package/cpp/rnskia/dom/base/JsiDomRenderNode.h +267 -0
- package/cpp/rnskia/dom/base/NodeProp.h +130 -0
- package/cpp/rnskia/dom/base/NodePropsContainer.h +119 -0
- package/cpp/rnskia/dom/nodes/JsiBackdropFilterNode.h +38 -0
- package/cpp/rnskia/dom/nodes/JsiBlendNode.h +112 -0
- package/cpp/rnskia/dom/nodes/JsiBlurMaskNode.h +78 -0
- package/cpp/rnskia/dom/nodes/JsiBoxNode.h +104 -0
- package/cpp/rnskia/dom/nodes/JsiBoxShadowNode.h +33 -0
- package/cpp/rnskia/dom/nodes/JsiCircleNode.h +38 -0
- package/cpp/rnskia/dom/nodes/JsiColorFilterNodes.h +192 -0
- package/cpp/rnskia/dom/nodes/JsiCustomDrawingNode.h +123 -0
- package/cpp/rnskia/dom/nodes/JsiDiffRectNode.h +42 -0
- package/cpp/rnskia/dom/nodes/JsiFillNode.h +22 -0
- package/cpp/rnskia/dom/nodes/JsiGlyphsNode.h +56 -0
- package/cpp/rnskia/dom/nodes/JsiGroupNode.h +26 -0
- package/cpp/rnskia/dom/nodes/JsiImageFilterNodes.h +415 -0
- package/cpp/rnskia/dom/nodes/JsiImageNode.h +34 -0
- package/cpp/rnskia/dom/nodes/JsiImageSvgNode.h +44 -0
- package/cpp/rnskia/dom/nodes/JsiLayerNode.h +64 -0
- package/cpp/rnskia/dom/nodes/JsiLineNode.h +43 -0
- package/cpp/rnskia/dom/nodes/JsiOvalNode.h +34 -0
- package/cpp/rnskia/dom/nodes/JsiPaintNode.h +77 -0
- package/cpp/rnskia/dom/nodes/JsiPatchNode.h +54 -0
- package/cpp/rnskia/dom/nodes/JsiPathEffectNodes.h +315 -0
- package/cpp/rnskia/dom/nodes/JsiPathNode.h +181 -0
- package/cpp/rnskia/dom/nodes/JsiPictureNode.h +32 -0
- package/cpp/rnskia/dom/nodes/JsiPointsNode.h +51 -0
- package/cpp/rnskia/dom/nodes/JsiRRectNode.h +34 -0
- package/cpp/rnskia/dom/nodes/JsiRectNode.h +34 -0
- package/cpp/rnskia/dom/nodes/JsiShaderNodes.h +517 -0
- package/cpp/rnskia/dom/nodes/JsiTextBlobNode.h +47 -0
- package/cpp/rnskia/dom/nodes/JsiTextNode.h +54 -0
- package/cpp/rnskia/dom/nodes/JsiTextPathNode.h +32 -0
- package/cpp/rnskia/dom/nodes/JsiVerticesNode.h +43 -0
- package/cpp/rnskia/dom/props/BezierProps.h +63 -0
- package/cpp/rnskia/dom/props/BlendModeProp.h +101 -0
- package/cpp/rnskia/dom/props/BoxShadowProps.h +61 -0
- package/cpp/rnskia/dom/props/CircleProp.h +46 -0
- package/cpp/rnskia/dom/props/ClipProp.h +62 -0
- package/cpp/rnskia/dom/props/ColorProp.h +80 -0
- package/cpp/rnskia/dom/props/DrawingProp.h +33 -0
- package/cpp/rnskia/dom/props/FontProp.h +34 -0
- package/cpp/rnskia/dom/props/GlyphsProp.h +53 -0
- package/cpp/rnskia/dom/props/ImageProps.h +173 -0
- package/cpp/rnskia/dom/props/LayerProp.h +50 -0
- package/cpp/rnskia/dom/props/MatrixProp.h +33 -0
- package/cpp/rnskia/dom/props/NumbersProp.h +63 -0
- package/cpp/rnskia/dom/props/PaintProps.h +172 -0
- package/cpp/rnskia/dom/props/PathProp.h +55 -0
- package/cpp/rnskia/dom/props/PictureProp.h +38 -0
- package/cpp/rnskia/dom/props/PointProp.h +72 -0
- package/cpp/rnskia/dom/props/PointsProp.h +83 -0
- package/cpp/rnskia/dom/props/RRectProp.h +134 -0
- package/cpp/rnskia/dom/props/RadiusProp.h +43 -0
- package/cpp/rnskia/dom/props/RectProp.h +118 -0
- package/cpp/rnskia/dom/props/StrokeProps.h +75 -0
- package/cpp/rnskia/dom/props/SvgProp.h +37 -0
- package/cpp/rnskia/dom/props/TextBlobProp.h +128 -0
- package/cpp/rnskia/dom/props/TileModeProp.h +50 -0
- package/cpp/rnskia/dom/props/TransformProp.h +80 -0
- package/cpp/rnskia/dom/props/TransformsProps.h +68 -0
- package/cpp/rnskia/dom/props/UniformsProp.h +194 -0
- package/cpp/rnskia/dom/props/VertexModeProp.h +47 -0
- package/cpp/rnskia/dom/props/VerticesProps.h +67 -0
- package/cpp/rnskia/values/RNSkReadonlyValue.h +13 -4
- package/cpp/skia/include/android/SkAndroidFrameworkUtils.h +35 -1
- package/cpp/skia/include/codec/SkAndroidCodec.h +17 -1
- package/cpp/skia/include/codec/SkCodec.h +8 -5
- package/cpp/skia/include/core/SkAnnotation.h +2 -0
- package/cpp/skia/include/core/SkBitmap.h +52 -1
- package/cpp/skia/include/core/SkBlendMode.h +2 -0
- package/cpp/skia/include/core/SkCanvas.h +52 -31
- package/cpp/skia/include/core/SkCapabilities.h +44 -0
- package/cpp/skia/include/core/SkColor.h +7 -0
- package/cpp/skia/include/core/SkColorFilter.h +37 -0
- package/cpp/skia/include/core/SkColorSpace.h +1 -1
- package/cpp/skia/include/core/SkFont.h +4 -0
- package/cpp/skia/include/core/SkFontMgr.h +3 -0
- package/cpp/skia/include/core/SkGraphics.h +9 -0
- package/cpp/skia/include/core/SkImage.h +77 -17
- package/cpp/skia/include/core/SkImageEncoder.h +5 -3
- package/cpp/skia/include/core/SkImageGenerator.h +27 -17
- package/cpp/skia/include/core/SkM44.h +1 -0
- package/cpp/skia/include/core/SkMesh.h +120 -34
- package/cpp/skia/include/core/SkMilestone.h +1 -1
- package/cpp/skia/include/core/SkOverdrawCanvas.h +2 -1
- package/cpp/skia/include/core/SkPaint.h +15 -2
- package/cpp/skia/include/core/SkPath.h +4 -0
- package/cpp/skia/include/core/SkPathBuilder.h +1 -1
- package/cpp/skia/include/core/SkPicture.h +0 -3
- package/cpp/skia/include/core/SkPictureRecorder.h +0 -2
- package/cpp/skia/include/core/SkPixmap.h +19 -0
- package/cpp/skia/include/core/SkRasterHandleAllocator.h +3 -1
- package/cpp/skia/include/core/SkRect.h +11 -4
- package/cpp/skia/include/core/SkRefCnt.h +13 -1
- package/cpp/skia/include/core/SkRegion.h +6 -0
- package/cpp/skia/include/core/SkSamplingOptions.h +8 -6
- package/cpp/skia/include/core/SkScalar.h +6 -25
- package/cpp/skia/include/core/SkShader.h +20 -12
- package/cpp/skia/include/core/SkSpan.h +51 -19
- package/cpp/skia/include/core/SkStream.h +2 -2
- package/cpp/skia/include/core/SkString.h +11 -3
- package/cpp/skia/include/core/SkSurface.h +85 -8
- package/cpp/skia/include/core/SkTextBlob.h +5 -2
- package/cpp/skia/include/core/SkTypes.h +11 -10
- package/cpp/skia/include/docs/SkPDFDocument.h +0 -5
- package/cpp/skia/include/effects/Sk1DPathEffect.h +6 -1
- package/cpp/skia/include/effects/Sk2DPathEffect.h +4 -1
- package/cpp/skia/include/effects/SkColorMatrix.h +1 -0
- package/cpp/skia/include/effects/SkColorMatrixFilter.h +5 -8
- package/cpp/skia/include/effects/SkCornerPathEffect.h +5 -1
- package/cpp/skia/include/effects/SkDashPathEffect.h +5 -1
- package/cpp/skia/include/effects/SkGradientShader.h +68 -38
- package/cpp/skia/include/effects/SkHighContrastFilter.h +5 -1
- package/cpp/skia/include/effects/SkImageFilters.h +5 -4
- package/cpp/skia/include/effects/SkLumaColorFilter.h +4 -1
- package/cpp/skia/include/effects/SkOpPathEffect.h +6 -2
- package/cpp/skia/include/effects/SkOverdrawColorFilter.h +5 -2
- package/cpp/skia/include/effects/SkRuntimeEffect.h +54 -62
- package/cpp/skia/include/effects/SkShaderMaskFilter.h +3 -1
- package/cpp/skia/include/effects/SkTableColorFilter.h +8 -21
- package/cpp/skia/include/effects/SkTableMaskFilter.h +5 -1
- package/cpp/skia/include/effects/SkTrimPathEffect.h +5 -1
- package/cpp/skia/include/encode/SkEncoder.h +17 -0
- package/cpp/skia/include/encode/SkWebpEncoder.h +17 -0
- package/cpp/skia/include/gpu/GpuTypes.h +18 -0
- package/cpp/skia/include/gpu/GrBackendSurface.h +38 -17
- package/cpp/skia/include/gpu/GrBackendSurfaceMutableState.h +6 -71
- package/cpp/skia/include/gpu/GrContextOptions.h +1 -1
- package/cpp/skia/include/gpu/GrContextThreadSafeProxy.h +10 -9
- package/cpp/skia/include/gpu/GrDirectContext.h +42 -22
- package/cpp/skia/include/gpu/GrRecordingContext.h +6 -3
- package/cpp/skia/include/gpu/GrTypes.h +11 -11
- package/cpp/skia/include/gpu/MutableTextureState.h +122 -0
- package/cpp/skia/include/gpu/gl/GrGLFunctions.h +1 -0
- package/cpp/skia/include/gpu/gl/GrGLInterface.h +1 -0
- package/cpp/skia/include/gpu/graphite/BackendTexture.h +7 -0
- package/cpp/skia/include/gpu/graphite/CombinationBuilder.h +195 -0
- package/cpp/skia/include/gpu/graphite/Context.h +47 -55
- package/cpp/skia/include/gpu/graphite/ContextOptions.h +85 -0
- package/cpp/skia/include/gpu/graphite/GraphiteTypes.h +1 -17
- package/cpp/skia/include/gpu/graphite/ImageProvider.h +61 -0
- package/cpp/skia/include/gpu/graphite/Recorder.h +87 -8
- package/cpp/skia/include/gpu/graphite/Recording.h +19 -9
- package/cpp/skia/include/gpu/graphite/TextureInfo.h +40 -8
- package/cpp/skia/include/gpu/graphite/dawn/DawnBackendContext.h +25 -0
- package/cpp/skia/include/gpu/graphite/mtl/MtlBackendContext.h +3 -2
- package/cpp/skia/include/gpu/graphite/vk/VulkanGraphiteTypes.h +69 -0
- package/cpp/skia/include/gpu/mtl/MtlMemoryAllocator.h +39 -0
- package/cpp/skia/include/gpu/vk/GrVkBackendContext.h +21 -19
- package/cpp/skia/include/gpu/vk/GrVkExtensions.h +2 -50
- package/cpp/skia/include/gpu/vk/GrVkMemoryAllocator.h +2 -127
- package/cpp/skia/include/gpu/vk/GrVkTypes.h +5 -43
- package/cpp/skia/include/gpu/vk/VulkanBackendContext.h +46 -0
- package/cpp/skia/include/gpu/vk/VulkanExtensions.h +67 -0
- package/cpp/skia/include/gpu/vk/VulkanMemoryAllocator.h +116 -0
- package/cpp/skia/include/gpu/vk/VulkanTypes.h +59 -0
- package/cpp/skia/include/pathops/SkPathOps.h +1 -1
- package/cpp/skia/include/private/SkColorData.h +10 -40
- package/cpp/skia/include/private/SkEncodedInfo.h +9 -3
- package/cpp/skia/include/private/SkFloatingPoint.h +9 -6
- package/cpp/skia/include/private/SkHalf.h +5 -52
- package/cpp/skia/include/private/SkMacros.h +1 -1
- package/cpp/skia/include/private/SkMalloc.h +4 -0
- package/cpp/skia/include/private/SkPathRef.h +10 -10
- package/cpp/skia/include/private/SkSLModifiers.h +59 -23
- package/cpp/skia/include/private/SkSLProgramKind.h +1 -0
- package/cpp/skia/include/private/SkSLSymbol.h +7 -3
- package/cpp/skia/include/private/SkStringView.h +4 -0
- package/cpp/skia/include/private/SkTArray.h +21 -7
- package/cpp/skia/include/private/SkTDArray.h +173 -285
- package/cpp/skia/include/private/SkTHash.h +33 -32
- package/cpp/skia/include/private/SkTemplates.h +24 -26
- package/cpp/skia/include/private/SkVx.h +218 -135
- package/cpp/skia/include/private/chromium/GrSlug.h +3 -65
- package/cpp/skia/include/private/chromium/SkChromeRemoteGlyphCache.h +6 -3
- package/cpp/skia/include/private/chromium/Slug.h +76 -0
- package/cpp/skia/include/private/gpu/ganesh/GrTypesPriv.h +6 -1
- package/cpp/skia/include/private/gpu/ganesh/GrVkTypesPriv.h +5 -39
- package/cpp/skia/include/private/gpu/graphite/VulkanGraphiteTypesPriv.h +63 -0
- package/cpp/skia/include/{gpu/vk/GrVkVulkan.h → private/gpu/vk/SkiaVulkan.h} +2 -2
- package/cpp/skia/include/private/gpu/vk/VulkanTypesPriv.h +57 -0
- package/cpp/skia/include/sksl/DSL.h +0 -1
- package/cpp/skia/include/sksl/DSLBlock.h +4 -18
- package/cpp/skia/include/sksl/DSLCase.h +2 -8
- package/cpp/skia/include/sksl/DSLCore.h +8 -15
- package/cpp/skia/include/sksl/DSLExpression.h +51 -142
- package/cpp/skia/include/sksl/DSLFunction.h +7 -15
- package/cpp/skia/include/sksl/DSLModifiers.h +5 -2
- package/cpp/skia/include/sksl/DSLStatement.h +4 -39
- package/cpp/skia/include/sksl/DSLSymbols.h +1 -11
- package/cpp/skia/include/sksl/DSLType.h +20 -12
- package/cpp/skia/include/sksl/DSLVar.h +56 -146
- package/cpp/skia/include/sksl/SkSLErrorReporter.h +2 -15
- package/cpp/skia/include/sksl/SkSLOperator.h +62 -59
- package/cpp/skia/include/sksl/SkSLPosition.h +2 -0
- package/cpp/skia/include/sksl/SkSLVersion.h +27 -0
- package/cpp/skia/include/svg/SkSVGCanvas.h +1 -0
- package/cpp/skia/include/utils/SkAnimCodecPlayer.h +1 -1
- package/cpp/skia/include/utils/SkBase64.h +2 -0
- package/cpp/skia/include/utils/SkCustomTypeface.h +24 -11
- package/cpp/skia/include/utils/SkEventTracer.h +12 -1
- package/cpp/skia/include/utils/SkNWayCanvas.h +11 -4
- package/cpp/skia/include/utils/SkPaintFilterCanvas.h +9 -4
- package/cpp/skia/include/utils/SkParse.h +3 -0
- package/cpp/skia/include/utils/SkShadowUtils.h +2 -0
- package/cpp/skia/include/utils/SkTextUtils.h +2 -1
- package/cpp/skia/{include/third_party → modules}/skcms/skcms.h +10 -0
- package/cpp/skia/modules/skcms/skcms_internal.h +56 -0
- package/cpp/skia/modules/skcms/src/Transform_inl.h +1609 -0
- package/cpp/skia/modules/skparagraph/include/DartTypes.h +153 -0
- package/cpp/skia/modules/skparagraph/include/FontArguments.h +46 -0
- package/cpp/skia/modules/skparagraph/include/FontCollection.h +84 -0
- package/cpp/skia/modules/skparagraph/include/Metrics.h +98 -0
- package/cpp/skia/modules/skparagraph/include/Paragraph.h +111 -0
- package/cpp/skia/modules/skparagraph/include/ParagraphBuilder.h +69 -0
- package/cpp/skia/modules/skparagraph/include/ParagraphCache.h +77 -0
- package/cpp/skia/modules/skparagraph/include/ParagraphStyle.h +143 -0
- package/cpp/skia/modules/skparagraph/include/TextShadow.h +30 -0
- package/cpp/skia/modules/skparagraph/include/TextStyle.h +352 -0
- package/cpp/skia/modules/skparagraph/include/TypefaceFontProvider.h +81 -0
- package/cpp/skia/modules/svg/include/SkSVGAttributeParser.h +1 -1
- package/cpp/skia/modules/svg/include/SkSVGTypes.h +3 -3
- package/cpp/skia/src/core/SkLRUCache.h +126 -0
- package/cpp/skia/src/core/SkTInternalLList.h +302 -0
- package/cpp/utils/RNSkTimingInfo.h +1 -0
- package/ios/RNSkia-iOS/RNSkMetalCanvasProvider.h +15 -4
- package/ios/RNSkia-iOS/RNSkMetalCanvasProvider.mm +40 -54
- package/ios/RNSkia-iOS/SkiaDomViewManager.h +8 -0
- package/ios/RNSkia-iOS/SkiaDomViewManager.mm +51 -0
- package/lib/commonjs/dom/nodes/JsiSkDOM.js +56 -56
- package/lib/commonjs/dom/nodes/JsiSkDOM.js.map +1 -1
- package/lib/commonjs/dom/nodes/RenderNode.js +1 -1
- package/lib/commonjs/dom/nodes/RenderNode.js.map +1 -1
- package/lib/commonjs/renderer/Canvas.js +19 -63
- package/lib/commonjs/renderer/Canvas.js.map +1 -1
- package/lib/commonjs/renderer/DependencyManager.js +0 -5
- package/lib/commonjs/renderer/DependencyManager.js.map +1 -1
- package/lib/commonjs/renderer/HostComponents.js.map +1 -1
- package/lib/commonjs/renderer/useCanvas.js +4 -18
- package/lib/commonjs/renderer/useCanvas.js.map +1 -1
- package/lib/commonjs/views/SkiaBaseWebView.js +7 -0
- package/lib/commonjs/views/SkiaBaseWebView.js.map +1 -1
- package/lib/commonjs/views/SkiaDomView.js +152 -0
- package/lib/commonjs/views/SkiaDomView.js.map +1 -0
- package/lib/commonjs/views/SkiaDomView.web.js +55 -0
- package/lib/commonjs/views/SkiaDomView.web.js.map +1 -0
- package/lib/commonjs/views/SkiaPictureView.js +16 -2
- package/lib/commonjs/views/SkiaPictureView.js.map +1 -1
- package/lib/commonjs/views/SkiaView.js +17 -2
- package/lib/commonjs/views/SkiaView.js.map +1 -1
- package/lib/commonjs/views/index.js +13 -0
- package/lib/commonjs/views/index.js.map +1 -1
- package/lib/commonjs/views/types.js.map +1 -1
- package/lib/module/dom/nodes/JsiSkDOM.js +56 -56
- package/lib/module/dom/nodes/JsiSkDOM.js.map +1 -1
- package/lib/module/dom/nodes/RenderNode.js +1 -1
- package/lib/module/dom/nodes/RenderNode.js.map +1 -1
- package/lib/module/renderer/Canvas.js +20 -64
- package/lib/module/renderer/Canvas.js.map +1 -1
- package/lib/module/renderer/DependencyManager.js +0 -5
- package/lib/module/renderer/DependencyManager.js.map +1 -1
- package/lib/module/renderer/HostComponents.js.map +1 -1
- package/lib/module/renderer/useCanvas.js +2 -13
- package/lib/module/renderer/useCanvas.js.map +1 -1
- package/lib/module/views/SkiaBaseWebView.js +7 -0
- package/lib/module/views/SkiaBaseWebView.js.map +1 -1
- package/lib/module/views/SkiaDomView.js +128 -0
- package/lib/module/views/SkiaDomView.js.map +1 -0
- package/lib/module/views/SkiaDomView.web.js +41 -0
- package/lib/module/views/SkiaDomView.web.js.map +1 -0
- package/lib/module/views/SkiaPictureView.js +14 -2
- package/lib/module/views/SkiaPictureView.js.map +1 -1
- package/lib/module/views/SkiaView.js +15 -2
- package/lib/module/views/SkiaView.js.map +1 -1
- package/lib/module/views/index.js +1 -0
- package/lib/module/views/index.js.map +1 -1
- package/lib/module/views/types.js.map +1 -1
- package/lib/typescript/src/dom/nodes/JsiSkDOM.d.ts +57 -64
- package/lib/typescript/src/renderer/Canvas.d.ts +5 -8
- package/lib/typescript/src/renderer/DependencyManager.d.ts +0 -2
- package/lib/typescript/src/renderer/HostComponents.d.ts +66 -3
- package/lib/typescript/src/renderer/useCanvas.d.ts +0 -6
- package/lib/typescript/src/views/SkiaBaseWebView.d.ts +4 -0
- package/lib/typescript/src/views/SkiaDomView.d.ts +31 -0
- package/lib/typescript/src/views/SkiaDomView.web.d.ts +7 -0
- package/lib/typescript/src/views/index.d.ts +1 -0
- package/lib/typescript/src/views/types.d.ts +12 -2
- package/libs/android/arm64-v8a/libskia.a +0 -0
- package/libs/android/arm64-v8a/libskottie.a +0 -0
- package/libs/android/arm64-v8a/libskparagraph.a +0 -0
- package/libs/android/arm64-v8a/libsksg.a +0 -0
- package/libs/android/arm64-v8a/libskshaper.a +0 -0
- package/libs/android/arm64-v8a/libskunicode.a +0 -0
- package/libs/android/arm64-v8a/libsvg.a +0 -0
- package/libs/android/armeabi-v7a/libskia.a +0 -0
- package/libs/android/armeabi-v7a/libskottie.a +0 -0
- package/libs/android/armeabi-v7a/libskparagraph.a +0 -0
- package/libs/android/armeabi-v7a/libsksg.a +0 -0
- package/libs/android/armeabi-v7a/libskshaper.a +0 -0
- package/libs/android/armeabi-v7a/libskunicode.a +0 -0
- package/libs/android/armeabi-v7a/libsvg.a +0 -0
- package/libs/android/x86/libskia.a +0 -0
- package/libs/android/x86/libskottie.a +0 -0
- package/libs/android/x86/libskparagraph.a +0 -0
- package/libs/android/x86/libsksg.a +0 -0
- package/libs/android/x86/libskshaper.a +0 -0
- package/libs/android/x86/libskunicode.a +0 -0
- package/libs/android/x86/libsvg.a +0 -0
- package/libs/android/x86_64/libskia.a +0 -0
- package/libs/android/x86_64/libskottie.a +0 -0
- package/libs/android/x86_64/libskparagraph.a +0 -0
- package/libs/android/x86_64/libsksg.a +0 -0
- package/libs/android/x86_64/libskshaper.a +0 -0
- package/libs/android/x86_64/libskunicode.a +0 -0
- package/libs/android/x86_64/libsvg.a +0 -0
- package/libs/ios/libskia.xcframework/ios-arm64_arm64e/libskia.a +0 -0
- package/libs/ios/libskia.xcframework/ios-arm64_arm64e_x86_64-simulator/libskia.a +0 -0
- package/libs/ios/libskottie.xcframework/ios-arm64_arm64e/libskottie.a +0 -0
- package/libs/ios/libskottie.xcframework/ios-arm64_arm64e_x86_64-simulator/libskottie.a +0 -0
- package/libs/ios/libskparagraph.xcframework/Info.plist +42 -0
- package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e/libskparagraph.a +0 -0
- package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e_x86_64-simulator/libskparagraph.a +0 -0
- package/libs/ios/libsksg.xcframework/Info.plist +5 -5
- package/libs/ios/libsksg.xcframework/ios-arm64_arm64e/libsksg.a +0 -0
- package/libs/ios/libsksg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsksg.a +0 -0
- package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e/libskshaper.a +0 -0
- package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e_x86_64-simulator/libskshaper.a +0 -0
- package/libs/ios/libskunicode.xcframework/Info.plist +42 -0
- package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e/libskunicode.a +0 -0
- package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e_x86_64-simulator/libskunicode.a +0 -0
- package/libs/ios/libsvg.xcframework/ios-arm64_arm64e/libsvg.a +0 -0
- package/libs/ios/libsvg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsvg.a +0 -0
- package/package.json +5 -3
- package/react-native-skia.podspec +3 -1
- package/src/dom/nodes/JsiSkDOM.ts +170 -56
- package/src/dom/nodes/RenderNode.ts +1 -1
- package/src/renderer/Canvas.tsx +32 -56
- package/src/renderer/DependencyManager.tsx +0 -5
- package/src/renderer/HostComponents.ts +152 -1
- package/src/renderer/useCanvas.ts +1 -15
- package/src/views/SkiaBaseWebView.tsx +4 -0
- package/src/views/SkiaDomView.tsx +120 -0
- package/src/views/SkiaDomView.web.tsx +37 -0
- package/src/views/SkiaPictureView.tsx +10 -2
- package/src/views/SkiaView.tsx +11 -3
- package/src/views/index.ts +1 -0
- package/src/views/types.ts +19 -2
- package/cpp/jsi/JsiSimpleValueWrapper.h +0 -99
- package/cpp/skia/include/c/sk_canvas.h +0 -159
- package/cpp/skia/include/c/sk_colorspace.h +0 -25
- package/cpp/skia/include/c/sk_data.h +0 -65
- package/cpp/skia/include/c/sk_image.h +0 -71
- package/cpp/skia/include/c/sk_imageinfo.h +0 -62
- package/cpp/skia/include/c/sk_maskfilter.h +0 -47
- package/cpp/skia/include/c/sk_matrix.h +0 -49
- package/cpp/skia/include/c/sk_paint.h +0 -145
- package/cpp/skia/include/c/sk_path.h +0 -102
- package/cpp/skia/include/c/sk_picture.h +0 -70
- package/cpp/skia/include/c/sk_shader.h +0 -143
- package/cpp/skia/include/c/sk_surface.h +0 -73
- package/cpp/skia/include/c/sk_types.h +0 -278
- package/cpp/skia/include/gpu/graphite/SkStuff.h +0 -47
- package/cpp/skia/include/private/SkNx.h +0 -430
- package/cpp/skia/include/private/SkNx_neon.h +0 -713
- package/cpp/skia/include/private/SkNx_sse.h +0 -823
- package/cpp/skia/include/sksl/DSLRuntimeEffects.h +0 -32
- package/cpp/skia/include/sksl/DSLWrapper.h +0 -77
@@ -0,0 +1,1609 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright 2018 Google Inc.
|
3
|
+
*
|
4
|
+
* Use of this source code is governed by a BSD-style license that can be
|
5
|
+
* found in the LICENSE file.
|
6
|
+
*/
|
7
|
+
|
8
|
+
// Intentionally NO #pragma once... included multiple times.
|
9
|
+
|
10
|
+
// This file is included from skcms.cc in a namespace with some pre-defines:
|
11
|
+
// - N: depth of all vectors, 1,4,8, or 16 (preprocessor define)
|
12
|
+
// - V<T>: a template to create a vector of N T's.
|
13
|
+
|
14
|
+
using F = V<Color>; // Called F for historic reasons... maybe rename C?
|
15
|
+
using I32 = V<int32_t>;
|
16
|
+
using U64 = V<uint64_t>;
|
17
|
+
using U32 = V<uint32_t>;
|
18
|
+
using U16 = V<uint16_t>;
|
19
|
+
using U8 = V<uint8_t>;
|
20
|
+
|
21
|
+
|
22
|
+
#if defined(__GNUC__) && !defined(__clang__)
|
23
|
+
// Once again, GCC is kind of weird, not allowing vector = scalar directly.
|
24
|
+
static constexpr F F0 = F() + 0.0f,
|
25
|
+
F1 = F() + 1.0f,
|
26
|
+
FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
|
27
|
+
#else
|
28
|
+
static constexpr F F0 = 0.0f,
|
29
|
+
F1 = 1.0f,
|
30
|
+
FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
|
31
|
+
#endif
|
32
|
+
|
33
|
+
// Instead of checking __AVX__ below, we'll check USING_AVX.
|
34
|
+
// This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
|
35
|
+
// Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
|
36
|
+
|
37
|
+
#if !defined(USING_AVX) && N == 8 && defined(__AVX__)
|
38
|
+
#define USING_AVX
|
39
|
+
#endif
|
40
|
+
#if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
|
41
|
+
#define USING AVX_F16C
|
42
|
+
#endif
|
43
|
+
#if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
|
44
|
+
#define USING_AVX2
|
45
|
+
#endif
|
46
|
+
#if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__) && defined(__AVX512DQ__)
|
47
|
+
#define USING_AVX512F
|
48
|
+
#endif
|
49
|
+
|
50
|
+
// Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
|
51
|
+
// This is more for organizational clarity... skcms.cc doesn't force these.
|
52
|
+
#if N > 1 && defined(__ARM_NEON)
|
53
|
+
#define USING_NEON
|
54
|
+
#if __ARM_FP & 2
|
55
|
+
#define USING_NEON_F16C
|
56
|
+
#endif
|
57
|
+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
|
58
|
+
#define USING_NEON_FP16
|
59
|
+
#endif
|
60
|
+
#endif
|
61
|
+
|
62
|
+
// These -Wvector-conversion warnings seem to trigger in very bogus situations,
|
63
|
+
// like vst3q_f32() expecting a 16x char rather than a 4x float vector. :/
|
64
|
+
#if defined(USING_NEON) && defined(__clang__)
|
65
|
+
#pragma clang diagnostic ignored "-Wvector-conversion"
|
66
|
+
#endif
|
67
|
+
|
68
|
+
// GCC & Clang (but not clang-cl) warn returning U64 on x86 is larger than a register.
|
69
|
+
// You'd see warnings like, "using AVX even though AVX is not enabled".
|
70
|
+
// We stifle these warnings; our helpers that return U64 are always inlined.
|
71
|
+
#if defined(__SSE__) && defined(__GNUC__)
|
72
|
+
#if !defined(__has_warning)
|
73
|
+
#pragma GCC diagnostic ignored "-Wpsabi"
|
74
|
+
#elif __has_warning("-Wpsabi")
|
75
|
+
#pragma GCC diagnostic ignored "-Wpsabi"
|
76
|
+
#endif
|
77
|
+
#endif
|
78
|
+
|
79
|
+
#if defined(__clang__)
|
80
|
+
#define FALLTHROUGH [[clang::fallthrough]]
|
81
|
+
#else
|
82
|
+
#define FALLTHROUGH
|
83
|
+
#endif
|
84
|
+
|
85
|
+
// We tag most helper functions as SI, to enforce good code generation
|
86
|
+
// but also work around what we think is a bug in GCC: when targeting 32-bit
|
87
|
+
// x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
|
88
|
+
// MMX mm0 register, which seems to mess with unrelated code that later uses
|
89
|
+
// x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
|
90
|
+
//
|
91
|
+
// It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
|
92
|
+
#if defined(__clang__) || defined(__GNUC__)
|
93
|
+
#define SI static inline __attribute__((always_inline))
|
94
|
+
#else
|
95
|
+
#define SI static inline
|
96
|
+
#endif
|
97
|
+
|
98
|
+
template <typename T, typename P>
|
99
|
+
SI T load(const P* ptr) {
|
100
|
+
T val;
|
101
|
+
small_memcpy(&val, ptr, sizeof(val));
|
102
|
+
return val;
|
103
|
+
}
|
104
|
+
template <typename T, typename P>
|
105
|
+
SI void store(P* ptr, const T& val) {
|
106
|
+
small_memcpy(ptr, &val, sizeof(val));
|
107
|
+
}
|
108
|
+
|
109
|
+
// (T)v is a cast when N == 1 and a bit-pun when N>1,
|
110
|
+
// so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
|
111
|
+
template <typename D, typename S>
|
112
|
+
SI D cast(const S& v) {
|
113
|
+
#if N == 1
|
114
|
+
return (D)v;
|
115
|
+
#elif defined(__clang__)
|
116
|
+
return __builtin_convertvector(v, D);
|
117
|
+
#else
|
118
|
+
D d;
|
119
|
+
for (int i = 0; i < N; i++) {
|
120
|
+
d[i] = v[i];
|
121
|
+
}
|
122
|
+
return d;
|
123
|
+
#endif
|
124
|
+
}
|
125
|
+
|
126
|
+
template <typename D, typename S>
|
127
|
+
SI D bit_pun(const S& v) {
|
128
|
+
static_assert(sizeof(D) == sizeof(v), "");
|
129
|
+
return load<D>(&v);
|
130
|
+
}
|
131
|
+
|
132
|
+
// When we convert from float to fixed point, it's very common to want to round,
|
133
|
+
// and for some reason compilers generate better code when converting to int32_t.
|
134
|
+
// To serve both those ends, we use this function to_fixed() instead of direct cast().
|
135
|
+
#if defined(USING_NEON_FP16)
|
136
|
+
// NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
|
137
|
+
SI U16 to_fixed(F f) { return cast<U16>(f + 0.5f); }
|
138
|
+
#else
|
139
|
+
SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
|
140
|
+
#endif
|
141
|
+
|
142
|
+
|
143
|
+
// Sometimes we do something crazy on one branch of a conditonal,
|
144
|
+
// like divide by zero or convert a huge float to an integer,
|
145
|
+
// but then harmlessly select the other side. That trips up N==1
|
146
|
+
// sanitizer builds, so we make if_then_else() a macro to avoid
|
147
|
+
// evaluating the unused side.
|
148
|
+
|
149
|
+
#if N == 1
|
150
|
+
#define if_then_else(cond, t, e) ((cond) ? (t) : (e))
|
151
|
+
#else
|
152
|
+
template <typename C, typename T>
|
153
|
+
SI T if_then_else(C cond, T t, T e) {
|
154
|
+
return bit_pun<T>( ( cond & bit_pun<C>(t)) |
|
155
|
+
(~cond & bit_pun<C>(e)) );
|
156
|
+
}
|
157
|
+
#endif
|
158
|
+
|
159
|
+
|
160
|
+
SI F F_from_Half(U16 half) {
|
161
|
+
#if defined(USING_NEON_FP16)
|
162
|
+
return bit_pun<F>(half);
|
163
|
+
#elif defined(USING_NEON_F16C)
|
164
|
+
return vcvt_f32_f16((float16x4_t)half);
|
165
|
+
#elif defined(USING_AVX512F)
|
166
|
+
return (F)_mm512_cvtph_ps((__m256i)half);
|
167
|
+
#elif defined(USING_AVX_F16C)
|
168
|
+
typedef int16_t __attribute__((vector_size(16))) I16;
|
169
|
+
return __builtin_ia32_vcvtph2ps256((I16)half);
|
170
|
+
#else
|
171
|
+
U32 wide = cast<U32>(half);
|
172
|
+
// A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
|
173
|
+
U32 s = wide & 0x8000,
|
174
|
+
em = wide ^ s;
|
175
|
+
|
176
|
+
// Constructing the float is easy if the half is not denormalized.
|
177
|
+
F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
|
178
|
+
|
179
|
+
// Simply flush all denorm half floats to zero.
|
180
|
+
return if_then_else(em < 0x0400, F0, norm);
|
181
|
+
#endif
|
182
|
+
}
|
183
|
+
|
184
|
+
#if defined(__clang__)
|
185
|
+
// The -((127-15)<<10) underflows that side of the math when
|
186
|
+
// we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
|
187
|
+
__attribute__((no_sanitize("unsigned-integer-overflow")))
|
188
|
+
#endif
|
189
|
+
SI U16 Half_from_F(F f) {
|
190
|
+
#if defined(USING_NEON_FP16)
|
191
|
+
return bit_pun<U16>(f);
|
192
|
+
#elif defined(USING_NEON_F16C)
|
193
|
+
return (U16)vcvt_f16_f32(f);
|
194
|
+
#elif defined(USING_AVX512F)
|
195
|
+
return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
|
196
|
+
#elif defined(USING_AVX_F16C)
|
197
|
+
return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
|
198
|
+
#else
|
199
|
+
// A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
|
200
|
+
U32 sem = bit_pun<U32>(f),
|
201
|
+
s = sem & 0x80000000,
|
202
|
+
em = sem ^ s;
|
203
|
+
|
204
|
+
// For simplicity we flush denorm half floats (including all denorm floats) to zero.
|
205
|
+
return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
|
206
|
+
, (s>>16) + (em>>13) - ((127-15)<<10)));
|
207
|
+
#endif
|
208
|
+
}
|
209
|
+
|
210
|
+
// Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
|
211
|
+
#if defined(USING_NEON_FP16)
|
212
|
+
SI U16 swap_endian_16(U16 v) {
|
213
|
+
return (U16)vrev16q_u8((uint8x16_t) v);
|
214
|
+
}
|
215
|
+
#elif defined(USING_NEON)
|
216
|
+
SI U16 swap_endian_16(U16 v) {
|
217
|
+
return (U16)vrev16_u8((uint8x8_t) v);
|
218
|
+
}
|
219
|
+
#endif
|
220
|
+
|
221
|
+
SI U64 swap_endian_16x4(const U64& rgba) {
|
222
|
+
return (rgba & 0x00ff00ff00ff00ff) << 8
|
223
|
+
| (rgba & 0xff00ff00ff00ff00) >> 8;
|
224
|
+
}
|
225
|
+
|
226
|
+
#if defined(USING_NEON_FP16)
|
227
|
+
SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
|
228
|
+
SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
|
229
|
+
#elif defined(USING_NEON)
|
230
|
+
SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
|
231
|
+
SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
|
232
|
+
#else
|
233
|
+
SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
|
234
|
+
SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
|
235
|
+
#endif
|
236
|
+
|
237
|
+
SI F floor_(F x) {
|
238
|
+
#if N == 1
|
239
|
+
return floorf_(x);
|
240
|
+
#elif defined(USING_NEON_FP16)
|
241
|
+
return vrndmq_f16(x);
|
242
|
+
#elif defined(__aarch64__)
|
243
|
+
return vrndmq_f32(x);
|
244
|
+
#elif defined(USING_AVX512F)
|
245
|
+
// Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
|
246
|
+
// and integer santizer catches that this implicit cast changes the
|
247
|
+
// value from -1 to 65535. We'll cast manually to work around it.
|
248
|
+
// Read this as `return _mm512_floor_ps(x)`.
|
249
|
+
return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
|
250
|
+
#elif defined(USING_AVX)
|
251
|
+
return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
|
252
|
+
#elif defined(__SSE4_1__)
|
253
|
+
return _mm_floor_ps(x);
|
254
|
+
#else
|
255
|
+
// Round trip through integers with a truncating cast.
|
256
|
+
F roundtrip = cast<F>(cast<I32>(x));
|
257
|
+
// If x is negative, truncating gives the ceiling instead of the floor.
|
258
|
+
return roundtrip - if_then_else(roundtrip > x, F1, F0);
|
259
|
+
|
260
|
+
// This implementation fails for values of x that are outside
|
261
|
+
// the range an integer can represent. We expect most x to be small.
|
262
|
+
#endif
|
263
|
+
}
|
264
|
+
|
265
|
+
SI F approx_log2(F x) {
|
266
|
+
#if defined(USING_NEON_FP16)
|
267
|
+
// TODO(mtklein)
|
268
|
+
return x;
|
269
|
+
#else
|
270
|
+
// The first approximation of log2(x) is its exponent 'e', minus 127.
|
271
|
+
I32 bits = bit_pun<I32>(x);
|
272
|
+
|
273
|
+
F e = cast<F>(bits) * (1.0f / (1<<23));
|
274
|
+
|
275
|
+
// If we use the mantissa too we can refine the error signficantly.
|
276
|
+
F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
|
277
|
+
|
278
|
+
return e - 124.225514990f
|
279
|
+
- 1.498030302f*m
|
280
|
+
- 1.725879990f/(0.3520887068f + m);
|
281
|
+
#endif
|
282
|
+
}
|
283
|
+
|
284
|
+
SI F approx_log(F x) {
|
285
|
+
const float ln2 = 0.69314718f;
|
286
|
+
return ln2 * approx_log2(x);
|
287
|
+
}
|
288
|
+
|
289
|
+
SI F approx_exp2(F x) {
|
290
|
+
#if defined(USING_NEON_FP16)
|
291
|
+
// TODO(mtklein)
|
292
|
+
return x;
|
293
|
+
#else
|
294
|
+
F fract = x - floor_(x);
|
295
|
+
|
296
|
+
F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
|
297
|
+
- 1.490129070f*fract
|
298
|
+
+ 27.728023300f/(4.84252568f - fract));
|
299
|
+
I32 bits = cast<I32>(min_(max_(fbits, F0), FInfBits));
|
300
|
+
|
301
|
+
return bit_pun<F>(bits);
|
302
|
+
#endif
|
303
|
+
}
|
304
|
+
|
305
|
+
SI F approx_pow(F x, float y) {
|
306
|
+
return if_then_else((x == F0) | (x == F1), x
|
307
|
+
, approx_exp2(approx_log2(x) * y));
|
308
|
+
}
|
309
|
+
|
310
|
+
SI F approx_exp(F x) {
|
311
|
+
const float log2_e = 1.4426950408889634074f;
|
312
|
+
return approx_exp2(log2_e * x);
|
313
|
+
}
|
314
|
+
|
315
|
+
// Return tf(x).
|
316
|
+
SI F apply_tf(const skcms_TransferFunction* tf, F x) {
|
317
|
+
#if defined(USING_NEON_FP16)
|
318
|
+
// TODO(mtklein)
|
319
|
+
(void)tf;
|
320
|
+
return x;
|
321
|
+
#else
|
322
|
+
// Peel off the sign bit and set x = |x|.
|
323
|
+
U32 bits = bit_pun<U32>(x),
|
324
|
+
sign = bits & 0x80000000;
|
325
|
+
x = bit_pun<F>(bits ^ sign);
|
326
|
+
|
327
|
+
// The transfer function has a linear part up to d, exponential at d and after.
|
328
|
+
F v = if_then_else(x < tf->d, tf->c*x + tf->f
|
329
|
+
, approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
|
330
|
+
|
331
|
+
// Tack the sign bit back on.
|
332
|
+
return bit_pun<F>(sign | bit_pun<U32>(v));
|
333
|
+
#endif
|
334
|
+
}
|
335
|
+
|
336
|
+
SI F apply_pq(const skcms_TransferFunction* tf, F x) {
|
337
|
+
#if defined(USING_NEON_FP16)
|
338
|
+
// TODO(mtklein)
|
339
|
+
(void)tf;
|
340
|
+
return x;
|
341
|
+
#else
|
342
|
+
U32 bits = bit_pun<U32>(x),
|
343
|
+
sign = bits & 0x80000000;
|
344
|
+
x = bit_pun<F>(bits ^ sign);
|
345
|
+
|
346
|
+
F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
|
347
|
+
/ (tf->d + tf->e * approx_pow(x, tf->c)),
|
348
|
+
tf->f);
|
349
|
+
|
350
|
+
return bit_pun<F>(sign | bit_pun<U32>(v));
|
351
|
+
#endif
|
352
|
+
}
|
353
|
+
|
354
|
+
SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
|
355
|
+
#if defined(USING_NEON_FP16)
|
356
|
+
// TODO(mtklein)
|
357
|
+
(void)tf;
|
358
|
+
return x;
|
359
|
+
#else
|
360
|
+
const float R = tf->a, G = tf->b,
|
361
|
+
a = tf->c, b = tf->d, c = tf->e,
|
362
|
+
K = tf->f + 1;
|
363
|
+
U32 bits = bit_pun<U32>(x),
|
364
|
+
sign = bits & 0x80000000;
|
365
|
+
x = bit_pun<F>(bits ^ sign);
|
366
|
+
|
367
|
+
F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
|
368
|
+
, approx_exp((x-c)*a) + b);
|
369
|
+
|
370
|
+
return K*bit_pun<F>(sign | bit_pun<U32>(v));
|
371
|
+
#endif
|
372
|
+
}
|
373
|
+
|
374
|
+
SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
|
375
|
+
#if defined(USING_NEON_FP16)
|
376
|
+
// TODO(mtklein)
|
377
|
+
(void)tf;
|
378
|
+
return x;
|
379
|
+
#else
|
380
|
+
const float R = tf->a, G = tf->b,
|
381
|
+
a = tf->c, b = tf->d, c = tf->e,
|
382
|
+
K = tf->f + 1;
|
383
|
+
U32 bits = bit_pun<U32>(x),
|
384
|
+
sign = bits & 0x80000000;
|
385
|
+
x = bit_pun<F>(bits ^ sign);
|
386
|
+
x /= K;
|
387
|
+
|
388
|
+
F v = if_then_else(x <= 1, R * approx_pow(x, G)
|
389
|
+
, a * approx_log(x - b) + c);
|
390
|
+
|
391
|
+
return bit_pun<F>(sign | bit_pun<U32>(v));
|
392
|
+
#endif
|
393
|
+
}
|
394
|
+
|
395
|
+
|
396
|
+
// Strided loads and stores of N values, starting from p.
|
397
|
+
template <typename T, typename P>
|
398
|
+
SI T load_3(const P* p) {
|
399
|
+
#if N == 1
|
400
|
+
return (T)p[0];
|
401
|
+
#elif N == 4
|
402
|
+
return T{p[ 0],p[ 3],p[ 6],p[ 9]};
|
403
|
+
#elif N == 8
|
404
|
+
return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
|
405
|
+
#elif N == 16
|
406
|
+
return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
|
407
|
+
p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
|
408
|
+
#endif
|
409
|
+
}
|
410
|
+
|
411
|
+
template <typename T, typename P>
|
412
|
+
SI T load_4(const P* p) {
|
413
|
+
#if N == 1
|
414
|
+
return (T)p[0];
|
415
|
+
#elif N == 4
|
416
|
+
return T{p[ 0],p[ 4],p[ 8],p[12]};
|
417
|
+
#elif N == 8
|
418
|
+
return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
|
419
|
+
#elif N == 16
|
420
|
+
return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
|
421
|
+
p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
|
422
|
+
#endif
|
423
|
+
}
|
424
|
+
|
425
|
+
template <typename T, typename P>
|
426
|
+
SI void store_3(P* p, const T& v) {
|
427
|
+
#if N == 1
|
428
|
+
p[0] = v;
|
429
|
+
#elif N == 4
|
430
|
+
p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
|
431
|
+
#elif N == 8
|
432
|
+
p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
|
433
|
+
p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
|
434
|
+
#elif N == 16
|
435
|
+
p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
|
436
|
+
p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
|
437
|
+
p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
|
438
|
+
p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
|
439
|
+
#endif
|
440
|
+
}
|
441
|
+
|
442
|
+
template <typename T, typename P>
|
443
|
+
SI void store_4(P* p, const T& v) {
|
444
|
+
#if N == 1
|
445
|
+
p[0] = v;
|
446
|
+
#elif N == 4
|
447
|
+
p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
|
448
|
+
#elif N == 8
|
449
|
+
p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
|
450
|
+
p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
|
451
|
+
#elif N == 16
|
452
|
+
p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
|
453
|
+
p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
|
454
|
+
p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
|
455
|
+
p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
|
456
|
+
#endif
|
457
|
+
}
|
458
|
+
|
459
|
+
|
460
|
+
SI U8 gather_8(const uint8_t* p, I32 ix) {
|
461
|
+
#if N == 1
|
462
|
+
U8 v = p[ix];
|
463
|
+
#elif N == 4
|
464
|
+
U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
|
465
|
+
#elif N == 8
|
466
|
+
U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
|
467
|
+
p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
|
468
|
+
#elif N == 16
|
469
|
+
U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
|
470
|
+
p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
|
471
|
+
p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
|
472
|
+
p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
|
473
|
+
#endif
|
474
|
+
return v;
|
475
|
+
}
|
476
|
+
|
477
|
+
SI U16 gather_16(const uint8_t* p, I32 ix) {
|
478
|
+
// Load the i'th 16-bit value from p.
|
479
|
+
auto load_16 = [p](int i) {
|
480
|
+
return load<uint16_t>(p + 2*i);
|
481
|
+
};
|
482
|
+
#if N == 1
|
483
|
+
U16 v = load_16(ix);
|
484
|
+
#elif N == 4
|
485
|
+
U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
|
486
|
+
#elif N == 8
|
487
|
+
U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
|
488
|
+
load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
|
489
|
+
#elif N == 16
|
490
|
+
U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
|
491
|
+
load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
|
492
|
+
load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
|
493
|
+
load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
|
494
|
+
#endif
|
495
|
+
return v;
|
496
|
+
}
|
497
|
+
|
498
|
+
SI U32 gather_32(const uint8_t* p, I32 ix) {
|
499
|
+
// Load the i'th 32-bit value from p.
|
500
|
+
auto load_32 = [p](int i) {
|
501
|
+
return load<uint32_t>(p + 4*i);
|
502
|
+
};
|
503
|
+
#if N == 1
|
504
|
+
U32 v = load_32(ix);
|
505
|
+
#elif N == 4
|
506
|
+
U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
|
507
|
+
#elif N == 8
|
508
|
+
U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
|
509
|
+
load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
|
510
|
+
#elif N == 16
|
511
|
+
U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
|
512
|
+
load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
|
513
|
+
load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
|
514
|
+
load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
|
515
|
+
#endif
|
516
|
+
// TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
|
517
|
+
return v;
|
518
|
+
}
|
519
|
+
|
520
|
+
SI U32 gather_24(const uint8_t* p, I32 ix) {
|
521
|
+
// First, back up a byte. Any place we're gathering from has a safe junk byte to read
|
522
|
+
// in front of it, either a previous table value, or some tag metadata.
|
523
|
+
p -= 1;
|
524
|
+
|
525
|
+
// Load the i'th 24-bit value from p, and 1 extra byte.
|
526
|
+
auto load_24_32 = [p](int i) {
|
527
|
+
return load<uint32_t>(p + 3*i);
|
528
|
+
};
|
529
|
+
|
530
|
+
// Now load multiples of 4 bytes (a junk byte, then r,g,b).
|
531
|
+
#if N == 1
|
532
|
+
U32 v = load_24_32(ix);
|
533
|
+
#elif N == 4
|
534
|
+
U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
|
535
|
+
#elif N == 8 && !defined(USING_AVX2)
|
536
|
+
U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
|
537
|
+
load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
|
538
|
+
#elif N == 8
|
539
|
+
(void)load_24_32;
|
540
|
+
// The gather instruction here doesn't need any particular alignment,
|
541
|
+
// but the intrinsic takes a const int*.
|
542
|
+
const int* p4 = bit_pun<const int*>(p);
|
543
|
+
I32 zero = { 0, 0, 0, 0, 0, 0, 0, 0},
|
544
|
+
mask = {-1,-1,-1,-1, -1,-1,-1,-1};
|
545
|
+
#if defined(__clang__)
|
546
|
+
U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
|
547
|
+
#elif defined(__GNUC__)
|
548
|
+
U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
|
549
|
+
#endif
|
550
|
+
#elif N == 16
|
551
|
+
(void)load_24_32;
|
552
|
+
// The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
|
553
|
+
// And AVX-512 swapped the order of arguments. :/
|
554
|
+
const int* p4 = bit_pun<const int*>(p);
|
555
|
+
U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
|
556
|
+
#endif
|
557
|
+
|
558
|
+
// Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
|
559
|
+
return v >> 8;
|
560
|
+
}
|
561
|
+
|
562
|
+
#if !defined(__arm__)
|
563
|
+
SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
|
564
|
+
// As in gather_24(), with everything doubled.
|
565
|
+
p -= 2;
|
566
|
+
|
567
|
+
// Load the i'th 48-bit value from p, and 2 extra bytes.
|
568
|
+
auto load_48_64 = [p](int i) {
|
569
|
+
return load<uint64_t>(p + 6*i);
|
570
|
+
};
|
571
|
+
|
572
|
+
#if N == 1
|
573
|
+
*v = load_48_64(ix);
|
574
|
+
#elif N == 4
|
575
|
+
*v = U64{
|
576
|
+
load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
|
577
|
+
};
|
578
|
+
#elif N == 8 && !defined(USING_AVX2)
|
579
|
+
*v = U64{
|
580
|
+
load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
|
581
|
+
load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
|
582
|
+
};
|
583
|
+
#elif N == 8
|
584
|
+
(void)load_48_64;
|
585
|
+
typedef int32_t __attribute__((vector_size(16))) Half_I32;
|
586
|
+
typedef long long __attribute__((vector_size(32))) Half_I64;
|
587
|
+
|
588
|
+
// The gather instruction here doesn't need any particular alignment,
|
589
|
+
// but the intrinsic takes a const long long*.
|
590
|
+
const long long int* p8 = bit_pun<const long long int*>(p);
|
591
|
+
|
592
|
+
Half_I64 zero = { 0, 0, 0, 0},
|
593
|
+
mask = {-1,-1,-1,-1};
|
594
|
+
|
595
|
+
ix *= 6;
|
596
|
+
Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
|
597
|
+
ix_hi = { ix[4], ix[5], ix[6], ix[7] };
|
598
|
+
|
599
|
+
#if defined(__clang__)
|
600
|
+
Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
|
601
|
+
hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
|
602
|
+
#elif defined(__GNUC__)
|
603
|
+
Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
|
604
|
+
hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
|
605
|
+
#endif
|
606
|
+
store((char*)v + 0, lo);
|
607
|
+
store((char*)v + 32, hi);
|
608
|
+
#elif N == 16
|
609
|
+
(void)load_48_64;
|
610
|
+
const long long int* p8 = bit_pun<const long long int*>(p);
|
611
|
+
__m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
|
612
|
+
hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
|
613
|
+
store((char*)v + 0, lo);
|
614
|
+
store((char*)v + 64, hi);
|
615
|
+
#endif
|
616
|
+
|
617
|
+
*v >>= 16;
|
618
|
+
}
|
619
|
+
#endif
|
620
|
+
|
621
|
+
SI F F_from_U8(U8 v) {
|
622
|
+
return cast<F>(v) * (1/255.0f);
|
623
|
+
}
|
624
|
+
|
625
|
+
SI F F_from_U16_BE(U16 v) {
|
626
|
+
// All 16-bit ICC values are big-endian, so we byte swap before converting to float.
|
627
|
+
// MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
|
628
|
+
U16 lo = (v >> 8),
|
629
|
+
hi = (v << 8) & 0xffff;
|
630
|
+
return cast<F>(lo|hi) * (1/65535.0f);
|
631
|
+
}
|
632
|
+
|
633
|
+
SI U16 U16_from_F(F v) {
|
634
|
+
// 65535 == inf in FP16, so promote to FP32 before converting.
|
635
|
+
return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
|
636
|
+
}
|
637
|
+
|
638
|
+
SI F minus_1_ulp(F v) {
|
639
|
+
#if defined(USING_NEON_FP16)
|
640
|
+
return bit_pun<F>( bit_pun<U16>(v) - 1 );
|
641
|
+
#else
|
642
|
+
return bit_pun<F>( bit_pun<U32>(v) - 1 );
|
643
|
+
#endif
|
644
|
+
}
|
645
|
+
|
646
|
+
SI F table(const skcms_Curve* curve, F v) {
|
647
|
+
// Clamp the input to [0,1], then scale to a table index.
|
648
|
+
F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
|
649
|
+
|
650
|
+
// We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
|
651
|
+
I32 lo = cast<I32>( ix ),
|
652
|
+
hi = cast<I32>(minus_1_ulp(ix+1.0f));
|
653
|
+
F t = ix - cast<F>(lo); // i.e. the fractional part of ix.
|
654
|
+
|
655
|
+
// TODO: can we load l and h simultaneously? Each entry in 'h' is either
|
656
|
+
// the same as in 'l' or adjacent. We have a rough idea that's it'd always be safe
|
657
|
+
// to read adjacent entries and perhaps underflow the table by a byte or two
|
658
|
+
// (it'd be junk, but always safe to read). Not sure how to lerp yet.
|
659
|
+
F l,h;
|
660
|
+
if (curve->table_8) {
|
661
|
+
l = F_from_U8(gather_8(curve->table_8, lo));
|
662
|
+
h = F_from_U8(gather_8(curve->table_8, hi));
|
663
|
+
} else {
|
664
|
+
l = F_from_U16_BE(gather_16(curve->table_16, lo));
|
665
|
+
h = F_from_U16_BE(gather_16(curve->table_16, hi));
|
666
|
+
}
|
667
|
+
return l + (h-l)*t;
|
668
|
+
}
|
669
|
+
|
670
|
+
SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) {
|
671
|
+
U32 rgb = gather_24(grid_8, ix);
|
672
|
+
|
673
|
+
*r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f);
|
674
|
+
*g = cast<F>((rgb >> 8) & 0xff) * (1/255.0f);
|
675
|
+
*b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
|
676
|
+
}
|
677
|
+
|
678
|
+
SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) {
|
679
|
+
// TODO: don't forget to optimize gather_32().
|
680
|
+
U32 rgba = gather_32(grid_8, ix);
|
681
|
+
|
682
|
+
*r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
|
683
|
+
*g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
|
684
|
+
*b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
|
685
|
+
*a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
|
686
|
+
}
|
687
|
+
|
688
|
+
SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) {
|
689
|
+
#if defined(__arm__)
|
690
|
+
// This is up to 2x faster on 32-bit ARM than the #else-case fast path.
|
691
|
+
*r = F_from_U16_BE(gather_16(grid_16, 3*ix+0));
|
692
|
+
*g = F_from_U16_BE(gather_16(grid_16, 3*ix+1));
|
693
|
+
*b = F_from_U16_BE(gather_16(grid_16, 3*ix+2));
|
694
|
+
#else
|
695
|
+
// This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
|
696
|
+
U64 rgb;
|
697
|
+
gather_48(grid_16, ix, &rgb);
|
698
|
+
rgb = swap_endian_16x4(rgb);
|
699
|
+
|
700
|
+
*r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f);
|
701
|
+
*g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
|
702
|
+
*b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
|
703
|
+
#endif
|
704
|
+
}
|
705
|
+
|
706
|
+
SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) {
|
707
|
+
// TODO: gather_64()-based fast path?
|
708
|
+
*r = F_from_U16_BE(gather_16(grid_16, 4*ix+0));
|
709
|
+
*g = F_from_U16_BE(gather_16(grid_16, 4*ix+1));
|
710
|
+
*b = F_from_U16_BE(gather_16(grid_16, 4*ix+2));
|
711
|
+
*a = F_from_U16_BE(gather_16(grid_16, 4*ix+3));
|
712
|
+
}
|
713
|
+
|
714
|
+
static void clut(uint32_t input_channels, uint32_t output_channels,
|
715
|
+
const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
|
716
|
+
F* r, F* g, F* b, F* a) {
|
717
|
+
|
718
|
+
const int dim = (int)input_channels;
|
719
|
+
assert (0 < dim && dim <= 4);
|
720
|
+
assert (output_channels == 3 ||
|
721
|
+
output_channels == 4);
|
722
|
+
|
723
|
+
// For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
|
724
|
+
I32 index [8]; // Index contribution by dimension, first low from 0, then high from 4.
|
725
|
+
F weight[8]; // Weight for each contribution, again first low, then high.
|
726
|
+
|
727
|
+
// O(dim) work first: calculate index,weight from r,g,b,a.
|
728
|
+
const F inputs[] = { *r,*g,*b,*a };
|
729
|
+
for (int i = dim-1, stride = 1; i >= 0; i--) {
|
730
|
+
// x is where we logically want to sample the grid in the i-th dimension.
|
731
|
+
F x = inputs[i] * (float)(grid_points[i] - 1);
|
732
|
+
|
733
|
+
// But we can't index at floats. lo and hi are the two integer grid points surrounding x.
|
734
|
+
I32 lo = cast<I32>( x ), // i.e. trunc(x) == floor(x) here.
|
735
|
+
hi = cast<I32>(minus_1_ulp(x+1.0f));
|
736
|
+
// Notice how we fold in the accumulated stride across previous dimensions here.
|
737
|
+
index[i+0] = lo * stride;
|
738
|
+
index[i+4] = hi * stride;
|
739
|
+
stride *= grid_points[i];
|
740
|
+
|
741
|
+
// We'll interpolate between those two integer grid points by t.
|
742
|
+
F t = x - cast<F>(lo); // i.e. fract(x)
|
743
|
+
weight[i+0] = 1-t;
|
744
|
+
weight[i+4] = t;
|
745
|
+
}
|
746
|
+
|
747
|
+
*r = *g = *b = F0;
|
748
|
+
if (output_channels == 4) {
|
749
|
+
*a = F0;
|
750
|
+
}
|
751
|
+
|
752
|
+
// We'll sample 2^dim == 1<<dim table entries per pixel,
|
753
|
+
// in all combinations of low and high in each dimension.
|
754
|
+
for (int combo = 0; combo < (1<<dim); combo++) { // This loop can be done in any order.
|
755
|
+
|
756
|
+
// Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
|
757
|
+
// where 0 selects the low index contribution and its weight 1-t,
|
758
|
+
// or 4 the high index contribution and its weight t.
|
759
|
+
|
760
|
+
// Since 0<dim≤4, we can always just start off with the 0-th channel,
|
761
|
+
// then handle the others conditionally.
|
762
|
+
I32 ix = index [0 + (combo&1)*4];
|
763
|
+
F w = weight[0 + (combo&1)*4];
|
764
|
+
|
765
|
+
switch ((dim-1)&3) { // This lets the compiler know there are no other cases to handle.
|
766
|
+
case 3: ix += index [3 + (combo&8)/2];
|
767
|
+
w *= weight[3 + (combo&8)/2];
|
768
|
+
FALLTHROUGH;
|
769
|
+
// fall through
|
770
|
+
|
771
|
+
case 2: ix += index [2 + (combo&4)*1];
|
772
|
+
w *= weight[2 + (combo&4)*1];
|
773
|
+
FALLTHROUGH;
|
774
|
+
// fall through
|
775
|
+
|
776
|
+
case 1: ix += index [1 + (combo&2)*2];
|
777
|
+
w *= weight[1 + (combo&2)*2];
|
778
|
+
}
|
779
|
+
|
780
|
+
F R,G,B,A=F0;
|
781
|
+
if (output_channels == 3) {
|
782
|
+
if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B); }
|
783
|
+
else { sample_clut_16(grid_16,ix, &R,&G,&B); }
|
784
|
+
} else {
|
785
|
+
if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B,&A); }
|
786
|
+
else { sample_clut_16(grid_16,ix, &R,&G,&B,&A); }
|
787
|
+
}
|
788
|
+
*r += w*R;
|
789
|
+
*g += w*G;
|
790
|
+
*b += w*B;
|
791
|
+
*a += w*A;
|
792
|
+
}
|
793
|
+
}
|
794
|
+
|
795
|
+
static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
|
796
|
+
clut(a2b->input_channels, a2b->output_channels,
|
797
|
+
a2b->grid_points, a2b->grid_8, a2b->grid_16,
|
798
|
+
r,g,b,&a);
|
799
|
+
}
|
800
|
+
static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
|
801
|
+
clut(b2a->input_channels, b2a->output_channels,
|
802
|
+
b2a->grid_points, b2a->grid_8, b2a->grid_16,
|
803
|
+
r,g,b,a);
|
804
|
+
}
|
805
|
+
|
806
|
+
static void exec_ops(const Op* ops, const void** args,
|
807
|
+
const char* src, char* dst, int i) {
|
808
|
+
F r = F0, g = F0, b = F0, a = F1;
|
809
|
+
while (true) {
|
810
|
+
switch (*ops++) {
|
811
|
+
case Op_load_a8:{
|
812
|
+
a = F_from_U8(load<U8>(src + 1*i));
|
813
|
+
} break;
|
814
|
+
|
815
|
+
case Op_load_g8:{
|
816
|
+
r = g = b = F_from_U8(load<U8>(src + 1*i));
|
817
|
+
} break;
|
818
|
+
|
819
|
+
case Op_load_4444:{
|
820
|
+
U16 abgr = load<U16>(src + 2*i);
|
821
|
+
|
822
|
+
r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
|
823
|
+
g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
|
824
|
+
b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
|
825
|
+
a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
|
826
|
+
} break;
|
827
|
+
|
828
|
+
case Op_load_565:{
|
829
|
+
U16 rgb = load<U16>(src + 2*i);
|
830
|
+
|
831
|
+
r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
|
832
|
+
g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
|
833
|
+
b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
|
834
|
+
} break;
|
835
|
+
|
836
|
+
case Op_load_888:{
|
837
|
+
const uint8_t* rgb = (const uint8_t*)(src + 3*i);
|
838
|
+
#if defined(USING_NEON_FP16)
|
839
|
+
// See the explanation under USING_NEON below. This is that doubled up.
|
840
|
+
uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
|
841
|
+
v = vld3q_lane_u8(rgb+ 0, v, 0);
|
842
|
+
v = vld3q_lane_u8(rgb+ 3, v, 2);
|
843
|
+
v = vld3q_lane_u8(rgb+ 6, v, 4);
|
844
|
+
v = vld3q_lane_u8(rgb+ 9, v, 6);
|
845
|
+
|
846
|
+
v = vld3q_lane_u8(rgb+12, v, 8);
|
847
|
+
v = vld3q_lane_u8(rgb+15, v, 10);
|
848
|
+
v = vld3q_lane_u8(rgb+18, v, 12);
|
849
|
+
v = vld3q_lane_u8(rgb+21, v, 14);
|
850
|
+
|
851
|
+
r = cast<F>((U16)v.val[0]) * (1/255.0f);
|
852
|
+
g = cast<F>((U16)v.val[1]) * (1/255.0f);
|
853
|
+
b = cast<F>((U16)v.val[2]) * (1/255.0f);
|
854
|
+
#elif defined(USING_NEON)
|
855
|
+
// There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
|
856
|
+
// a time. Since we're doing that, we might as well load them into 16-bit lanes.
|
857
|
+
// (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
|
858
|
+
uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
|
859
|
+
v = vld3_lane_u8(rgb+0, v, 0);
|
860
|
+
v = vld3_lane_u8(rgb+3, v, 2);
|
861
|
+
v = vld3_lane_u8(rgb+6, v, 4);
|
862
|
+
v = vld3_lane_u8(rgb+9, v, 6);
|
863
|
+
|
864
|
+
// Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
|
865
|
+
// convert to F. (Again, U32 would be even better here if drop ARMv7 or split
|
866
|
+
// ARMv7 and ARMv8 impls.)
|
867
|
+
r = cast<F>((U16)v.val[0]) * (1/255.0f);
|
868
|
+
g = cast<F>((U16)v.val[1]) * (1/255.0f);
|
869
|
+
b = cast<F>((U16)v.val[2]) * (1/255.0f);
|
870
|
+
#else
|
871
|
+
r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
|
872
|
+
g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
|
873
|
+
b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
|
874
|
+
#endif
|
875
|
+
} break;
|
876
|
+
|
877
|
+
case Op_load_8888:{
|
878
|
+
U32 rgba = load<U32>(src + 4*i);
|
879
|
+
|
880
|
+
r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
|
881
|
+
g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
|
882
|
+
b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
|
883
|
+
a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
|
884
|
+
} break;
|
885
|
+
|
886
|
+
case Op_load_8888_palette8:{
|
887
|
+
const uint8_t* palette = (const uint8_t*) *args++;
|
888
|
+
I32 ix = cast<I32>(load<U8>(src + 1*i));
|
889
|
+
U32 rgba = gather_32(palette, ix);
|
890
|
+
|
891
|
+
r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
|
892
|
+
g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
|
893
|
+
b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
|
894
|
+
a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
|
895
|
+
} break;
|
896
|
+
|
897
|
+
case Op_load_1010102:{
|
898
|
+
U32 rgba = load<U32>(src + 4*i);
|
899
|
+
|
900
|
+
r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
|
901
|
+
g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
|
902
|
+
b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
|
903
|
+
a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
|
904
|
+
} break;
|
905
|
+
|
906
|
+
case Op_load_161616LE:{
|
907
|
+
uintptr_t ptr = (uintptr_t)(src + 6*i);
|
908
|
+
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
|
909
|
+
const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
|
910
|
+
#if defined(USING_NEON_FP16)
|
911
|
+
uint16x8x3_t v = vld3q_u16(rgb);
|
912
|
+
r = cast<F>((U16)v.val[0]) * (1/65535.0f);
|
913
|
+
g = cast<F>((U16)v.val[1]) * (1/65535.0f);
|
914
|
+
b = cast<F>((U16)v.val[2]) * (1/65535.0f);
|
915
|
+
#elif defined(USING_NEON)
|
916
|
+
uint16x4x3_t v = vld3_u16(rgb);
|
917
|
+
r = cast<F>((U16)v.val[0]) * (1/65535.0f);
|
918
|
+
g = cast<F>((U16)v.val[1]) * (1/65535.0f);
|
919
|
+
b = cast<F>((U16)v.val[2]) * (1/65535.0f);
|
920
|
+
#else
|
921
|
+
r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
|
922
|
+
g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
|
923
|
+
b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
|
924
|
+
#endif
|
925
|
+
} break;
|
926
|
+
|
927
|
+
case Op_load_16161616LE:{
|
928
|
+
uintptr_t ptr = (uintptr_t)(src + 8*i);
|
929
|
+
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
|
930
|
+
const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
|
931
|
+
#if defined(USING_NEON_FP16)
|
932
|
+
uint16x8x4_t v = vld4q_u16(rgba);
|
933
|
+
r = cast<F>((U16)v.val[0]) * (1/65535.0f);
|
934
|
+
g = cast<F>((U16)v.val[1]) * (1/65535.0f);
|
935
|
+
b = cast<F>((U16)v.val[2]) * (1/65535.0f);
|
936
|
+
a = cast<F>((U16)v.val[3]) * (1/65535.0f);
|
937
|
+
#elif defined(USING_NEON)
|
938
|
+
uint16x4x4_t v = vld4_u16(rgba);
|
939
|
+
r = cast<F>((U16)v.val[0]) * (1/65535.0f);
|
940
|
+
g = cast<F>((U16)v.val[1]) * (1/65535.0f);
|
941
|
+
b = cast<F>((U16)v.val[2]) * (1/65535.0f);
|
942
|
+
a = cast<F>((U16)v.val[3]) * (1/65535.0f);
|
943
|
+
#else
|
944
|
+
U64 px = load<U64>(rgba);
|
945
|
+
|
946
|
+
r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
|
947
|
+
g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
|
948
|
+
b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
|
949
|
+
a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
|
950
|
+
#endif
|
951
|
+
} break;
|
952
|
+
|
953
|
+
case Op_load_161616BE:{
|
954
|
+
uintptr_t ptr = (uintptr_t)(src + 6*i);
|
955
|
+
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
|
956
|
+
const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
|
957
|
+
#if defined(USING_NEON_FP16)
|
958
|
+
uint16x8x3_t v = vld3q_u16(rgb);
|
959
|
+
r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
|
960
|
+
g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
|
961
|
+
b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
|
962
|
+
#elif defined(USING_NEON)
|
963
|
+
uint16x4x3_t v = vld3_u16(rgb);
|
964
|
+
r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
|
965
|
+
g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
|
966
|
+
b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
|
967
|
+
#else
|
968
|
+
U32 R = load_3<U32>(rgb+0),
|
969
|
+
G = load_3<U32>(rgb+1),
|
970
|
+
B = load_3<U32>(rgb+2);
|
971
|
+
// R,G,B are big-endian 16-bit, so byte swap them before converting to float.
|
972
|
+
r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
|
973
|
+
g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
|
974
|
+
b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
|
975
|
+
#endif
|
976
|
+
} break;
|
977
|
+
|
978
|
+
case Op_load_16161616BE:{
|
979
|
+
uintptr_t ptr = (uintptr_t)(src + 8*i);
|
980
|
+
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
|
981
|
+
const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
|
982
|
+
#if defined(USING_NEON_FP16)
|
983
|
+
uint16x8x4_t v = vld4q_u16(rgba);
|
984
|
+
r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
|
985
|
+
g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
|
986
|
+
b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
|
987
|
+
a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
|
988
|
+
#elif defined(USING_NEON)
|
989
|
+
uint16x4x4_t v = vld4_u16(rgba);
|
990
|
+
r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
|
991
|
+
g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
|
992
|
+
b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
|
993
|
+
a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
|
994
|
+
#else
|
995
|
+
U64 px = swap_endian_16x4(load<U64>(rgba));
|
996
|
+
|
997
|
+
r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
|
998
|
+
g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
|
999
|
+
b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
|
1000
|
+
a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
|
1001
|
+
#endif
|
1002
|
+
} break;
|
1003
|
+
|
1004
|
+
case Op_load_hhh:{
|
1005
|
+
uintptr_t ptr = (uintptr_t)(src + 6*i);
|
1006
|
+
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
|
1007
|
+
const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
|
1008
|
+
#if defined(USING_NEON_FP16)
|
1009
|
+
uint16x8x3_t v = vld3q_u16(rgb);
|
1010
|
+
U16 R = (U16)v.val[0],
|
1011
|
+
G = (U16)v.val[1],
|
1012
|
+
B = (U16)v.val[2];
|
1013
|
+
#elif defined(USING_NEON)
|
1014
|
+
uint16x4x3_t v = vld3_u16(rgb);
|
1015
|
+
U16 R = (U16)v.val[0],
|
1016
|
+
G = (U16)v.val[1],
|
1017
|
+
B = (U16)v.val[2];
|
1018
|
+
#else
|
1019
|
+
U16 R = load_3<U16>(rgb+0),
|
1020
|
+
G = load_3<U16>(rgb+1),
|
1021
|
+
B = load_3<U16>(rgb+2);
|
1022
|
+
#endif
|
1023
|
+
r = F_from_Half(R);
|
1024
|
+
g = F_from_Half(G);
|
1025
|
+
b = F_from_Half(B);
|
1026
|
+
} break;
|
1027
|
+
|
1028
|
+
case Op_load_hhhh:{
|
1029
|
+
uintptr_t ptr = (uintptr_t)(src + 8*i);
|
1030
|
+
assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
|
1031
|
+
const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
|
1032
|
+
#if defined(USING_NEON_FP16)
|
1033
|
+
uint16x8x4_t v = vld4q_u16(rgba);
|
1034
|
+
U16 R = (U16)v.val[0],
|
1035
|
+
G = (U16)v.val[1],
|
1036
|
+
B = (U16)v.val[2],
|
1037
|
+
A = (U16)v.val[3];
|
1038
|
+
#elif defined(USING_NEON)
|
1039
|
+
uint16x4x4_t v = vld4_u16(rgba);
|
1040
|
+
U16 R = (U16)v.val[0],
|
1041
|
+
G = (U16)v.val[1],
|
1042
|
+
B = (U16)v.val[2],
|
1043
|
+
A = (U16)v.val[3];
|
1044
|
+
#else
|
1045
|
+
U64 px = load<U64>(rgba);
|
1046
|
+
U16 R = cast<U16>((px >> 0) & 0xffff),
|
1047
|
+
G = cast<U16>((px >> 16) & 0xffff),
|
1048
|
+
B = cast<U16>((px >> 32) & 0xffff),
|
1049
|
+
A = cast<U16>((px >> 48) & 0xffff);
|
1050
|
+
#endif
|
1051
|
+
r = F_from_Half(R);
|
1052
|
+
g = F_from_Half(G);
|
1053
|
+
b = F_from_Half(B);
|
1054
|
+
a = F_from_Half(A);
|
1055
|
+
} break;
|
1056
|
+
|
1057
|
+
case Op_load_fff:{
|
1058
|
+
uintptr_t ptr = (uintptr_t)(src + 12*i);
|
1059
|
+
assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
|
1060
|
+
const float* rgb = (const float*)ptr; // cast to const float* to be safe.
|
1061
|
+
#if defined(USING_NEON_FP16)
|
1062
|
+
float32x4x3_t lo = vld3q_f32(rgb + 0),
|
1063
|
+
hi = vld3q_f32(rgb + 12);
|
1064
|
+
r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
|
1065
|
+
g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
|
1066
|
+
b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
|
1067
|
+
#elif defined(USING_NEON)
|
1068
|
+
float32x4x3_t v = vld3q_f32(rgb);
|
1069
|
+
r = (F)v.val[0];
|
1070
|
+
g = (F)v.val[1];
|
1071
|
+
b = (F)v.val[2];
|
1072
|
+
#else
|
1073
|
+
r = load_3<F>(rgb+0);
|
1074
|
+
g = load_3<F>(rgb+1);
|
1075
|
+
b = load_3<F>(rgb+2);
|
1076
|
+
#endif
|
1077
|
+
} break;
|
1078
|
+
|
1079
|
+
case Op_load_ffff:{
|
1080
|
+
uintptr_t ptr = (uintptr_t)(src + 16*i);
|
1081
|
+
assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
|
1082
|
+
const float* rgba = (const float*)ptr; // cast to const float* to be safe.
|
1083
|
+
#if defined(USING_NEON_FP16)
|
1084
|
+
float32x4x4_t lo = vld4q_f32(rgba + 0),
|
1085
|
+
hi = vld4q_f32(rgba + 16);
|
1086
|
+
r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
|
1087
|
+
g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
|
1088
|
+
b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
|
1089
|
+
a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
|
1090
|
+
#elif defined(USING_NEON)
|
1091
|
+
float32x4x4_t v = vld4q_f32(rgba);
|
1092
|
+
r = (F)v.val[0];
|
1093
|
+
g = (F)v.val[1];
|
1094
|
+
b = (F)v.val[2];
|
1095
|
+
a = (F)v.val[3];
|
1096
|
+
#else
|
1097
|
+
r = load_4<F>(rgba+0);
|
1098
|
+
g = load_4<F>(rgba+1);
|
1099
|
+
b = load_4<F>(rgba+2);
|
1100
|
+
a = load_4<F>(rgba+3);
|
1101
|
+
#endif
|
1102
|
+
} break;
|
1103
|
+
|
1104
|
+
case Op_swap_rb:{
|
1105
|
+
F t = r;
|
1106
|
+
r = b;
|
1107
|
+
b = t;
|
1108
|
+
} break;
|
1109
|
+
|
1110
|
+
case Op_clamp:{
|
1111
|
+
r = max_(F0, min_(r, F1));
|
1112
|
+
g = max_(F0, min_(g, F1));
|
1113
|
+
b = max_(F0, min_(b, F1));
|
1114
|
+
a = max_(F0, min_(a, F1));
|
1115
|
+
} break;
|
1116
|
+
|
1117
|
+
case Op_invert:{
|
1118
|
+
r = F1 - r;
|
1119
|
+
g = F1 - g;
|
1120
|
+
b = F1 - b;
|
1121
|
+
a = F1 - a;
|
1122
|
+
} break;
|
1123
|
+
|
1124
|
+
case Op_force_opaque:{
|
1125
|
+
a = F1;
|
1126
|
+
} break;
|
1127
|
+
|
1128
|
+
case Op_premul:{
|
1129
|
+
r *= a;
|
1130
|
+
g *= a;
|
1131
|
+
b *= a;
|
1132
|
+
} break;
|
1133
|
+
|
1134
|
+
case Op_unpremul:{
|
1135
|
+
F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
|
1136
|
+
r *= scale;
|
1137
|
+
g *= scale;
|
1138
|
+
b *= scale;
|
1139
|
+
} break;
|
1140
|
+
|
1141
|
+
case Op_matrix_3x3:{
|
1142
|
+
const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
|
1143
|
+
const float* m = &matrix->vals[0][0];
|
1144
|
+
|
1145
|
+
F R = m[0]*r + m[1]*g + m[2]*b,
|
1146
|
+
G = m[3]*r + m[4]*g + m[5]*b,
|
1147
|
+
B = m[6]*r + m[7]*g + m[8]*b;
|
1148
|
+
|
1149
|
+
r = R;
|
1150
|
+
g = G;
|
1151
|
+
b = B;
|
1152
|
+
} break;
|
1153
|
+
|
1154
|
+
case Op_matrix_3x4:{
|
1155
|
+
const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
|
1156
|
+
const float* m = &matrix->vals[0][0];
|
1157
|
+
|
1158
|
+
F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
|
1159
|
+
G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
|
1160
|
+
B = m[8]*r + m[9]*g + m[10]*b + m[11];
|
1161
|
+
|
1162
|
+
r = R;
|
1163
|
+
g = G;
|
1164
|
+
b = B;
|
1165
|
+
} break;
|
1166
|
+
|
1167
|
+
case Op_lab_to_xyz:{
|
1168
|
+
// The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
|
1169
|
+
F L = r * 100.0f,
|
1170
|
+
A = g * 255.0f - 128.0f,
|
1171
|
+
B = b * 255.0f - 128.0f;
|
1172
|
+
|
1173
|
+
// Convert to CIE XYZ.
|
1174
|
+
F Y = (L + 16.0f) * (1/116.0f),
|
1175
|
+
X = Y + A*(1/500.0f),
|
1176
|
+
Z = Y - B*(1/200.0f);
|
1177
|
+
|
1178
|
+
X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
|
1179
|
+
Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
|
1180
|
+
Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
|
1181
|
+
|
1182
|
+
// Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
|
1183
|
+
r = X * 0.9642f;
|
1184
|
+
g = Y ;
|
1185
|
+
b = Z * 0.8249f;
|
1186
|
+
} break;
|
1187
|
+
|
1188
|
+
// As above, in reverse.
|
1189
|
+
case Op_xyz_to_lab:{
|
1190
|
+
F X = r * (1/0.9642f),
|
1191
|
+
Y = g,
|
1192
|
+
Z = b * (1/0.8249f);
|
1193
|
+
|
1194
|
+
X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
|
1195
|
+
Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
|
1196
|
+
Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
|
1197
|
+
|
1198
|
+
F L = Y*116.0f - 16.0f,
|
1199
|
+
A = (X-Y)*500.0f,
|
1200
|
+
B = (Y-Z)*200.0f;
|
1201
|
+
|
1202
|
+
r = L * (1/100.f);
|
1203
|
+
g = (A + 128.0f) * (1/255.0f);
|
1204
|
+
b = (B + 128.0f) * (1/255.0f);
|
1205
|
+
} break;
|
1206
|
+
|
1207
|
+
case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
|
1208
|
+
case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
|
1209
|
+
case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
|
1210
|
+
case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
|
1211
|
+
|
1212
|
+
case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
|
1213
|
+
case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
|
1214
|
+
case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
|
1215
|
+
case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
|
1216
|
+
|
1217
|
+
case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
|
1218
|
+
case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
|
1219
|
+
case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
|
1220
|
+
case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
|
1221
|
+
|
1222
|
+
case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
|
1223
|
+
case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
|
1224
|
+
case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
|
1225
|
+
case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
|
1226
|
+
|
1227
|
+
case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
|
1228
|
+
case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
|
1229
|
+
case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
|
1230
|
+
case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
|
1231
|
+
|
1232
|
+
case Op_clut_A2B: {
|
1233
|
+
const skcms_A2B* a2b = (const skcms_A2B*) *args++;
|
1234
|
+
clut(a2b, &r,&g,&b,a);
|
1235
|
+
|
1236
|
+
if (a2b->input_channels == 4) {
|
1237
|
+
// CMYK is opaque.
|
1238
|
+
a = F1;
|
1239
|
+
}
|
1240
|
+
} break;
|
1241
|
+
|
1242
|
+
case Op_clut_B2A: {
|
1243
|
+
const skcms_B2A* b2a = (const skcms_B2A*) *args++;
|
1244
|
+
clut(b2a, &r,&g,&b,&a);
|
1245
|
+
} break;
|
1246
|
+
|
1247
|
+
// Notice, from here on down the store_ ops all return, ending the loop.
|
1248
|
+
|
1249
|
+
case Op_store_a8: {
|
1250
|
+
store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
|
1251
|
+
} return;
|
1252
|
+
|
1253
|
+
case Op_store_g8: {
|
1254
|
+
// g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
|
1255
|
+
store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
|
1256
|
+
} return;
|
1257
|
+
|
1258
|
+
case Op_store_4444: {
|
1259
|
+
store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
|
1260
|
+
| cast<U16>(to_fixed(g * 15) << 8)
|
1261
|
+
| cast<U16>(to_fixed(b * 15) << 4)
|
1262
|
+
| cast<U16>(to_fixed(a * 15) << 0));
|
1263
|
+
} return;
|
1264
|
+
|
1265
|
+
case Op_store_565: {
|
1266
|
+
store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
|
1267
|
+
| cast<U16>(to_fixed(g * 63) << 5 )
|
1268
|
+
| cast<U16>(to_fixed(b * 31) << 11 ));
|
1269
|
+
} return;
|
1270
|
+
|
1271
|
+
case Op_store_888: {
|
1272
|
+
uint8_t* rgb = (uint8_t*)dst + 3*i;
|
1273
|
+
#if defined(USING_NEON_FP16)
|
1274
|
+
// See the explanation under USING_NEON below. This is that doubled up.
|
1275
|
+
U16 R = to_fixed(r * 255),
|
1276
|
+
G = to_fixed(g * 255),
|
1277
|
+
B = to_fixed(b * 255);
|
1278
|
+
|
1279
|
+
uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
|
1280
|
+
vst3q_lane_u8(rgb+ 0, v, 0);
|
1281
|
+
vst3q_lane_u8(rgb+ 3, v, 2);
|
1282
|
+
vst3q_lane_u8(rgb+ 6, v, 4);
|
1283
|
+
vst3q_lane_u8(rgb+ 9, v, 6);
|
1284
|
+
|
1285
|
+
vst3q_lane_u8(rgb+12, v, 8);
|
1286
|
+
vst3q_lane_u8(rgb+15, v, 10);
|
1287
|
+
vst3q_lane_u8(rgb+18, v, 12);
|
1288
|
+
vst3q_lane_u8(rgb+21, v, 14);
|
1289
|
+
#elif defined(USING_NEON)
|
1290
|
+
// Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
|
1291
|
+
// get there via U16 to save some instructions converting to float. And just
|
1292
|
+
// like load_888, we'd prefer to go via U32 but for ARMv7 support.
|
1293
|
+
U16 R = cast<U16>(to_fixed(r * 255)),
|
1294
|
+
G = cast<U16>(to_fixed(g * 255)),
|
1295
|
+
B = cast<U16>(to_fixed(b * 255));
|
1296
|
+
|
1297
|
+
uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
|
1298
|
+
vst3_lane_u8(rgb+0, v, 0);
|
1299
|
+
vst3_lane_u8(rgb+3, v, 2);
|
1300
|
+
vst3_lane_u8(rgb+6, v, 4);
|
1301
|
+
vst3_lane_u8(rgb+9, v, 6);
|
1302
|
+
#else
|
1303
|
+
store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
|
1304
|
+
store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
|
1305
|
+
store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
|
1306
|
+
#endif
|
1307
|
+
} return;
|
1308
|
+
|
1309
|
+
case Op_store_8888: {
|
1310
|
+
store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0
|
1311
|
+
| cast<U32>(to_fixed(g * 255)) << 8
|
1312
|
+
| cast<U32>(to_fixed(b * 255)) << 16
|
1313
|
+
| cast<U32>(to_fixed(a * 255)) << 24);
|
1314
|
+
} return;
|
1315
|
+
|
1316
|
+
case Op_store_1010102: {
|
1317
|
+
store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
|
1318
|
+
| cast<U32>(to_fixed(g * 1023)) << 10
|
1319
|
+
| cast<U32>(to_fixed(b * 1023)) << 20
|
1320
|
+
| cast<U32>(to_fixed(a * 3)) << 30);
|
1321
|
+
} return;
|
1322
|
+
|
1323
|
+
case Op_store_161616LE: {
|
1324
|
+
uintptr_t ptr = (uintptr_t)(dst + 6*i);
|
1325
|
+
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1326
|
+
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1327
|
+
#if defined(USING_NEON_FP16)
|
1328
|
+
uint16x8x3_t v = {{
|
1329
|
+
(uint16x8_t)U16_from_F(r),
|
1330
|
+
(uint16x8_t)U16_from_F(g),
|
1331
|
+
(uint16x8_t)U16_from_F(b),
|
1332
|
+
}};
|
1333
|
+
vst3q_u16(rgb, v);
|
1334
|
+
#elif defined(USING_NEON)
|
1335
|
+
uint16x4x3_t v = {{
|
1336
|
+
(uint16x4_t)U16_from_F(r),
|
1337
|
+
(uint16x4_t)U16_from_F(g),
|
1338
|
+
(uint16x4_t)U16_from_F(b),
|
1339
|
+
}};
|
1340
|
+
vst3_u16(rgb, v);
|
1341
|
+
#else
|
1342
|
+
store_3(rgb+0, U16_from_F(r));
|
1343
|
+
store_3(rgb+1, U16_from_F(g));
|
1344
|
+
store_3(rgb+2, U16_from_F(b));
|
1345
|
+
#endif
|
1346
|
+
|
1347
|
+
} return;
|
1348
|
+
|
1349
|
+
case Op_store_16161616LE: {
|
1350
|
+
uintptr_t ptr = (uintptr_t)(dst + 8*i);
|
1351
|
+
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1352
|
+
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1353
|
+
#if defined(USING_NEON_FP16)
|
1354
|
+
uint16x8x4_t v = {{
|
1355
|
+
(uint16x8_t)U16_from_F(r),
|
1356
|
+
(uint16x8_t)U16_from_F(g),
|
1357
|
+
(uint16x8_t)U16_from_F(b),
|
1358
|
+
(uint16x8_t)U16_from_F(a),
|
1359
|
+
}};
|
1360
|
+
vst4q_u16(rgba, v);
|
1361
|
+
#elif defined(USING_NEON)
|
1362
|
+
uint16x4x4_t v = {{
|
1363
|
+
(uint16x4_t)U16_from_F(r),
|
1364
|
+
(uint16x4_t)U16_from_F(g),
|
1365
|
+
(uint16x4_t)U16_from_F(b),
|
1366
|
+
(uint16x4_t)U16_from_F(a),
|
1367
|
+
}};
|
1368
|
+
vst4_u16(rgba, v);
|
1369
|
+
#else
|
1370
|
+
U64 px = cast<U64>(to_fixed(r * 65535)) << 0
|
1371
|
+
| cast<U64>(to_fixed(g * 65535)) << 16
|
1372
|
+
| cast<U64>(to_fixed(b * 65535)) << 32
|
1373
|
+
| cast<U64>(to_fixed(a * 65535)) << 48;
|
1374
|
+
store(rgba, px);
|
1375
|
+
#endif
|
1376
|
+
} return;
|
1377
|
+
|
1378
|
+
case Op_store_161616BE: {
|
1379
|
+
uintptr_t ptr = (uintptr_t)(dst + 6*i);
|
1380
|
+
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1381
|
+
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1382
|
+
#if defined(USING_NEON_FP16)
|
1383
|
+
uint16x8x3_t v = {{
|
1384
|
+
(uint16x8_t)swap_endian_16(U16_from_F(r)),
|
1385
|
+
(uint16x8_t)swap_endian_16(U16_from_F(g)),
|
1386
|
+
(uint16x8_t)swap_endian_16(U16_from_F(b)),
|
1387
|
+
}};
|
1388
|
+
vst3q_u16(rgb, v);
|
1389
|
+
#elif defined(USING_NEON)
|
1390
|
+
uint16x4x3_t v = {{
|
1391
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
|
1392
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
|
1393
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
|
1394
|
+
}};
|
1395
|
+
vst3_u16(rgb, v);
|
1396
|
+
#else
|
1397
|
+
U32 R = to_fixed(r * 65535),
|
1398
|
+
G = to_fixed(g * 65535),
|
1399
|
+
B = to_fixed(b * 65535);
|
1400
|
+
store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
|
1401
|
+
store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
|
1402
|
+
store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
|
1403
|
+
#endif
|
1404
|
+
|
1405
|
+
} return;
|
1406
|
+
|
1407
|
+
case Op_store_16161616BE: {
|
1408
|
+
uintptr_t ptr = (uintptr_t)(dst + 8*i);
|
1409
|
+
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1410
|
+
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1411
|
+
#if defined(USING_NEON_FP16)
|
1412
|
+
uint16x8x4_t v = {{
|
1413
|
+
(uint16x8_t)swap_endian_16(U16_from_F(r)),
|
1414
|
+
(uint16x8_t)swap_endian_16(U16_from_F(g)),
|
1415
|
+
(uint16x8_t)swap_endian_16(U16_from_F(b)),
|
1416
|
+
(uint16x8_t)swap_endian_16(U16_from_F(a)),
|
1417
|
+
}};
|
1418
|
+
vst4q_u16(rgba, v);
|
1419
|
+
#elif defined(USING_NEON)
|
1420
|
+
uint16x4x4_t v = {{
|
1421
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
|
1422
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
|
1423
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
|
1424
|
+
(uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
|
1425
|
+
}};
|
1426
|
+
vst4_u16(rgba, v);
|
1427
|
+
#else
|
1428
|
+
U64 px = cast<U64>(to_fixed(r * 65535)) << 0
|
1429
|
+
| cast<U64>(to_fixed(g * 65535)) << 16
|
1430
|
+
| cast<U64>(to_fixed(b * 65535)) << 32
|
1431
|
+
| cast<U64>(to_fixed(a * 65535)) << 48;
|
1432
|
+
store(rgba, swap_endian_16x4(px));
|
1433
|
+
#endif
|
1434
|
+
} return;
|
1435
|
+
|
1436
|
+
case Op_store_hhh: {
|
1437
|
+
uintptr_t ptr = (uintptr_t)(dst + 6*i);
|
1438
|
+
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1439
|
+
uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1440
|
+
|
1441
|
+
U16 R = Half_from_F(r),
|
1442
|
+
G = Half_from_F(g),
|
1443
|
+
B = Half_from_F(b);
|
1444
|
+
#if defined(USING_NEON_FP16)
|
1445
|
+
uint16x8x3_t v = {{
|
1446
|
+
(uint16x8_t)R,
|
1447
|
+
(uint16x8_t)G,
|
1448
|
+
(uint16x8_t)B,
|
1449
|
+
}};
|
1450
|
+
vst3q_u16(rgb, v);
|
1451
|
+
#elif defined(USING_NEON)
|
1452
|
+
uint16x4x3_t v = {{
|
1453
|
+
(uint16x4_t)R,
|
1454
|
+
(uint16x4_t)G,
|
1455
|
+
(uint16x4_t)B,
|
1456
|
+
}};
|
1457
|
+
vst3_u16(rgb, v);
|
1458
|
+
#else
|
1459
|
+
store_3(rgb+0, R);
|
1460
|
+
store_3(rgb+1, G);
|
1461
|
+
store_3(rgb+2, B);
|
1462
|
+
#endif
|
1463
|
+
} return;
|
1464
|
+
|
1465
|
+
case Op_store_hhhh: {
|
1466
|
+
uintptr_t ptr = (uintptr_t)(dst + 8*i);
|
1467
|
+
assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
|
1468
|
+
uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
|
1469
|
+
|
1470
|
+
U16 R = Half_from_F(r),
|
1471
|
+
G = Half_from_F(g),
|
1472
|
+
B = Half_from_F(b),
|
1473
|
+
A = Half_from_F(a);
|
1474
|
+
#if defined(USING_NEON_FP16)
|
1475
|
+
uint16x8x4_t v = {{
|
1476
|
+
(uint16x8_t)R,
|
1477
|
+
(uint16x8_t)G,
|
1478
|
+
(uint16x8_t)B,
|
1479
|
+
(uint16x8_t)A,
|
1480
|
+
}};
|
1481
|
+
vst4q_u16(rgba, v);
|
1482
|
+
#elif defined(USING_NEON)
|
1483
|
+
uint16x4x4_t v = {{
|
1484
|
+
(uint16x4_t)R,
|
1485
|
+
(uint16x4_t)G,
|
1486
|
+
(uint16x4_t)B,
|
1487
|
+
(uint16x4_t)A,
|
1488
|
+
}};
|
1489
|
+
vst4_u16(rgba, v);
|
1490
|
+
#else
|
1491
|
+
store(rgba, cast<U64>(R) << 0
|
1492
|
+
| cast<U64>(G) << 16
|
1493
|
+
| cast<U64>(B) << 32
|
1494
|
+
| cast<U64>(A) << 48);
|
1495
|
+
#endif
|
1496
|
+
|
1497
|
+
} return;
|
1498
|
+
|
1499
|
+
case Op_store_fff: {
|
1500
|
+
uintptr_t ptr = (uintptr_t)(dst + 12*i);
|
1501
|
+
assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
|
1502
|
+
float* rgb = (float*)ptr; // for this cast to float* to be safe.
|
1503
|
+
#if defined(USING_NEON_FP16)
|
1504
|
+
float32x4x3_t lo = {{
|
1505
|
+
vcvt_f32_f16(vget_low_f16(r)),
|
1506
|
+
vcvt_f32_f16(vget_low_f16(g)),
|
1507
|
+
vcvt_f32_f16(vget_low_f16(b)),
|
1508
|
+
}}, hi = {{
|
1509
|
+
vcvt_f32_f16(vget_high_f16(r)),
|
1510
|
+
vcvt_f32_f16(vget_high_f16(g)),
|
1511
|
+
vcvt_f32_f16(vget_high_f16(b)),
|
1512
|
+
}};
|
1513
|
+
vst3q_f32(rgb + 0, lo);
|
1514
|
+
vst3q_f32(rgb + 12, hi);
|
1515
|
+
#elif defined(USING_NEON)
|
1516
|
+
float32x4x3_t v = {{
|
1517
|
+
(float32x4_t)r,
|
1518
|
+
(float32x4_t)g,
|
1519
|
+
(float32x4_t)b,
|
1520
|
+
}};
|
1521
|
+
vst3q_f32(rgb, v);
|
1522
|
+
#else
|
1523
|
+
store_3(rgb+0, r);
|
1524
|
+
store_3(rgb+1, g);
|
1525
|
+
store_3(rgb+2, b);
|
1526
|
+
#endif
|
1527
|
+
} return;
|
1528
|
+
|
1529
|
+
case Op_store_ffff: {
|
1530
|
+
uintptr_t ptr = (uintptr_t)(dst + 16*i);
|
1531
|
+
assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
|
1532
|
+
float* rgba = (float*)ptr; // for this cast to float* to be safe.
|
1533
|
+
#if defined(USING_NEON_FP16)
|
1534
|
+
float32x4x4_t lo = {{
|
1535
|
+
vcvt_f32_f16(vget_low_f16(r)),
|
1536
|
+
vcvt_f32_f16(vget_low_f16(g)),
|
1537
|
+
vcvt_f32_f16(vget_low_f16(b)),
|
1538
|
+
vcvt_f32_f16(vget_low_f16(a)),
|
1539
|
+
}}, hi = {{
|
1540
|
+
vcvt_f32_f16(vget_high_f16(r)),
|
1541
|
+
vcvt_f32_f16(vget_high_f16(g)),
|
1542
|
+
vcvt_f32_f16(vget_high_f16(b)),
|
1543
|
+
vcvt_f32_f16(vget_high_f16(a)),
|
1544
|
+
}};
|
1545
|
+
vst4q_f32(rgba + 0, lo);
|
1546
|
+
vst4q_f32(rgba + 16, hi);
|
1547
|
+
#elif defined(USING_NEON)
|
1548
|
+
float32x4x4_t v = {{
|
1549
|
+
(float32x4_t)r,
|
1550
|
+
(float32x4_t)g,
|
1551
|
+
(float32x4_t)b,
|
1552
|
+
(float32x4_t)a,
|
1553
|
+
}};
|
1554
|
+
vst4q_f32(rgba, v);
|
1555
|
+
#else
|
1556
|
+
store_4(rgba+0, r);
|
1557
|
+
store_4(rgba+1, g);
|
1558
|
+
store_4(rgba+2, b);
|
1559
|
+
store_4(rgba+3, a);
|
1560
|
+
#endif
|
1561
|
+
} return;
|
1562
|
+
}
|
1563
|
+
}
|
1564
|
+
}
|
1565
|
+
|
1566
|
+
|
1567
|
+
static void run_program(const Op* program, const void** arguments,
|
1568
|
+
const char* src, char* dst, int n,
|
1569
|
+
const size_t src_bpp, const size_t dst_bpp) {
|
1570
|
+
int i = 0;
|
1571
|
+
while (n >= N) {
|
1572
|
+
exec_ops(program, arguments, src, dst, i);
|
1573
|
+
i += N;
|
1574
|
+
n -= N;
|
1575
|
+
}
|
1576
|
+
if (n > 0) {
|
1577
|
+
char tmp[4*4*N] = {0};
|
1578
|
+
|
1579
|
+
memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
|
1580
|
+
exec_ops(program, arguments, tmp, tmp, 0);
|
1581
|
+
memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
|
1582
|
+
}
|
1583
|
+
}
|
1584
|
+
|
1585
|
+
// Clean up any #defines we may have set so that we can be #included again.
|
1586
|
+
#if defined(USING_AVX)
|
1587
|
+
#undef USING_AVX
|
1588
|
+
#endif
|
1589
|
+
#if defined(USING_AVX_F16C)
|
1590
|
+
#undef USING_AVX_F16C
|
1591
|
+
#endif
|
1592
|
+
#if defined(USING_AVX2)
|
1593
|
+
#undef USING_AVX2
|
1594
|
+
#endif
|
1595
|
+
#if defined(USING_AVX512F)
|
1596
|
+
#undef USING_AVX512F
|
1597
|
+
#endif
|
1598
|
+
|
1599
|
+
#if defined(USING_NEON)
|
1600
|
+
#undef USING_NEON
|
1601
|
+
#endif
|
1602
|
+
#if defined(USING_NEON_F16C)
|
1603
|
+
#undef USING_NEON_F16C
|
1604
|
+
#endif
|
1605
|
+
#if defined(USING_NEON_FP16)
|
1606
|
+
#undef USING_NEON_FP16
|
1607
|
+
#endif
|
1608
|
+
|
1609
|
+
#undef FALLTHROUGH
|