npm - @shopify/react-native-skia - Versions diffs - 0.1.157 → 0.1.159 - Mend

@shopify/react-native-skia 0.1.157 → 0.1.159

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

package/android/CMakeLists.txt +35 -11
package/android/build.gradle +31 -30
package/android/cpp/jni/JniLoad.cpp +2 -0
package/android/cpp/jni/include/JniSkiaDomView.h +89 -0
package/android/cpp/rnskia-android/SkiaOpenGLRenderer.cpp +4 -3
package/android/cpp/rnskia-android/SkiaOpenGLRenderer.h +4 -3
package/android/src/main/java/com/shopify/reactnative/skia/RNSkiaPackage.java +2 -1
package/android/src/main/java/com/shopify/reactnative/skia/SkiaDomView.java +45 -0
package/android/src/main/java/com/shopify/reactnative/skia/SkiaDomViewManager.java +64 -0
package/cpp/api/JsiSkContourMeasure.h +7 -5
package/cpp/api/JsiSkHostObjects.h +6 -0
package/cpp/api/JsiSkImageFilterFactory.h +1 -1
package/cpp/api/JsiSkPaint.h +9 -2
package/cpp/api/JsiSkPath.h +1 -0
package/cpp/api/JsiSkRuntimeEffect.h +36 -36
package/cpp/jsi/JsiHostObject.cpp +16 -28
package/cpp/jsi/JsiHostObject.h +127 -7
package/cpp/jsi/JsiValue.cpp +346 -0
package/cpp/jsi/JsiValue.h +222 -0
package/cpp/jsi/JsiValueWrapper.h +33 -5
package/cpp/rnskia/RNSkDomView.cpp +220 -0
package/cpp/rnskia/RNSkDomView.h +140 -0
package/cpp/rnskia/RNSkJsView.cpp +0 -4
package/cpp/rnskia/RNSkJsView.h +6 -4
package/cpp/rnskia/RNSkManager.cpp +7 -0
package/cpp/rnskia/RNSkPictureView.h +5 -8
package/cpp/rnskia/RNSkView.h +113 -5
package/cpp/rnskia/dom/JsiDomApi.h +167 -0
package/cpp/rnskia/dom/base/BaseNodeProp.h +72 -0
package/cpp/rnskia/dom/base/DerivedNodeProp.h +187 -0
package/cpp/rnskia/dom/base/DrawingContext.cpp +227 -0
package/cpp/rnskia/dom/base/DrawingContext.h +136 -0
package/cpp/rnskia/dom/base/JsiDependencyManager.h +294 -0
package/cpp/rnskia/dom/base/JsiDomDeclarationNode.h +176 -0
package/cpp/rnskia/dom/base/JsiDomDrawingNode.h +50 -0
package/cpp/rnskia/dom/base/JsiDomNode.h +361 -0
package/cpp/rnskia/dom/base/JsiDomRenderNode.h +267 -0
package/cpp/rnskia/dom/base/NodeProp.h +130 -0
package/cpp/rnskia/dom/base/NodePropsContainer.h +119 -0
package/cpp/rnskia/dom/nodes/JsiBackdropFilterNode.h +38 -0
package/cpp/rnskia/dom/nodes/JsiBlendNode.h +112 -0
package/cpp/rnskia/dom/nodes/JsiBlurMaskNode.h +78 -0
package/cpp/rnskia/dom/nodes/JsiBoxNode.h +104 -0
package/cpp/rnskia/dom/nodes/JsiBoxShadowNode.h +33 -0
package/cpp/rnskia/dom/nodes/JsiCircleNode.h +38 -0
package/cpp/rnskia/dom/nodes/JsiColorFilterNodes.h +192 -0
package/cpp/rnskia/dom/nodes/JsiCustomDrawingNode.h +123 -0
package/cpp/rnskia/dom/nodes/JsiDiffRectNode.h +42 -0
package/cpp/rnskia/dom/nodes/JsiFillNode.h +22 -0
package/cpp/rnskia/dom/nodes/JsiGlyphsNode.h +56 -0
package/cpp/rnskia/dom/nodes/JsiGroupNode.h +26 -0
package/cpp/rnskia/dom/nodes/JsiImageFilterNodes.h +415 -0
package/cpp/rnskia/dom/nodes/JsiImageNode.h +34 -0
package/cpp/rnskia/dom/nodes/JsiImageSvgNode.h +44 -0
package/cpp/rnskia/dom/nodes/JsiLayerNode.h +64 -0
package/cpp/rnskia/dom/nodes/JsiLineNode.h +43 -0
package/cpp/rnskia/dom/nodes/JsiOvalNode.h +34 -0
package/cpp/rnskia/dom/nodes/JsiPaintNode.h +77 -0
package/cpp/rnskia/dom/nodes/JsiPatchNode.h +54 -0
package/cpp/rnskia/dom/nodes/JsiPathEffectNodes.h +315 -0
package/cpp/rnskia/dom/nodes/JsiPathNode.h +181 -0
package/cpp/rnskia/dom/nodes/JsiPictureNode.h +32 -0
package/cpp/rnskia/dom/nodes/JsiPointsNode.h +51 -0
package/cpp/rnskia/dom/nodes/JsiRRectNode.h +34 -0
package/cpp/rnskia/dom/nodes/JsiRectNode.h +34 -0
package/cpp/rnskia/dom/nodes/JsiShaderNodes.h +517 -0
package/cpp/rnskia/dom/nodes/JsiTextBlobNode.h +47 -0
package/cpp/rnskia/dom/nodes/JsiTextNode.h +54 -0
package/cpp/rnskia/dom/nodes/JsiTextPathNode.h +32 -0
package/cpp/rnskia/dom/nodes/JsiVerticesNode.h +43 -0
package/cpp/rnskia/dom/props/BezierProps.h +63 -0
package/cpp/rnskia/dom/props/BlendModeProp.h +101 -0
package/cpp/rnskia/dom/props/BoxShadowProps.h +61 -0
package/cpp/rnskia/dom/props/CircleProp.h +46 -0
package/cpp/rnskia/dom/props/ClipProp.h +62 -0
package/cpp/rnskia/dom/props/ColorProp.h +80 -0
package/cpp/rnskia/dom/props/DrawingProp.h +33 -0
package/cpp/rnskia/dom/props/FontProp.h +34 -0
package/cpp/rnskia/dom/props/GlyphsProp.h +53 -0
package/cpp/rnskia/dom/props/ImageProps.h +173 -0
package/cpp/rnskia/dom/props/LayerProp.h +50 -0
package/cpp/rnskia/dom/props/MatrixProp.h +33 -0
package/cpp/rnskia/dom/props/NumbersProp.h +63 -0
package/cpp/rnskia/dom/props/PaintProps.h +172 -0
package/cpp/rnskia/dom/props/PathProp.h +55 -0
package/cpp/rnskia/dom/props/PictureProp.h +38 -0
package/cpp/rnskia/dom/props/PointProp.h +72 -0
package/cpp/rnskia/dom/props/PointsProp.h +83 -0
package/cpp/rnskia/dom/props/RRectProp.h +134 -0
package/cpp/rnskia/dom/props/RadiusProp.h +43 -0
package/cpp/rnskia/dom/props/RectProp.h +118 -0
package/cpp/rnskia/dom/props/StrokeProps.h +75 -0
package/cpp/rnskia/dom/props/SvgProp.h +37 -0
package/cpp/rnskia/dom/props/TextBlobProp.h +128 -0
package/cpp/rnskia/dom/props/TileModeProp.h +50 -0
package/cpp/rnskia/dom/props/TransformProp.h +80 -0
package/cpp/rnskia/dom/props/TransformsProps.h +68 -0
package/cpp/rnskia/dom/props/UniformsProp.h +194 -0
package/cpp/rnskia/dom/props/VertexModeProp.h +47 -0
package/cpp/rnskia/dom/props/VerticesProps.h +67 -0
package/cpp/rnskia/values/RNSkReadonlyValue.h +13 -4
package/cpp/skia/include/android/SkAndroidFrameworkUtils.h +35 -1
package/cpp/skia/include/codec/SkAndroidCodec.h +17 -1
package/cpp/skia/include/codec/SkCodec.h +8 -5
package/cpp/skia/include/core/SkAnnotation.h +2 -0
package/cpp/skia/include/core/SkBitmap.h +52 -1
package/cpp/skia/include/core/SkBlendMode.h +2 -0
package/cpp/skia/include/core/SkCanvas.h +52 -31
package/cpp/skia/include/core/SkCapabilities.h +44 -0
package/cpp/skia/include/core/SkColor.h +7 -0
package/cpp/skia/include/core/SkColorFilter.h +37 -0
package/cpp/skia/include/core/SkColorSpace.h +1 -1
package/cpp/skia/include/core/SkFont.h +4 -0
package/cpp/skia/include/core/SkFontMgr.h +3 -0
package/cpp/skia/include/core/SkGraphics.h +9 -0
package/cpp/skia/include/core/SkImage.h +77 -17
package/cpp/skia/include/core/SkImageEncoder.h +5 -3
package/cpp/skia/include/core/SkImageGenerator.h +27 -17
package/cpp/skia/include/core/SkM44.h +1 -0
package/cpp/skia/include/core/SkMesh.h +120 -34
package/cpp/skia/include/core/SkMilestone.h +1 -1
package/cpp/skia/include/core/SkOverdrawCanvas.h +2 -1
package/cpp/skia/include/core/SkPaint.h +15 -2
package/cpp/skia/include/core/SkPath.h +4 -0
package/cpp/skia/include/core/SkPathBuilder.h +1 -1
package/cpp/skia/include/core/SkPicture.h +0 -3
package/cpp/skia/include/core/SkPictureRecorder.h +0 -2
package/cpp/skia/include/core/SkPixmap.h +19 -0
package/cpp/skia/include/core/SkRasterHandleAllocator.h +3 -1
package/cpp/skia/include/core/SkRect.h +11 -4
package/cpp/skia/include/core/SkRefCnt.h +13 -1
package/cpp/skia/include/core/SkRegion.h +6 -0
package/cpp/skia/include/core/SkSamplingOptions.h +8 -6
package/cpp/skia/include/core/SkScalar.h +6 -25
package/cpp/skia/include/core/SkShader.h +20 -12
package/cpp/skia/include/core/SkSpan.h +51 -19
package/cpp/skia/include/core/SkStream.h +2 -2
package/cpp/skia/include/core/SkString.h +11 -3
package/cpp/skia/include/core/SkSurface.h +85 -8
package/cpp/skia/include/core/SkTextBlob.h +5 -2
package/cpp/skia/include/core/SkTypes.h +11 -10
package/cpp/skia/include/docs/SkPDFDocument.h +0 -5
package/cpp/skia/include/effects/Sk1DPathEffect.h +6 -1
package/cpp/skia/include/effects/Sk2DPathEffect.h +4 -1
package/cpp/skia/include/effects/SkColorMatrix.h +1 -0
package/cpp/skia/include/effects/SkColorMatrixFilter.h +5 -8
package/cpp/skia/include/effects/SkCornerPathEffect.h +5 -1
package/cpp/skia/include/effects/SkDashPathEffect.h +5 -1
package/cpp/skia/include/effects/SkGradientShader.h +68 -38
package/cpp/skia/include/effects/SkHighContrastFilter.h +5 -1
package/cpp/skia/include/effects/SkImageFilters.h +5 -4
package/cpp/skia/include/effects/SkLumaColorFilter.h +4 -1
package/cpp/skia/include/effects/SkOpPathEffect.h +6 -2
package/cpp/skia/include/effects/SkOverdrawColorFilter.h +5 -2
package/cpp/skia/include/effects/SkRuntimeEffect.h +54 -62
package/cpp/skia/include/effects/SkShaderMaskFilter.h +3 -1
package/cpp/skia/include/effects/SkTableColorFilter.h +8 -21
package/cpp/skia/include/effects/SkTableMaskFilter.h +5 -1
package/cpp/skia/include/effects/SkTrimPathEffect.h +5 -1
package/cpp/skia/include/encode/SkEncoder.h +17 -0
package/cpp/skia/include/encode/SkWebpEncoder.h +17 -0
package/cpp/skia/include/gpu/GpuTypes.h +18 -0
package/cpp/skia/include/gpu/GrBackendSurface.h +38 -17
package/cpp/skia/include/gpu/GrBackendSurfaceMutableState.h +6 -71
package/cpp/skia/include/gpu/GrContextOptions.h +1 -1
package/cpp/skia/include/gpu/GrContextThreadSafeProxy.h +10 -9
package/cpp/skia/include/gpu/GrDirectContext.h +42 -22
package/cpp/skia/include/gpu/GrRecordingContext.h +6 -3
package/cpp/skia/include/gpu/GrTypes.h +11 -11
package/cpp/skia/include/gpu/MutableTextureState.h +122 -0
package/cpp/skia/include/gpu/gl/GrGLFunctions.h +1 -0
package/cpp/skia/include/gpu/gl/GrGLInterface.h +1 -0
package/cpp/skia/include/gpu/graphite/BackendTexture.h +7 -0
package/cpp/skia/include/gpu/graphite/CombinationBuilder.h +195 -0
package/cpp/skia/include/gpu/graphite/Context.h +47 -55
package/cpp/skia/include/gpu/graphite/ContextOptions.h +85 -0
package/cpp/skia/include/gpu/graphite/GraphiteTypes.h +1 -17
package/cpp/skia/include/gpu/graphite/ImageProvider.h +61 -0
package/cpp/skia/include/gpu/graphite/Recorder.h +87 -8
package/cpp/skia/include/gpu/graphite/Recording.h +19 -9
package/cpp/skia/include/gpu/graphite/TextureInfo.h +40 -8
package/cpp/skia/include/gpu/graphite/dawn/DawnBackendContext.h +25 -0
package/cpp/skia/include/gpu/graphite/mtl/MtlBackendContext.h +3 -2
package/cpp/skia/include/gpu/graphite/vk/VulkanGraphiteTypes.h +69 -0
package/cpp/skia/include/gpu/mtl/MtlMemoryAllocator.h +39 -0
package/cpp/skia/include/gpu/vk/GrVkBackendContext.h +21 -19
package/cpp/skia/include/gpu/vk/GrVkExtensions.h +2 -50
package/cpp/skia/include/gpu/vk/GrVkMemoryAllocator.h +2 -127
package/cpp/skia/include/gpu/vk/GrVkTypes.h +5 -43
package/cpp/skia/include/gpu/vk/VulkanBackendContext.h +46 -0
package/cpp/skia/include/gpu/vk/VulkanExtensions.h +67 -0
package/cpp/skia/include/gpu/vk/VulkanMemoryAllocator.h +116 -0
package/cpp/skia/include/gpu/vk/VulkanTypes.h +59 -0
package/cpp/skia/include/pathops/SkPathOps.h +1 -1
package/cpp/skia/include/private/SkColorData.h +10 -40
package/cpp/skia/include/private/SkEncodedInfo.h +9 -3
package/cpp/skia/include/private/SkFloatingPoint.h +9 -6
package/cpp/skia/include/private/SkHalf.h +5 -52
package/cpp/skia/include/private/SkMacros.h +1 -1
package/cpp/skia/include/private/SkMalloc.h +4 -0
package/cpp/skia/include/private/SkPathRef.h +10 -10
package/cpp/skia/include/private/SkSLModifiers.h +59 -23
package/cpp/skia/include/private/SkSLProgramKind.h +1 -0
package/cpp/skia/include/private/SkSLSymbol.h +7 -3
package/cpp/skia/include/private/SkStringView.h +4 -0
package/cpp/skia/include/private/SkTArray.h +21 -7
package/cpp/skia/include/private/SkTDArray.h +173 -285
package/cpp/skia/include/private/SkTHash.h +33 -32
package/cpp/skia/include/private/SkTemplates.h +24 -26
package/cpp/skia/include/private/SkVx.h +218 -135
package/cpp/skia/include/private/chromium/GrSlug.h +3 -65
package/cpp/skia/include/private/chromium/SkChromeRemoteGlyphCache.h +6 -3
package/cpp/skia/include/private/chromium/Slug.h +76 -0
package/cpp/skia/include/private/gpu/ganesh/GrTypesPriv.h +6 -1
package/cpp/skia/include/private/gpu/ganesh/GrVkTypesPriv.h +5 -39
package/cpp/skia/include/private/gpu/graphite/VulkanGraphiteTypesPriv.h +63 -0
package/cpp/skia/include/{gpu/vk/GrVkVulkan.h → private/gpu/vk/SkiaVulkan.h} +2 -2
package/cpp/skia/include/private/gpu/vk/VulkanTypesPriv.h +57 -0
package/cpp/skia/include/sksl/DSL.h +0 -1
package/cpp/skia/include/sksl/DSLBlock.h +4 -18
package/cpp/skia/include/sksl/DSLCase.h +2 -8
package/cpp/skia/include/sksl/DSLCore.h +8 -15
package/cpp/skia/include/sksl/DSLExpression.h +51 -142
package/cpp/skia/include/sksl/DSLFunction.h +7 -15
package/cpp/skia/include/sksl/DSLModifiers.h +5 -2
package/cpp/skia/include/sksl/DSLStatement.h +4 -39
package/cpp/skia/include/sksl/DSLSymbols.h +1 -11
package/cpp/skia/include/sksl/DSLType.h +20 -12
package/cpp/skia/include/sksl/DSLVar.h +56 -146
package/cpp/skia/include/sksl/SkSLErrorReporter.h +2 -15
package/cpp/skia/include/sksl/SkSLOperator.h +62 -59
package/cpp/skia/include/sksl/SkSLPosition.h +2 -0
package/cpp/skia/include/sksl/SkSLVersion.h +27 -0
package/cpp/skia/include/svg/SkSVGCanvas.h +1 -0
package/cpp/skia/include/utils/SkAnimCodecPlayer.h +1 -1
package/cpp/skia/include/utils/SkBase64.h +2 -0
package/cpp/skia/include/utils/SkCustomTypeface.h +24 -11
package/cpp/skia/include/utils/SkEventTracer.h +12 -1
package/cpp/skia/include/utils/SkNWayCanvas.h +11 -4
package/cpp/skia/include/utils/SkPaintFilterCanvas.h +9 -4
package/cpp/skia/include/utils/SkParse.h +3 -0
package/cpp/skia/include/utils/SkShadowUtils.h +2 -0
package/cpp/skia/include/utils/SkTextUtils.h +2 -1
package/cpp/skia/{include/third_party → modules}/skcms/skcms.h +10 -0
package/cpp/skia/modules/skcms/skcms_internal.h +56 -0
package/cpp/skia/modules/skcms/src/Transform_inl.h +1609 -0
package/cpp/skia/modules/skparagraph/include/DartTypes.h +153 -0
package/cpp/skia/modules/skparagraph/include/FontArguments.h +46 -0
package/cpp/skia/modules/skparagraph/include/FontCollection.h +84 -0
package/cpp/skia/modules/skparagraph/include/Metrics.h +98 -0
package/cpp/skia/modules/skparagraph/include/Paragraph.h +111 -0
package/cpp/skia/modules/skparagraph/include/ParagraphBuilder.h +69 -0
package/cpp/skia/modules/skparagraph/include/ParagraphCache.h +77 -0
package/cpp/skia/modules/skparagraph/include/ParagraphStyle.h +143 -0
package/cpp/skia/modules/skparagraph/include/TextShadow.h +30 -0
package/cpp/skia/modules/skparagraph/include/TextStyle.h +352 -0
package/cpp/skia/modules/skparagraph/include/TypefaceFontProvider.h +81 -0
package/cpp/skia/modules/svg/include/SkSVGAttributeParser.h +1 -1
package/cpp/skia/modules/svg/include/SkSVGTypes.h +3 -3
package/cpp/skia/src/core/SkLRUCache.h +126 -0
package/cpp/skia/src/core/SkTInternalLList.h +302 -0
package/cpp/utils/RNSkTimingInfo.h +1 -0
package/ios/RNSkia-iOS/RNSkMetalCanvasProvider.h +15 -4
package/ios/RNSkia-iOS/RNSkMetalCanvasProvider.mm +40 -54
package/ios/RNSkia-iOS/SkiaDomViewManager.h +8 -0
package/ios/RNSkia-iOS/SkiaDomViewManager.mm +51 -0
package/lib/commonjs/dom/nodes/DrawingNode.js +1 -5
package/lib/commonjs/dom/nodes/DrawingNode.js.map +1 -1
package/lib/commonjs/dom/nodes/JsiSkDOM.js +56 -56
package/lib/commonjs/dom/nodes/JsiSkDOM.js.map +1 -1
package/lib/commonjs/dom/nodes/RenderNode.js +3 -9
package/lib/commonjs/dom/nodes/RenderNode.js.map +1 -1
package/lib/commonjs/dom/nodes/drawings/Text.js +4 -9
package/lib/commonjs/dom/nodes/drawings/Text.js.map +1 -1
package/lib/commonjs/dom/types/Common.js.map +1 -1
package/lib/commonjs/dom/types/Drawings.js.map +1 -1
package/lib/commonjs/mock/index.js +0 -1
package/lib/commonjs/mock/index.js.map +1 -1
package/lib/commonjs/renderer/Canvas.js +21 -57
package/lib/commonjs/renderer/Canvas.js.map +1 -1
package/lib/commonjs/renderer/DependencyManager.js +0 -5
package/lib/commonjs/renderer/DependencyManager.js.map +1 -1
package/lib/commonjs/renderer/HostComponents.js.map +1 -1
package/lib/commonjs/renderer/components/Mask.js +1 -3
package/lib/commonjs/renderer/components/Mask.js.map +1 -1
package/lib/commonjs/renderer/components/Paint.js +5 -18
package/lib/commonjs/renderer/components/Paint.js.map +1 -1
package/lib/commonjs/renderer/useCanvas.js +8 -6
package/lib/commonjs/renderer/useCanvas.js.map +1 -1
package/lib/commonjs/skia/core/Picture.js +1 -24
package/lib/commonjs/skia/core/Picture.js.map +1 -1
package/lib/commonjs/skia/core/Rect.js +1 -1
package/lib/commonjs/skia/core/Rect.js.map +1 -1
package/lib/commonjs/skia/types/ContourMeasure.js.map +1 -1
package/lib/commonjs/skia/types/Size.js +2 -0
package/lib/commonjs/skia/types/Size.js.map +1 -0
package/lib/commonjs/skia/types/index.js +13 -0
package/lib/commonjs/skia/types/index.js.map +1 -1
package/lib/commonjs/skia/web/JsiSkColor.js +8 -0
package/lib/commonjs/skia/web/JsiSkColor.js.map +1 -1
package/lib/commonjs/skia/web/JsiSkContourMeasure.js +4 -7
package/lib/commonjs/skia/web/JsiSkContourMeasure.js.map +1 -1
package/lib/commonjs/values/web/RNSkReadonlyValue.js +4 -2
package/lib/commonjs/values/web/RNSkReadonlyValue.js.map +1 -1
package/lib/commonjs/views/SkiaBaseWebView.js +9 -2
package/lib/commonjs/views/SkiaBaseWebView.js.map +1 -1
package/lib/commonjs/views/SkiaDomView.js +152 -0
package/lib/commonjs/views/SkiaDomView.js.map +1 -0
package/lib/commonjs/views/SkiaDomView.web.js +55 -0
package/lib/commonjs/views/SkiaDomView.web.js.map +1 -0
package/lib/commonjs/views/SkiaPictureView.js +16 -2
package/lib/commonjs/views/SkiaPictureView.js.map +1 -1
package/lib/commonjs/views/SkiaView.js +17 -2
package/lib/commonjs/views/SkiaView.js.map +1 -1
package/lib/commonjs/views/index.js +13 -0
package/lib/commonjs/views/index.js.map +1 -1
package/lib/commonjs/views/types.js.map +1 -1
package/lib/module/dom/nodes/DrawingNode.js +2 -6
package/lib/module/dom/nodes/DrawingNode.js.map +1 -1
package/lib/module/dom/nodes/JsiSkDOM.js +56 -56
package/lib/module/dom/nodes/JsiSkDOM.js.map +1 -1
package/lib/module/dom/nodes/RenderNode.js +2 -5
package/lib/module/dom/nodes/RenderNode.js.map +1 -1
package/lib/module/dom/nodes/drawings/Text.js +4 -9
package/lib/module/dom/nodes/drawings/Text.js.map +1 -1
package/lib/module/dom/types/Common.js.map +1 -1
package/lib/module/dom/types/Drawings.js.map +1 -1
package/lib/module/mock/index.js +0 -1
package/lib/module/mock/index.js.map +1 -1
package/lib/module/renderer/Canvas.js +22 -58
package/lib/module/renderer/Canvas.js.map +1 -1
package/lib/module/renderer/DependencyManager.js +0 -5
package/lib/module/renderer/DependencyManager.js.map +1 -1
package/lib/module/renderer/HostComponents.js.map +1 -1
package/lib/module/renderer/components/Mask.js +2 -4
package/lib/module/renderer/components/Mask.js.map +1 -1
package/lib/module/renderer/components/Paint.js +3 -14
package/lib/module/renderer/components/Paint.js.map +1 -1
package/lib/module/renderer/useCanvas.js +6 -4
package/lib/module/renderer/useCanvas.js.map +1 -1
package/lib/module/skia/core/Picture.js +0 -18
package/lib/module/skia/core/Picture.js.map +1 -1
package/lib/module/skia/core/Rect.js +1 -1
package/lib/module/skia/core/Rect.js.map +1 -1
package/lib/module/skia/types/ContourMeasure.js.map +1 -1
package/lib/module/skia/types/Size.js +2 -0
package/lib/module/skia/types/Size.js.map +1 -0
package/lib/module/skia/types/index.js +1 -0
package/lib/module/skia/types/index.js.map +1 -1
package/lib/module/skia/web/JsiSkColor.js +8 -0
package/lib/module/skia/web/JsiSkColor.js.map +1 -1
package/lib/module/skia/web/JsiSkContourMeasure.js +3 -7
package/lib/module/skia/web/JsiSkContourMeasure.js.map +1 -1
package/lib/module/values/web/RNSkReadonlyValue.js +4 -2
package/lib/module/values/web/RNSkReadonlyValue.js.map +1 -1
package/lib/module/views/SkiaBaseWebView.js +9 -2
package/lib/module/views/SkiaBaseWebView.js.map +1 -1
package/lib/module/views/SkiaDomView.js +128 -0
package/lib/module/views/SkiaDomView.js.map +1 -0
package/lib/module/views/SkiaDomView.web.js +41 -0
package/lib/module/views/SkiaDomView.web.js.map +1 -0
package/lib/module/views/SkiaPictureView.js +14 -2
package/lib/module/views/SkiaPictureView.js.map +1 -1
package/lib/module/views/SkiaView.js +15 -2
package/lib/module/views/SkiaView.js.map +1 -1
package/lib/module/views/index.js +1 -0
package/lib/module/views/index.js.map +1 -1
package/lib/module/views/types.js.map +1 -1
package/lib/typescript/src/dom/nodes/JsiSkDOM.d.ts +57 -64
package/lib/typescript/src/dom/nodes/RenderNode.d.ts +1 -3
package/lib/typescript/src/dom/types/Common.d.ts +2 -3
package/lib/typescript/src/dom/types/Drawings.d.ts +1 -3
package/lib/typescript/src/renderer/Canvas.d.ts +5 -5
package/lib/typescript/src/renderer/DependencyManager.d.ts +0 -2
package/lib/typescript/src/renderer/HostComponents.d.ts +67 -8
package/lib/typescript/src/renderer/components/Paint.d.ts +3 -4
package/lib/typescript/src/renderer/useCanvas.d.ts +1 -6
package/lib/typescript/src/skia/core/Picture.d.ts +0 -8
package/lib/typescript/src/skia/types/ContourMeasure.d.ts +3 -2
package/lib/typescript/src/skia/types/Size.d.ts +4 -0
package/lib/typescript/src/skia/types/index.d.ts +1 -0
package/lib/typescript/src/skia/web/JsiSkContourMeasure.d.ts +2 -6
package/lib/typescript/src/views/SkiaBaseWebView.d.ts +4 -0
package/lib/typescript/src/views/SkiaDomView.d.ts +31 -0
package/lib/typescript/src/views/SkiaDomView.web.d.ts +7 -0
package/lib/typescript/src/views/index.d.ts +1 -0
package/lib/typescript/src/views/types.d.ts +12 -2
package/libs/android/arm64-v8a/libskia.a +0 -0
package/libs/android/arm64-v8a/libskottie.a +0 -0
package/libs/android/arm64-v8a/libskparagraph.a +0 -0
package/libs/android/arm64-v8a/libsksg.a +0 -0
package/libs/android/arm64-v8a/libskshaper.a +0 -0
package/libs/android/arm64-v8a/libskunicode.a +0 -0
package/libs/android/arm64-v8a/libsvg.a +0 -0
package/libs/android/armeabi-v7a/libskia.a +0 -0
package/libs/android/armeabi-v7a/libskottie.a +0 -0
package/libs/android/armeabi-v7a/libskparagraph.a +0 -0
package/libs/android/armeabi-v7a/libsksg.a +0 -0
package/libs/android/armeabi-v7a/libskshaper.a +0 -0
package/libs/android/armeabi-v7a/libskunicode.a +0 -0
package/libs/android/armeabi-v7a/libsvg.a +0 -0
package/libs/android/x86/libskia.a +0 -0
package/libs/android/x86/libskottie.a +0 -0
package/libs/android/x86/libskparagraph.a +0 -0
package/libs/android/x86/libsksg.a +0 -0
package/libs/android/x86/libskshaper.a +0 -0
package/libs/android/x86/libskunicode.a +0 -0
package/libs/android/x86/libsvg.a +0 -0
package/libs/android/x86_64/libskia.a +0 -0
package/libs/android/x86_64/libskottie.a +0 -0
package/libs/android/x86_64/libskparagraph.a +0 -0
package/libs/android/x86_64/libsksg.a +0 -0
package/libs/android/x86_64/libskshaper.a +0 -0
package/libs/android/x86_64/libskunicode.a +0 -0
package/libs/android/x86_64/libsvg.a +0 -0
package/libs/ios/libskia.xcframework/ios-arm64_arm64e/libskia.a +0 -0
package/libs/ios/libskia.xcframework/ios-arm64_arm64e_x86_64-simulator/libskia.a +0 -0
package/libs/ios/libskottie.xcframework/ios-arm64_arm64e/libskottie.a +0 -0
package/libs/ios/libskottie.xcframework/ios-arm64_arm64e_x86_64-simulator/libskottie.a +0 -0
package/libs/ios/libskparagraph.xcframework/Info.plist +42 -0
package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e/libskparagraph.a +0 -0
package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e_x86_64-simulator/libskparagraph.a +0 -0
package/libs/ios/libsksg.xcframework/Info.plist +5 -5
package/libs/ios/libsksg.xcframework/ios-arm64_arm64e/libsksg.a +0 -0
package/libs/ios/libsksg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsksg.a +0 -0
package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e/libskshaper.a +0 -0
package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e_x86_64-simulator/libskshaper.a +0 -0
package/libs/ios/libskunicode.xcframework/Info.plist +42 -0
package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e/libskunicode.a +0 -0
package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e_x86_64-simulator/libskunicode.a +0 -0
package/libs/ios/libsvg.xcframework/ios-arm64_arm64e/libsvg.a +0 -0
package/libs/ios/libsvg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsvg.a +0 -0
package/package.json +6 -4
package/react-native-skia.podspec +3 -1
package/src/dom/nodes/DrawingNode.ts +2 -4
package/src/dom/nodes/JsiSkDOM.ts +170 -56
package/src/dom/nodes/RenderNode.ts +2 -14
package/src/dom/nodes/drawings/Text.ts +4 -4
package/src/dom/types/Common.ts +2 -4
package/src/dom/types/Drawings.ts +1 -4
package/src/mock/index.ts +0 -1
package/src/renderer/Canvas.tsx +32 -51
package/src/renderer/DependencyManager.tsx +0 -5
package/src/renderer/HostComponents.ts +153 -5
package/src/renderer/components/Mask.tsx +2 -2
package/src/renderer/components/Paint.tsx +3 -14
package/src/renderer/useCanvas.ts +5 -5
package/src/skia/core/Picture.ts +0 -24
package/src/skia/core/Rect.ts +1 -1
package/src/skia/types/ContourMeasure.tsx +3 -2
package/src/skia/types/Size.ts +4 -0
package/src/skia/types/index.ts +1 -0
package/src/skia/web/JsiSkColor.ts +11 -0
package/src/skia/web/JsiSkContourMeasure.ts +7 -3
package/src/values/web/RNSkReadonlyValue.ts +4 -2
package/src/views/SkiaBaseWebView.tsx +6 -2
package/src/views/SkiaDomView.tsx +120 -0
package/src/views/SkiaDomView.web.tsx +37 -0
package/src/views/SkiaPictureView.tsx +10 -2
package/src/views/SkiaView.tsx +11 -3
package/src/views/index.ts +1 -0
package/src/views/types.ts +19 -2
package/cpp/jsi/JsiSimpleValueWrapper.h +0 -99
package/cpp/skia/include/c/sk_canvas.h +0 -159
package/cpp/skia/include/c/sk_colorspace.h +0 -25
package/cpp/skia/include/c/sk_data.h +0 -65
package/cpp/skia/include/c/sk_image.h +0 -71
package/cpp/skia/include/c/sk_imageinfo.h +0 -62
package/cpp/skia/include/c/sk_maskfilter.h +0 -47
package/cpp/skia/include/c/sk_matrix.h +0 -49
package/cpp/skia/include/c/sk_paint.h +0 -145
package/cpp/skia/include/c/sk_path.h +0 -102
package/cpp/skia/include/c/sk_picture.h +0 -70
package/cpp/skia/include/c/sk_shader.h +0 -143
package/cpp/skia/include/c/sk_surface.h +0 -73
package/cpp/skia/include/c/sk_types.h +0 -278
package/cpp/skia/include/gpu/graphite/SkStuff.h +0 -47
package/cpp/skia/include/private/SkNx.h +0 -430
package/cpp/skia/include/private/SkNx_neon.h +0 -713
package/cpp/skia/include/private/SkNx_sse.h +0 -823
package/cpp/skia/include/sksl/DSLRuntimeEffects.h +0 -32
package/cpp/skia/include/sksl/DSLWrapper.h +0 -77

package/cpp/skia/modules/skcms/src/Transform_inl.h ADDED Viewed

@@ -0,0 +1,1609 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+// Intentionally NO #pragma once... included multiple times.
+// This file is included from skcms.cc in a namespace with some pre-defines:
+//    - N:    depth of all vectors, 1,4,8, or 16 (preprocessor define)
+//    - V<T>: a template to create a vector of N T's.
+using F   = V<Color>;   // Called F for historic reasons... maybe rename C?
+using I32 = V<int32_t>;
+using U64 = V<uint64_t>;
+using U32 = V<uint32_t>;
+using U16 = V<uint16_t>;
+using U8  = V<uint8_t>;
+#if defined(__GNUC__) && !defined(__clang__)
+    // Once again, GCC is kind of weird, not allowing vector = scalar directly.
+    static constexpr F F0 = F() + 0.0f,
+                       F1 = F() + 1.0f,
+                       FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
+#else
+    static constexpr F F0 = 0.0f,
+                       F1 = 1.0f,
+                       FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
+#endif
+// Instead of checking __AVX__ below, we'll check USING_AVX.
+// This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
+// Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
+#if !defined(USING_AVX)      && N == 8 && defined(__AVX__)
+    #define  USING_AVX
+#endif
+#if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
+    #define  USING AVX_F16C
+#endif
+#if !defined(USING_AVX2)     && defined(USING_AVX) && defined(__AVX2__)
+    #define  USING_AVX2
+#endif
+#if !defined(USING_AVX512F)  && N == 16 && defined(__AVX512F__) && defined(__AVX512DQ__)
+    #define  USING_AVX512F
+#endif
+// Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
+// This is more for organizational clarity... skcms.cc doesn't force these.
+#if N > 1 && defined(__ARM_NEON)
+    #define USING_NEON
+    #if __ARM_FP & 2
+        #define USING_NEON_F16C
+    #endif
+    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
+        #define USING_NEON_FP16
+    #endif
+#endif
+// These -Wvector-conversion warnings seem to trigger in very bogus situations,
+// like vst3q_f32() expecting a 16x char rather than a 4x float vector.  :/
+#if defined(USING_NEON) && defined(__clang__)
+    #pragma clang diagnostic ignored "-Wvector-conversion"
+#endif
+// GCC & Clang (but not clang-cl) warn returning U64 on x86 is larger than a register.
+// You'd see warnings like, "using AVX even though AVX is not enabled".
+// We stifle these warnings; our helpers that return U64 are always inlined.
+#if defined(__SSE__) && defined(__GNUC__)
+    #if !defined(__has_warning)
+        #pragma GCC diagnostic ignored "-Wpsabi"
+    #elif __has_warning("-Wpsabi")
+        #pragma GCC diagnostic ignored "-Wpsabi"
+    #endif
+#endif
+#if defined(__clang__)
+    #define FALLTHROUGH [[clang::fallthrough]]
+#else
+    #define FALLTHROUGH
+#endif
+// We tag most helper functions as SI, to enforce good code generation
+// but also work around what we think is a bug in GCC: when targeting 32-bit
+// x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
+// MMX mm0 register, which seems to mess with unrelated code that later uses
+// x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
+//
+// It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
+#if defined(__clang__) || defined(__GNUC__)
+    #define SI static inline __attribute__((always_inline))
+#else
+    #define SI static inline
+#endif
+template <typename T, typename P>
+SI T load(const P* ptr) {
+    T val;
+    small_memcpy(&val, ptr, sizeof(val));
+    return val;
+}
+template <typename T, typename P>
+SI void store(P* ptr, const T& val) {
+    small_memcpy(ptr, &val, sizeof(val));
+}
+// (T)v is a cast when N == 1 and a bit-pun when N>1,
+// so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
+template <typename D, typename S>
+SI D cast(const S& v) {
+#if N == 1
+    return (D)v;
+#elif defined(__clang__)
+    return __builtin_convertvector(v, D);
+#else
+    D d;
+    for (int i = 0; i < N; i++) {
+        d[i] = v[i];
+    }
+    return d;
+#endif
+}
+template <typename D, typename S>
+SI D bit_pun(const S& v) {
+    static_assert(sizeof(D) == sizeof(v), "");
+    return load<D>(&v);
+}
+// When we convert from float to fixed point, it's very common to want to round,
+// and for some reason compilers generate better code when converting to int32_t.
+// To serve both those ends, we use this function to_fixed() instead of direct cast().
+#if defined(USING_NEON_FP16)
+    // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
+    SI U16 to_fixed(F f) {  return cast<U16>(f + 0.5f); }
+#else
+    SI U32 to_fixed(F f) {  return (U32)cast<I32>(f + 0.5f); }
+#endif
+// Sometimes we do something crazy on one branch of a conditonal,
+// like divide by zero or convert a huge float to an integer,
+// but then harmlessly select the other side.  That trips up N==1
+// sanitizer builds, so we make if_then_else() a macro to avoid
+// evaluating the unused side.
+#if N == 1
+    #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
+#else
+    template <typename C, typename T>
+    SI T if_then_else(C cond, T t, T e) {
+        return bit_pun<T>( ( cond & bit_pun<C>(t)) |
+                           (~cond & bit_pun<C>(e)) );
+    }
+#endif
+SI F F_from_Half(U16 half) {
+#if defined(USING_NEON_FP16)
+    return bit_pun<F>(half);
+#elif defined(USING_NEON_F16C)
+    return vcvt_f32_f16((float16x4_t)half);
+#elif defined(USING_AVX512F)
+    return (F)_mm512_cvtph_ps((__m256i)half);
+#elif defined(USING_AVX_F16C)
+    typedef int16_t __attribute__((vector_size(16))) I16;
+    return __builtin_ia32_vcvtph2ps256((I16)half);
+#else
+    U32 wide = cast<U32>(half);
+    // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
+    U32 s  = wide & 0x8000,
+        em = wide ^ s;
+    // Constructing the float is easy if the half is not denormalized.
+    F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
+    // Simply flush all denorm half floats to zero.
+    return if_then_else(em < 0x0400, F0, norm);
+#endif
+}
+#if defined(__clang__)
+    // The -((127-15)<<10) underflows that side of the math when
+    // we pass a denorm half float.  It's harmless... we'll take the 0 side anyway.
+    __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+SI U16 Half_from_F(F f) {
+#if defined(USING_NEON_FP16)
+    return bit_pun<U16>(f);
+#elif defined(USING_NEON_F16C)
+    return (U16)vcvt_f16_f32(f);
+#elif defined(USING_AVX512F)
+    return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
+#elif defined(USING_AVX_F16C)
+    return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
+#else
+    // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
+    U32 sem = bit_pun<U32>(f),
+        s   = sem & 0x80000000,
+         em = sem ^ s;
+    // For simplicity we flush denorm half floats (including all denorm floats) to zero.
+    return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
+                                                 , (s>>16) + (em>>13) - ((127-15)<<10)));
+#endif
+}
+// Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
+#if defined(USING_NEON_FP16)
+    SI U16 swap_endian_16(U16 v) {
+        return (U16)vrev16q_u8((uint8x16_t) v);
+    }
+#elif defined(USING_NEON)
+    SI U16 swap_endian_16(U16 v) {
+        return (U16)vrev16_u8((uint8x8_t) v);
+    }
+#endif
+SI U64 swap_endian_16x4(const U64& rgba) {
+    return (rgba & 0x00ff00ff00ff00ff) << 8
+         | (rgba & 0xff00ff00ff00ff00) >> 8;
+}
+#if defined(USING_NEON_FP16)
+    SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
+    SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
+#elif defined(USING_NEON)
+    SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
+    SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
+#else
+    SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
+    SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
+#endif
+SI F floor_(F x) {
+#if N == 1
+    return floorf_(x);
+#elif defined(USING_NEON_FP16)
+    return vrndmq_f16(x);
+#elif defined(__aarch64__)
+    return vrndmq_f32(x);
+#elif defined(USING_AVX512F)
+    // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
+    // and integer santizer catches that this implicit cast changes the
+    // value from -1 to 65535.  We'll cast manually to work around it.
+    // Read this as `return _mm512_floor_ps(x)`.
+    return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
+#elif defined(USING_AVX)
+    return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
+#elif defined(__SSE4_1__)
+    return _mm_floor_ps(x);
+#else
+    // Round trip through integers with a truncating cast.
+    F roundtrip = cast<F>(cast<I32>(x));
+    // If x is negative, truncating gives the ceiling instead of the floor.
+    return roundtrip - if_then_else(roundtrip > x, F1, F0);
+    // This implementation fails for values of x that are outside
+    // the range an integer can represent.  We expect most x to be small.
+#endif
+}
+SI F approx_log2(F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    return x;
+#else
+    // The first approximation of log2(x) is its exponent 'e', minus 127.
+    I32 bits = bit_pun<I32>(x);
+    F e = cast<F>(bits) * (1.0f / (1<<23));
+    // If we use the mantissa too we can refine the error signficantly.
+    F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
+    return e - 124.225514990f
+             -   1.498030302f*m
+             -   1.725879990f/(0.3520887068f + m);
+#endif
+}
+SI F approx_log(F x) {
+    const float ln2 = 0.69314718f;
+    return ln2 * approx_log2(x);
+}
+SI F approx_exp2(F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    return x;
+#else
+    F fract = x - floor_(x);
+    F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
+                                    -   1.490129070f*fract
+                                    +  27.728023300f/(4.84252568f - fract));
+    I32 bits = cast<I32>(min_(max_(fbits, F0), FInfBits));
+    return bit_pun<F>(bits);
+#endif
+}
+SI F approx_pow(F x, float y) {
+    return if_then_else((x == F0) | (x == F1), x
+                                             , approx_exp2(approx_log2(x) * y));
+}
+SI F approx_exp(F x) {
+    const float log2_e = 1.4426950408889634074f;
+    return approx_exp2(log2_e * x);
+}
+// Return tf(x).
+SI F apply_tf(const skcms_TransferFunction* tf, F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    (void)tf;
+    return x;
+#else
+    // Peel off the sign bit and set x = |x|.
+    U32 bits = bit_pun<U32>(x),
+        sign = bits & 0x80000000;
+    x = bit_pun<F>(bits ^ sign);
+    // The transfer function has a linear part up to d, exponential at d and after.
+    F v = if_then_else(x < tf->d,            tf->c*x + tf->f
+                                , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
+    // Tack the sign bit back on.
+    return bit_pun<F>(sign | bit_pun<U32>(v));
+#endif
+}
+SI F apply_pq(const skcms_TransferFunction* tf, F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    (void)tf;
+    return x;
+#else
+    U32 bits = bit_pun<U32>(x),
+        sign = bits & 0x80000000;
+    x = bit_pun<F>(bits ^ sign);
+    F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
+                       / (tf->d + tf->e * approx_pow(x, tf->c)),
+                     tf->f);
+    return bit_pun<F>(sign | bit_pun<U32>(v));
+#endif
+}
+SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    (void)tf;
+    return x;
+#else
+    const float R = tf->a, G = tf->b,
+                a = tf->c, b = tf->d, c = tf->e,
+                K = tf->f + 1;
+    U32 bits = bit_pun<U32>(x),
+        sign = bits & 0x80000000;
+    x = bit_pun<F>(bits ^ sign);
+    F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
+                               , approx_exp((x-c)*a) + b);
+    return K*bit_pun<F>(sign | bit_pun<U32>(v));
+#endif
+}
+SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    (void)tf;
+    return x;
+#else
+    const float R = tf->a, G = tf->b,
+                a = tf->c, b = tf->d, c = tf->e,
+                K = tf->f + 1;
+    U32 bits = bit_pun<U32>(x),
+        sign = bits & 0x80000000;
+    x = bit_pun<F>(bits ^ sign);
+    x /= K;
+    F v = if_then_else(x <= 1, R * approx_pow(x, G)
+                             , a * approx_log(x - b) + c);
+    return bit_pun<F>(sign | bit_pun<U32>(v));
+#endif
+}
+// Strided loads and stores of N values, starting from p.
+template <typename T, typename P>
+SI T load_3(const P* p) {
+#if N == 1
+    return (T)p[0];
+#elif N == 4
+    return T{p[ 0],p[ 3],p[ 6],p[ 9]};
+#elif N == 8
+    return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
+#elif N == 16
+    return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
+             p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
+#endif
+}
+template <typename T, typename P>
+SI T load_4(const P* p) {
+#if N == 1
+    return (T)p[0];
+#elif N == 4
+    return T{p[ 0],p[ 4],p[ 8],p[12]};
+#elif N == 8
+    return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
+#elif N == 16
+    return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
+             p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
+#endif
+}
+template <typename T, typename P>
+SI void store_3(P* p, const T& v) {
+#if N == 1
+    p[0] = v;
+#elif N == 4
+    p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
+#elif N == 8
+    p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
+    p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
+#elif N == 16
+    p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
+    p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
+    p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
+    p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
+#endif
+}
+template <typename T, typename P>
+SI void store_4(P* p, const T& v) {
+#if N == 1
+    p[0] = v;
+#elif N == 4
+    p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
+#elif N == 8
+    p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
+    p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
+#elif N == 16
+    p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
+    p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
+    p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
+    p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
+#endif
+}
+SI U8 gather_8(const uint8_t* p, I32 ix) {
+#if N == 1
+    U8 v = p[ix];
+#elif N == 4
+    U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
+#elif N == 8
+    U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
+             p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
+#elif N == 16
+    U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
+             p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
+             p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
+             p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
+#endif
+    return v;
+}
+SI U16 gather_16(const uint8_t* p, I32 ix) {
+    // Load the i'th 16-bit value from p.
+    auto load_16 = [p](int i) {
+        return load<uint16_t>(p + 2*i);
+    };
+#if N == 1
+    U16 v = load_16(ix);
+#elif N == 4
+    U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
+#elif N == 8
+    U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
+              load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
+#elif N == 16
+    U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
+              load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
+              load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
+              load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
+#endif
+    return v;
+}
+SI U32 gather_32(const uint8_t* p, I32 ix) {
+    // Load the i'th 32-bit value from p.
+    auto load_32 = [p](int i) {
+        return load<uint32_t>(p + 4*i);
+    };
+#if N == 1
+    U32 v = load_32(ix);
+#elif N == 4
+    U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
+#elif N == 8
+    U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
+              load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
+#elif N == 16
+    U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
+              load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
+              load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
+              load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
+#endif
+    // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
+    return v;
+}
+SI U32 gather_24(const uint8_t* p, I32 ix) {
+    // First, back up a byte.  Any place we're gathering from has a safe junk byte to read
+    // in front of it, either a previous table value, or some tag metadata.
+    p -= 1;
+    // Load the i'th 24-bit value from p, and 1 extra byte.
+    auto load_24_32 = [p](int i) {
+        return load<uint32_t>(p + 3*i);
+    };
+    // Now load multiples of 4 bytes (a junk byte, then r,g,b).
+#if N == 1
+    U32 v = load_24_32(ix);
+#elif N == 4
+    U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
+#elif N == 8 && !defined(USING_AVX2)
+    U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
+              load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
+#elif N == 8
+    (void)load_24_32;
+    // The gather instruction here doesn't need any particular alignment,
+    // but the intrinsic takes a const int*.
+    const int* p4 = bit_pun<const int*>(p);
+    I32 zero = { 0, 0, 0, 0,  0, 0, 0, 0},
+        mask = {-1,-1,-1,-1, -1,-1,-1,-1};
+    #if defined(__clang__)
+        U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
+    #elif defined(__GNUC__)
+        U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
+    #endif
+#elif N == 16
+    (void)load_24_32;
+    // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
+    // And AVX-512 swapped the order of arguments.  :/
+    const int* p4 = bit_pun<const int*>(p);
+    U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
+#endif
+    // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
+    return v >> 8;
+}
+#if !defined(__arm__)
+    SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
+        // As in gather_24(), with everything doubled.
+        p -= 2;
+        // Load the i'th 48-bit value from p, and 2 extra bytes.
+        auto load_48_64 = [p](int i) {
+            return load<uint64_t>(p + 6*i);
+        };
+    #if N == 1
+        *v = load_48_64(ix);
+    #elif N == 4
+        *v = U64{
+            load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
+        };
+    #elif N == 8 && !defined(USING_AVX2)
+        *v = U64{
+            load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
+            load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
+        };
+    #elif N == 8
+        (void)load_48_64;
+        typedef int32_t   __attribute__((vector_size(16))) Half_I32;
+        typedef long long __attribute__((vector_size(32))) Half_I64;
+        // The gather instruction here doesn't need any particular alignment,
+        // but the intrinsic takes a const long long*.
+        const long long int* p8 = bit_pun<const long long int*>(p);
+        Half_I64 zero = { 0, 0, 0, 0},
+                 mask = {-1,-1,-1,-1};
+        ix *= 6;
+        Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
+                 ix_hi = { ix[4], ix[5], ix[6], ix[7] };
+        #if defined(__clang__)
+            Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
+                     hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
+        #elif defined(__GNUC__)
+            Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
+                     hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
+        #endif
+        store((char*)v +  0, lo);
+        store((char*)v + 32, hi);
+    #elif N == 16
+        (void)load_48_64;
+        const long long int* p8 = bit_pun<const long long int*>(p);
+        __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
+                hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
+        store((char*)v +  0, lo);
+        store((char*)v + 64, hi);
+    #endif
+        *v >>= 16;
+    }
+#endif
+SI F F_from_U8(U8 v) {
+    return cast<F>(v) * (1/255.0f);
+}
+SI F F_from_U16_BE(U16 v) {
+    // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
+    // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
+    U16 lo = (v >> 8),
+        hi = (v << 8) & 0xffff;
+    return cast<F>(lo|hi) * (1/65535.0f);
+}
+SI U16 U16_from_F(F v) {
+    // 65535 == inf in FP16, so promote to FP32 before converting.
+    return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
+}
+SI F minus_1_ulp(F v) {
+#if defined(USING_NEON_FP16)
+    return bit_pun<F>( bit_pun<U16>(v) - 1 );
+#else
+    return bit_pun<F>( bit_pun<U32>(v) - 1 );
+#endif
+}
+SI F table(const skcms_Curve* curve, F v) {
+    // Clamp the input to [0,1], then scale to a table index.
+    F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
+    // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
+    I32 lo = cast<I32>(            ix      ),
+        hi = cast<I32>(minus_1_ulp(ix+1.0f));
+    F t = ix - cast<F>(lo);  // i.e. the fractional part of ix.
+    // TODO: can we load l and h simultaneously?  Each entry in 'h' is either
+    // the same as in 'l' or adjacent.  We have a rough idea that's it'd always be safe
+    // to read adjacent entries and perhaps underflow the table by a byte or two
+    // (it'd be junk, but always safe to read).  Not sure how to lerp yet.
+    F l,h;
+    if (curve->table_8) {
+        l = F_from_U8(gather_8(curve->table_8, lo));
+        h = F_from_U8(gather_8(curve->table_8, hi));
+    } else {
+        l = F_from_U16_BE(gather_16(curve->table_16, lo));
+        h = F_from_U16_BE(gather_16(curve->table_16, hi));
+    }
+    return l + (h-l)*t;
+}
+SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) {
+    U32 rgb = gather_24(grid_8, ix);
+    *r = cast<F>((rgb >>  0) & 0xff) * (1/255.0f);
+    *g = cast<F>((rgb >>  8) & 0xff) * (1/255.0f);
+    *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
+}
+SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) {
+    // TODO: don't forget to optimize gather_32().
+    U32 rgba = gather_32(grid_8, ix);
+    *r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
+    *g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
+    *b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
+    *a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
+}
+SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) {
+#if defined(__arm__)
+    // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
+    *r = F_from_U16_BE(gather_16(grid_16, 3*ix+0));
+    *g = F_from_U16_BE(gather_16(grid_16, 3*ix+1));
+    *b = F_from_U16_BE(gather_16(grid_16, 3*ix+2));
+#else
+    // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
+    U64 rgb;
+    gather_48(grid_16, ix, &rgb);
+    rgb = swap_endian_16x4(rgb);
+    *r = cast<F>((rgb >>  0) & 0xffff) * (1/65535.0f);
+    *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
+    *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
+#endif
+}
+SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) {
+    // TODO: gather_64()-based fast path?
+    *r = F_from_U16_BE(gather_16(grid_16, 4*ix+0));
+    *g = F_from_U16_BE(gather_16(grid_16, 4*ix+1));
+    *b = F_from_U16_BE(gather_16(grid_16, 4*ix+2));
+    *a = F_from_U16_BE(gather_16(grid_16, 4*ix+3));
+}
+static void clut(uint32_t input_channels, uint32_t output_channels,
+                 const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
+                 F* r, F* g, F* b, F* a) {
+    const int dim = (int)input_channels;
+    assert (0 < dim && dim <= 4);
+    assert (output_channels == 3 ||
+            output_channels == 4);
+    // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
+    I32 index [8];  // Index contribution by dimension, first low from 0, then high from 4.
+    F   weight[8];  // Weight for each contribution, again first low, then high.
+    // O(dim) work first: calculate index,weight from r,g,b,a.
+    const F inputs[] = { *r,*g,*b,*a };
+    for (int i = dim-1, stride = 1; i >= 0; i--) {
+        // x is where we logically want to sample the grid in the i-th dimension.
+        F x = inputs[i] * (float)(grid_points[i] - 1);
+        // But we can't index at floats.  lo and hi are the two integer grid points surrounding x.
+        I32 lo = cast<I32>(            x      ),   // i.e. trunc(x) == floor(x) here.
+            hi = cast<I32>(minus_1_ulp(x+1.0f));
+        // Notice how we fold in the accumulated stride across previous dimensions here.
+        index[i+0] = lo * stride;
+        index[i+4] = hi * stride;
+        stride *= grid_points[i];
+        // We'll interpolate between those two integer grid points by t.
+        F t = x - cast<F>(lo);  // i.e. fract(x)
+        weight[i+0] = 1-t;
+        weight[i+4] = t;
+    }
+    *r = *g = *b = F0;
+    if (output_channels == 4) {
+        *a = F0;
+    }
+    // We'll sample 2^dim == 1<<dim table entries per pixel,
+    // in all combinations of low and high in each dimension.
+    for (int combo = 0; combo < (1<<dim); combo++) {  // This loop can be done in any order.
+        // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
+        // where 0 selects the low index contribution and its weight 1-t,
+        // or 4 the high index contribution and its weight t.
+        // Since 0<dim≤4, we can always just start off with the 0-th channel,
+        // then handle the others conditionally.
+        I32 ix = index [0 + (combo&1)*4];
+        F    w = weight[0 + (combo&1)*4];
+        switch ((dim-1)&3) {  // This lets the compiler know there are no other cases to handle.
+            case 3: ix += index [3 + (combo&8)/2];
+                    w  *= weight[3 + (combo&8)/2];
+                    FALLTHROUGH;
+                    // fall through
+            case 2: ix += index [2 + (combo&4)*1];
+                    w  *= weight[2 + (combo&4)*1];
+                    FALLTHROUGH;
+                    // fall through
+            case 1: ix += index [1 + (combo&2)*2];
+                    w  *= weight[1 + (combo&2)*2];
+        }
+        F R,G,B,A=F0;
+        if (output_channels == 3) {
+            if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B); }
+            else        { sample_clut_16(grid_16,ix, &R,&G,&B); }
+        } else {
+            if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B,&A); }
+            else        { sample_clut_16(grid_16,ix, &R,&G,&B,&A); }
+        }
+        *r += w*R;
+        *g += w*G;
+        *b += w*B;
+        *a += w*A;
+    }
+}
+static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
+    clut(a2b->input_channels, a2b->output_channels,
+         a2b->grid_points, a2b->grid_8, a2b->grid_16,
+         r,g,b,&a);
+}
+static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
+    clut(b2a->input_channels, b2a->output_channels,
+         b2a->grid_points, b2a->grid_8, b2a->grid_16,
+         r,g,b,a);
+}
+static void exec_ops(const Op* ops, const void** args,
+                     const char* src, char* dst, int i) {
+    F r = F0, g = F0, b = F0, a = F1;
+    while (true) {
+        switch (*ops++) {
+            case Op_load_a8:{
+                a = F_from_U8(load<U8>(src + 1*i));
+            } break;
+            case Op_load_g8:{
+                r = g = b = F_from_U8(load<U8>(src + 1*i));
+            } break;
+            case Op_load_4444:{
+                U16 abgr = load<U16>(src + 2*i);
+                r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
+                g = cast<F>((abgr >>  8) & 0xf) * (1/15.0f);
+                b = cast<F>((abgr >>  4) & 0xf) * (1/15.0f);
+                a = cast<F>((abgr >>  0) & 0xf) * (1/15.0f);
+            } break;
+            case Op_load_565:{
+                U16 rgb = load<U16>(src + 2*i);
+                r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
+                g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
+                b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
+            } break;
+            case Op_load_888:{
+                const uint8_t* rgb = (const uint8_t*)(src + 3*i);
+            #if defined(USING_NEON_FP16)
+                // See the explanation under USING_NEON below.  This is that doubled up.
+                uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
+                v = vld3q_lane_u8(rgb+ 0, v,  0);
+                v = vld3q_lane_u8(rgb+ 3, v,  2);
+                v = vld3q_lane_u8(rgb+ 6, v,  4);
+                v = vld3q_lane_u8(rgb+ 9, v,  6);
+                v = vld3q_lane_u8(rgb+12, v,  8);
+                v = vld3q_lane_u8(rgb+15, v, 10);
+                v = vld3q_lane_u8(rgb+18, v, 12);
+                v = vld3q_lane_u8(rgb+21, v, 14);
+                r = cast<F>((U16)v.val[0]) * (1/255.0f);
+                g = cast<F>((U16)v.val[1]) * (1/255.0f);
+                b = cast<F>((U16)v.val[2]) * (1/255.0f);
+            #elif defined(USING_NEON)
+                // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
+                // a time.  Since we're doing that, we might as well load them into 16-bit lanes.
+                // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
+                uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
+                v = vld3_lane_u8(rgb+0, v, 0);
+                v = vld3_lane_u8(rgb+3, v, 2);
+                v = vld3_lane_u8(rgb+6, v, 4);
+                v = vld3_lane_u8(rgb+9, v, 6);
+                // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
+                // convert to F.  (Again, U32 would be even better here if drop ARMv7 or split
+                // ARMv7 and ARMv8 impls.)
+                r = cast<F>((U16)v.val[0]) * (1/255.0f);
+                g = cast<F>((U16)v.val[1]) * (1/255.0f);
+                b = cast<F>((U16)v.val[2]) * (1/255.0f);
+            #else
+                r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
+                g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
+                b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
+            #endif
+            } break;
+            case Op_load_8888:{
+                U32 rgba = load<U32>(src + 4*i);
+                r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
+                g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
+                b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
+                a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
+            } break;
+            case Op_load_8888_palette8:{
+                const uint8_t* palette = (const uint8_t*) *args++;
+                I32 ix = cast<I32>(load<U8>(src + 1*i));
+                U32 rgba = gather_32(palette, ix);
+                r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
+                g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
+                b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
+                a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
+            } break;
+            case Op_load_1010102:{
+                U32 rgba = load<U32>(src + 4*i);
+                r = cast<F>((rgba >>  0) & 0x3ff) * (1/1023.0f);
+                g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
+                b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
+                a = cast<F>((rgba >> 30) & 0x3  ) * (1/   3.0f);
+            } break;
+            case Op_load_161616LE:{
+                uintptr_t ptr = (uintptr_t)(src + 6*i);
+                assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
+                const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = vld3q_u16(rgb);
+                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
+                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
+                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = vld3_u16(rgb);
+                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
+                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
+                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
+            #else
+                r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
+                g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
+                b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
+            #endif
+            } break;
+            case Op_load_16161616LE:{
+                uintptr_t ptr = (uintptr_t)(src + 8*i);
+                assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
+                const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = vld4q_u16(rgba);
+                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
+                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
+                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
+                a = cast<F>((U16)v.val[3]) * (1/65535.0f);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = vld4_u16(rgba);
+                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
+                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
+                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
+                a = cast<F>((U16)v.val[3]) * (1/65535.0f);
+            #else
+                U64 px = load<U64>(rgba);
+                r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f);
+                g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
+                b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
+                a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
+            #endif
+            } break;
+            case Op_load_161616BE:{
+                uintptr_t ptr = (uintptr_t)(src + 6*i);
+                assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
+                const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = vld3q_u16(rgb);
+                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
+                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
+                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = vld3_u16(rgb);
+                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
+                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
+                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
+            #else
+                U32 R = load_3<U32>(rgb+0),
+                    G = load_3<U32>(rgb+1),
+                    B = load_3<U32>(rgb+2);
+                // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
+                r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
+                g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
+                b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
+            #endif
+            } break;
+            case Op_load_16161616BE:{
+                uintptr_t ptr = (uintptr_t)(src + 8*i);
+                assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
+                const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = vld4q_u16(rgba);
+                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
+                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
+                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
+                a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = vld4_u16(rgba);
+                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
+                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
+                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
+                a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
+            #else
+                U64 px = swap_endian_16x4(load<U64>(rgba));
+                r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f);
+                g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
+                b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
+                a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
+            #endif
+            } break;
+            case Op_load_hhh:{
+                uintptr_t ptr = (uintptr_t)(src + 6*i);
+                assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
+                const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = vld3q_u16(rgb);
+                U16 R = (U16)v.val[0],
+                    G = (U16)v.val[1],
+                    B = (U16)v.val[2];
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = vld3_u16(rgb);
+                U16 R = (U16)v.val[0],
+                    G = (U16)v.val[1],
+                    B = (U16)v.val[2];
+            #else
+                U16 R = load_3<U16>(rgb+0),
+                    G = load_3<U16>(rgb+1),
+                    B = load_3<U16>(rgb+2);
+            #endif
+                r = F_from_Half(R);
+                g = F_from_Half(G);
+                b = F_from_Half(B);
+            } break;
+            case Op_load_hhhh:{
+                uintptr_t ptr = (uintptr_t)(src + 8*i);
+                assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
+                const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = vld4q_u16(rgba);
+                U16 R = (U16)v.val[0],
+                    G = (U16)v.val[1],
+                    B = (U16)v.val[2],
+                    A = (U16)v.val[3];
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = vld4_u16(rgba);
+                U16 R = (U16)v.val[0],
+                    G = (U16)v.val[1],
+                    B = (U16)v.val[2],
+                    A = (U16)v.val[3];
+            #else
+                U64 px = load<U64>(rgba);
+                U16 R = cast<U16>((px >>  0) & 0xffff),
+                    G = cast<U16>((px >> 16) & 0xffff),
+                    B = cast<U16>((px >> 32) & 0xffff),
+                    A = cast<U16>((px >> 48) & 0xffff);
+            #endif
+                r = F_from_Half(R);
+                g = F_from_Half(G);
+                b = F_from_Half(B);
+                a = F_from_Half(A);
+            } break;
+            case Op_load_fff:{
+                uintptr_t ptr = (uintptr_t)(src + 12*i);
+                assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
+                const float* rgb = (const float*)ptr;       // cast to const float* to be safe.
+            #if defined(USING_NEON_FP16)
+                float32x4x3_t lo = vld3q_f32(rgb +  0),
+                              hi = vld3q_f32(rgb + 12);
+                r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
+                g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
+                b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
+            #elif defined(USING_NEON)
+                float32x4x3_t v = vld3q_f32(rgb);
+                r = (F)v.val[0];
+                g = (F)v.val[1];
+                b = (F)v.val[2];
+            #else
+                r = load_3<F>(rgb+0);
+                g = load_3<F>(rgb+1);
+                b = load_3<F>(rgb+2);
+            #endif
+            } break;
+            case Op_load_ffff:{
+                uintptr_t ptr = (uintptr_t)(src + 16*i);
+                assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
+                const float* rgba = (const float*)ptr;      // cast to const float* to be safe.
+            #if defined(USING_NEON_FP16)
+                float32x4x4_t lo = vld4q_f32(rgba +  0),
+                              hi = vld4q_f32(rgba + 16);
+                r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
+                g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
+                b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
+                a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
+            #elif defined(USING_NEON)
+                float32x4x4_t v = vld4q_f32(rgba);
+                r = (F)v.val[0];
+                g = (F)v.val[1];
+                b = (F)v.val[2];
+                a = (F)v.val[3];
+            #else
+                r = load_4<F>(rgba+0);
+                g = load_4<F>(rgba+1);
+                b = load_4<F>(rgba+2);
+                a = load_4<F>(rgba+3);
+            #endif
+            } break;
+            case Op_swap_rb:{
+                F t = r;
+                r = b;
+                b = t;
+            } break;
+            case Op_clamp:{
+                r = max_(F0, min_(r, F1));
+                g = max_(F0, min_(g, F1));
+                b = max_(F0, min_(b, F1));
+                a = max_(F0, min_(a, F1));
+            } break;
+            case Op_invert:{
+                r = F1 - r;
+                g = F1 - g;
+                b = F1 - b;
+                a = F1 - a;
+            } break;
+            case Op_force_opaque:{
+                a = F1;
+            } break;
+            case Op_premul:{
+                r *= a;
+                g *= a;
+                b *= a;
+            } break;
+            case Op_unpremul:{
+                F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
+                r *= scale;
+                g *= scale;
+                b *= scale;
+            } break;
+            case Op_matrix_3x3:{
+                const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
+                const float* m = &matrix->vals[0][0];
+                F R = m[0]*r + m[1]*g + m[2]*b,
+                  G = m[3]*r + m[4]*g + m[5]*b,
+                  B = m[6]*r + m[7]*g + m[8]*b;
+                r = R;
+                g = G;
+                b = B;
+            } break;
+            case Op_matrix_3x4:{
+                const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
+                const float* m = &matrix->vals[0][0];
+                F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
+                  G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
+                  B = m[8]*r + m[9]*g + m[10]*b + m[11];
+                r = R;
+                g = G;
+                b = B;
+            } break;
+            case Op_lab_to_xyz:{
+                // The L*a*b values are in r,g,b, but normalized to [0,1].  Reconstruct them:
+                F L = r * 100.0f,
+                  A = g * 255.0f - 128.0f,
+                  B = b * 255.0f - 128.0f;
+                // Convert to CIE XYZ.
+                F Y = (L + 16.0f) * (1/116.0f),
+                  X = Y + A*(1/500.0f),
+                  Z = Y - B*(1/200.0f);
+                X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
+                Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
+                Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
+                // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
+                r = X * 0.9642f;
+                g = Y          ;
+                b = Z * 0.8249f;
+            } break;
+            // As above, in reverse.
+            case Op_xyz_to_lab:{
+                F X = r * (1/0.9642f),
+                  Y = g,
+                  Z = b * (1/0.8249f);
+                X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
+                Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
+                Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
+                F L = Y*116.0f - 16.0f,
+                  A = (X-Y)*500.0f,
+                  B = (Y-Z)*200.0f;
+                r = L * (1/100.f);
+                g = (A + 128.0f) * (1/255.0f);
+                b = (B + 128.0f) * (1/255.0f);
+            } break;
+            case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
+            case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
+            case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
+            case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
+            case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
+            case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
+            case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
+            case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
+            case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
+            case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
+            case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
+            case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
+            case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
+            case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
+            case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
+            case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
+            case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
+            case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
+            case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
+            case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
+            case Op_clut_A2B: {
+                const skcms_A2B* a2b = (const skcms_A2B*) *args++;
+                clut(a2b, &r,&g,&b,a);
+                if (a2b->input_channels == 4) {
+                    // CMYK is opaque.
+                    a = F1;
+                }
+            } break;
+            case Op_clut_B2A: {
+                const skcms_B2A* b2a = (const skcms_B2A*) *args++;
+                clut(b2a, &r,&g,&b,&a);
+            } break;
+    // Notice, from here on down the store_ ops all return, ending the loop.
+            case Op_store_a8: {
+                store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
+            } return;
+            case Op_store_g8: {
+                // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
+                store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
+            } return;
+            case Op_store_4444: {
+                store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
+                                    | cast<U16>(to_fixed(g * 15) <<  8)
+                                    | cast<U16>(to_fixed(b * 15) <<  4)
+                                    | cast<U16>(to_fixed(a * 15) <<  0));
+            } return;
+            case Op_store_565: {
+                store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) <<  0 )
+                                    | cast<U16>(to_fixed(g * 63) <<  5 )
+                                    | cast<U16>(to_fixed(b * 31) << 11 ));
+            } return;
+            case Op_store_888: {
+                uint8_t* rgb = (uint8_t*)dst + 3*i;
+            #if defined(USING_NEON_FP16)
+                // See the explanation under USING_NEON below.  This is that doubled up.
+                U16 R = to_fixed(r * 255),
+                    G = to_fixed(g * 255),
+                    B = to_fixed(b * 255);
+                uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
+                vst3q_lane_u8(rgb+ 0, v,  0);
+                vst3q_lane_u8(rgb+ 3, v,  2);
+                vst3q_lane_u8(rgb+ 6, v,  4);
+                vst3q_lane_u8(rgb+ 9, v,  6);
+                vst3q_lane_u8(rgb+12, v,  8);
+                vst3q_lane_u8(rgb+15, v, 10);
+                vst3q_lane_u8(rgb+18, v, 12);
+                vst3q_lane_u8(rgb+21, v, 14);
+            #elif defined(USING_NEON)
+                // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
+                // get there via U16 to save some instructions converting to float.  And just
+                // like load_888, we'd prefer to go via U32 but for ARMv7 support.
+                U16 R = cast<U16>(to_fixed(r * 255)),
+                    G = cast<U16>(to_fixed(g * 255)),
+                    B = cast<U16>(to_fixed(b * 255));
+                uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
+                vst3_lane_u8(rgb+0, v, 0);
+                vst3_lane_u8(rgb+3, v, 2);
+                vst3_lane_u8(rgb+6, v, 4);
+                vst3_lane_u8(rgb+9, v, 6);
+            #else
+                store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
+                store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
+                store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
+            #endif
+            } return;
+            case Op_store_8888: {
+                store(dst + 4*i, cast<U32>(to_fixed(r * 255)) <<  0
+                               | cast<U32>(to_fixed(g * 255)) <<  8
+                               | cast<U32>(to_fixed(b * 255)) << 16
+                               | cast<U32>(to_fixed(a * 255)) << 24);
+            } return;
+            case Op_store_1010102: {
+                store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) <<  0
+                               | cast<U32>(to_fixed(g * 1023)) << 10
+                               | cast<U32>(to_fixed(b * 1023)) << 20
+                               | cast<U32>(to_fixed(a *    3)) << 30);
+            } return;
+            case Op_store_161616LE: {
+                uintptr_t ptr = (uintptr_t)(dst + 6*i);
+                assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
+                uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = {{
+                    (uint16x8_t)U16_from_F(r),
+                    (uint16x8_t)U16_from_F(g),
+                    (uint16x8_t)U16_from_F(b),
+                }};
+                vst3q_u16(rgb, v);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = {{
+                    (uint16x4_t)U16_from_F(r),
+                    (uint16x4_t)U16_from_F(g),
+                    (uint16x4_t)U16_from_F(b),
+                }};
+                vst3_u16(rgb, v);
+            #else
+                store_3(rgb+0, U16_from_F(r));
+                store_3(rgb+1, U16_from_F(g));
+                store_3(rgb+2, U16_from_F(b));
+            #endif
+            } return;
+            case Op_store_16161616LE: {
+                uintptr_t ptr = (uintptr_t)(dst + 8*i);
+                assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
+                uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = {{
+                    (uint16x8_t)U16_from_F(r),
+                    (uint16x8_t)U16_from_F(g),
+                    (uint16x8_t)U16_from_F(b),
+                    (uint16x8_t)U16_from_F(a),
+                }};
+                vst4q_u16(rgba, v);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = {{
+                    (uint16x4_t)U16_from_F(r),
+                    (uint16x4_t)U16_from_F(g),
+                    (uint16x4_t)U16_from_F(b),
+                    (uint16x4_t)U16_from_F(a),
+                }};
+                vst4_u16(rgba, v);
+            #else
+                U64 px = cast<U64>(to_fixed(r * 65535)) <<  0
+                       | cast<U64>(to_fixed(g * 65535)) << 16
+                       | cast<U64>(to_fixed(b * 65535)) << 32
+                       | cast<U64>(to_fixed(a * 65535)) << 48;
+                store(rgba, px);
+            #endif
+            } return;
+            case Op_store_161616BE: {
+                uintptr_t ptr = (uintptr_t)(dst + 6*i);
+                assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
+                uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = {{
+                    (uint16x8_t)swap_endian_16(U16_from_F(r)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(g)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(b)),
+                }};
+                vst3q_u16(rgb, v);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = {{
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
+                }};
+                vst3_u16(rgb, v);
+            #else
+                U32 R = to_fixed(r * 65535),
+                    G = to_fixed(g * 65535),
+                    B = to_fixed(b * 65535);
+                store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
+                store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
+                store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
+            #endif
+            } return;
+            case Op_store_16161616BE: {
+                uintptr_t ptr = (uintptr_t)(dst + 8*i);
+                assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
+                uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = {{
+                    (uint16x8_t)swap_endian_16(U16_from_F(r)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(g)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(b)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(a)),
+                }};
+                vst4q_u16(rgba, v);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = {{
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
+                }};
+                vst4_u16(rgba, v);
+            #else
+                U64 px = cast<U64>(to_fixed(r * 65535)) <<  0
+                       | cast<U64>(to_fixed(g * 65535)) << 16
+                       | cast<U64>(to_fixed(b * 65535)) << 32
+                       | cast<U64>(to_fixed(a * 65535)) << 48;
+                store(rgba, swap_endian_16x4(px));
+            #endif
+            } return;
+            case Op_store_hhh: {
+                uintptr_t ptr = (uintptr_t)(dst + 6*i);
+                assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
+                uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
+                U16 R = Half_from_F(r),
+                    G = Half_from_F(g),
+                    B = Half_from_F(b);
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = {{
+                    (uint16x8_t)R,
+                    (uint16x8_t)G,
+                    (uint16x8_t)B,
+                }};
+                vst3q_u16(rgb, v);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = {{
+                    (uint16x4_t)R,
+                    (uint16x4_t)G,
+                    (uint16x4_t)B,
+                }};
+                vst3_u16(rgb, v);
+            #else
+                store_3(rgb+0, R);
+                store_3(rgb+1, G);
+                store_3(rgb+2, B);
+            #endif
+            } return;
+            case Op_store_hhhh: {
+                uintptr_t ptr = (uintptr_t)(dst + 8*i);
+                assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
+                uint16_t* rgba = (uint16_t*)ptr;         // for this cast to uint16_t* to be safe.
+                U16 R = Half_from_F(r),
+                    G = Half_from_F(g),
+                    B = Half_from_F(b),
+                    A = Half_from_F(a);
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = {{
+                    (uint16x8_t)R,
+                    (uint16x8_t)G,
+                    (uint16x8_t)B,
+                    (uint16x8_t)A,
+                }};
+                vst4q_u16(rgba, v);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = {{
+                    (uint16x4_t)R,
+                    (uint16x4_t)G,
+                    (uint16x4_t)B,
+                    (uint16x4_t)A,
+                }};
+                vst4_u16(rgba, v);
+            #else
+                store(rgba, cast<U64>(R) <<  0
+                          | cast<U64>(G) << 16
+                          | cast<U64>(B) << 32
+                          | cast<U64>(A) << 48);
+            #endif
+            } return;
+            case Op_store_fff: {
+                uintptr_t ptr = (uintptr_t)(dst + 12*i);
+                assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
+                float* rgb = (float*)ptr;                // for this cast to float* to be safe.
+            #if defined(USING_NEON_FP16)
+                float32x4x3_t lo = {{
+                    vcvt_f32_f16(vget_low_f16(r)),
+                    vcvt_f32_f16(vget_low_f16(g)),
+                    vcvt_f32_f16(vget_low_f16(b)),
+                }}, hi = {{
+                    vcvt_f32_f16(vget_high_f16(r)),
+                    vcvt_f32_f16(vget_high_f16(g)),
+                    vcvt_f32_f16(vget_high_f16(b)),
+                }};
+                vst3q_f32(rgb +  0, lo);
+                vst3q_f32(rgb + 12, hi);
+            #elif defined(USING_NEON)
+                float32x4x3_t v = {{
+                    (float32x4_t)r,
+                    (float32x4_t)g,
+                    (float32x4_t)b,
+                }};
+                vst3q_f32(rgb, v);
+            #else
+                store_3(rgb+0, r);
+                store_3(rgb+1, g);
+                store_3(rgb+2, b);
+            #endif
+            } return;
+            case Op_store_ffff: {
+                uintptr_t ptr = (uintptr_t)(dst + 16*i);
+                assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
+                float* rgba = (float*)ptr;               // for this cast to float* to be safe.
+            #if defined(USING_NEON_FP16)
+                float32x4x4_t lo = {{
+                    vcvt_f32_f16(vget_low_f16(r)),
+                    vcvt_f32_f16(vget_low_f16(g)),
+                    vcvt_f32_f16(vget_low_f16(b)),
+                    vcvt_f32_f16(vget_low_f16(a)),
+                }}, hi = {{
+                    vcvt_f32_f16(vget_high_f16(r)),
+                    vcvt_f32_f16(vget_high_f16(g)),
+                    vcvt_f32_f16(vget_high_f16(b)),
+                    vcvt_f32_f16(vget_high_f16(a)),
+                }};
+                vst4q_f32(rgba +  0, lo);
+                vst4q_f32(rgba + 16, hi);
+            #elif defined(USING_NEON)
+                float32x4x4_t v = {{
+                    (float32x4_t)r,
+                    (float32x4_t)g,
+                    (float32x4_t)b,
+                    (float32x4_t)a,
+                }};
+                vst4q_f32(rgba, v);
+            #else
+                store_4(rgba+0, r);
+                store_4(rgba+1, g);
+                store_4(rgba+2, b);
+                store_4(rgba+3, a);
+            #endif
+            } return;
+        }
+    }
+}
+static void run_program(const Op* program, const void** arguments,
+                        const char* src, char* dst, int n,
+                        const size_t src_bpp, const size_t dst_bpp) {
+    int i = 0;
+    while (n >= N) {
+        exec_ops(program, arguments, src, dst, i);
+        i += N;
+        n -= N;
+    }
+    if (n > 0) {
+        char tmp[4*4*N] = {0};
+        memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
+        exec_ops(program, arguments, tmp, tmp, 0);
+        memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
+    }
+}
+// Clean up any #defines we may have set so that we can be #included again.
+#if defined(USING_AVX)
+    #undef  USING_AVX
+#endif
+#if defined(USING_AVX_F16C)
+    #undef  USING_AVX_F16C
+#endif
+#if defined(USING_AVX2)
+    #undef  USING_AVX2
+#endif
+#if defined(USING_AVX512F)
+    #undef  USING_AVX512F
+#endif
+#if defined(USING_NEON)
+    #undef  USING_NEON
+#endif
+#if defined(USING_NEON_F16C)
+    #undef  USING_NEON_F16C
+#endif
+#if defined(USING_NEON_FP16)
+    #undef  USING_NEON_FP16
+#endif
+#undef FALLTHROUGH