npm - @shopify/react-native-skia - Versions diffs - 0.1.158 → 0.1.160 - Mend

@shopify/react-native-skia 0.1.158 → 0.1.160

Files changed (403) hide show

package/android/CMakeLists.txt +35 -11
package/android/build.gradle +21 -25
package/android/cpp/jni/JniLoad.cpp +2 -0
package/android/cpp/jni/include/JniSkiaDomView.h +89 -0
package/android/cpp/rnskia-android/SkiaOpenGLRenderer.cpp +4 -3
package/android/cpp/rnskia-android/SkiaOpenGLRenderer.h +4 -3
package/android/src/main/java/com/shopify/reactnative/skia/RNSkiaPackage.java +2 -1
package/android/src/main/java/com/shopify/reactnative/skia/SkiaDomView.java +45 -0
package/android/src/main/java/com/shopify/reactnative/skia/SkiaDomViewManager.java +64 -0
package/cpp/api/JsiSkHostObjects.h +6 -0
package/cpp/api/JsiSkImageFilterFactory.h +1 -1
package/cpp/api/JsiSkPaint.h +9 -2
package/cpp/api/JsiSkPath.h +1 -0
package/cpp/api/JsiSkRuntimeEffect.h +36 -36
package/cpp/jsi/JsiHostObject.cpp +16 -28
package/cpp/jsi/JsiHostObject.h +127 -7
package/cpp/jsi/JsiValue.cpp +346 -0
package/cpp/jsi/JsiValue.h +222 -0
package/cpp/jsi/JsiValueWrapper.h +33 -5
package/cpp/rnskia/RNSkDomView.cpp +222 -0
package/cpp/rnskia/RNSkDomView.h +140 -0
package/cpp/rnskia/RNSkJsView.cpp +0 -4
package/cpp/rnskia/RNSkJsView.h +6 -4
package/cpp/rnskia/RNSkManager.cpp +7 -0
package/cpp/rnskia/RNSkPictureView.h +5 -8
package/cpp/rnskia/RNSkView.h +113 -5
package/cpp/rnskia/dom/JsiDomApi.h +167 -0
package/cpp/rnskia/dom/base/BaseNodeProp.h +72 -0
package/cpp/rnskia/dom/base/DerivedNodeProp.h +187 -0
package/cpp/rnskia/dom/base/DrawingContext.cpp +227 -0
package/cpp/rnskia/dom/base/DrawingContext.h +136 -0
package/cpp/rnskia/dom/base/JsiDependencyManager.h +293 -0
package/cpp/rnskia/dom/base/JsiDomDeclarationNode.h +176 -0
package/cpp/rnskia/dom/base/JsiDomDrawingNode.h +50 -0
package/cpp/rnskia/dom/base/JsiDomNode.h +384 -0
package/cpp/rnskia/dom/base/JsiDomRenderNode.h +267 -0
package/cpp/rnskia/dom/base/NodeProp.h +130 -0
package/cpp/rnskia/dom/base/NodePropsContainer.h +119 -0
package/cpp/rnskia/dom/nodes/JsiBackdropFilterNode.h +38 -0
package/cpp/rnskia/dom/nodes/JsiBlendNode.h +112 -0
package/cpp/rnskia/dom/nodes/JsiBlurMaskNode.h +78 -0
package/cpp/rnskia/dom/nodes/JsiBoxNode.h +104 -0
package/cpp/rnskia/dom/nodes/JsiBoxShadowNode.h +33 -0
package/cpp/rnskia/dom/nodes/JsiCircleNode.h +38 -0
package/cpp/rnskia/dom/nodes/JsiColorFilterNodes.h +192 -0
package/cpp/rnskia/dom/nodes/JsiCustomDrawingNode.h +123 -0
package/cpp/rnskia/dom/nodes/JsiDiffRectNode.h +42 -0
package/cpp/rnskia/dom/nodes/JsiFillNode.h +22 -0
package/cpp/rnskia/dom/nodes/JsiGlyphsNode.h +56 -0
package/cpp/rnskia/dom/nodes/JsiGroupNode.h +26 -0
package/cpp/rnskia/dom/nodes/JsiImageFilterNodes.h +415 -0
package/cpp/rnskia/dom/nodes/JsiImageNode.h +34 -0
package/cpp/rnskia/dom/nodes/JsiImageSvgNode.h +44 -0
package/cpp/rnskia/dom/nodes/JsiLayerNode.h +64 -0
package/cpp/rnskia/dom/nodes/JsiLineNode.h +43 -0
package/cpp/rnskia/dom/nodes/JsiOvalNode.h +34 -0
package/cpp/rnskia/dom/nodes/JsiPaintNode.h +79 -0
package/cpp/rnskia/dom/nodes/JsiPatchNode.h +54 -0
package/cpp/rnskia/dom/nodes/JsiPathEffectNodes.h +315 -0
package/cpp/rnskia/dom/nodes/JsiPathNode.h +181 -0
package/cpp/rnskia/dom/nodes/JsiPictureNode.h +32 -0
package/cpp/rnskia/dom/nodes/JsiPointsNode.h +51 -0
package/cpp/rnskia/dom/nodes/JsiRRectNode.h +34 -0
package/cpp/rnskia/dom/nodes/JsiRectNode.h +34 -0
package/cpp/rnskia/dom/nodes/JsiShaderNodes.h +517 -0
package/cpp/rnskia/dom/nodes/JsiTextBlobNode.h +47 -0
package/cpp/rnskia/dom/nodes/JsiTextNode.h +54 -0
package/cpp/rnskia/dom/nodes/JsiTextPathNode.h +32 -0
package/cpp/rnskia/dom/nodes/JsiVerticesNode.h +43 -0
package/cpp/rnskia/dom/props/BezierProps.h +63 -0
package/cpp/rnskia/dom/props/BlendModeProp.h +101 -0
package/cpp/rnskia/dom/props/BoxShadowProps.h +62 -0
package/cpp/rnskia/dom/props/CircleProp.h +46 -0
package/cpp/rnskia/dom/props/ClipProp.h +62 -0
package/cpp/rnskia/dom/props/ColorProp.h +80 -0
package/cpp/rnskia/dom/props/DrawingProp.h +33 -0
package/cpp/rnskia/dom/props/FontProp.h +34 -0
package/cpp/rnskia/dom/props/GlyphsProp.h +53 -0
package/cpp/rnskia/dom/props/ImageProps.h +173 -0
package/cpp/rnskia/dom/props/LayerProp.h +50 -0
package/cpp/rnskia/dom/props/MatrixProp.h +33 -0
package/cpp/rnskia/dom/props/NumbersProp.h +63 -0
package/cpp/rnskia/dom/props/PaintProps.h +171 -0
package/cpp/rnskia/dom/props/PathProp.h +55 -0
package/cpp/rnskia/dom/props/PictureProp.h +38 -0
package/cpp/rnskia/dom/props/PointProp.h +72 -0
package/cpp/rnskia/dom/props/PointsProp.h +83 -0
package/cpp/rnskia/dom/props/RRectProp.h +134 -0
package/cpp/rnskia/dom/props/RadiusProp.h +43 -0
package/cpp/rnskia/dom/props/RectProp.h +118 -0
package/cpp/rnskia/dom/props/StrokeProps.h +75 -0
package/cpp/rnskia/dom/props/SvgProp.h +37 -0
package/cpp/rnskia/dom/props/TextBlobProp.h +128 -0
package/cpp/rnskia/dom/props/TileModeProp.h +50 -0
package/cpp/rnskia/dom/props/TransformProp.h +80 -0
package/cpp/rnskia/dom/props/TransformsProps.h +68 -0
package/cpp/rnskia/dom/props/UniformsProp.h +194 -0
package/cpp/rnskia/dom/props/VertexModeProp.h +47 -0
package/cpp/rnskia/dom/props/VerticesProps.h +67 -0
package/cpp/rnskia/values/RNSkReadonlyValue.h +13 -4
package/cpp/skia/include/android/SkAndroidFrameworkUtils.h +35 -1
package/cpp/skia/include/codec/SkAndroidCodec.h +17 -1
package/cpp/skia/include/codec/SkCodec.h +8 -5
package/cpp/skia/include/core/SkAnnotation.h +2 -0
package/cpp/skia/include/core/SkBitmap.h +52 -1
package/cpp/skia/include/core/SkBlendMode.h +2 -0
package/cpp/skia/include/core/SkCanvas.h +52 -31
package/cpp/skia/include/core/SkCapabilities.h +44 -0
package/cpp/skia/include/core/SkColor.h +7 -0
package/cpp/skia/include/core/SkColorFilter.h +37 -0
package/cpp/skia/include/core/SkColorSpace.h +1 -1
package/cpp/skia/include/core/SkFont.h +4 -0
package/cpp/skia/include/core/SkFontMgr.h +3 -0
package/cpp/skia/include/core/SkGraphics.h +9 -0
package/cpp/skia/include/core/SkImage.h +77 -17
package/cpp/skia/include/core/SkImageEncoder.h +5 -3
package/cpp/skia/include/core/SkImageGenerator.h +27 -17
package/cpp/skia/include/core/SkM44.h +1 -0
package/cpp/skia/include/core/SkMesh.h +120 -34
package/cpp/skia/include/core/SkMilestone.h +1 -1
package/cpp/skia/include/core/SkOverdrawCanvas.h +2 -1
package/cpp/skia/include/core/SkPaint.h +15 -2
package/cpp/skia/include/core/SkPath.h +4 -0
package/cpp/skia/include/core/SkPathBuilder.h +1 -1
package/cpp/skia/include/core/SkPicture.h +0 -3
package/cpp/skia/include/core/SkPictureRecorder.h +0 -2
package/cpp/skia/include/core/SkPixmap.h +19 -0
package/cpp/skia/include/core/SkRasterHandleAllocator.h +3 -1
package/cpp/skia/include/core/SkRect.h +11 -4
package/cpp/skia/include/core/SkRefCnt.h +13 -1
package/cpp/skia/include/core/SkRegion.h +6 -0
package/cpp/skia/include/core/SkSamplingOptions.h +8 -6
package/cpp/skia/include/core/SkScalar.h +6 -25
package/cpp/skia/include/core/SkShader.h +20 -12
package/cpp/skia/include/core/SkSpan.h +51 -19
package/cpp/skia/include/core/SkStream.h +2 -2
package/cpp/skia/include/core/SkString.h +11 -3
package/cpp/skia/include/core/SkSurface.h +85 -8
package/cpp/skia/include/core/SkTextBlob.h +5 -2
package/cpp/skia/include/core/SkTypes.h +11 -10
package/cpp/skia/include/docs/SkPDFDocument.h +0 -5
package/cpp/skia/include/effects/Sk1DPathEffect.h +6 -1
package/cpp/skia/include/effects/Sk2DPathEffect.h +4 -1
package/cpp/skia/include/effects/SkColorMatrix.h +1 -0
package/cpp/skia/include/effects/SkColorMatrixFilter.h +5 -8
package/cpp/skia/include/effects/SkCornerPathEffect.h +5 -1
package/cpp/skia/include/effects/SkDashPathEffect.h +5 -1
package/cpp/skia/include/effects/SkGradientShader.h +68 -38
package/cpp/skia/include/effects/SkHighContrastFilter.h +5 -1
package/cpp/skia/include/effects/SkImageFilters.h +5 -4
package/cpp/skia/include/effects/SkLumaColorFilter.h +4 -1
package/cpp/skia/include/effects/SkOpPathEffect.h +6 -2
package/cpp/skia/include/effects/SkOverdrawColorFilter.h +5 -2
package/cpp/skia/include/effects/SkRuntimeEffect.h +54 -62
package/cpp/skia/include/effects/SkShaderMaskFilter.h +3 -1
package/cpp/skia/include/effects/SkTableColorFilter.h +8 -21
package/cpp/skia/include/effects/SkTableMaskFilter.h +5 -1
package/cpp/skia/include/effects/SkTrimPathEffect.h +5 -1
package/cpp/skia/include/encode/SkEncoder.h +17 -0
package/cpp/skia/include/encode/SkWebpEncoder.h +17 -0
package/cpp/skia/include/gpu/GpuTypes.h +18 -0
package/cpp/skia/include/gpu/GrBackendSurface.h +38 -17
package/cpp/skia/include/gpu/GrBackendSurfaceMutableState.h +6 -71
package/cpp/skia/include/gpu/GrContextOptions.h +1 -1
package/cpp/skia/include/gpu/GrContextThreadSafeProxy.h +10 -9
package/cpp/skia/include/gpu/GrDirectContext.h +42 -22
package/cpp/skia/include/gpu/GrRecordingContext.h +6 -3
package/cpp/skia/include/gpu/GrTypes.h +11 -11
package/cpp/skia/include/gpu/MutableTextureState.h +122 -0
package/cpp/skia/include/gpu/gl/GrGLFunctions.h +1 -0
package/cpp/skia/include/gpu/gl/GrGLInterface.h +1 -0
package/cpp/skia/include/gpu/graphite/BackendTexture.h +7 -0
package/cpp/skia/include/gpu/graphite/CombinationBuilder.h +195 -0
package/cpp/skia/include/gpu/graphite/Context.h +47 -55
package/cpp/skia/include/gpu/graphite/ContextOptions.h +85 -0
package/cpp/skia/include/gpu/graphite/GraphiteTypes.h +1 -17
package/cpp/skia/include/gpu/graphite/ImageProvider.h +61 -0
package/cpp/skia/include/gpu/graphite/Recorder.h +87 -8
package/cpp/skia/include/gpu/graphite/Recording.h +19 -9
package/cpp/skia/include/gpu/graphite/TextureInfo.h +40 -8
package/cpp/skia/include/gpu/graphite/dawn/DawnBackendContext.h +25 -0
package/cpp/skia/include/gpu/graphite/mtl/MtlBackendContext.h +3 -2
package/cpp/skia/include/gpu/graphite/vk/VulkanGraphiteTypes.h +69 -0
package/cpp/skia/include/gpu/mtl/MtlMemoryAllocator.h +39 -0
package/cpp/skia/include/gpu/vk/GrVkBackendContext.h +21 -19
package/cpp/skia/include/gpu/vk/GrVkExtensions.h +2 -50
package/cpp/skia/include/gpu/vk/GrVkMemoryAllocator.h +2 -127
package/cpp/skia/include/gpu/vk/GrVkTypes.h +5 -43
package/cpp/skia/include/gpu/vk/VulkanBackendContext.h +46 -0
package/cpp/skia/include/gpu/vk/VulkanExtensions.h +67 -0
package/cpp/skia/include/gpu/vk/VulkanMemoryAllocator.h +116 -0
package/cpp/skia/include/gpu/vk/VulkanTypes.h +59 -0
package/cpp/skia/include/pathops/SkPathOps.h +1 -1
package/cpp/skia/include/private/SkColorData.h +10 -40
package/cpp/skia/include/private/SkEncodedInfo.h +9 -3
package/cpp/skia/include/private/SkFloatingPoint.h +9 -6
package/cpp/skia/include/private/SkHalf.h +5 -52
package/cpp/skia/include/private/SkMacros.h +1 -1
package/cpp/skia/include/private/SkMalloc.h +4 -0
package/cpp/skia/include/private/SkPathRef.h +10 -10
package/cpp/skia/include/private/SkSLModifiers.h +59 -23
package/cpp/skia/include/private/SkSLProgramKind.h +1 -0
package/cpp/skia/include/private/SkSLSymbol.h +7 -3
package/cpp/skia/include/private/SkStringView.h +4 -0
package/cpp/skia/include/private/SkTArray.h +21 -7
package/cpp/skia/include/private/SkTDArray.h +173 -285
package/cpp/skia/include/private/SkTHash.h +33 -32
package/cpp/skia/include/private/SkTemplates.h +24 -26
package/cpp/skia/include/private/SkVx.h +218 -135
package/cpp/skia/include/private/chromium/GrSlug.h +3 -65
package/cpp/skia/include/private/chromium/SkChromeRemoteGlyphCache.h +6 -3
package/cpp/skia/include/private/chromium/Slug.h +76 -0
package/cpp/skia/include/private/gpu/ganesh/GrTypesPriv.h +6 -1
package/cpp/skia/include/private/gpu/ganesh/GrVkTypesPriv.h +5 -39
package/cpp/skia/include/private/gpu/graphite/VulkanGraphiteTypesPriv.h +63 -0
package/cpp/skia/include/{gpu/vk/GrVkVulkan.h → private/gpu/vk/SkiaVulkan.h} +2 -2
package/cpp/skia/include/private/gpu/vk/VulkanTypesPriv.h +57 -0
package/cpp/skia/include/sksl/DSL.h +0 -1
package/cpp/skia/include/sksl/DSLBlock.h +4 -18
package/cpp/skia/include/sksl/DSLCase.h +2 -8
package/cpp/skia/include/sksl/DSLCore.h +8 -15
package/cpp/skia/include/sksl/DSLExpression.h +51 -142
package/cpp/skia/include/sksl/DSLFunction.h +7 -15
package/cpp/skia/include/sksl/DSLModifiers.h +5 -2
package/cpp/skia/include/sksl/DSLStatement.h +4 -39
package/cpp/skia/include/sksl/DSLSymbols.h +1 -11
package/cpp/skia/include/sksl/DSLType.h +20 -12
package/cpp/skia/include/sksl/DSLVar.h +56 -146
package/cpp/skia/include/sksl/SkSLErrorReporter.h +2 -15
package/cpp/skia/include/sksl/SkSLOperator.h +62 -59
package/cpp/skia/include/sksl/SkSLPosition.h +2 -0
package/cpp/skia/include/sksl/SkSLVersion.h +27 -0
package/cpp/skia/include/svg/SkSVGCanvas.h +1 -0
package/cpp/skia/include/utils/SkAnimCodecPlayer.h +1 -1
package/cpp/skia/include/utils/SkBase64.h +2 -0
package/cpp/skia/include/utils/SkCustomTypeface.h +24 -11
package/cpp/skia/include/utils/SkEventTracer.h +12 -1
package/cpp/skia/include/utils/SkNWayCanvas.h +11 -4
package/cpp/skia/include/utils/SkPaintFilterCanvas.h +9 -4
package/cpp/skia/include/utils/SkParse.h +3 -0
package/cpp/skia/include/utils/SkShadowUtils.h +2 -0
package/cpp/skia/include/utils/SkTextUtils.h +2 -1
package/cpp/skia/{include/third_party → modules}/skcms/skcms.h +10 -0
package/cpp/skia/modules/skcms/skcms_internal.h +56 -0
package/cpp/skia/modules/skcms/src/Transform_inl.h +1609 -0
package/cpp/skia/modules/skparagraph/include/DartTypes.h +153 -0
package/cpp/skia/modules/skparagraph/include/FontArguments.h +46 -0
package/cpp/skia/modules/skparagraph/include/FontCollection.h +84 -0
package/cpp/skia/modules/skparagraph/include/Metrics.h +98 -0
package/cpp/skia/modules/skparagraph/include/Paragraph.h +111 -0
package/cpp/skia/modules/skparagraph/include/ParagraphBuilder.h +69 -0
package/cpp/skia/modules/skparagraph/include/ParagraphCache.h +77 -0
package/cpp/skia/modules/skparagraph/include/ParagraphStyle.h +143 -0
package/cpp/skia/modules/skparagraph/include/TextShadow.h +30 -0
package/cpp/skia/modules/skparagraph/include/TextStyle.h +352 -0
package/cpp/skia/modules/skparagraph/include/TypefaceFontProvider.h +81 -0
package/cpp/skia/modules/svg/include/SkSVGAttributeParser.h +1 -1
package/cpp/skia/modules/svg/include/SkSVGTypes.h +3 -3
package/cpp/skia/src/core/SkLRUCache.h +126 -0
package/cpp/skia/src/core/SkTInternalLList.h +302 -0
package/cpp/utils/RNSkTimingInfo.h +1 -0
package/ios/RNSkia-iOS/RNSkMetalCanvasProvider.h +15 -4
package/ios/RNSkia-iOS/RNSkMetalCanvasProvider.mm +41 -55
package/ios/RNSkia-iOS/SkiaDomViewManager.h +8 -0
package/ios/RNSkia-iOS/SkiaDomViewManager.mm +51 -0
package/lib/commonjs/dom/nodes/JsiSkDOM.js +56 -56
package/lib/commonjs/dom/nodes/JsiSkDOM.js.map +1 -1
package/lib/commonjs/dom/nodes/RenderNode.js +1 -1
package/lib/commonjs/dom/nodes/RenderNode.js.map +1 -1
package/lib/commonjs/renderer/Canvas.js +24 -66
package/lib/commonjs/renderer/Canvas.js.map +1 -1
package/lib/commonjs/renderer/DependencyManager.js +0 -5
package/lib/commonjs/renderer/DependencyManager.js.map +1 -1
package/lib/commonjs/renderer/HostComponents.js.map +1 -1
package/lib/commonjs/renderer/useCanvas.js +4 -18
package/lib/commonjs/renderer/useCanvas.js.map +1 -1
package/lib/commonjs/views/SkiaBaseWebView.js +15 -0
package/lib/commonjs/views/SkiaBaseWebView.js.map +1 -1
package/lib/commonjs/views/SkiaDomView.js +152 -0
package/lib/commonjs/views/SkiaDomView.js.map +1 -0
package/lib/commonjs/views/SkiaDomView.web.js +55 -0
package/lib/commonjs/views/SkiaDomView.web.js.map +1 -0
package/lib/commonjs/views/SkiaPictureView.js +16 -2
package/lib/commonjs/views/SkiaPictureView.js.map +1 -1
package/lib/commonjs/views/SkiaView.js +17 -2
package/lib/commonjs/views/SkiaView.js.map +1 -1
package/lib/commonjs/views/index.js +13 -0
package/lib/commonjs/views/index.js.map +1 -1
package/lib/commonjs/views/types.js.map +1 -1
package/lib/module/dom/nodes/JsiSkDOM.js +56 -56
package/lib/module/dom/nodes/JsiSkDOM.js.map +1 -1
package/lib/module/dom/nodes/RenderNode.js +1 -1
package/lib/module/dom/nodes/RenderNode.js.map +1 -1
package/lib/module/renderer/Canvas.js +25 -66
package/lib/module/renderer/Canvas.js.map +1 -1
package/lib/module/renderer/DependencyManager.js +0 -5
package/lib/module/renderer/DependencyManager.js.map +1 -1
package/lib/module/renderer/HostComponents.js.map +1 -1
package/lib/module/renderer/useCanvas.js +2 -13
package/lib/module/renderer/useCanvas.js.map +1 -1
package/lib/module/views/SkiaBaseWebView.js +15 -0
package/lib/module/views/SkiaBaseWebView.js.map +1 -1
package/lib/module/views/SkiaDomView.js +128 -0
package/lib/module/views/SkiaDomView.js.map +1 -0
package/lib/module/views/SkiaDomView.web.js +41 -0
package/lib/module/views/SkiaDomView.web.js.map +1 -0
package/lib/module/views/SkiaPictureView.js +14 -2
package/lib/module/views/SkiaPictureView.js.map +1 -1
package/lib/module/views/SkiaView.js +15 -2
package/lib/module/views/SkiaView.js.map +1 -1
package/lib/module/views/index.js +1 -0
package/lib/module/views/index.js.map +1 -1
package/lib/module/views/types.js.map +1 -1
package/lib/typescript/src/dom/nodes/JsiSkDOM.d.ts +57 -64
package/lib/typescript/src/renderer/Canvas.d.ts +5 -8
package/lib/typescript/src/renderer/DependencyManager.d.ts +0 -2
package/lib/typescript/src/renderer/HostComponents.d.ts +66 -3
package/lib/typescript/src/renderer/useCanvas.d.ts +0 -6
package/lib/typescript/src/views/SkiaBaseWebView.d.ts +4 -0
package/lib/typescript/src/views/SkiaDomView.d.ts +31 -0
package/lib/typescript/src/views/SkiaDomView.web.d.ts +7 -0
package/lib/typescript/src/views/index.d.ts +1 -0
package/lib/typescript/src/views/types.d.ts +12 -2
package/libs/android/arm64-v8a/libskia.a +0 -0
package/libs/android/arm64-v8a/libskottie.a +0 -0
package/libs/android/arm64-v8a/libskparagraph.a +0 -0
package/libs/android/arm64-v8a/libsksg.a +0 -0
package/libs/android/arm64-v8a/libskshaper.a +0 -0
package/libs/android/arm64-v8a/libskunicode.a +0 -0
package/libs/android/arm64-v8a/libsvg.a +0 -0
package/libs/android/armeabi-v7a/libskia.a +0 -0
package/libs/android/armeabi-v7a/libskottie.a +0 -0
package/libs/android/armeabi-v7a/libskparagraph.a +0 -0
package/libs/android/armeabi-v7a/libsksg.a +0 -0
package/libs/android/armeabi-v7a/libskshaper.a +0 -0
package/libs/android/armeabi-v7a/libskunicode.a +0 -0
package/libs/android/armeabi-v7a/libsvg.a +0 -0
package/libs/android/x86/libskia.a +0 -0
package/libs/android/x86/libskottie.a +0 -0
package/libs/android/x86/libskparagraph.a +0 -0
package/libs/android/x86/libsksg.a +0 -0
package/libs/android/x86/libskshaper.a +0 -0
package/libs/android/x86/libskunicode.a +0 -0
package/libs/android/x86/libsvg.a +0 -0
package/libs/android/x86_64/libskia.a +0 -0
package/libs/android/x86_64/libskottie.a +0 -0
package/libs/android/x86_64/libskparagraph.a +0 -0
package/libs/android/x86_64/libsksg.a +0 -0
package/libs/android/x86_64/libskshaper.a +0 -0
package/libs/android/x86_64/libskunicode.a +0 -0
package/libs/android/x86_64/libsvg.a +0 -0
package/libs/ios/libskia.xcframework/ios-arm64_arm64e/libskia.a +0 -0
package/libs/ios/libskia.xcframework/ios-arm64_arm64e_x86_64-simulator/libskia.a +0 -0
package/libs/ios/libskottie.xcframework/ios-arm64_arm64e/libskottie.a +0 -0
package/libs/ios/libskottie.xcframework/ios-arm64_arm64e_x86_64-simulator/libskottie.a +0 -0
package/libs/ios/libskparagraph.xcframework/Info.plist +42 -0
package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e/libskparagraph.a +0 -0
package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e_x86_64-simulator/libskparagraph.a +0 -0
package/libs/ios/libsksg.xcframework/Info.plist +5 -5
package/libs/ios/libsksg.xcframework/ios-arm64_arm64e/libsksg.a +0 -0
package/libs/ios/libsksg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsksg.a +0 -0
package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e/libskshaper.a +0 -0
package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e_x86_64-simulator/libskshaper.a +0 -0
package/libs/ios/libskunicode.xcframework/Info.plist +42 -0
package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e/libskunicode.a +0 -0
package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e_x86_64-simulator/libskunicode.a +0 -0
package/libs/ios/libsvg.xcframework/ios-arm64_arm64e/libsvg.a +0 -0
package/libs/ios/libsvg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsvg.a +0 -0
package/package.json +5 -3
package/react-native-skia.podspec +5 -2
package/src/dom/nodes/JsiSkDOM.ts +170 -56
package/src/dom/nodes/RenderNode.ts +1 -1
package/src/renderer/Canvas.tsx +37 -57
package/src/renderer/DependencyManager.tsx +0 -5
package/src/renderer/HostComponents.ts +152 -1
package/src/renderer/useCanvas.ts +1 -15
package/src/views/SkiaBaseWebView.tsx +9 -0
package/src/views/SkiaDomView.tsx +120 -0
package/src/views/SkiaDomView.web.tsx +37 -0
package/src/views/SkiaPictureView.tsx +10 -2
package/src/views/SkiaView.tsx +11 -3
package/src/views/index.ts +1 -0
package/src/views/types.ts +19 -2
package/cpp/jsi/JsiSimpleValueWrapper.h +0 -99
package/cpp/skia/include/c/sk_canvas.h +0 -159
package/cpp/skia/include/c/sk_colorspace.h +0 -25
package/cpp/skia/include/c/sk_data.h +0 -65
package/cpp/skia/include/c/sk_image.h +0 -71
package/cpp/skia/include/c/sk_imageinfo.h +0 -62
package/cpp/skia/include/c/sk_maskfilter.h +0 -47
package/cpp/skia/include/c/sk_matrix.h +0 -49
package/cpp/skia/include/c/sk_paint.h +0 -145
package/cpp/skia/include/c/sk_path.h +0 -102
package/cpp/skia/include/c/sk_picture.h +0 -70
package/cpp/skia/include/c/sk_shader.h +0 -143
package/cpp/skia/include/c/sk_surface.h +0 -73
package/cpp/skia/include/c/sk_types.h +0 -278
package/cpp/skia/include/gpu/graphite/SkStuff.h +0 -47
package/cpp/skia/include/private/SkNx.h +0 -430
package/cpp/skia/include/private/SkNx_neon.h +0 -713
package/cpp/skia/include/private/SkNx_sse.h +0 -823
package/cpp/skia/include/sksl/DSLRuntimeEffects.h +0 -32
package/cpp/skia/include/sksl/DSLWrapper.h +0 -77

package/cpp/skia/modules/skcms/src/Transform_inl.h ADDED Viewed

@@ -0,0 +1,1609 @@
+/*
+ * Copyright 2018 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+// Intentionally NO #pragma once... included multiple times.
+// This file is included from skcms.cc in a namespace with some pre-defines:
+//    - N:    depth of all vectors, 1,4,8, or 16 (preprocessor define)
+//    - V<T>: a template to create a vector of N T's.
+using F   = V<Color>;   // Called F for historic reasons... maybe rename C?
+using I32 = V<int32_t>;
+using U64 = V<uint64_t>;
+using U32 = V<uint32_t>;
+using U16 = V<uint16_t>;
+using U8  = V<uint8_t>;
+#if defined(__GNUC__) && !defined(__clang__)
+    // Once again, GCC is kind of weird, not allowing vector = scalar directly.
+    static constexpr F F0 = F() + 0.0f,
+                       F1 = F() + 1.0f,
+                       FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
+#else
+    static constexpr F F0 = 0.0f,
+                       F1 = 1.0f,
+                       FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
+#endif
+// Instead of checking __AVX__ below, we'll check USING_AVX.
+// This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
+// Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
+#if !defined(USING_AVX)      && N == 8 && defined(__AVX__)
+    #define  USING_AVX
+#endif
+#if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
+    #define  USING AVX_F16C
+#endif
+#if !defined(USING_AVX2)     && defined(USING_AVX) && defined(__AVX2__)
+    #define  USING_AVX2
+#endif
+#if !defined(USING_AVX512F)  && N == 16 && defined(__AVX512F__) && defined(__AVX512DQ__)
+    #define  USING_AVX512F
+#endif
+// Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
+// This is more for organizational clarity... skcms.cc doesn't force these.
+#if N > 1 && defined(__ARM_NEON)
+    #define USING_NEON
+    #if __ARM_FP & 2
+        #define USING_NEON_F16C
+    #endif
+    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
+        #define USING_NEON_FP16
+    #endif
+#endif
+// These -Wvector-conversion warnings seem to trigger in very bogus situations,
+// like vst3q_f32() expecting a 16x char rather than a 4x float vector.  :/
+#if defined(USING_NEON) && defined(__clang__)
+    #pragma clang diagnostic ignored "-Wvector-conversion"
+#endif
+// GCC & Clang (but not clang-cl) warn returning U64 on x86 is larger than a register.
+// You'd see warnings like, "using AVX even though AVX is not enabled".
+// We stifle these warnings; our helpers that return U64 are always inlined.
+#if defined(__SSE__) && defined(__GNUC__)
+    #if !defined(__has_warning)
+        #pragma GCC diagnostic ignored "-Wpsabi"
+    #elif __has_warning("-Wpsabi")
+        #pragma GCC diagnostic ignored "-Wpsabi"
+    #endif
+#endif
+#if defined(__clang__)
+    #define FALLTHROUGH [[clang::fallthrough]]
+#else
+    #define FALLTHROUGH
+#endif
+// We tag most helper functions as SI, to enforce good code generation
+// but also work around what we think is a bug in GCC: when targeting 32-bit
+// x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
+// MMX mm0 register, which seems to mess with unrelated code that later uses
+// x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
+//
+// It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
+#if defined(__clang__) || defined(__GNUC__)
+    #define SI static inline __attribute__((always_inline))
+#else
+    #define SI static inline
+#endif
+template <typename T, typename P>
+SI T load(const P* ptr) {
+    T val;
+    small_memcpy(&val, ptr, sizeof(val));
+    return val;
+}
+template <typename T, typename P>
+SI void store(P* ptr, const T& val) {
+    small_memcpy(ptr, &val, sizeof(val));
+}
+// (T)v is a cast when N == 1 and a bit-pun when N>1,
+// so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
+template <typename D, typename S>
+SI D cast(const S& v) {
+#if N == 1
+    return (D)v;
+#elif defined(__clang__)
+    return __builtin_convertvector(v, D);
+#else
+    D d;
+    for (int i = 0; i < N; i++) {
+        d[i] = v[i];
+    }
+    return d;
+#endif
+}
+template <typename D, typename S>
+SI D bit_pun(const S& v) {
+    static_assert(sizeof(D) == sizeof(v), "");
+    return load<D>(&v);
+}
+// When we convert from float to fixed point, it's very common to want to round,
+// and for some reason compilers generate better code when converting to int32_t.
+// To serve both those ends, we use this function to_fixed() instead of direct cast().
+#if defined(USING_NEON_FP16)
+    // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
+    SI U16 to_fixed(F f) {  return cast<U16>(f + 0.5f); }
+#else
+    SI U32 to_fixed(F f) {  return (U32)cast<I32>(f + 0.5f); }
+#endif
+// Sometimes we do something crazy on one branch of a conditonal,
+// like divide by zero or convert a huge float to an integer,
+// but then harmlessly select the other side.  That trips up N==1
+// sanitizer builds, so we make if_then_else() a macro to avoid
+// evaluating the unused side.
+#if N == 1
+    #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
+#else
+    template <typename C, typename T>
+    SI T if_then_else(C cond, T t, T e) {
+        return bit_pun<T>( ( cond & bit_pun<C>(t)) |
+                           (~cond & bit_pun<C>(e)) );
+    }
+#endif
+SI F F_from_Half(U16 half) {
+#if defined(USING_NEON_FP16)
+    return bit_pun<F>(half);
+#elif defined(USING_NEON_F16C)
+    return vcvt_f32_f16((float16x4_t)half);
+#elif defined(USING_AVX512F)
+    return (F)_mm512_cvtph_ps((__m256i)half);
+#elif defined(USING_AVX_F16C)
+    typedef int16_t __attribute__((vector_size(16))) I16;
+    return __builtin_ia32_vcvtph2ps256((I16)half);
+#else
+    U32 wide = cast<U32>(half);
+    // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
+    U32 s  = wide & 0x8000,
+        em = wide ^ s;
+    // Constructing the float is easy if the half is not denormalized.
+    F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
+    // Simply flush all denorm half floats to zero.
+    return if_then_else(em < 0x0400, F0, norm);
+#endif
+}
+#if defined(__clang__)
+    // The -((127-15)<<10) underflows that side of the math when
+    // we pass a denorm half float.  It's harmless... we'll take the 0 side anyway.
+    __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+SI U16 Half_from_F(F f) {
+#if defined(USING_NEON_FP16)
+    return bit_pun<U16>(f);
+#elif defined(USING_NEON_F16C)
+    return (U16)vcvt_f16_f32(f);
+#elif defined(USING_AVX512F)
+    return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
+#elif defined(USING_AVX_F16C)
+    return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
+#else
+    // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
+    U32 sem = bit_pun<U32>(f),
+        s   = sem & 0x80000000,
+         em = sem ^ s;
+    // For simplicity we flush denorm half floats (including all denorm floats) to zero.
+    return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
+                                                 , (s>>16) + (em>>13) - ((127-15)<<10)));
+#endif
+}
+// Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
+#if defined(USING_NEON_FP16)
+    SI U16 swap_endian_16(U16 v) {
+        return (U16)vrev16q_u8((uint8x16_t) v);
+    }
+#elif defined(USING_NEON)
+    SI U16 swap_endian_16(U16 v) {
+        return (U16)vrev16_u8((uint8x8_t) v);
+    }
+#endif
+SI U64 swap_endian_16x4(const U64& rgba) {
+    return (rgba & 0x00ff00ff00ff00ff) << 8
+         | (rgba & 0xff00ff00ff00ff00) >> 8;
+}
+#if defined(USING_NEON_FP16)
+    SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
+    SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
+#elif defined(USING_NEON)
+    SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
+    SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
+#else
+    SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
+    SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
+#endif
+SI F floor_(F x) {
+#if N == 1
+    return floorf_(x);
+#elif defined(USING_NEON_FP16)
+    return vrndmq_f16(x);
+#elif defined(__aarch64__)
+    return vrndmq_f32(x);
+#elif defined(USING_AVX512F)
+    // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
+    // and integer santizer catches that this implicit cast changes the
+    // value from -1 to 65535.  We'll cast manually to work around it.
+    // Read this as `return _mm512_floor_ps(x)`.
+    return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
+#elif defined(USING_AVX)
+    return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
+#elif defined(__SSE4_1__)
+    return _mm_floor_ps(x);
+#else
+    // Round trip through integers with a truncating cast.
+    F roundtrip = cast<F>(cast<I32>(x));
+    // If x is negative, truncating gives the ceiling instead of the floor.
+    return roundtrip - if_then_else(roundtrip > x, F1, F0);
+    // This implementation fails for values of x that are outside
+    // the range an integer can represent.  We expect most x to be small.
+#endif
+}
+SI F approx_log2(F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    return x;
+#else
+    // The first approximation of log2(x) is its exponent 'e', minus 127.
+    I32 bits = bit_pun<I32>(x);
+    F e = cast<F>(bits) * (1.0f / (1<<23));
+    // If we use the mantissa too we can refine the error signficantly.
+    F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
+    return e - 124.225514990f
+             -   1.498030302f*m
+             -   1.725879990f/(0.3520887068f + m);
+#endif
+}
+SI F approx_log(F x) {
+    const float ln2 = 0.69314718f;
+    return ln2 * approx_log2(x);
+}
+SI F approx_exp2(F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    return x;
+#else
+    F fract = x - floor_(x);
+    F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
+                                    -   1.490129070f*fract
+                                    +  27.728023300f/(4.84252568f - fract));
+    I32 bits = cast<I32>(min_(max_(fbits, F0), FInfBits));
+    return bit_pun<F>(bits);
+#endif
+}
+SI F approx_pow(F x, float y) {
+    return if_then_else((x == F0) | (x == F1), x
+                                             , approx_exp2(approx_log2(x) * y));
+}
+SI F approx_exp(F x) {
+    const float log2_e = 1.4426950408889634074f;
+    return approx_exp2(log2_e * x);
+}
+// Return tf(x).
+SI F apply_tf(const skcms_TransferFunction* tf, F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    (void)tf;
+    return x;
+#else
+    // Peel off the sign bit and set x = |x|.
+    U32 bits = bit_pun<U32>(x),
+        sign = bits & 0x80000000;
+    x = bit_pun<F>(bits ^ sign);
+    // The transfer function has a linear part up to d, exponential at d and after.
+    F v = if_then_else(x < tf->d,            tf->c*x + tf->f
+                                , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
+    // Tack the sign bit back on.
+    return bit_pun<F>(sign | bit_pun<U32>(v));
+#endif
+}
+SI F apply_pq(const skcms_TransferFunction* tf, F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    (void)tf;
+    return x;
+#else
+    U32 bits = bit_pun<U32>(x),
+        sign = bits & 0x80000000;
+    x = bit_pun<F>(bits ^ sign);
+    F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
+                       / (tf->d + tf->e * approx_pow(x, tf->c)),
+                     tf->f);
+    return bit_pun<F>(sign | bit_pun<U32>(v));
+#endif
+}
+SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    (void)tf;
+    return x;
+#else
+    const float R = tf->a, G = tf->b,
+                a = tf->c, b = tf->d, c = tf->e,
+                K = tf->f + 1;
+    U32 bits = bit_pun<U32>(x),
+        sign = bits & 0x80000000;
+    x = bit_pun<F>(bits ^ sign);
+    F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
+                               , approx_exp((x-c)*a) + b);
+    return K*bit_pun<F>(sign | bit_pun<U32>(v));
+#endif
+}
+SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
+#if defined(USING_NEON_FP16)
+    // TODO(mtklein)
+    (void)tf;
+    return x;
+#else
+    const float R = tf->a, G = tf->b,
+                a = tf->c, b = tf->d, c = tf->e,
+                K = tf->f + 1;
+    U32 bits = bit_pun<U32>(x),
+        sign = bits & 0x80000000;
+    x = bit_pun<F>(bits ^ sign);
+    x /= K;
+    F v = if_then_else(x <= 1, R * approx_pow(x, G)
+                             , a * approx_log(x - b) + c);
+    return bit_pun<F>(sign | bit_pun<U32>(v));
+#endif
+}
+// Strided loads and stores of N values, starting from p.
+template <typename T, typename P>
+SI T load_3(const P* p) {
+#if N == 1
+    return (T)p[0];
+#elif N == 4
+    return T{p[ 0],p[ 3],p[ 6],p[ 9]};
+#elif N == 8
+    return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
+#elif N == 16
+    return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
+             p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
+#endif
+}
+template <typename T, typename P>
+SI T load_4(const P* p) {
+#if N == 1
+    return (T)p[0];
+#elif N == 4
+    return T{p[ 0],p[ 4],p[ 8],p[12]};
+#elif N == 8
+    return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
+#elif N == 16
+    return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
+             p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
+#endif
+}
+template <typename T, typename P>
+SI void store_3(P* p, const T& v) {
+#if N == 1
+    p[0] = v;
+#elif N == 4
+    p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
+#elif N == 8
+    p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
+    p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
+#elif N == 16
+    p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
+    p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
+    p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
+    p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
+#endif
+}
+template <typename T, typename P>
+SI void store_4(P* p, const T& v) {
+#if N == 1
+    p[0] = v;
+#elif N == 4
+    p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
+#elif N == 8
+    p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
+    p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
+#elif N == 16
+    p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
+    p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
+    p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
+    p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
+#endif
+}
+SI U8 gather_8(const uint8_t* p, I32 ix) {
+#if N == 1
+    U8 v = p[ix];
+#elif N == 4
+    U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
+#elif N == 8
+    U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
+             p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
+#elif N == 16
+    U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
+             p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
+             p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
+             p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
+#endif
+    return v;
+}
+SI U16 gather_16(const uint8_t* p, I32 ix) {
+    // Load the i'th 16-bit value from p.
+    auto load_16 = [p](int i) {
+        return load<uint16_t>(p + 2*i);
+    };
+#if N == 1
+    U16 v = load_16(ix);
+#elif N == 4
+    U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
+#elif N == 8
+    U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
+              load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
+#elif N == 16
+    U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
+              load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
+              load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
+              load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
+#endif
+    return v;
+}
+SI U32 gather_32(const uint8_t* p, I32 ix) {
+    // Load the i'th 32-bit value from p.
+    auto load_32 = [p](int i) {
+        return load<uint32_t>(p + 4*i);
+    };
+#if N == 1
+    U32 v = load_32(ix);
+#elif N == 4
+    U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
+#elif N == 8
+    U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
+              load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
+#elif N == 16
+    U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
+              load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
+              load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
+              load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
+#endif
+    // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
+    return v;
+}
+SI U32 gather_24(const uint8_t* p, I32 ix) {
+    // First, back up a byte.  Any place we're gathering from has a safe junk byte to read
+    // in front of it, either a previous table value, or some tag metadata.
+    p -= 1;
+    // Load the i'th 24-bit value from p, and 1 extra byte.
+    auto load_24_32 = [p](int i) {
+        return load<uint32_t>(p + 3*i);
+    };
+    // Now load multiples of 4 bytes (a junk byte, then r,g,b).
+#if N == 1
+    U32 v = load_24_32(ix);
+#elif N == 4
+    U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
+#elif N == 8 && !defined(USING_AVX2)
+    U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
+              load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
+#elif N == 8
+    (void)load_24_32;
+    // The gather instruction here doesn't need any particular alignment,
+    // but the intrinsic takes a const int*.
+    const int* p4 = bit_pun<const int*>(p);
+    I32 zero = { 0, 0, 0, 0,  0, 0, 0, 0},
+        mask = {-1,-1,-1,-1, -1,-1,-1,-1};
+    #if defined(__clang__)
+        U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
+    #elif defined(__GNUC__)
+        U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
+    #endif
+#elif N == 16
+    (void)load_24_32;
+    // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
+    // And AVX-512 swapped the order of arguments.  :/
+    const int* p4 = bit_pun<const int*>(p);
+    U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
+#endif
+    // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
+    return v >> 8;
+}
+#if !defined(__arm__)
+    SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
+        // As in gather_24(), with everything doubled.
+        p -= 2;
+        // Load the i'th 48-bit value from p, and 2 extra bytes.
+        auto load_48_64 = [p](int i) {
+            return load<uint64_t>(p + 6*i);
+        };
+    #if N == 1
+        *v = load_48_64(ix);
+    #elif N == 4
+        *v = U64{
+            load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
+        };
+    #elif N == 8 && !defined(USING_AVX2)
+        *v = U64{
+            load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
+            load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
+        };
+    #elif N == 8
+        (void)load_48_64;
+        typedef int32_t   __attribute__((vector_size(16))) Half_I32;
+        typedef long long __attribute__((vector_size(32))) Half_I64;
+        // The gather instruction here doesn't need any particular alignment,
+        // but the intrinsic takes a const long long*.
+        const long long int* p8 = bit_pun<const long long int*>(p);
+        Half_I64 zero = { 0, 0, 0, 0},
+                 mask = {-1,-1,-1,-1};
+        ix *= 6;
+        Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
+                 ix_hi = { ix[4], ix[5], ix[6], ix[7] };
+        #if defined(__clang__)
+            Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
+                     hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
+        #elif defined(__GNUC__)
+            Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
+                     hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
+        #endif
+        store((char*)v +  0, lo);
+        store((char*)v + 32, hi);
+    #elif N == 16
+        (void)load_48_64;
+        const long long int* p8 = bit_pun<const long long int*>(p);
+        __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
+                hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
+        store((char*)v +  0, lo);
+        store((char*)v + 64, hi);
+    #endif
+        *v >>= 16;
+    }
+#endif
+SI F F_from_U8(U8 v) {
+    return cast<F>(v) * (1/255.0f);
+}
+SI F F_from_U16_BE(U16 v) {
+    // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
+    // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
+    U16 lo = (v >> 8),
+        hi = (v << 8) & 0xffff;
+    return cast<F>(lo|hi) * (1/65535.0f);
+}
+SI U16 U16_from_F(F v) {
+    // 65535 == inf in FP16, so promote to FP32 before converting.
+    return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
+}
+SI F minus_1_ulp(F v) {
+#if defined(USING_NEON_FP16)
+    return bit_pun<F>( bit_pun<U16>(v) - 1 );
+#else
+    return bit_pun<F>( bit_pun<U32>(v) - 1 );
+#endif
+}
+SI F table(const skcms_Curve* curve, F v) {
+    // Clamp the input to [0,1], then scale to a table index.
+    F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
+    // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
+    I32 lo = cast<I32>(            ix      ),
+        hi = cast<I32>(minus_1_ulp(ix+1.0f));
+    F t = ix - cast<F>(lo);  // i.e. the fractional part of ix.
+    // TODO: can we load l and h simultaneously?  Each entry in 'h' is either
+    // the same as in 'l' or adjacent.  We have a rough idea that's it'd always be safe
+    // to read adjacent entries and perhaps underflow the table by a byte or two
+    // (it'd be junk, but always safe to read).  Not sure how to lerp yet.
+    F l,h;
+    if (curve->table_8) {
+        l = F_from_U8(gather_8(curve->table_8, lo));
+        h = F_from_U8(gather_8(curve->table_8, hi));
+    } else {
+        l = F_from_U16_BE(gather_16(curve->table_16, lo));
+        h = F_from_U16_BE(gather_16(curve->table_16, hi));
+    }
+    return l + (h-l)*t;
+}
+SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) {
+    U32 rgb = gather_24(grid_8, ix);
+    *r = cast<F>((rgb >>  0) & 0xff) * (1/255.0f);
+    *g = cast<F>((rgb >>  8) & 0xff) * (1/255.0f);
+    *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
+}
+SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) {
+    // TODO: don't forget to optimize gather_32().
+    U32 rgba = gather_32(grid_8, ix);
+    *r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
+    *g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
+    *b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
+    *a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
+}
+SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) {
+#if defined(__arm__)
+    // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
+    *r = F_from_U16_BE(gather_16(grid_16, 3*ix+0));
+    *g = F_from_U16_BE(gather_16(grid_16, 3*ix+1));
+    *b = F_from_U16_BE(gather_16(grid_16, 3*ix+2));
+#else
+    // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
+    U64 rgb;
+    gather_48(grid_16, ix, &rgb);
+    rgb = swap_endian_16x4(rgb);
+    *r = cast<F>((rgb >>  0) & 0xffff) * (1/65535.0f);
+    *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
+    *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
+#endif
+}
+SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) {
+    // TODO: gather_64()-based fast path?
+    *r = F_from_U16_BE(gather_16(grid_16, 4*ix+0));
+    *g = F_from_U16_BE(gather_16(grid_16, 4*ix+1));
+    *b = F_from_U16_BE(gather_16(grid_16, 4*ix+2));
+    *a = F_from_U16_BE(gather_16(grid_16, 4*ix+3));
+}
+static void clut(uint32_t input_channels, uint32_t output_channels,
+                 const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
+                 F* r, F* g, F* b, F* a) {
+    const int dim = (int)input_channels;
+    assert (0 < dim && dim <= 4);
+    assert (output_channels == 3 ||
+            output_channels == 4);
+    // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
+    I32 index [8];  // Index contribution by dimension, first low from 0, then high from 4.
+    F   weight[8];  // Weight for each contribution, again first low, then high.
+    // O(dim) work first: calculate index,weight from r,g,b,a.
+    const F inputs[] = { *r,*g,*b,*a };
+    for (int i = dim-1, stride = 1; i >= 0; i--) {
+        // x is where we logically want to sample the grid in the i-th dimension.
+        F x = inputs[i] * (float)(grid_points[i] - 1);
+        // But we can't index at floats.  lo and hi are the two integer grid points surrounding x.
+        I32 lo = cast<I32>(            x      ),   // i.e. trunc(x) == floor(x) here.
+            hi = cast<I32>(minus_1_ulp(x+1.0f));
+        // Notice how we fold in the accumulated stride across previous dimensions here.
+        index[i+0] = lo * stride;
+        index[i+4] = hi * stride;
+        stride *= grid_points[i];
+        // We'll interpolate between those two integer grid points by t.
+        F t = x - cast<F>(lo);  // i.e. fract(x)
+        weight[i+0] = 1-t;
+        weight[i+4] = t;
+    }
+    *r = *g = *b = F0;
+    if (output_channels == 4) {
+        *a = F0;
+    }
+    // We'll sample 2^dim == 1<<dim table entries per pixel,
+    // in all combinations of low and high in each dimension.
+    for (int combo = 0; combo < (1<<dim); combo++) {  // This loop can be done in any order.
+        // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
+        // where 0 selects the low index contribution and its weight 1-t,
+        // or 4 the high index contribution and its weight t.
+        // Since 0<dim≤4, we can always just start off with the 0-th channel,
+        // then handle the others conditionally.
+        I32 ix = index [0 + (combo&1)*4];
+        F    w = weight[0 + (combo&1)*4];
+        switch ((dim-1)&3) {  // This lets the compiler know there are no other cases to handle.
+            case 3: ix += index [3 + (combo&8)/2];
+                    w  *= weight[3 + (combo&8)/2];
+                    FALLTHROUGH;
+                    // fall through
+            case 2: ix += index [2 + (combo&4)*1];
+                    w  *= weight[2 + (combo&4)*1];
+                    FALLTHROUGH;
+                    // fall through
+            case 1: ix += index [1 + (combo&2)*2];
+                    w  *= weight[1 + (combo&2)*2];
+        }
+        F R,G,B,A=F0;
+        if (output_channels == 3) {
+            if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B); }
+            else        { sample_clut_16(grid_16,ix, &R,&G,&B); }
+        } else {
+            if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B,&A); }
+            else        { sample_clut_16(grid_16,ix, &R,&G,&B,&A); }
+        }
+        *r += w*R;
+        *g += w*G;
+        *b += w*B;
+        *a += w*A;
+    }
+}
+static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
+    clut(a2b->input_channels, a2b->output_channels,
+         a2b->grid_points, a2b->grid_8, a2b->grid_16,
+         r,g,b,&a);
+}
+static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
+    clut(b2a->input_channels, b2a->output_channels,
+         b2a->grid_points, b2a->grid_8, b2a->grid_16,
+         r,g,b,a);
+}
+static void exec_ops(const Op* ops, const void** args,
+                     const char* src, char* dst, int i) {
+    F r = F0, g = F0, b = F0, a = F1;
+    while (true) {
+        switch (*ops++) {
+            case Op_load_a8:{
+                a = F_from_U8(load<U8>(src + 1*i));
+            } break;
+            case Op_load_g8:{
+                r = g = b = F_from_U8(load<U8>(src + 1*i));
+            } break;
+            case Op_load_4444:{
+                U16 abgr = load<U16>(src + 2*i);
+                r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
+                g = cast<F>((abgr >>  8) & 0xf) * (1/15.0f);
+                b = cast<F>((abgr >>  4) & 0xf) * (1/15.0f);
+                a = cast<F>((abgr >>  0) & 0xf) * (1/15.0f);
+            } break;
+            case Op_load_565:{
+                U16 rgb = load<U16>(src + 2*i);
+                r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
+                g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
+                b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
+            } break;
+            case Op_load_888:{
+                const uint8_t* rgb = (const uint8_t*)(src + 3*i);
+            #if defined(USING_NEON_FP16)
+                // See the explanation under USING_NEON below.  This is that doubled up.
+                uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
+                v = vld3q_lane_u8(rgb+ 0, v,  0);
+                v = vld3q_lane_u8(rgb+ 3, v,  2);
+                v = vld3q_lane_u8(rgb+ 6, v,  4);
+                v = vld3q_lane_u8(rgb+ 9, v,  6);
+                v = vld3q_lane_u8(rgb+12, v,  8);
+                v = vld3q_lane_u8(rgb+15, v, 10);
+                v = vld3q_lane_u8(rgb+18, v, 12);
+                v = vld3q_lane_u8(rgb+21, v, 14);
+                r = cast<F>((U16)v.val[0]) * (1/255.0f);
+                g = cast<F>((U16)v.val[1]) * (1/255.0f);
+                b = cast<F>((U16)v.val[2]) * (1/255.0f);
+            #elif defined(USING_NEON)
+                // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
+                // a time.  Since we're doing that, we might as well load them into 16-bit lanes.
+                // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
+                uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
+                v = vld3_lane_u8(rgb+0, v, 0);
+                v = vld3_lane_u8(rgb+3, v, 2);
+                v = vld3_lane_u8(rgb+6, v, 4);
+                v = vld3_lane_u8(rgb+9, v, 6);
+                // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
+                // convert to F.  (Again, U32 would be even better here if drop ARMv7 or split
+                // ARMv7 and ARMv8 impls.)
+                r = cast<F>((U16)v.val[0]) * (1/255.0f);
+                g = cast<F>((U16)v.val[1]) * (1/255.0f);
+                b = cast<F>((U16)v.val[2]) * (1/255.0f);
+            #else
+                r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
+                g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
+                b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
+            #endif
+            } break;
+            case Op_load_8888:{
+                U32 rgba = load<U32>(src + 4*i);
+                r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
+                g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
+                b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
+                a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
+            } break;
+            case Op_load_8888_palette8:{
+                const uint8_t* palette = (const uint8_t*) *args++;
+                I32 ix = cast<I32>(load<U8>(src + 1*i));
+                U32 rgba = gather_32(palette, ix);
+                r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
+                g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
+                b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
+                a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
+            } break;
+            case Op_load_1010102:{
+                U32 rgba = load<U32>(src + 4*i);
+                r = cast<F>((rgba >>  0) & 0x3ff) * (1/1023.0f);
+                g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
+                b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
+                a = cast<F>((rgba >> 30) & 0x3  ) * (1/   3.0f);
+            } break;
+            case Op_load_161616LE:{
+                uintptr_t ptr = (uintptr_t)(src + 6*i);
+                assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
+                const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = vld3q_u16(rgb);
+                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
+                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
+                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = vld3_u16(rgb);
+                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
+                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
+                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
+            #else
+                r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
+                g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
+                b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
+            #endif
+            } break;
+            case Op_load_16161616LE:{
+                uintptr_t ptr = (uintptr_t)(src + 8*i);
+                assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
+                const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = vld4q_u16(rgba);
+                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
+                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
+                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
+                a = cast<F>((U16)v.val[3]) * (1/65535.0f);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = vld4_u16(rgba);
+                r = cast<F>((U16)v.val[0]) * (1/65535.0f);
+                g = cast<F>((U16)v.val[1]) * (1/65535.0f);
+                b = cast<F>((U16)v.val[2]) * (1/65535.0f);
+                a = cast<F>((U16)v.val[3]) * (1/65535.0f);
+            #else
+                U64 px = load<U64>(rgba);
+                r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f);
+                g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
+                b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
+                a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
+            #endif
+            } break;
+            case Op_load_161616BE:{
+                uintptr_t ptr = (uintptr_t)(src + 6*i);
+                assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
+                const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = vld3q_u16(rgb);
+                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
+                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
+                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = vld3_u16(rgb);
+                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
+                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
+                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
+            #else
+                U32 R = load_3<U32>(rgb+0),
+                    G = load_3<U32>(rgb+1),
+                    B = load_3<U32>(rgb+2);
+                // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
+                r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
+                g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
+                b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
+            #endif
+            } break;
+            case Op_load_16161616BE:{
+                uintptr_t ptr = (uintptr_t)(src + 8*i);
+                assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
+                const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = vld4q_u16(rgba);
+                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
+                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
+                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
+                a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = vld4_u16(rgba);
+                r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
+                g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
+                b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
+                a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
+            #else
+                U64 px = swap_endian_16x4(load<U64>(rgba));
+                r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f);
+                g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
+                b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
+                a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
+            #endif
+            } break;
+            case Op_load_hhh:{
+                uintptr_t ptr = (uintptr_t)(src + 6*i);
+                assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
+                const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = vld3q_u16(rgb);
+                U16 R = (U16)v.val[0],
+                    G = (U16)v.val[1],
+                    B = (U16)v.val[2];
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = vld3_u16(rgb);
+                U16 R = (U16)v.val[0],
+                    G = (U16)v.val[1],
+                    B = (U16)v.val[2];
+            #else
+                U16 R = load_3<U16>(rgb+0),
+                    G = load_3<U16>(rgb+1),
+                    B = load_3<U16>(rgb+2);
+            #endif
+                r = F_from_Half(R);
+                g = F_from_Half(G);
+                b = F_from_Half(B);
+            } break;
+            case Op_load_hhhh:{
+                uintptr_t ptr = (uintptr_t)(src + 8*i);
+                assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
+                const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = vld4q_u16(rgba);
+                U16 R = (U16)v.val[0],
+                    G = (U16)v.val[1],
+                    B = (U16)v.val[2],
+                    A = (U16)v.val[3];
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = vld4_u16(rgba);
+                U16 R = (U16)v.val[0],
+                    G = (U16)v.val[1],
+                    B = (U16)v.val[2],
+                    A = (U16)v.val[3];
+            #else
+                U64 px = load<U64>(rgba);
+                U16 R = cast<U16>((px >>  0) & 0xffff),
+                    G = cast<U16>((px >> 16) & 0xffff),
+                    B = cast<U16>((px >> 32) & 0xffff),
+                    A = cast<U16>((px >> 48) & 0xffff);
+            #endif
+                r = F_from_Half(R);
+                g = F_from_Half(G);
+                b = F_from_Half(B);
+                a = F_from_Half(A);
+            } break;
+            case Op_load_fff:{
+                uintptr_t ptr = (uintptr_t)(src + 12*i);
+                assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
+                const float* rgb = (const float*)ptr;       // cast to const float* to be safe.
+            #if defined(USING_NEON_FP16)
+                float32x4x3_t lo = vld3q_f32(rgb +  0),
+                              hi = vld3q_f32(rgb + 12);
+                r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
+                g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
+                b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
+            #elif defined(USING_NEON)
+                float32x4x3_t v = vld3q_f32(rgb);
+                r = (F)v.val[0];
+                g = (F)v.val[1];
+                b = (F)v.val[2];
+            #else
+                r = load_3<F>(rgb+0);
+                g = load_3<F>(rgb+1);
+                b = load_3<F>(rgb+2);
+            #endif
+            } break;
+            case Op_load_ffff:{
+                uintptr_t ptr = (uintptr_t)(src + 16*i);
+                assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
+                const float* rgba = (const float*)ptr;      // cast to const float* to be safe.
+            #if defined(USING_NEON_FP16)
+                float32x4x4_t lo = vld4q_f32(rgba +  0),
+                              hi = vld4q_f32(rgba + 16);
+                r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
+                g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
+                b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
+                a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
+            #elif defined(USING_NEON)
+                float32x4x4_t v = vld4q_f32(rgba);
+                r = (F)v.val[0];
+                g = (F)v.val[1];
+                b = (F)v.val[2];
+                a = (F)v.val[3];
+            #else
+                r = load_4<F>(rgba+0);
+                g = load_4<F>(rgba+1);
+                b = load_4<F>(rgba+2);
+                a = load_4<F>(rgba+3);
+            #endif
+            } break;
+            case Op_swap_rb:{
+                F t = r;
+                r = b;
+                b = t;
+            } break;
+            case Op_clamp:{
+                r = max_(F0, min_(r, F1));
+                g = max_(F0, min_(g, F1));
+                b = max_(F0, min_(b, F1));
+                a = max_(F0, min_(a, F1));
+            } break;
+            case Op_invert:{
+                r = F1 - r;
+                g = F1 - g;
+                b = F1 - b;
+                a = F1 - a;
+            } break;
+            case Op_force_opaque:{
+                a = F1;
+            } break;
+            case Op_premul:{
+                r *= a;
+                g *= a;
+                b *= a;
+            } break;
+            case Op_unpremul:{
+                F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
+                r *= scale;
+                g *= scale;
+                b *= scale;
+            } break;
+            case Op_matrix_3x3:{
+                const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
+                const float* m = &matrix->vals[0][0];
+                F R = m[0]*r + m[1]*g + m[2]*b,
+                  G = m[3]*r + m[4]*g + m[5]*b,
+                  B = m[6]*r + m[7]*g + m[8]*b;
+                r = R;
+                g = G;
+                b = B;
+            } break;
+            case Op_matrix_3x4:{
+                const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
+                const float* m = &matrix->vals[0][0];
+                F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
+                  G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
+                  B = m[8]*r + m[9]*g + m[10]*b + m[11];
+                r = R;
+                g = G;
+                b = B;
+            } break;
+            case Op_lab_to_xyz:{
+                // The L*a*b values are in r,g,b, but normalized to [0,1].  Reconstruct them:
+                F L = r * 100.0f,
+                  A = g * 255.0f - 128.0f,
+                  B = b * 255.0f - 128.0f;
+                // Convert to CIE XYZ.
+                F Y = (L + 16.0f) * (1/116.0f),
+                  X = Y + A*(1/500.0f),
+                  Z = Y - B*(1/200.0f);
+                X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
+                Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
+                Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
+                // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
+                r = X * 0.9642f;
+                g = Y          ;
+                b = Z * 0.8249f;
+            } break;
+            // As above, in reverse.
+            case Op_xyz_to_lab:{
+                F X = r * (1/0.9642f),
+                  Y = g,
+                  Z = b * (1/0.8249f);
+                X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
+                Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
+                Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
+                F L = Y*116.0f - 16.0f,
+                  A = (X-Y)*500.0f,
+                  B = (Y-Z)*200.0f;
+                r = L * (1/100.f);
+                g = (A + 128.0f) * (1/255.0f);
+                b = (B + 128.0f) * (1/255.0f);
+            } break;
+            case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
+            case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
+            case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
+            case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
+            case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
+            case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
+            case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
+            case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
+            case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
+            case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
+            case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
+            case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
+            case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
+            case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
+            case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
+            case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
+            case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
+            case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
+            case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
+            case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
+            case Op_clut_A2B: {
+                const skcms_A2B* a2b = (const skcms_A2B*) *args++;
+                clut(a2b, &r,&g,&b,a);
+                if (a2b->input_channels == 4) {
+                    // CMYK is opaque.
+                    a = F1;
+                }
+            } break;
+            case Op_clut_B2A: {
+                const skcms_B2A* b2a = (const skcms_B2A*) *args++;
+                clut(b2a, &r,&g,&b,&a);
+            } break;
+    // Notice, from here on down the store_ ops all return, ending the loop.
+            case Op_store_a8: {
+                store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
+            } return;
+            case Op_store_g8: {
+                // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
+                store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
+            } return;
+            case Op_store_4444: {
+                store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
+                                    | cast<U16>(to_fixed(g * 15) <<  8)
+                                    | cast<U16>(to_fixed(b * 15) <<  4)
+                                    | cast<U16>(to_fixed(a * 15) <<  0));
+            } return;
+            case Op_store_565: {
+                store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) <<  0 )
+                                    | cast<U16>(to_fixed(g * 63) <<  5 )
+                                    | cast<U16>(to_fixed(b * 31) << 11 ));
+            } return;
+            case Op_store_888: {
+                uint8_t* rgb = (uint8_t*)dst + 3*i;
+            #if defined(USING_NEON_FP16)
+                // See the explanation under USING_NEON below.  This is that doubled up.
+                U16 R = to_fixed(r * 255),
+                    G = to_fixed(g * 255),
+                    B = to_fixed(b * 255);
+                uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
+                vst3q_lane_u8(rgb+ 0, v,  0);
+                vst3q_lane_u8(rgb+ 3, v,  2);
+                vst3q_lane_u8(rgb+ 6, v,  4);
+                vst3q_lane_u8(rgb+ 9, v,  6);
+                vst3q_lane_u8(rgb+12, v,  8);
+                vst3q_lane_u8(rgb+15, v, 10);
+                vst3q_lane_u8(rgb+18, v, 12);
+                vst3q_lane_u8(rgb+21, v, 14);
+            #elif defined(USING_NEON)
+                // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
+                // get there via U16 to save some instructions converting to float.  And just
+                // like load_888, we'd prefer to go via U32 but for ARMv7 support.
+                U16 R = cast<U16>(to_fixed(r * 255)),
+                    G = cast<U16>(to_fixed(g * 255)),
+                    B = cast<U16>(to_fixed(b * 255));
+                uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
+                vst3_lane_u8(rgb+0, v, 0);
+                vst3_lane_u8(rgb+3, v, 2);
+                vst3_lane_u8(rgb+6, v, 4);
+                vst3_lane_u8(rgb+9, v, 6);
+            #else
+                store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
+                store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
+                store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
+            #endif
+            } return;
+            case Op_store_8888: {
+                store(dst + 4*i, cast<U32>(to_fixed(r * 255)) <<  0
+                               | cast<U32>(to_fixed(g * 255)) <<  8
+                               | cast<U32>(to_fixed(b * 255)) << 16
+                               | cast<U32>(to_fixed(a * 255)) << 24);
+            } return;
+            case Op_store_1010102: {
+                store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) <<  0
+                               | cast<U32>(to_fixed(g * 1023)) << 10
+                               | cast<U32>(to_fixed(b * 1023)) << 20
+                               | cast<U32>(to_fixed(a *    3)) << 30);
+            } return;
+            case Op_store_161616LE: {
+                uintptr_t ptr = (uintptr_t)(dst + 6*i);
+                assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
+                uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = {{
+                    (uint16x8_t)U16_from_F(r),
+                    (uint16x8_t)U16_from_F(g),
+                    (uint16x8_t)U16_from_F(b),
+                }};
+                vst3q_u16(rgb, v);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = {{
+                    (uint16x4_t)U16_from_F(r),
+                    (uint16x4_t)U16_from_F(g),
+                    (uint16x4_t)U16_from_F(b),
+                }};
+                vst3_u16(rgb, v);
+            #else
+                store_3(rgb+0, U16_from_F(r));
+                store_3(rgb+1, U16_from_F(g));
+                store_3(rgb+2, U16_from_F(b));
+            #endif
+            } return;
+            case Op_store_16161616LE: {
+                uintptr_t ptr = (uintptr_t)(dst + 8*i);
+                assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
+                uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = {{
+                    (uint16x8_t)U16_from_F(r),
+                    (uint16x8_t)U16_from_F(g),
+                    (uint16x8_t)U16_from_F(b),
+                    (uint16x8_t)U16_from_F(a),
+                }};
+                vst4q_u16(rgba, v);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = {{
+                    (uint16x4_t)U16_from_F(r),
+                    (uint16x4_t)U16_from_F(g),
+                    (uint16x4_t)U16_from_F(b),
+                    (uint16x4_t)U16_from_F(a),
+                }};
+                vst4_u16(rgba, v);
+            #else
+                U64 px = cast<U64>(to_fixed(r * 65535)) <<  0
+                       | cast<U64>(to_fixed(g * 65535)) << 16
+                       | cast<U64>(to_fixed(b * 65535)) << 32
+                       | cast<U64>(to_fixed(a * 65535)) << 48;
+                store(rgba, px);
+            #endif
+            } return;
+            case Op_store_161616BE: {
+                uintptr_t ptr = (uintptr_t)(dst + 6*i);
+                assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
+                uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = {{
+                    (uint16x8_t)swap_endian_16(U16_from_F(r)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(g)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(b)),
+                }};
+                vst3q_u16(rgb, v);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = {{
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
+                }};
+                vst3_u16(rgb, v);
+            #else
+                U32 R = to_fixed(r * 65535),
+                    G = to_fixed(g * 65535),
+                    B = to_fixed(b * 65535);
+                store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
+                store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
+                store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
+            #endif
+            } return;
+            case Op_store_16161616BE: {
+                uintptr_t ptr = (uintptr_t)(dst + 8*i);
+                assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
+                uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = {{
+                    (uint16x8_t)swap_endian_16(U16_from_F(r)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(g)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(b)),
+                    (uint16x8_t)swap_endian_16(U16_from_F(a)),
+                }};
+                vst4q_u16(rgba, v);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = {{
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
+                    (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
+                }};
+                vst4_u16(rgba, v);
+            #else
+                U64 px = cast<U64>(to_fixed(r * 65535)) <<  0
+                       | cast<U64>(to_fixed(g * 65535)) << 16
+                       | cast<U64>(to_fixed(b * 65535)) << 32
+                       | cast<U64>(to_fixed(a * 65535)) << 48;
+                store(rgba, swap_endian_16x4(px));
+            #endif
+            } return;
+            case Op_store_hhh: {
+                uintptr_t ptr = (uintptr_t)(dst + 6*i);
+                assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
+                uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
+                U16 R = Half_from_F(r),
+                    G = Half_from_F(g),
+                    B = Half_from_F(b);
+            #if defined(USING_NEON_FP16)
+                uint16x8x3_t v = {{
+                    (uint16x8_t)R,
+                    (uint16x8_t)G,
+                    (uint16x8_t)B,
+                }};
+                vst3q_u16(rgb, v);
+            #elif defined(USING_NEON)
+                uint16x4x3_t v = {{
+                    (uint16x4_t)R,
+                    (uint16x4_t)G,
+                    (uint16x4_t)B,
+                }};
+                vst3_u16(rgb, v);
+            #else
+                store_3(rgb+0, R);
+                store_3(rgb+1, G);
+                store_3(rgb+2, B);
+            #endif
+            } return;
+            case Op_store_hhhh: {
+                uintptr_t ptr = (uintptr_t)(dst + 8*i);
+                assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
+                uint16_t* rgba = (uint16_t*)ptr;         // for this cast to uint16_t* to be safe.
+                U16 R = Half_from_F(r),
+                    G = Half_from_F(g),
+                    B = Half_from_F(b),
+                    A = Half_from_F(a);
+            #if defined(USING_NEON_FP16)
+                uint16x8x4_t v = {{
+                    (uint16x8_t)R,
+                    (uint16x8_t)G,
+                    (uint16x8_t)B,
+                    (uint16x8_t)A,
+                }};
+                vst4q_u16(rgba, v);
+            #elif defined(USING_NEON)
+                uint16x4x4_t v = {{
+                    (uint16x4_t)R,
+                    (uint16x4_t)G,
+                    (uint16x4_t)B,
+                    (uint16x4_t)A,
+                }};
+                vst4_u16(rgba, v);
+            #else
+                store(rgba, cast<U64>(R) <<  0
+                          | cast<U64>(G) << 16
+                          | cast<U64>(B) << 32
+                          | cast<U64>(A) << 48);
+            #endif
+            } return;
+            case Op_store_fff: {
+                uintptr_t ptr = (uintptr_t)(dst + 12*i);
+                assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
+                float* rgb = (float*)ptr;                // for this cast to float* to be safe.
+            #if defined(USING_NEON_FP16)
+                float32x4x3_t lo = {{
+                    vcvt_f32_f16(vget_low_f16(r)),
+                    vcvt_f32_f16(vget_low_f16(g)),
+                    vcvt_f32_f16(vget_low_f16(b)),
+                }}, hi = {{
+                    vcvt_f32_f16(vget_high_f16(r)),
+                    vcvt_f32_f16(vget_high_f16(g)),
+                    vcvt_f32_f16(vget_high_f16(b)),
+                }};
+                vst3q_f32(rgb +  0, lo);
+                vst3q_f32(rgb + 12, hi);
+            #elif defined(USING_NEON)
+                float32x4x3_t v = {{
+                    (float32x4_t)r,
+                    (float32x4_t)g,
+                    (float32x4_t)b,
+                }};
+                vst3q_f32(rgb, v);
+            #else
+                store_3(rgb+0, r);
+                store_3(rgb+1, g);
+                store_3(rgb+2, b);
+            #endif
+            } return;
+            case Op_store_ffff: {
+                uintptr_t ptr = (uintptr_t)(dst + 16*i);
+                assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
+                float* rgba = (float*)ptr;               // for this cast to float* to be safe.
+            #if defined(USING_NEON_FP16)
+                float32x4x4_t lo = {{
+                    vcvt_f32_f16(vget_low_f16(r)),
+                    vcvt_f32_f16(vget_low_f16(g)),
+                    vcvt_f32_f16(vget_low_f16(b)),
+                    vcvt_f32_f16(vget_low_f16(a)),
+                }}, hi = {{
+                    vcvt_f32_f16(vget_high_f16(r)),
+                    vcvt_f32_f16(vget_high_f16(g)),
+                    vcvt_f32_f16(vget_high_f16(b)),
+                    vcvt_f32_f16(vget_high_f16(a)),
+                }};
+                vst4q_f32(rgba +  0, lo);
+                vst4q_f32(rgba + 16, hi);
+            #elif defined(USING_NEON)
+                float32x4x4_t v = {{
+                    (float32x4_t)r,
+                    (float32x4_t)g,
+                    (float32x4_t)b,
+                    (float32x4_t)a,
+                }};
+                vst4q_f32(rgba, v);
+            #else
+                store_4(rgba+0, r);
+                store_4(rgba+1, g);
+                store_4(rgba+2, b);
+                store_4(rgba+3, a);
+            #endif
+            } return;
+        }
+    }
+}
+static void run_program(const Op* program, const void** arguments,
+                        const char* src, char* dst, int n,
+                        const size_t src_bpp, const size_t dst_bpp) {
+    int i = 0;
+    while (n >= N) {
+        exec_ops(program, arguments, src, dst, i);
+        i += N;
+        n -= N;
+    }
+    if (n > 0) {
+        char tmp[4*4*N] = {0};
+        memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
+        exec_ops(program, arguments, tmp, tmp, 0);
+        memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
+    }
+}
+// Clean up any #defines we may have set so that we can be #included again.
+#if defined(USING_AVX)
+    #undef  USING_AVX
+#endif
+#if defined(USING_AVX_F16C)
+    #undef  USING_AVX_F16C
+#endif
+#if defined(USING_AVX2)
+    #undef  USING_AVX2
+#endif
+#if defined(USING_AVX512F)
+    #undef  USING_AVX512F
+#endif
+#if defined(USING_NEON)
+    #undef  USING_NEON
+#endif
+#if defined(USING_NEON_F16C)
+    #undef  USING_NEON_F16C
+#endif
+#if defined(USING_NEON_FP16)
+    #undef  USING_NEON_FP16
+#endif
+#undef FALLTHROUGH