@shopify/react-native-skia 0.1.158 → 0.1.160

Sign up to get free protection for your applications and to get access to all the features.
Files changed (403) hide show
  1. package/android/CMakeLists.txt +35 -11
  2. package/android/build.gradle +21 -25
  3. package/android/cpp/jni/JniLoad.cpp +2 -0
  4. package/android/cpp/jni/include/JniSkiaDomView.h +89 -0
  5. package/android/cpp/rnskia-android/SkiaOpenGLRenderer.cpp +4 -3
  6. package/android/cpp/rnskia-android/SkiaOpenGLRenderer.h +4 -3
  7. package/android/src/main/java/com/shopify/reactnative/skia/RNSkiaPackage.java +2 -1
  8. package/android/src/main/java/com/shopify/reactnative/skia/SkiaDomView.java +45 -0
  9. package/android/src/main/java/com/shopify/reactnative/skia/SkiaDomViewManager.java +64 -0
  10. package/cpp/api/JsiSkHostObjects.h +6 -0
  11. package/cpp/api/JsiSkImageFilterFactory.h +1 -1
  12. package/cpp/api/JsiSkPaint.h +9 -2
  13. package/cpp/api/JsiSkPath.h +1 -0
  14. package/cpp/api/JsiSkRuntimeEffect.h +36 -36
  15. package/cpp/jsi/JsiHostObject.cpp +16 -28
  16. package/cpp/jsi/JsiHostObject.h +127 -7
  17. package/cpp/jsi/JsiValue.cpp +346 -0
  18. package/cpp/jsi/JsiValue.h +222 -0
  19. package/cpp/jsi/JsiValueWrapper.h +33 -5
  20. package/cpp/rnskia/RNSkDomView.cpp +222 -0
  21. package/cpp/rnskia/RNSkDomView.h +140 -0
  22. package/cpp/rnskia/RNSkJsView.cpp +0 -4
  23. package/cpp/rnskia/RNSkJsView.h +6 -4
  24. package/cpp/rnskia/RNSkManager.cpp +7 -0
  25. package/cpp/rnskia/RNSkPictureView.h +5 -8
  26. package/cpp/rnskia/RNSkView.h +113 -5
  27. package/cpp/rnskia/dom/JsiDomApi.h +167 -0
  28. package/cpp/rnskia/dom/base/BaseNodeProp.h +72 -0
  29. package/cpp/rnskia/dom/base/DerivedNodeProp.h +187 -0
  30. package/cpp/rnskia/dom/base/DrawingContext.cpp +227 -0
  31. package/cpp/rnskia/dom/base/DrawingContext.h +136 -0
  32. package/cpp/rnskia/dom/base/JsiDependencyManager.h +293 -0
  33. package/cpp/rnskia/dom/base/JsiDomDeclarationNode.h +176 -0
  34. package/cpp/rnskia/dom/base/JsiDomDrawingNode.h +50 -0
  35. package/cpp/rnskia/dom/base/JsiDomNode.h +384 -0
  36. package/cpp/rnskia/dom/base/JsiDomRenderNode.h +267 -0
  37. package/cpp/rnskia/dom/base/NodeProp.h +130 -0
  38. package/cpp/rnskia/dom/base/NodePropsContainer.h +119 -0
  39. package/cpp/rnskia/dom/nodes/JsiBackdropFilterNode.h +38 -0
  40. package/cpp/rnskia/dom/nodes/JsiBlendNode.h +112 -0
  41. package/cpp/rnskia/dom/nodes/JsiBlurMaskNode.h +78 -0
  42. package/cpp/rnskia/dom/nodes/JsiBoxNode.h +104 -0
  43. package/cpp/rnskia/dom/nodes/JsiBoxShadowNode.h +33 -0
  44. package/cpp/rnskia/dom/nodes/JsiCircleNode.h +38 -0
  45. package/cpp/rnskia/dom/nodes/JsiColorFilterNodes.h +192 -0
  46. package/cpp/rnskia/dom/nodes/JsiCustomDrawingNode.h +123 -0
  47. package/cpp/rnskia/dom/nodes/JsiDiffRectNode.h +42 -0
  48. package/cpp/rnskia/dom/nodes/JsiFillNode.h +22 -0
  49. package/cpp/rnskia/dom/nodes/JsiGlyphsNode.h +56 -0
  50. package/cpp/rnskia/dom/nodes/JsiGroupNode.h +26 -0
  51. package/cpp/rnskia/dom/nodes/JsiImageFilterNodes.h +415 -0
  52. package/cpp/rnskia/dom/nodes/JsiImageNode.h +34 -0
  53. package/cpp/rnskia/dom/nodes/JsiImageSvgNode.h +44 -0
  54. package/cpp/rnskia/dom/nodes/JsiLayerNode.h +64 -0
  55. package/cpp/rnskia/dom/nodes/JsiLineNode.h +43 -0
  56. package/cpp/rnskia/dom/nodes/JsiOvalNode.h +34 -0
  57. package/cpp/rnskia/dom/nodes/JsiPaintNode.h +79 -0
  58. package/cpp/rnskia/dom/nodes/JsiPatchNode.h +54 -0
  59. package/cpp/rnskia/dom/nodes/JsiPathEffectNodes.h +315 -0
  60. package/cpp/rnskia/dom/nodes/JsiPathNode.h +181 -0
  61. package/cpp/rnskia/dom/nodes/JsiPictureNode.h +32 -0
  62. package/cpp/rnskia/dom/nodes/JsiPointsNode.h +51 -0
  63. package/cpp/rnskia/dom/nodes/JsiRRectNode.h +34 -0
  64. package/cpp/rnskia/dom/nodes/JsiRectNode.h +34 -0
  65. package/cpp/rnskia/dom/nodes/JsiShaderNodes.h +517 -0
  66. package/cpp/rnskia/dom/nodes/JsiTextBlobNode.h +47 -0
  67. package/cpp/rnskia/dom/nodes/JsiTextNode.h +54 -0
  68. package/cpp/rnskia/dom/nodes/JsiTextPathNode.h +32 -0
  69. package/cpp/rnskia/dom/nodes/JsiVerticesNode.h +43 -0
  70. package/cpp/rnskia/dom/props/BezierProps.h +63 -0
  71. package/cpp/rnskia/dom/props/BlendModeProp.h +101 -0
  72. package/cpp/rnskia/dom/props/BoxShadowProps.h +62 -0
  73. package/cpp/rnskia/dom/props/CircleProp.h +46 -0
  74. package/cpp/rnskia/dom/props/ClipProp.h +62 -0
  75. package/cpp/rnskia/dom/props/ColorProp.h +80 -0
  76. package/cpp/rnskia/dom/props/DrawingProp.h +33 -0
  77. package/cpp/rnskia/dom/props/FontProp.h +34 -0
  78. package/cpp/rnskia/dom/props/GlyphsProp.h +53 -0
  79. package/cpp/rnskia/dom/props/ImageProps.h +173 -0
  80. package/cpp/rnskia/dom/props/LayerProp.h +50 -0
  81. package/cpp/rnskia/dom/props/MatrixProp.h +33 -0
  82. package/cpp/rnskia/dom/props/NumbersProp.h +63 -0
  83. package/cpp/rnskia/dom/props/PaintProps.h +171 -0
  84. package/cpp/rnskia/dom/props/PathProp.h +55 -0
  85. package/cpp/rnskia/dom/props/PictureProp.h +38 -0
  86. package/cpp/rnskia/dom/props/PointProp.h +72 -0
  87. package/cpp/rnskia/dom/props/PointsProp.h +83 -0
  88. package/cpp/rnskia/dom/props/RRectProp.h +134 -0
  89. package/cpp/rnskia/dom/props/RadiusProp.h +43 -0
  90. package/cpp/rnskia/dom/props/RectProp.h +118 -0
  91. package/cpp/rnskia/dom/props/StrokeProps.h +75 -0
  92. package/cpp/rnskia/dom/props/SvgProp.h +37 -0
  93. package/cpp/rnskia/dom/props/TextBlobProp.h +128 -0
  94. package/cpp/rnskia/dom/props/TileModeProp.h +50 -0
  95. package/cpp/rnskia/dom/props/TransformProp.h +80 -0
  96. package/cpp/rnskia/dom/props/TransformsProps.h +68 -0
  97. package/cpp/rnskia/dom/props/UniformsProp.h +194 -0
  98. package/cpp/rnskia/dom/props/VertexModeProp.h +47 -0
  99. package/cpp/rnskia/dom/props/VerticesProps.h +67 -0
  100. package/cpp/rnskia/values/RNSkReadonlyValue.h +13 -4
  101. package/cpp/skia/include/android/SkAndroidFrameworkUtils.h +35 -1
  102. package/cpp/skia/include/codec/SkAndroidCodec.h +17 -1
  103. package/cpp/skia/include/codec/SkCodec.h +8 -5
  104. package/cpp/skia/include/core/SkAnnotation.h +2 -0
  105. package/cpp/skia/include/core/SkBitmap.h +52 -1
  106. package/cpp/skia/include/core/SkBlendMode.h +2 -0
  107. package/cpp/skia/include/core/SkCanvas.h +52 -31
  108. package/cpp/skia/include/core/SkCapabilities.h +44 -0
  109. package/cpp/skia/include/core/SkColor.h +7 -0
  110. package/cpp/skia/include/core/SkColorFilter.h +37 -0
  111. package/cpp/skia/include/core/SkColorSpace.h +1 -1
  112. package/cpp/skia/include/core/SkFont.h +4 -0
  113. package/cpp/skia/include/core/SkFontMgr.h +3 -0
  114. package/cpp/skia/include/core/SkGraphics.h +9 -0
  115. package/cpp/skia/include/core/SkImage.h +77 -17
  116. package/cpp/skia/include/core/SkImageEncoder.h +5 -3
  117. package/cpp/skia/include/core/SkImageGenerator.h +27 -17
  118. package/cpp/skia/include/core/SkM44.h +1 -0
  119. package/cpp/skia/include/core/SkMesh.h +120 -34
  120. package/cpp/skia/include/core/SkMilestone.h +1 -1
  121. package/cpp/skia/include/core/SkOverdrawCanvas.h +2 -1
  122. package/cpp/skia/include/core/SkPaint.h +15 -2
  123. package/cpp/skia/include/core/SkPath.h +4 -0
  124. package/cpp/skia/include/core/SkPathBuilder.h +1 -1
  125. package/cpp/skia/include/core/SkPicture.h +0 -3
  126. package/cpp/skia/include/core/SkPictureRecorder.h +0 -2
  127. package/cpp/skia/include/core/SkPixmap.h +19 -0
  128. package/cpp/skia/include/core/SkRasterHandleAllocator.h +3 -1
  129. package/cpp/skia/include/core/SkRect.h +11 -4
  130. package/cpp/skia/include/core/SkRefCnt.h +13 -1
  131. package/cpp/skia/include/core/SkRegion.h +6 -0
  132. package/cpp/skia/include/core/SkSamplingOptions.h +8 -6
  133. package/cpp/skia/include/core/SkScalar.h +6 -25
  134. package/cpp/skia/include/core/SkShader.h +20 -12
  135. package/cpp/skia/include/core/SkSpan.h +51 -19
  136. package/cpp/skia/include/core/SkStream.h +2 -2
  137. package/cpp/skia/include/core/SkString.h +11 -3
  138. package/cpp/skia/include/core/SkSurface.h +85 -8
  139. package/cpp/skia/include/core/SkTextBlob.h +5 -2
  140. package/cpp/skia/include/core/SkTypes.h +11 -10
  141. package/cpp/skia/include/docs/SkPDFDocument.h +0 -5
  142. package/cpp/skia/include/effects/Sk1DPathEffect.h +6 -1
  143. package/cpp/skia/include/effects/Sk2DPathEffect.h +4 -1
  144. package/cpp/skia/include/effects/SkColorMatrix.h +1 -0
  145. package/cpp/skia/include/effects/SkColorMatrixFilter.h +5 -8
  146. package/cpp/skia/include/effects/SkCornerPathEffect.h +5 -1
  147. package/cpp/skia/include/effects/SkDashPathEffect.h +5 -1
  148. package/cpp/skia/include/effects/SkGradientShader.h +68 -38
  149. package/cpp/skia/include/effects/SkHighContrastFilter.h +5 -1
  150. package/cpp/skia/include/effects/SkImageFilters.h +5 -4
  151. package/cpp/skia/include/effects/SkLumaColorFilter.h +4 -1
  152. package/cpp/skia/include/effects/SkOpPathEffect.h +6 -2
  153. package/cpp/skia/include/effects/SkOverdrawColorFilter.h +5 -2
  154. package/cpp/skia/include/effects/SkRuntimeEffect.h +54 -62
  155. package/cpp/skia/include/effects/SkShaderMaskFilter.h +3 -1
  156. package/cpp/skia/include/effects/SkTableColorFilter.h +8 -21
  157. package/cpp/skia/include/effects/SkTableMaskFilter.h +5 -1
  158. package/cpp/skia/include/effects/SkTrimPathEffect.h +5 -1
  159. package/cpp/skia/include/encode/SkEncoder.h +17 -0
  160. package/cpp/skia/include/encode/SkWebpEncoder.h +17 -0
  161. package/cpp/skia/include/gpu/GpuTypes.h +18 -0
  162. package/cpp/skia/include/gpu/GrBackendSurface.h +38 -17
  163. package/cpp/skia/include/gpu/GrBackendSurfaceMutableState.h +6 -71
  164. package/cpp/skia/include/gpu/GrContextOptions.h +1 -1
  165. package/cpp/skia/include/gpu/GrContextThreadSafeProxy.h +10 -9
  166. package/cpp/skia/include/gpu/GrDirectContext.h +42 -22
  167. package/cpp/skia/include/gpu/GrRecordingContext.h +6 -3
  168. package/cpp/skia/include/gpu/GrTypes.h +11 -11
  169. package/cpp/skia/include/gpu/MutableTextureState.h +122 -0
  170. package/cpp/skia/include/gpu/gl/GrGLFunctions.h +1 -0
  171. package/cpp/skia/include/gpu/gl/GrGLInterface.h +1 -0
  172. package/cpp/skia/include/gpu/graphite/BackendTexture.h +7 -0
  173. package/cpp/skia/include/gpu/graphite/CombinationBuilder.h +195 -0
  174. package/cpp/skia/include/gpu/graphite/Context.h +47 -55
  175. package/cpp/skia/include/gpu/graphite/ContextOptions.h +85 -0
  176. package/cpp/skia/include/gpu/graphite/GraphiteTypes.h +1 -17
  177. package/cpp/skia/include/gpu/graphite/ImageProvider.h +61 -0
  178. package/cpp/skia/include/gpu/graphite/Recorder.h +87 -8
  179. package/cpp/skia/include/gpu/graphite/Recording.h +19 -9
  180. package/cpp/skia/include/gpu/graphite/TextureInfo.h +40 -8
  181. package/cpp/skia/include/gpu/graphite/dawn/DawnBackendContext.h +25 -0
  182. package/cpp/skia/include/gpu/graphite/mtl/MtlBackendContext.h +3 -2
  183. package/cpp/skia/include/gpu/graphite/vk/VulkanGraphiteTypes.h +69 -0
  184. package/cpp/skia/include/gpu/mtl/MtlMemoryAllocator.h +39 -0
  185. package/cpp/skia/include/gpu/vk/GrVkBackendContext.h +21 -19
  186. package/cpp/skia/include/gpu/vk/GrVkExtensions.h +2 -50
  187. package/cpp/skia/include/gpu/vk/GrVkMemoryAllocator.h +2 -127
  188. package/cpp/skia/include/gpu/vk/GrVkTypes.h +5 -43
  189. package/cpp/skia/include/gpu/vk/VulkanBackendContext.h +46 -0
  190. package/cpp/skia/include/gpu/vk/VulkanExtensions.h +67 -0
  191. package/cpp/skia/include/gpu/vk/VulkanMemoryAllocator.h +116 -0
  192. package/cpp/skia/include/gpu/vk/VulkanTypes.h +59 -0
  193. package/cpp/skia/include/pathops/SkPathOps.h +1 -1
  194. package/cpp/skia/include/private/SkColorData.h +10 -40
  195. package/cpp/skia/include/private/SkEncodedInfo.h +9 -3
  196. package/cpp/skia/include/private/SkFloatingPoint.h +9 -6
  197. package/cpp/skia/include/private/SkHalf.h +5 -52
  198. package/cpp/skia/include/private/SkMacros.h +1 -1
  199. package/cpp/skia/include/private/SkMalloc.h +4 -0
  200. package/cpp/skia/include/private/SkPathRef.h +10 -10
  201. package/cpp/skia/include/private/SkSLModifiers.h +59 -23
  202. package/cpp/skia/include/private/SkSLProgramKind.h +1 -0
  203. package/cpp/skia/include/private/SkSLSymbol.h +7 -3
  204. package/cpp/skia/include/private/SkStringView.h +4 -0
  205. package/cpp/skia/include/private/SkTArray.h +21 -7
  206. package/cpp/skia/include/private/SkTDArray.h +173 -285
  207. package/cpp/skia/include/private/SkTHash.h +33 -32
  208. package/cpp/skia/include/private/SkTemplates.h +24 -26
  209. package/cpp/skia/include/private/SkVx.h +218 -135
  210. package/cpp/skia/include/private/chromium/GrSlug.h +3 -65
  211. package/cpp/skia/include/private/chromium/SkChromeRemoteGlyphCache.h +6 -3
  212. package/cpp/skia/include/private/chromium/Slug.h +76 -0
  213. package/cpp/skia/include/private/gpu/ganesh/GrTypesPriv.h +6 -1
  214. package/cpp/skia/include/private/gpu/ganesh/GrVkTypesPriv.h +5 -39
  215. package/cpp/skia/include/private/gpu/graphite/VulkanGraphiteTypesPriv.h +63 -0
  216. package/cpp/skia/include/{gpu/vk/GrVkVulkan.h → private/gpu/vk/SkiaVulkan.h} +2 -2
  217. package/cpp/skia/include/private/gpu/vk/VulkanTypesPriv.h +57 -0
  218. package/cpp/skia/include/sksl/DSL.h +0 -1
  219. package/cpp/skia/include/sksl/DSLBlock.h +4 -18
  220. package/cpp/skia/include/sksl/DSLCase.h +2 -8
  221. package/cpp/skia/include/sksl/DSLCore.h +8 -15
  222. package/cpp/skia/include/sksl/DSLExpression.h +51 -142
  223. package/cpp/skia/include/sksl/DSLFunction.h +7 -15
  224. package/cpp/skia/include/sksl/DSLModifiers.h +5 -2
  225. package/cpp/skia/include/sksl/DSLStatement.h +4 -39
  226. package/cpp/skia/include/sksl/DSLSymbols.h +1 -11
  227. package/cpp/skia/include/sksl/DSLType.h +20 -12
  228. package/cpp/skia/include/sksl/DSLVar.h +56 -146
  229. package/cpp/skia/include/sksl/SkSLErrorReporter.h +2 -15
  230. package/cpp/skia/include/sksl/SkSLOperator.h +62 -59
  231. package/cpp/skia/include/sksl/SkSLPosition.h +2 -0
  232. package/cpp/skia/include/sksl/SkSLVersion.h +27 -0
  233. package/cpp/skia/include/svg/SkSVGCanvas.h +1 -0
  234. package/cpp/skia/include/utils/SkAnimCodecPlayer.h +1 -1
  235. package/cpp/skia/include/utils/SkBase64.h +2 -0
  236. package/cpp/skia/include/utils/SkCustomTypeface.h +24 -11
  237. package/cpp/skia/include/utils/SkEventTracer.h +12 -1
  238. package/cpp/skia/include/utils/SkNWayCanvas.h +11 -4
  239. package/cpp/skia/include/utils/SkPaintFilterCanvas.h +9 -4
  240. package/cpp/skia/include/utils/SkParse.h +3 -0
  241. package/cpp/skia/include/utils/SkShadowUtils.h +2 -0
  242. package/cpp/skia/include/utils/SkTextUtils.h +2 -1
  243. package/cpp/skia/{include/third_party → modules}/skcms/skcms.h +10 -0
  244. package/cpp/skia/modules/skcms/skcms_internal.h +56 -0
  245. package/cpp/skia/modules/skcms/src/Transform_inl.h +1609 -0
  246. package/cpp/skia/modules/skparagraph/include/DartTypes.h +153 -0
  247. package/cpp/skia/modules/skparagraph/include/FontArguments.h +46 -0
  248. package/cpp/skia/modules/skparagraph/include/FontCollection.h +84 -0
  249. package/cpp/skia/modules/skparagraph/include/Metrics.h +98 -0
  250. package/cpp/skia/modules/skparagraph/include/Paragraph.h +111 -0
  251. package/cpp/skia/modules/skparagraph/include/ParagraphBuilder.h +69 -0
  252. package/cpp/skia/modules/skparagraph/include/ParagraphCache.h +77 -0
  253. package/cpp/skia/modules/skparagraph/include/ParagraphStyle.h +143 -0
  254. package/cpp/skia/modules/skparagraph/include/TextShadow.h +30 -0
  255. package/cpp/skia/modules/skparagraph/include/TextStyle.h +352 -0
  256. package/cpp/skia/modules/skparagraph/include/TypefaceFontProvider.h +81 -0
  257. package/cpp/skia/modules/svg/include/SkSVGAttributeParser.h +1 -1
  258. package/cpp/skia/modules/svg/include/SkSVGTypes.h +3 -3
  259. package/cpp/skia/src/core/SkLRUCache.h +126 -0
  260. package/cpp/skia/src/core/SkTInternalLList.h +302 -0
  261. package/cpp/utils/RNSkTimingInfo.h +1 -0
  262. package/ios/RNSkia-iOS/RNSkMetalCanvasProvider.h +15 -4
  263. package/ios/RNSkia-iOS/RNSkMetalCanvasProvider.mm +41 -55
  264. package/ios/RNSkia-iOS/SkiaDomViewManager.h +8 -0
  265. package/ios/RNSkia-iOS/SkiaDomViewManager.mm +51 -0
  266. package/lib/commonjs/dom/nodes/JsiSkDOM.js +56 -56
  267. package/lib/commonjs/dom/nodes/JsiSkDOM.js.map +1 -1
  268. package/lib/commonjs/dom/nodes/RenderNode.js +1 -1
  269. package/lib/commonjs/dom/nodes/RenderNode.js.map +1 -1
  270. package/lib/commonjs/renderer/Canvas.js +24 -66
  271. package/lib/commonjs/renderer/Canvas.js.map +1 -1
  272. package/lib/commonjs/renderer/DependencyManager.js +0 -5
  273. package/lib/commonjs/renderer/DependencyManager.js.map +1 -1
  274. package/lib/commonjs/renderer/HostComponents.js.map +1 -1
  275. package/lib/commonjs/renderer/useCanvas.js +4 -18
  276. package/lib/commonjs/renderer/useCanvas.js.map +1 -1
  277. package/lib/commonjs/views/SkiaBaseWebView.js +15 -0
  278. package/lib/commonjs/views/SkiaBaseWebView.js.map +1 -1
  279. package/lib/commonjs/views/SkiaDomView.js +152 -0
  280. package/lib/commonjs/views/SkiaDomView.js.map +1 -0
  281. package/lib/commonjs/views/SkiaDomView.web.js +55 -0
  282. package/lib/commonjs/views/SkiaDomView.web.js.map +1 -0
  283. package/lib/commonjs/views/SkiaPictureView.js +16 -2
  284. package/lib/commonjs/views/SkiaPictureView.js.map +1 -1
  285. package/lib/commonjs/views/SkiaView.js +17 -2
  286. package/lib/commonjs/views/SkiaView.js.map +1 -1
  287. package/lib/commonjs/views/index.js +13 -0
  288. package/lib/commonjs/views/index.js.map +1 -1
  289. package/lib/commonjs/views/types.js.map +1 -1
  290. package/lib/module/dom/nodes/JsiSkDOM.js +56 -56
  291. package/lib/module/dom/nodes/JsiSkDOM.js.map +1 -1
  292. package/lib/module/dom/nodes/RenderNode.js +1 -1
  293. package/lib/module/dom/nodes/RenderNode.js.map +1 -1
  294. package/lib/module/renderer/Canvas.js +25 -66
  295. package/lib/module/renderer/Canvas.js.map +1 -1
  296. package/lib/module/renderer/DependencyManager.js +0 -5
  297. package/lib/module/renderer/DependencyManager.js.map +1 -1
  298. package/lib/module/renderer/HostComponents.js.map +1 -1
  299. package/lib/module/renderer/useCanvas.js +2 -13
  300. package/lib/module/renderer/useCanvas.js.map +1 -1
  301. package/lib/module/views/SkiaBaseWebView.js +15 -0
  302. package/lib/module/views/SkiaBaseWebView.js.map +1 -1
  303. package/lib/module/views/SkiaDomView.js +128 -0
  304. package/lib/module/views/SkiaDomView.js.map +1 -0
  305. package/lib/module/views/SkiaDomView.web.js +41 -0
  306. package/lib/module/views/SkiaDomView.web.js.map +1 -0
  307. package/lib/module/views/SkiaPictureView.js +14 -2
  308. package/lib/module/views/SkiaPictureView.js.map +1 -1
  309. package/lib/module/views/SkiaView.js +15 -2
  310. package/lib/module/views/SkiaView.js.map +1 -1
  311. package/lib/module/views/index.js +1 -0
  312. package/lib/module/views/index.js.map +1 -1
  313. package/lib/module/views/types.js.map +1 -1
  314. package/lib/typescript/src/dom/nodes/JsiSkDOM.d.ts +57 -64
  315. package/lib/typescript/src/renderer/Canvas.d.ts +5 -8
  316. package/lib/typescript/src/renderer/DependencyManager.d.ts +0 -2
  317. package/lib/typescript/src/renderer/HostComponents.d.ts +66 -3
  318. package/lib/typescript/src/renderer/useCanvas.d.ts +0 -6
  319. package/lib/typescript/src/views/SkiaBaseWebView.d.ts +4 -0
  320. package/lib/typescript/src/views/SkiaDomView.d.ts +31 -0
  321. package/lib/typescript/src/views/SkiaDomView.web.d.ts +7 -0
  322. package/lib/typescript/src/views/index.d.ts +1 -0
  323. package/lib/typescript/src/views/types.d.ts +12 -2
  324. package/libs/android/arm64-v8a/libskia.a +0 -0
  325. package/libs/android/arm64-v8a/libskottie.a +0 -0
  326. package/libs/android/arm64-v8a/libskparagraph.a +0 -0
  327. package/libs/android/arm64-v8a/libsksg.a +0 -0
  328. package/libs/android/arm64-v8a/libskshaper.a +0 -0
  329. package/libs/android/arm64-v8a/libskunicode.a +0 -0
  330. package/libs/android/arm64-v8a/libsvg.a +0 -0
  331. package/libs/android/armeabi-v7a/libskia.a +0 -0
  332. package/libs/android/armeabi-v7a/libskottie.a +0 -0
  333. package/libs/android/armeabi-v7a/libskparagraph.a +0 -0
  334. package/libs/android/armeabi-v7a/libsksg.a +0 -0
  335. package/libs/android/armeabi-v7a/libskshaper.a +0 -0
  336. package/libs/android/armeabi-v7a/libskunicode.a +0 -0
  337. package/libs/android/armeabi-v7a/libsvg.a +0 -0
  338. package/libs/android/x86/libskia.a +0 -0
  339. package/libs/android/x86/libskottie.a +0 -0
  340. package/libs/android/x86/libskparagraph.a +0 -0
  341. package/libs/android/x86/libsksg.a +0 -0
  342. package/libs/android/x86/libskshaper.a +0 -0
  343. package/libs/android/x86/libskunicode.a +0 -0
  344. package/libs/android/x86/libsvg.a +0 -0
  345. package/libs/android/x86_64/libskia.a +0 -0
  346. package/libs/android/x86_64/libskottie.a +0 -0
  347. package/libs/android/x86_64/libskparagraph.a +0 -0
  348. package/libs/android/x86_64/libsksg.a +0 -0
  349. package/libs/android/x86_64/libskshaper.a +0 -0
  350. package/libs/android/x86_64/libskunicode.a +0 -0
  351. package/libs/android/x86_64/libsvg.a +0 -0
  352. package/libs/ios/libskia.xcframework/ios-arm64_arm64e/libskia.a +0 -0
  353. package/libs/ios/libskia.xcframework/ios-arm64_arm64e_x86_64-simulator/libskia.a +0 -0
  354. package/libs/ios/libskottie.xcframework/ios-arm64_arm64e/libskottie.a +0 -0
  355. package/libs/ios/libskottie.xcframework/ios-arm64_arm64e_x86_64-simulator/libskottie.a +0 -0
  356. package/libs/ios/libskparagraph.xcframework/Info.plist +42 -0
  357. package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e/libskparagraph.a +0 -0
  358. package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e_x86_64-simulator/libskparagraph.a +0 -0
  359. package/libs/ios/libsksg.xcframework/Info.plist +5 -5
  360. package/libs/ios/libsksg.xcframework/ios-arm64_arm64e/libsksg.a +0 -0
  361. package/libs/ios/libsksg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsksg.a +0 -0
  362. package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e/libskshaper.a +0 -0
  363. package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e_x86_64-simulator/libskshaper.a +0 -0
  364. package/libs/ios/libskunicode.xcframework/Info.plist +42 -0
  365. package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e/libskunicode.a +0 -0
  366. package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e_x86_64-simulator/libskunicode.a +0 -0
  367. package/libs/ios/libsvg.xcframework/ios-arm64_arm64e/libsvg.a +0 -0
  368. package/libs/ios/libsvg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsvg.a +0 -0
  369. package/package.json +5 -3
  370. package/react-native-skia.podspec +5 -2
  371. package/src/dom/nodes/JsiSkDOM.ts +170 -56
  372. package/src/dom/nodes/RenderNode.ts +1 -1
  373. package/src/renderer/Canvas.tsx +37 -57
  374. package/src/renderer/DependencyManager.tsx +0 -5
  375. package/src/renderer/HostComponents.ts +152 -1
  376. package/src/renderer/useCanvas.ts +1 -15
  377. package/src/views/SkiaBaseWebView.tsx +9 -0
  378. package/src/views/SkiaDomView.tsx +120 -0
  379. package/src/views/SkiaDomView.web.tsx +37 -0
  380. package/src/views/SkiaPictureView.tsx +10 -2
  381. package/src/views/SkiaView.tsx +11 -3
  382. package/src/views/index.ts +1 -0
  383. package/src/views/types.ts +19 -2
  384. package/cpp/jsi/JsiSimpleValueWrapper.h +0 -99
  385. package/cpp/skia/include/c/sk_canvas.h +0 -159
  386. package/cpp/skia/include/c/sk_colorspace.h +0 -25
  387. package/cpp/skia/include/c/sk_data.h +0 -65
  388. package/cpp/skia/include/c/sk_image.h +0 -71
  389. package/cpp/skia/include/c/sk_imageinfo.h +0 -62
  390. package/cpp/skia/include/c/sk_maskfilter.h +0 -47
  391. package/cpp/skia/include/c/sk_matrix.h +0 -49
  392. package/cpp/skia/include/c/sk_paint.h +0 -145
  393. package/cpp/skia/include/c/sk_path.h +0 -102
  394. package/cpp/skia/include/c/sk_picture.h +0 -70
  395. package/cpp/skia/include/c/sk_shader.h +0 -143
  396. package/cpp/skia/include/c/sk_surface.h +0 -73
  397. package/cpp/skia/include/c/sk_types.h +0 -278
  398. package/cpp/skia/include/gpu/graphite/SkStuff.h +0 -47
  399. package/cpp/skia/include/private/SkNx.h +0 -430
  400. package/cpp/skia/include/private/SkNx_neon.h +0 -713
  401. package/cpp/skia/include/private/SkNx_sse.h +0 -823
  402. package/cpp/skia/include/sksl/DSLRuntimeEffects.h +0 -32
  403. package/cpp/skia/include/sksl/DSLWrapper.h +0 -77
@@ -0,0 +1,1609 @@
1
+ /*
2
+ * Copyright 2018 Google Inc.
3
+ *
4
+ * Use of this source code is governed by a BSD-style license that can be
5
+ * found in the LICENSE file.
6
+ */
7
+
8
+ // Intentionally NO #pragma once... included multiple times.
9
+
10
+ // This file is included from skcms.cc in a namespace with some pre-defines:
11
+ // - N: depth of all vectors, 1,4,8, or 16 (preprocessor define)
12
+ // - V<T>: a template to create a vector of N T's.
13
+
14
+ using F = V<Color>; // Called F for historic reasons... maybe rename C?
15
+ using I32 = V<int32_t>;
16
+ using U64 = V<uint64_t>;
17
+ using U32 = V<uint32_t>;
18
+ using U16 = V<uint16_t>;
19
+ using U8 = V<uint8_t>;
20
+
21
+
22
+ #if defined(__GNUC__) && !defined(__clang__)
23
+ // Once again, GCC is kind of weird, not allowing vector = scalar directly.
24
+ static constexpr F F0 = F() + 0.0f,
25
+ F1 = F() + 1.0f,
26
+ FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
27
+ #else
28
+ static constexpr F F0 = 0.0f,
29
+ F1 = 1.0f,
30
+ FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
31
+ #endif
32
+
33
+ // Instead of checking __AVX__ below, we'll check USING_AVX.
34
+ // This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
35
+ // Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
36
+
37
+ #if !defined(USING_AVX) && N == 8 && defined(__AVX__)
38
+ #define USING_AVX
39
+ #endif
40
+ #if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
41
+ #define USING AVX_F16C
42
+ #endif
43
+ #if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
44
+ #define USING_AVX2
45
+ #endif
46
+ #if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__) && defined(__AVX512DQ__)
47
+ #define USING_AVX512F
48
+ #endif
49
+
50
+ // Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
51
+ // This is more for organizational clarity... skcms.cc doesn't force these.
52
+ #if N > 1 && defined(__ARM_NEON)
53
+ #define USING_NEON
54
+ #if __ARM_FP & 2
55
+ #define USING_NEON_F16C
56
+ #endif
57
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
58
+ #define USING_NEON_FP16
59
+ #endif
60
+ #endif
61
+
62
+ // These -Wvector-conversion warnings seem to trigger in very bogus situations,
63
+ // like vst3q_f32() expecting a 16x char rather than a 4x float vector. :/
64
+ #if defined(USING_NEON) && defined(__clang__)
65
+ #pragma clang diagnostic ignored "-Wvector-conversion"
66
+ #endif
67
+
68
+ // GCC & Clang (but not clang-cl) warn returning U64 on x86 is larger than a register.
69
+ // You'd see warnings like, "using AVX even though AVX is not enabled".
70
+ // We stifle these warnings; our helpers that return U64 are always inlined.
71
+ #if defined(__SSE__) && defined(__GNUC__)
72
+ #if !defined(__has_warning)
73
+ #pragma GCC diagnostic ignored "-Wpsabi"
74
+ #elif __has_warning("-Wpsabi")
75
+ #pragma GCC diagnostic ignored "-Wpsabi"
76
+ #endif
77
+ #endif
78
+
79
+ #if defined(__clang__)
80
+ #define FALLTHROUGH [[clang::fallthrough]]
81
+ #else
82
+ #define FALLTHROUGH
83
+ #endif
84
+
85
+ // We tag most helper functions as SI, to enforce good code generation
86
+ // but also work around what we think is a bug in GCC: when targeting 32-bit
87
+ // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
88
+ // MMX mm0 register, which seems to mess with unrelated code that later uses
89
+ // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
90
+ //
91
+ // It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
92
+ #if defined(__clang__) || defined(__GNUC__)
93
+ #define SI static inline __attribute__((always_inline))
94
+ #else
95
+ #define SI static inline
96
+ #endif
97
+
98
+ template <typename T, typename P>
99
+ SI T load(const P* ptr) {
100
+ T val;
101
+ small_memcpy(&val, ptr, sizeof(val));
102
+ return val;
103
+ }
104
+ template <typename T, typename P>
105
+ SI void store(P* ptr, const T& val) {
106
+ small_memcpy(ptr, &val, sizeof(val));
107
+ }
108
+
109
+ // (T)v is a cast when N == 1 and a bit-pun when N>1,
110
+ // so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
111
+ template <typename D, typename S>
112
+ SI D cast(const S& v) {
113
+ #if N == 1
114
+ return (D)v;
115
+ #elif defined(__clang__)
116
+ return __builtin_convertvector(v, D);
117
+ #else
118
+ D d;
119
+ for (int i = 0; i < N; i++) {
120
+ d[i] = v[i];
121
+ }
122
+ return d;
123
+ #endif
124
+ }
125
+
126
+ template <typename D, typename S>
127
+ SI D bit_pun(const S& v) {
128
+ static_assert(sizeof(D) == sizeof(v), "");
129
+ return load<D>(&v);
130
+ }
131
+
132
+ // When we convert from float to fixed point, it's very common to want to round,
133
+ // and for some reason compilers generate better code when converting to int32_t.
134
+ // To serve both those ends, we use this function to_fixed() instead of direct cast().
135
+ #if defined(USING_NEON_FP16)
136
+ // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
137
+ SI U16 to_fixed(F f) { return cast<U16>(f + 0.5f); }
138
+ #else
139
+ SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
140
+ #endif
141
+
142
+
143
+ // Sometimes we do something crazy on one branch of a conditonal,
144
+ // like divide by zero or convert a huge float to an integer,
145
+ // but then harmlessly select the other side. That trips up N==1
146
+ // sanitizer builds, so we make if_then_else() a macro to avoid
147
+ // evaluating the unused side.
148
+
149
+ #if N == 1
150
+ #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
151
+ #else
152
+ template <typename C, typename T>
153
+ SI T if_then_else(C cond, T t, T e) {
154
+ return bit_pun<T>( ( cond & bit_pun<C>(t)) |
155
+ (~cond & bit_pun<C>(e)) );
156
+ }
157
+ #endif
158
+
159
+
160
+ SI F F_from_Half(U16 half) {
161
+ #if defined(USING_NEON_FP16)
162
+ return bit_pun<F>(half);
163
+ #elif defined(USING_NEON_F16C)
164
+ return vcvt_f32_f16((float16x4_t)half);
165
+ #elif defined(USING_AVX512F)
166
+ return (F)_mm512_cvtph_ps((__m256i)half);
167
+ #elif defined(USING_AVX_F16C)
168
+ typedef int16_t __attribute__((vector_size(16))) I16;
169
+ return __builtin_ia32_vcvtph2ps256((I16)half);
170
+ #else
171
+ U32 wide = cast<U32>(half);
172
+ // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
173
+ U32 s = wide & 0x8000,
174
+ em = wide ^ s;
175
+
176
+ // Constructing the float is easy if the half is not denormalized.
177
+ F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
178
+
179
+ // Simply flush all denorm half floats to zero.
180
+ return if_then_else(em < 0x0400, F0, norm);
181
+ #endif
182
+ }
183
+
184
+ #if defined(__clang__)
185
+ // The -((127-15)<<10) underflows that side of the math when
186
+ // we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
187
+ __attribute__((no_sanitize("unsigned-integer-overflow")))
188
+ #endif
189
+ SI U16 Half_from_F(F f) {
190
+ #if defined(USING_NEON_FP16)
191
+ return bit_pun<U16>(f);
192
+ #elif defined(USING_NEON_F16C)
193
+ return (U16)vcvt_f16_f32(f);
194
+ #elif defined(USING_AVX512F)
195
+ return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
196
+ #elif defined(USING_AVX_F16C)
197
+ return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
198
+ #else
199
+ // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
200
+ U32 sem = bit_pun<U32>(f),
201
+ s = sem & 0x80000000,
202
+ em = sem ^ s;
203
+
204
+ // For simplicity we flush denorm half floats (including all denorm floats) to zero.
205
+ return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
206
+ , (s>>16) + (em>>13) - ((127-15)<<10)));
207
+ #endif
208
+ }
209
+
210
+ // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
211
+ #if defined(USING_NEON_FP16)
212
+ SI U16 swap_endian_16(U16 v) {
213
+ return (U16)vrev16q_u8((uint8x16_t) v);
214
+ }
215
+ #elif defined(USING_NEON)
216
+ SI U16 swap_endian_16(U16 v) {
217
+ return (U16)vrev16_u8((uint8x8_t) v);
218
+ }
219
+ #endif
220
+
221
+ SI U64 swap_endian_16x4(const U64& rgba) {
222
+ return (rgba & 0x00ff00ff00ff00ff) << 8
223
+ | (rgba & 0xff00ff00ff00ff00) >> 8;
224
+ }
225
+
226
+ #if defined(USING_NEON_FP16)
227
+ SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
228
+ SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
229
+ #elif defined(USING_NEON)
230
+ SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
231
+ SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
232
+ #else
233
+ SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
234
+ SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
235
+ #endif
236
+
237
+ SI F floor_(F x) {
238
+ #if N == 1
239
+ return floorf_(x);
240
+ #elif defined(USING_NEON_FP16)
241
+ return vrndmq_f16(x);
242
+ #elif defined(__aarch64__)
243
+ return vrndmq_f32(x);
244
+ #elif defined(USING_AVX512F)
245
+ // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
246
+ // and integer santizer catches that this implicit cast changes the
247
+ // value from -1 to 65535. We'll cast manually to work around it.
248
+ // Read this as `return _mm512_floor_ps(x)`.
249
+ return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
250
+ #elif defined(USING_AVX)
251
+ return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
252
+ #elif defined(__SSE4_1__)
253
+ return _mm_floor_ps(x);
254
+ #else
255
+ // Round trip through integers with a truncating cast.
256
+ F roundtrip = cast<F>(cast<I32>(x));
257
+ // If x is negative, truncating gives the ceiling instead of the floor.
258
+ return roundtrip - if_then_else(roundtrip > x, F1, F0);
259
+
260
+ // This implementation fails for values of x that are outside
261
+ // the range an integer can represent. We expect most x to be small.
262
+ #endif
263
+ }
264
+
265
+ SI F approx_log2(F x) {
266
+ #if defined(USING_NEON_FP16)
267
+ // TODO(mtklein)
268
+ return x;
269
+ #else
270
+ // The first approximation of log2(x) is its exponent 'e', minus 127.
271
+ I32 bits = bit_pun<I32>(x);
272
+
273
+ F e = cast<F>(bits) * (1.0f / (1<<23));
274
+
275
+ // If we use the mantissa too we can refine the error signficantly.
276
+ F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
277
+
278
+ return e - 124.225514990f
279
+ - 1.498030302f*m
280
+ - 1.725879990f/(0.3520887068f + m);
281
+ #endif
282
+ }
283
+
284
+ SI F approx_log(F x) {
285
+ const float ln2 = 0.69314718f;
286
+ return ln2 * approx_log2(x);
287
+ }
288
+
289
+ SI F approx_exp2(F x) {
290
+ #if defined(USING_NEON_FP16)
291
+ // TODO(mtklein)
292
+ return x;
293
+ #else
294
+ F fract = x - floor_(x);
295
+
296
+ F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
297
+ - 1.490129070f*fract
298
+ + 27.728023300f/(4.84252568f - fract));
299
+ I32 bits = cast<I32>(min_(max_(fbits, F0), FInfBits));
300
+
301
+ return bit_pun<F>(bits);
302
+ #endif
303
+ }
304
+
305
+ SI F approx_pow(F x, float y) {
306
+ return if_then_else((x == F0) | (x == F1), x
307
+ , approx_exp2(approx_log2(x) * y));
308
+ }
309
+
310
+ SI F approx_exp(F x) {
311
+ const float log2_e = 1.4426950408889634074f;
312
+ return approx_exp2(log2_e * x);
313
+ }
314
+
315
+ // Return tf(x).
316
+ SI F apply_tf(const skcms_TransferFunction* tf, F x) {
317
+ #if defined(USING_NEON_FP16)
318
+ // TODO(mtklein)
319
+ (void)tf;
320
+ return x;
321
+ #else
322
+ // Peel off the sign bit and set x = |x|.
323
+ U32 bits = bit_pun<U32>(x),
324
+ sign = bits & 0x80000000;
325
+ x = bit_pun<F>(bits ^ sign);
326
+
327
+ // The transfer function has a linear part up to d, exponential at d and after.
328
+ F v = if_then_else(x < tf->d, tf->c*x + tf->f
329
+ , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
330
+
331
+ // Tack the sign bit back on.
332
+ return bit_pun<F>(sign | bit_pun<U32>(v));
333
+ #endif
334
+ }
335
+
336
+ SI F apply_pq(const skcms_TransferFunction* tf, F x) {
337
+ #if defined(USING_NEON_FP16)
338
+ // TODO(mtklein)
339
+ (void)tf;
340
+ return x;
341
+ #else
342
+ U32 bits = bit_pun<U32>(x),
343
+ sign = bits & 0x80000000;
344
+ x = bit_pun<F>(bits ^ sign);
345
+
346
+ F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
347
+ / (tf->d + tf->e * approx_pow(x, tf->c)),
348
+ tf->f);
349
+
350
+ return bit_pun<F>(sign | bit_pun<U32>(v));
351
+ #endif
352
+ }
353
+
354
+ SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
355
+ #if defined(USING_NEON_FP16)
356
+ // TODO(mtklein)
357
+ (void)tf;
358
+ return x;
359
+ #else
360
+ const float R = tf->a, G = tf->b,
361
+ a = tf->c, b = tf->d, c = tf->e,
362
+ K = tf->f + 1;
363
+ U32 bits = bit_pun<U32>(x),
364
+ sign = bits & 0x80000000;
365
+ x = bit_pun<F>(bits ^ sign);
366
+
367
+ F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
368
+ , approx_exp((x-c)*a) + b);
369
+
370
+ return K*bit_pun<F>(sign | bit_pun<U32>(v));
371
+ #endif
372
+ }
373
+
374
+ SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
375
+ #if defined(USING_NEON_FP16)
376
+ // TODO(mtklein)
377
+ (void)tf;
378
+ return x;
379
+ #else
380
+ const float R = tf->a, G = tf->b,
381
+ a = tf->c, b = tf->d, c = tf->e,
382
+ K = tf->f + 1;
383
+ U32 bits = bit_pun<U32>(x),
384
+ sign = bits & 0x80000000;
385
+ x = bit_pun<F>(bits ^ sign);
386
+ x /= K;
387
+
388
+ F v = if_then_else(x <= 1, R * approx_pow(x, G)
389
+ , a * approx_log(x - b) + c);
390
+
391
+ return bit_pun<F>(sign | bit_pun<U32>(v));
392
+ #endif
393
+ }
394
+
395
+
396
+ // Strided loads and stores of N values, starting from p.
397
+ template <typename T, typename P>
398
+ SI T load_3(const P* p) {
399
+ #if N == 1
400
+ return (T)p[0];
401
+ #elif N == 4
402
+ return T{p[ 0],p[ 3],p[ 6],p[ 9]};
403
+ #elif N == 8
404
+ return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
405
+ #elif N == 16
406
+ return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
407
+ p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
408
+ #endif
409
+ }
410
+
411
+ template <typename T, typename P>
412
+ SI T load_4(const P* p) {
413
+ #if N == 1
414
+ return (T)p[0];
415
+ #elif N == 4
416
+ return T{p[ 0],p[ 4],p[ 8],p[12]};
417
+ #elif N == 8
418
+ return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
419
+ #elif N == 16
420
+ return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
421
+ p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
422
+ #endif
423
+ }
424
+
425
+ template <typename T, typename P>
426
+ SI void store_3(P* p, const T& v) {
427
+ #if N == 1
428
+ p[0] = v;
429
+ #elif N == 4
430
+ p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
431
+ #elif N == 8
432
+ p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
433
+ p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
434
+ #elif N == 16
435
+ p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
436
+ p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
437
+ p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
438
+ p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
439
+ #endif
440
+ }
441
+
442
+ template <typename T, typename P>
443
+ SI void store_4(P* p, const T& v) {
444
+ #if N == 1
445
+ p[0] = v;
446
+ #elif N == 4
447
+ p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
448
+ #elif N == 8
449
+ p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
450
+ p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
451
+ #elif N == 16
452
+ p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
453
+ p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
454
+ p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
455
+ p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
456
+ #endif
457
+ }
458
+
459
+
460
+ SI U8 gather_8(const uint8_t* p, I32 ix) {
461
+ #if N == 1
462
+ U8 v = p[ix];
463
+ #elif N == 4
464
+ U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
465
+ #elif N == 8
466
+ U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
467
+ p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
468
+ #elif N == 16
469
+ U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
470
+ p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
471
+ p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
472
+ p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
473
+ #endif
474
+ return v;
475
+ }
476
+
477
+ SI U16 gather_16(const uint8_t* p, I32 ix) {
478
+ // Load the i'th 16-bit value from p.
479
+ auto load_16 = [p](int i) {
480
+ return load<uint16_t>(p + 2*i);
481
+ };
482
+ #if N == 1
483
+ U16 v = load_16(ix);
484
+ #elif N == 4
485
+ U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
486
+ #elif N == 8
487
+ U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
488
+ load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
489
+ #elif N == 16
490
+ U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
491
+ load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
492
+ load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
493
+ load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
494
+ #endif
495
+ return v;
496
+ }
497
+
498
+ SI U32 gather_32(const uint8_t* p, I32 ix) {
499
+ // Load the i'th 32-bit value from p.
500
+ auto load_32 = [p](int i) {
501
+ return load<uint32_t>(p + 4*i);
502
+ };
503
+ #if N == 1
504
+ U32 v = load_32(ix);
505
+ #elif N == 4
506
+ U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
507
+ #elif N == 8
508
+ U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
509
+ load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
510
+ #elif N == 16
511
+ U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
512
+ load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
513
+ load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
514
+ load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
515
+ #endif
516
+ // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
517
+ return v;
518
+ }
519
+
520
+ SI U32 gather_24(const uint8_t* p, I32 ix) {
521
+ // First, back up a byte. Any place we're gathering from has a safe junk byte to read
522
+ // in front of it, either a previous table value, or some tag metadata.
523
+ p -= 1;
524
+
525
+ // Load the i'th 24-bit value from p, and 1 extra byte.
526
+ auto load_24_32 = [p](int i) {
527
+ return load<uint32_t>(p + 3*i);
528
+ };
529
+
530
+ // Now load multiples of 4 bytes (a junk byte, then r,g,b).
531
+ #if N == 1
532
+ U32 v = load_24_32(ix);
533
+ #elif N == 4
534
+ U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
535
+ #elif N == 8 && !defined(USING_AVX2)
536
+ U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
537
+ load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
538
+ #elif N == 8
539
+ (void)load_24_32;
540
+ // The gather instruction here doesn't need any particular alignment,
541
+ // but the intrinsic takes a const int*.
542
+ const int* p4 = bit_pun<const int*>(p);
543
+ I32 zero = { 0, 0, 0, 0, 0, 0, 0, 0},
544
+ mask = {-1,-1,-1,-1, -1,-1,-1,-1};
545
+ #if defined(__clang__)
546
+ U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
547
+ #elif defined(__GNUC__)
548
+ U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
549
+ #endif
550
+ #elif N == 16
551
+ (void)load_24_32;
552
+ // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
553
+ // And AVX-512 swapped the order of arguments. :/
554
+ const int* p4 = bit_pun<const int*>(p);
555
+ U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
556
+ #endif
557
+
558
+ // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
559
+ return v >> 8;
560
+ }
561
+
562
+ #if !defined(__arm__)
563
+ SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
564
+ // As in gather_24(), with everything doubled.
565
+ p -= 2;
566
+
567
+ // Load the i'th 48-bit value from p, and 2 extra bytes.
568
+ auto load_48_64 = [p](int i) {
569
+ return load<uint64_t>(p + 6*i);
570
+ };
571
+
572
+ #if N == 1
573
+ *v = load_48_64(ix);
574
+ #elif N == 4
575
+ *v = U64{
576
+ load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
577
+ };
578
+ #elif N == 8 && !defined(USING_AVX2)
579
+ *v = U64{
580
+ load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
581
+ load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
582
+ };
583
+ #elif N == 8
584
+ (void)load_48_64;
585
+ typedef int32_t __attribute__((vector_size(16))) Half_I32;
586
+ typedef long long __attribute__((vector_size(32))) Half_I64;
587
+
588
+ // The gather instruction here doesn't need any particular alignment,
589
+ // but the intrinsic takes a const long long*.
590
+ const long long int* p8 = bit_pun<const long long int*>(p);
591
+
592
+ Half_I64 zero = { 0, 0, 0, 0},
593
+ mask = {-1,-1,-1,-1};
594
+
595
+ ix *= 6;
596
+ Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
597
+ ix_hi = { ix[4], ix[5], ix[6], ix[7] };
598
+
599
+ #if defined(__clang__)
600
+ Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
601
+ hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
602
+ #elif defined(__GNUC__)
603
+ Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
604
+ hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
605
+ #endif
606
+ store((char*)v + 0, lo);
607
+ store((char*)v + 32, hi);
608
+ #elif N == 16
609
+ (void)load_48_64;
610
+ const long long int* p8 = bit_pun<const long long int*>(p);
611
+ __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
612
+ hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
613
+ store((char*)v + 0, lo);
614
+ store((char*)v + 64, hi);
615
+ #endif
616
+
617
+ *v >>= 16;
618
+ }
619
+ #endif
620
+
621
+ SI F F_from_U8(U8 v) {
622
+ return cast<F>(v) * (1/255.0f);
623
+ }
624
+
625
+ SI F F_from_U16_BE(U16 v) {
626
+ // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
627
+ // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
628
+ U16 lo = (v >> 8),
629
+ hi = (v << 8) & 0xffff;
630
+ return cast<F>(lo|hi) * (1/65535.0f);
631
+ }
632
+
633
+ SI U16 U16_from_F(F v) {
634
+ // 65535 == inf in FP16, so promote to FP32 before converting.
635
+ return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
636
+ }
637
+
638
+ SI F minus_1_ulp(F v) {
639
+ #if defined(USING_NEON_FP16)
640
+ return bit_pun<F>( bit_pun<U16>(v) - 1 );
641
+ #else
642
+ return bit_pun<F>( bit_pun<U32>(v) - 1 );
643
+ #endif
644
+ }
645
+
646
+ SI F table(const skcms_Curve* curve, F v) {
647
+ // Clamp the input to [0,1], then scale to a table index.
648
+ F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
649
+
650
+ // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
651
+ I32 lo = cast<I32>( ix ),
652
+ hi = cast<I32>(minus_1_ulp(ix+1.0f));
653
+ F t = ix - cast<F>(lo); // i.e. the fractional part of ix.
654
+
655
+ // TODO: can we load l and h simultaneously? Each entry in 'h' is either
656
+ // the same as in 'l' or adjacent. We have a rough idea that's it'd always be safe
657
+ // to read adjacent entries and perhaps underflow the table by a byte or two
658
+ // (it'd be junk, but always safe to read). Not sure how to lerp yet.
659
+ F l,h;
660
+ if (curve->table_8) {
661
+ l = F_from_U8(gather_8(curve->table_8, lo));
662
+ h = F_from_U8(gather_8(curve->table_8, hi));
663
+ } else {
664
+ l = F_from_U16_BE(gather_16(curve->table_16, lo));
665
+ h = F_from_U16_BE(gather_16(curve->table_16, hi));
666
+ }
667
+ return l + (h-l)*t;
668
+ }
669
+
670
+ SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) {
671
+ U32 rgb = gather_24(grid_8, ix);
672
+
673
+ *r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f);
674
+ *g = cast<F>((rgb >> 8) & 0xff) * (1/255.0f);
675
+ *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
676
+ }
677
+
678
+ SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) {
679
+ // TODO: don't forget to optimize gather_32().
680
+ U32 rgba = gather_32(grid_8, ix);
681
+
682
+ *r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
683
+ *g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
684
+ *b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
685
+ *a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
686
+ }
687
+
688
+ SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) {
689
+ #if defined(__arm__)
690
+ // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
691
+ *r = F_from_U16_BE(gather_16(grid_16, 3*ix+0));
692
+ *g = F_from_U16_BE(gather_16(grid_16, 3*ix+1));
693
+ *b = F_from_U16_BE(gather_16(grid_16, 3*ix+2));
694
+ #else
695
+ // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
696
+ U64 rgb;
697
+ gather_48(grid_16, ix, &rgb);
698
+ rgb = swap_endian_16x4(rgb);
699
+
700
+ *r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f);
701
+ *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
702
+ *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
703
+ #endif
704
+ }
705
+
706
+ SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) {
707
+ // TODO: gather_64()-based fast path?
708
+ *r = F_from_U16_BE(gather_16(grid_16, 4*ix+0));
709
+ *g = F_from_U16_BE(gather_16(grid_16, 4*ix+1));
710
+ *b = F_from_U16_BE(gather_16(grid_16, 4*ix+2));
711
+ *a = F_from_U16_BE(gather_16(grid_16, 4*ix+3));
712
+ }
713
+
714
+ static void clut(uint32_t input_channels, uint32_t output_channels,
715
+ const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
716
+ F* r, F* g, F* b, F* a) {
717
+
718
+ const int dim = (int)input_channels;
719
+ assert (0 < dim && dim <= 4);
720
+ assert (output_channels == 3 ||
721
+ output_channels == 4);
722
+
723
+ // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
724
+ I32 index [8]; // Index contribution by dimension, first low from 0, then high from 4.
725
+ F weight[8]; // Weight for each contribution, again first low, then high.
726
+
727
+ // O(dim) work first: calculate index,weight from r,g,b,a.
728
+ const F inputs[] = { *r,*g,*b,*a };
729
+ for (int i = dim-1, stride = 1; i >= 0; i--) {
730
+ // x is where we logically want to sample the grid in the i-th dimension.
731
+ F x = inputs[i] * (float)(grid_points[i] - 1);
732
+
733
+ // But we can't index at floats. lo and hi are the two integer grid points surrounding x.
734
+ I32 lo = cast<I32>( x ), // i.e. trunc(x) == floor(x) here.
735
+ hi = cast<I32>(minus_1_ulp(x+1.0f));
736
+ // Notice how we fold in the accumulated stride across previous dimensions here.
737
+ index[i+0] = lo * stride;
738
+ index[i+4] = hi * stride;
739
+ stride *= grid_points[i];
740
+
741
+ // We'll interpolate between those two integer grid points by t.
742
+ F t = x - cast<F>(lo); // i.e. fract(x)
743
+ weight[i+0] = 1-t;
744
+ weight[i+4] = t;
745
+ }
746
+
747
+ *r = *g = *b = F0;
748
+ if (output_channels == 4) {
749
+ *a = F0;
750
+ }
751
+
752
+ // We'll sample 2^dim == 1<<dim table entries per pixel,
753
+ // in all combinations of low and high in each dimension.
754
+ for (int combo = 0; combo < (1<<dim); combo++) { // This loop can be done in any order.
755
+
756
+ // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
757
+ // where 0 selects the low index contribution and its weight 1-t,
758
+ // or 4 the high index contribution and its weight t.
759
+
760
+ // Since 0<dim≤4, we can always just start off with the 0-th channel,
761
+ // then handle the others conditionally.
762
+ I32 ix = index [0 + (combo&1)*4];
763
+ F w = weight[0 + (combo&1)*4];
764
+
765
+ switch ((dim-1)&3) { // This lets the compiler know there are no other cases to handle.
766
+ case 3: ix += index [3 + (combo&8)/2];
767
+ w *= weight[3 + (combo&8)/2];
768
+ FALLTHROUGH;
769
+ // fall through
770
+
771
+ case 2: ix += index [2 + (combo&4)*1];
772
+ w *= weight[2 + (combo&4)*1];
773
+ FALLTHROUGH;
774
+ // fall through
775
+
776
+ case 1: ix += index [1 + (combo&2)*2];
777
+ w *= weight[1 + (combo&2)*2];
778
+ }
779
+
780
+ F R,G,B,A=F0;
781
+ if (output_channels == 3) {
782
+ if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B); }
783
+ else { sample_clut_16(grid_16,ix, &R,&G,&B); }
784
+ } else {
785
+ if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B,&A); }
786
+ else { sample_clut_16(grid_16,ix, &R,&G,&B,&A); }
787
+ }
788
+ *r += w*R;
789
+ *g += w*G;
790
+ *b += w*B;
791
+ *a += w*A;
792
+ }
793
+ }
794
+
795
+ static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
796
+ clut(a2b->input_channels, a2b->output_channels,
797
+ a2b->grid_points, a2b->grid_8, a2b->grid_16,
798
+ r,g,b,&a);
799
+ }
800
+ static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
801
+ clut(b2a->input_channels, b2a->output_channels,
802
+ b2a->grid_points, b2a->grid_8, b2a->grid_16,
803
+ r,g,b,a);
804
+ }
805
+
806
+ static void exec_ops(const Op* ops, const void** args,
807
+ const char* src, char* dst, int i) {
808
+ F r = F0, g = F0, b = F0, a = F1;
809
+ while (true) {
810
+ switch (*ops++) {
811
+ case Op_load_a8:{
812
+ a = F_from_U8(load<U8>(src + 1*i));
813
+ } break;
814
+
815
+ case Op_load_g8:{
816
+ r = g = b = F_from_U8(load<U8>(src + 1*i));
817
+ } break;
818
+
819
+ case Op_load_4444:{
820
+ U16 abgr = load<U16>(src + 2*i);
821
+
822
+ r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
823
+ g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
824
+ b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
825
+ a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
826
+ } break;
827
+
828
+ case Op_load_565:{
829
+ U16 rgb = load<U16>(src + 2*i);
830
+
831
+ r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
832
+ g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
833
+ b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
834
+ } break;
835
+
836
+ case Op_load_888:{
837
+ const uint8_t* rgb = (const uint8_t*)(src + 3*i);
838
+ #if defined(USING_NEON_FP16)
839
+ // See the explanation under USING_NEON below. This is that doubled up.
840
+ uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
841
+ v = vld3q_lane_u8(rgb+ 0, v, 0);
842
+ v = vld3q_lane_u8(rgb+ 3, v, 2);
843
+ v = vld3q_lane_u8(rgb+ 6, v, 4);
844
+ v = vld3q_lane_u8(rgb+ 9, v, 6);
845
+
846
+ v = vld3q_lane_u8(rgb+12, v, 8);
847
+ v = vld3q_lane_u8(rgb+15, v, 10);
848
+ v = vld3q_lane_u8(rgb+18, v, 12);
849
+ v = vld3q_lane_u8(rgb+21, v, 14);
850
+
851
+ r = cast<F>((U16)v.val[0]) * (1/255.0f);
852
+ g = cast<F>((U16)v.val[1]) * (1/255.0f);
853
+ b = cast<F>((U16)v.val[2]) * (1/255.0f);
854
+ #elif defined(USING_NEON)
855
+ // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
856
+ // a time. Since we're doing that, we might as well load them into 16-bit lanes.
857
+ // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
858
+ uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
859
+ v = vld3_lane_u8(rgb+0, v, 0);
860
+ v = vld3_lane_u8(rgb+3, v, 2);
861
+ v = vld3_lane_u8(rgb+6, v, 4);
862
+ v = vld3_lane_u8(rgb+9, v, 6);
863
+
864
+ // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
865
+ // convert to F. (Again, U32 would be even better here if drop ARMv7 or split
866
+ // ARMv7 and ARMv8 impls.)
867
+ r = cast<F>((U16)v.val[0]) * (1/255.0f);
868
+ g = cast<F>((U16)v.val[1]) * (1/255.0f);
869
+ b = cast<F>((U16)v.val[2]) * (1/255.0f);
870
+ #else
871
+ r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
872
+ g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
873
+ b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
874
+ #endif
875
+ } break;
876
+
877
+ case Op_load_8888:{
878
+ U32 rgba = load<U32>(src + 4*i);
879
+
880
+ r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
881
+ g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
882
+ b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
883
+ a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
884
+ } break;
885
+
886
+ case Op_load_8888_palette8:{
887
+ const uint8_t* palette = (const uint8_t*) *args++;
888
+ I32 ix = cast<I32>(load<U8>(src + 1*i));
889
+ U32 rgba = gather_32(palette, ix);
890
+
891
+ r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
892
+ g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
893
+ b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
894
+ a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
895
+ } break;
896
+
897
+ case Op_load_1010102:{
898
+ U32 rgba = load<U32>(src + 4*i);
899
+
900
+ r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
901
+ g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
902
+ b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
903
+ a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
904
+ } break;
905
+
906
+ case Op_load_161616LE:{
907
+ uintptr_t ptr = (uintptr_t)(src + 6*i);
908
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
909
+ const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
910
+ #if defined(USING_NEON_FP16)
911
+ uint16x8x3_t v = vld3q_u16(rgb);
912
+ r = cast<F>((U16)v.val[0]) * (1/65535.0f);
913
+ g = cast<F>((U16)v.val[1]) * (1/65535.0f);
914
+ b = cast<F>((U16)v.val[2]) * (1/65535.0f);
915
+ #elif defined(USING_NEON)
916
+ uint16x4x3_t v = vld3_u16(rgb);
917
+ r = cast<F>((U16)v.val[0]) * (1/65535.0f);
918
+ g = cast<F>((U16)v.val[1]) * (1/65535.0f);
919
+ b = cast<F>((U16)v.val[2]) * (1/65535.0f);
920
+ #else
921
+ r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
922
+ g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
923
+ b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
924
+ #endif
925
+ } break;
926
+
927
+ case Op_load_16161616LE:{
928
+ uintptr_t ptr = (uintptr_t)(src + 8*i);
929
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
930
+ const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
931
+ #if defined(USING_NEON_FP16)
932
+ uint16x8x4_t v = vld4q_u16(rgba);
933
+ r = cast<F>((U16)v.val[0]) * (1/65535.0f);
934
+ g = cast<F>((U16)v.val[1]) * (1/65535.0f);
935
+ b = cast<F>((U16)v.val[2]) * (1/65535.0f);
936
+ a = cast<F>((U16)v.val[3]) * (1/65535.0f);
937
+ #elif defined(USING_NEON)
938
+ uint16x4x4_t v = vld4_u16(rgba);
939
+ r = cast<F>((U16)v.val[0]) * (1/65535.0f);
940
+ g = cast<F>((U16)v.val[1]) * (1/65535.0f);
941
+ b = cast<F>((U16)v.val[2]) * (1/65535.0f);
942
+ a = cast<F>((U16)v.val[3]) * (1/65535.0f);
943
+ #else
944
+ U64 px = load<U64>(rgba);
945
+
946
+ r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
947
+ g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
948
+ b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
949
+ a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
950
+ #endif
951
+ } break;
952
+
953
+ case Op_load_161616BE:{
954
+ uintptr_t ptr = (uintptr_t)(src + 6*i);
955
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
956
+ const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
957
+ #if defined(USING_NEON_FP16)
958
+ uint16x8x3_t v = vld3q_u16(rgb);
959
+ r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
960
+ g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
961
+ b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
962
+ #elif defined(USING_NEON)
963
+ uint16x4x3_t v = vld3_u16(rgb);
964
+ r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
965
+ g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
966
+ b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
967
+ #else
968
+ U32 R = load_3<U32>(rgb+0),
969
+ G = load_3<U32>(rgb+1),
970
+ B = load_3<U32>(rgb+2);
971
+ // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
972
+ r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
973
+ g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
974
+ b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
975
+ #endif
976
+ } break;
977
+
978
+ case Op_load_16161616BE:{
979
+ uintptr_t ptr = (uintptr_t)(src + 8*i);
980
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
981
+ const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
982
+ #if defined(USING_NEON_FP16)
983
+ uint16x8x4_t v = vld4q_u16(rgba);
984
+ r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
985
+ g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
986
+ b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
987
+ a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
988
+ #elif defined(USING_NEON)
989
+ uint16x4x4_t v = vld4_u16(rgba);
990
+ r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
991
+ g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
992
+ b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
993
+ a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
994
+ #else
995
+ U64 px = swap_endian_16x4(load<U64>(rgba));
996
+
997
+ r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
998
+ g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
999
+ b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
1000
+ a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
1001
+ #endif
1002
+ } break;
1003
+
1004
+ case Op_load_hhh:{
1005
+ uintptr_t ptr = (uintptr_t)(src + 6*i);
1006
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1007
+ const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1008
+ #if defined(USING_NEON_FP16)
1009
+ uint16x8x3_t v = vld3q_u16(rgb);
1010
+ U16 R = (U16)v.val[0],
1011
+ G = (U16)v.val[1],
1012
+ B = (U16)v.val[2];
1013
+ #elif defined(USING_NEON)
1014
+ uint16x4x3_t v = vld3_u16(rgb);
1015
+ U16 R = (U16)v.val[0],
1016
+ G = (U16)v.val[1],
1017
+ B = (U16)v.val[2];
1018
+ #else
1019
+ U16 R = load_3<U16>(rgb+0),
1020
+ G = load_3<U16>(rgb+1),
1021
+ B = load_3<U16>(rgb+2);
1022
+ #endif
1023
+ r = F_from_Half(R);
1024
+ g = F_from_Half(G);
1025
+ b = F_from_Half(B);
1026
+ } break;
1027
+
1028
+ case Op_load_hhhh:{
1029
+ uintptr_t ptr = (uintptr_t)(src + 8*i);
1030
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1031
+ const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1032
+ #if defined(USING_NEON_FP16)
1033
+ uint16x8x4_t v = vld4q_u16(rgba);
1034
+ U16 R = (U16)v.val[0],
1035
+ G = (U16)v.val[1],
1036
+ B = (U16)v.val[2],
1037
+ A = (U16)v.val[3];
1038
+ #elif defined(USING_NEON)
1039
+ uint16x4x4_t v = vld4_u16(rgba);
1040
+ U16 R = (U16)v.val[0],
1041
+ G = (U16)v.val[1],
1042
+ B = (U16)v.val[2],
1043
+ A = (U16)v.val[3];
1044
+ #else
1045
+ U64 px = load<U64>(rgba);
1046
+ U16 R = cast<U16>((px >> 0) & 0xffff),
1047
+ G = cast<U16>((px >> 16) & 0xffff),
1048
+ B = cast<U16>((px >> 32) & 0xffff),
1049
+ A = cast<U16>((px >> 48) & 0xffff);
1050
+ #endif
1051
+ r = F_from_Half(R);
1052
+ g = F_from_Half(G);
1053
+ b = F_from_Half(B);
1054
+ a = F_from_Half(A);
1055
+ } break;
1056
+
1057
+ case Op_load_fff:{
1058
+ uintptr_t ptr = (uintptr_t)(src + 12*i);
1059
+ assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1060
+ const float* rgb = (const float*)ptr; // cast to const float* to be safe.
1061
+ #if defined(USING_NEON_FP16)
1062
+ float32x4x3_t lo = vld3q_f32(rgb + 0),
1063
+ hi = vld3q_f32(rgb + 12);
1064
+ r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1065
+ g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1066
+ b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1067
+ #elif defined(USING_NEON)
1068
+ float32x4x3_t v = vld3q_f32(rgb);
1069
+ r = (F)v.val[0];
1070
+ g = (F)v.val[1];
1071
+ b = (F)v.val[2];
1072
+ #else
1073
+ r = load_3<F>(rgb+0);
1074
+ g = load_3<F>(rgb+1);
1075
+ b = load_3<F>(rgb+2);
1076
+ #endif
1077
+ } break;
1078
+
1079
+ case Op_load_ffff:{
1080
+ uintptr_t ptr = (uintptr_t)(src + 16*i);
1081
+ assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1082
+ const float* rgba = (const float*)ptr; // cast to const float* to be safe.
1083
+ #if defined(USING_NEON_FP16)
1084
+ float32x4x4_t lo = vld4q_f32(rgba + 0),
1085
+ hi = vld4q_f32(rgba + 16);
1086
+ r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1087
+ g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1088
+ b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1089
+ a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
1090
+ #elif defined(USING_NEON)
1091
+ float32x4x4_t v = vld4q_f32(rgba);
1092
+ r = (F)v.val[0];
1093
+ g = (F)v.val[1];
1094
+ b = (F)v.val[2];
1095
+ a = (F)v.val[3];
1096
+ #else
1097
+ r = load_4<F>(rgba+0);
1098
+ g = load_4<F>(rgba+1);
1099
+ b = load_4<F>(rgba+2);
1100
+ a = load_4<F>(rgba+3);
1101
+ #endif
1102
+ } break;
1103
+
1104
+ case Op_swap_rb:{
1105
+ F t = r;
1106
+ r = b;
1107
+ b = t;
1108
+ } break;
1109
+
1110
+ case Op_clamp:{
1111
+ r = max_(F0, min_(r, F1));
1112
+ g = max_(F0, min_(g, F1));
1113
+ b = max_(F0, min_(b, F1));
1114
+ a = max_(F0, min_(a, F1));
1115
+ } break;
1116
+
1117
+ case Op_invert:{
1118
+ r = F1 - r;
1119
+ g = F1 - g;
1120
+ b = F1 - b;
1121
+ a = F1 - a;
1122
+ } break;
1123
+
1124
+ case Op_force_opaque:{
1125
+ a = F1;
1126
+ } break;
1127
+
1128
+ case Op_premul:{
1129
+ r *= a;
1130
+ g *= a;
1131
+ b *= a;
1132
+ } break;
1133
+
1134
+ case Op_unpremul:{
1135
+ F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1136
+ r *= scale;
1137
+ g *= scale;
1138
+ b *= scale;
1139
+ } break;
1140
+
1141
+ case Op_matrix_3x3:{
1142
+ const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
1143
+ const float* m = &matrix->vals[0][0];
1144
+
1145
+ F R = m[0]*r + m[1]*g + m[2]*b,
1146
+ G = m[3]*r + m[4]*g + m[5]*b,
1147
+ B = m[6]*r + m[7]*g + m[8]*b;
1148
+
1149
+ r = R;
1150
+ g = G;
1151
+ b = B;
1152
+ } break;
1153
+
1154
+ case Op_matrix_3x4:{
1155
+ const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
1156
+ const float* m = &matrix->vals[0][0];
1157
+
1158
+ F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1159
+ G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1160
+ B = m[8]*r + m[9]*g + m[10]*b + m[11];
1161
+
1162
+ r = R;
1163
+ g = G;
1164
+ b = B;
1165
+ } break;
1166
+
1167
+ case Op_lab_to_xyz:{
1168
+ // The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
1169
+ F L = r * 100.0f,
1170
+ A = g * 255.0f - 128.0f,
1171
+ B = b * 255.0f - 128.0f;
1172
+
1173
+ // Convert to CIE XYZ.
1174
+ F Y = (L + 16.0f) * (1/116.0f),
1175
+ X = Y + A*(1/500.0f),
1176
+ Z = Y - B*(1/200.0f);
1177
+
1178
+ X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1179
+ Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1180
+ Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1181
+
1182
+ // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1183
+ r = X * 0.9642f;
1184
+ g = Y ;
1185
+ b = Z * 0.8249f;
1186
+ } break;
1187
+
1188
+ // As above, in reverse.
1189
+ case Op_xyz_to_lab:{
1190
+ F X = r * (1/0.9642f),
1191
+ Y = g,
1192
+ Z = b * (1/0.8249f);
1193
+
1194
+ X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
1195
+ Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
1196
+ Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
1197
+
1198
+ F L = Y*116.0f - 16.0f,
1199
+ A = (X-Y)*500.0f,
1200
+ B = (Y-Z)*200.0f;
1201
+
1202
+ r = L * (1/100.f);
1203
+ g = (A + 128.0f) * (1/255.0f);
1204
+ b = (B + 128.0f) * (1/255.0f);
1205
+ } break;
1206
+
1207
+ case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
1208
+ case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
1209
+ case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
1210
+ case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
1211
+
1212
+ case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
1213
+ case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
1214
+ case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
1215
+ case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
1216
+
1217
+ case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
1218
+ case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
1219
+ case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
1220
+ case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
1221
+
1222
+ case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
1223
+ case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
1224
+ case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
1225
+ case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
1226
+
1227
+ case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
1228
+ case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
1229
+ case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
1230
+ case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
1231
+
1232
+ case Op_clut_A2B: {
1233
+ const skcms_A2B* a2b = (const skcms_A2B*) *args++;
1234
+ clut(a2b, &r,&g,&b,a);
1235
+
1236
+ if (a2b->input_channels == 4) {
1237
+ // CMYK is opaque.
1238
+ a = F1;
1239
+ }
1240
+ } break;
1241
+
1242
+ case Op_clut_B2A: {
1243
+ const skcms_B2A* b2a = (const skcms_B2A*) *args++;
1244
+ clut(b2a, &r,&g,&b,&a);
1245
+ } break;
1246
+
1247
+ // Notice, from here on down the store_ ops all return, ending the loop.
1248
+
1249
+ case Op_store_a8: {
1250
+ store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1251
+ } return;
1252
+
1253
+ case Op_store_g8: {
1254
+ // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1255
+ store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1256
+ } return;
1257
+
1258
+ case Op_store_4444: {
1259
+ store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1260
+ | cast<U16>(to_fixed(g * 15) << 8)
1261
+ | cast<U16>(to_fixed(b * 15) << 4)
1262
+ | cast<U16>(to_fixed(a * 15) << 0));
1263
+ } return;
1264
+
1265
+ case Op_store_565: {
1266
+ store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
1267
+ | cast<U16>(to_fixed(g * 63) << 5 )
1268
+ | cast<U16>(to_fixed(b * 31) << 11 ));
1269
+ } return;
1270
+
1271
+ case Op_store_888: {
1272
+ uint8_t* rgb = (uint8_t*)dst + 3*i;
1273
+ #if defined(USING_NEON_FP16)
1274
+ // See the explanation under USING_NEON below. This is that doubled up.
1275
+ U16 R = to_fixed(r * 255),
1276
+ G = to_fixed(g * 255),
1277
+ B = to_fixed(b * 255);
1278
+
1279
+ uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
1280
+ vst3q_lane_u8(rgb+ 0, v, 0);
1281
+ vst3q_lane_u8(rgb+ 3, v, 2);
1282
+ vst3q_lane_u8(rgb+ 6, v, 4);
1283
+ vst3q_lane_u8(rgb+ 9, v, 6);
1284
+
1285
+ vst3q_lane_u8(rgb+12, v, 8);
1286
+ vst3q_lane_u8(rgb+15, v, 10);
1287
+ vst3q_lane_u8(rgb+18, v, 12);
1288
+ vst3q_lane_u8(rgb+21, v, 14);
1289
+ #elif defined(USING_NEON)
1290
+ // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1291
+ // get there via U16 to save some instructions converting to float. And just
1292
+ // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1293
+ U16 R = cast<U16>(to_fixed(r * 255)),
1294
+ G = cast<U16>(to_fixed(g * 255)),
1295
+ B = cast<U16>(to_fixed(b * 255));
1296
+
1297
+ uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1298
+ vst3_lane_u8(rgb+0, v, 0);
1299
+ vst3_lane_u8(rgb+3, v, 2);
1300
+ vst3_lane_u8(rgb+6, v, 4);
1301
+ vst3_lane_u8(rgb+9, v, 6);
1302
+ #else
1303
+ store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1304
+ store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1305
+ store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1306
+ #endif
1307
+ } return;
1308
+
1309
+ case Op_store_8888: {
1310
+ store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0
1311
+ | cast<U32>(to_fixed(g * 255)) << 8
1312
+ | cast<U32>(to_fixed(b * 255)) << 16
1313
+ | cast<U32>(to_fixed(a * 255)) << 24);
1314
+ } return;
1315
+
1316
+ case Op_store_1010102: {
1317
+ store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
1318
+ | cast<U32>(to_fixed(g * 1023)) << 10
1319
+ | cast<U32>(to_fixed(b * 1023)) << 20
1320
+ | cast<U32>(to_fixed(a * 3)) << 30);
1321
+ } return;
1322
+
1323
+ case Op_store_161616LE: {
1324
+ uintptr_t ptr = (uintptr_t)(dst + 6*i);
1325
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1326
+ uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1327
+ #if defined(USING_NEON_FP16)
1328
+ uint16x8x3_t v = {{
1329
+ (uint16x8_t)U16_from_F(r),
1330
+ (uint16x8_t)U16_from_F(g),
1331
+ (uint16x8_t)U16_from_F(b),
1332
+ }};
1333
+ vst3q_u16(rgb, v);
1334
+ #elif defined(USING_NEON)
1335
+ uint16x4x3_t v = {{
1336
+ (uint16x4_t)U16_from_F(r),
1337
+ (uint16x4_t)U16_from_F(g),
1338
+ (uint16x4_t)U16_from_F(b),
1339
+ }};
1340
+ vst3_u16(rgb, v);
1341
+ #else
1342
+ store_3(rgb+0, U16_from_F(r));
1343
+ store_3(rgb+1, U16_from_F(g));
1344
+ store_3(rgb+2, U16_from_F(b));
1345
+ #endif
1346
+
1347
+ } return;
1348
+
1349
+ case Op_store_16161616LE: {
1350
+ uintptr_t ptr = (uintptr_t)(dst + 8*i);
1351
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1352
+ uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1353
+ #if defined(USING_NEON_FP16)
1354
+ uint16x8x4_t v = {{
1355
+ (uint16x8_t)U16_from_F(r),
1356
+ (uint16x8_t)U16_from_F(g),
1357
+ (uint16x8_t)U16_from_F(b),
1358
+ (uint16x8_t)U16_from_F(a),
1359
+ }};
1360
+ vst4q_u16(rgba, v);
1361
+ #elif defined(USING_NEON)
1362
+ uint16x4x4_t v = {{
1363
+ (uint16x4_t)U16_from_F(r),
1364
+ (uint16x4_t)U16_from_F(g),
1365
+ (uint16x4_t)U16_from_F(b),
1366
+ (uint16x4_t)U16_from_F(a),
1367
+ }};
1368
+ vst4_u16(rgba, v);
1369
+ #else
1370
+ U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1371
+ | cast<U64>(to_fixed(g * 65535)) << 16
1372
+ | cast<U64>(to_fixed(b * 65535)) << 32
1373
+ | cast<U64>(to_fixed(a * 65535)) << 48;
1374
+ store(rgba, px);
1375
+ #endif
1376
+ } return;
1377
+
1378
+ case Op_store_161616BE: {
1379
+ uintptr_t ptr = (uintptr_t)(dst + 6*i);
1380
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1381
+ uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1382
+ #if defined(USING_NEON_FP16)
1383
+ uint16x8x3_t v = {{
1384
+ (uint16x8_t)swap_endian_16(U16_from_F(r)),
1385
+ (uint16x8_t)swap_endian_16(U16_from_F(g)),
1386
+ (uint16x8_t)swap_endian_16(U16_from_F(b)),
1387
+ }};
1388
+ vst3q_u16(rgb, v);
1389
+ #elif defined(USING_NEON)
1390
+ uint16x4x3_t v = {{
1391
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1392
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1393
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1394
+ }};
1395
+ vst3_u16(rgb, v);
1396
+ #else
1397
+ U32 R = to_fixed(r * 65535),
1398
+ G = to_fixed(g * 65535),
1399
+ B = to_fixed(b * 65535);
1400
+ store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1401
+ store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1402
+ store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1403
+ #endif
1404
+
1405
+ } return;
1406
+
1407
+ case Op_store_16161616BE: {
1408
+ uintptr_t ptr = (uintptr_t)(dst + 8*i);
1409
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1410
+ uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1411
+ #if defined(USING_NEON_FP16)
1412
+ uint16x8x4_t v = {{
1413
+ (uint16x8_t)swap_endian_16(U16_from_F(r)),
1414
+ (uint16x8_t)swap_endian_16(U16_from_F(g)),
1415
+ (uint16x8_t)swap_endian_16(U16_from_F(b)),
1416
+ (uint16x8_t)swap_endian_16(U16_from_F(a)),
1417
+ }};
1418
+ vst4q_u16(rgba, v);
1419
+ #elif defined(USING_NEON)
1420
+ uint16x4x4_t v = {{
1421
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1422
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1423
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1424
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1425
+ }};
1426
+ vst4_u16(rgba, v);
1427
+ #else
1428
+ U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1429
+ | cast<U64>(to_fixed(g * 65535)) << 16
1430
+ | cast<U64>(to_fixed(b * 65535)) << 32
1431
+ | cast<U64>(to_fixed(a * 65535)) << 48;
1432
+ store(rgba, swap_endian_16x4(px));
1433
+ #endif
1434
+ } return;
1435
+
1436
+ case Op_store_hhh: {
1437
+ uintptr_t ptr = (uintptr_t)(dst + 6*i);
1438
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1439
+ uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1440
+
1441
+ U16 R = Half_from_F(r),
1442
+ G = Half_from_F(g),
1443
+ B = Half_from_F(b);
1444
+ #if defined(USING_NEON_FP16)
1445
+ uint16x8x3_t v = {{
1446
+ (uint16x8_t)R,
1447
+ (uint16x8_t)G,
1448
+ (uint16x8_t)B,
1449
+ }};
1450
+ vst3q_u16(rgb, v);
1451
+ #elif defined(USING_NEON)
1452
+ uint16x4x3_t v = {{
1453
+ (uint16x4_t)R,
1454
+ (uint16x4_t)G,
1455
+ (uint16x4_t)B,
1456
+ }};
1457
+ vst3_u16(rgb, v);
1458
+ #else
1459
+ store_3(rgb+0, R);
1460
+ store_3(rgb+1, G);
1461
+ store_3(rgb+2, B);
1462
+ #endif
1463
+ } return;
1464
+
1465
+ case Op_store_hhhh: {
1466
+ uintptr_t ptr = (uintptr_t)(dst + 8*i);
1467
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1468
+ uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1469
+
1470
+ U16 R = Half_from_F(r),
1471
+ G = Half_from_F(g),
1472
+ B = Half_from_F(b),
1473
+ A = Half_from_F(a);
1474
+ #if defined(USING_NEON_FP16)
1475
+ uint16x8x4_t v = {{
1476
+ (uint16x8_t)R,
1477
+ (uint16x8_t)G,
1478
+ (uint16x8_t)B,
1479
+ (uint16x8_t)A,
1480
+ }};
1481
+ vst4q_u16(rgba, v);
1482
+ #elif defined(USING_NEON)
1483
+ uint16x4x4_t v = {{
1484
+ (uint16x4_t)R,
1485
+ (uint16x4_t)G,
1486
+ (uint16x4_t)B,
1487
+ (uint16x4_t)A,
1488
+ }};
1489
+ vst4_u16(rgba, v);
1490
+ #else
1491
+ store(rgba, cast<U64>(R) << 0
1492
+ | cast<U64>(G) << 16
1493
+ | cast<U64>(B) << 32
1494
+ | cast<U64>(A) << 48);
1495
+ #endif
1496
+
1497
+ } return;
1498
+
1499
+ case Op_store_fff: {
1500
+ uintptr_t ptr = (uintptr_t)(dst + 12*i);
1501
+ assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1502
+ float* rgb = (float*)ptr; // for this cast to float* to be safe.
1503
+ #if defined(USING_NEON_FP16)
1504
+ float32x4x3_t lo = {{
1505
+ vcvt_f32_f16(vget_low_f16(r)),
1506
+ vcvt_f32_f16(vget_low_f16(g)),
1507
+ vcvt_f32_f16(vget_low_f16(b)),
1508
+ }}, hi = {{
1509
+ vcvt_f32_f16(vget_high_f16(r)),
1510
+ vcvt_f32_f16(vget_high_f16(g)),
1511
+ vcvt_f32_f16(vget_high_f16(b)),
1512
+ }};
1513
+ vst3q_f32(rgb + 0, lo);
1514
+ vst3q_f32(rgb + 12, hi);
1515
+ #elif defined(USING_NEON)
1516
+ float32x4x3_t v = {{
1517
+ (float32x4_t)r,
1518
+ (float32x4_t)g,
1519
+ (float32x4_t)b,
1520
+ }};
1521
+ vst3q_f32(rgb, v);
1522
+ #else
1523
+ store_3(rgb+0, r);
1524
+ store_3(rgb+1, g);
1525
+ store_3(rgb+2, b);
1526
+ #endif
1527
+ } return;
1528
+
1529
+ case Op_store_ffff: {
1530
+ uintptr_t ptr = (uintptr_t)(dst + 16*i);
1531
+ assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1532
+ float* rgba = (float*)ptr; // for this cast to float* to be safe.
1533
+ #if defined(USING_NEON_FP16)
1534
+ float32x4x4_t lo = {{
1535
+ vcvt_f32_f16(vget_low_f16(r)),
1536
+ vcvt_f32_f16(vget_low_f16(g)),
1537
+ vcvt_f32_f16(vget_low_f16(b)),
1538
+ vcvt_f32_f16(vget_low_f16(a)),
1539
+ }}, hi = {{
1540
+ vcvt_f32_f16(vget_high_f16(r)),
1541
+ vcvt_f32_f16(vget_high_f16(g)),
1542
+ vcvt_f32_f16(vget_high_f16(b)),
1543
+ vcvt_f32_f16(vget_high_f16(a)),
1544
+ }};
1545
+ vst4q_f32(rgba + 0, lo);
1546
+ vst4q_f32(rgba + 16, hi);
1547
+ #elif defined(USING_NEON)
1548
+ float32x4x4_t v = {{
1549
+ (float32x4_t)r,
1550
+ (float32x4_t)g,
1551
+ (float32x4_t)b,
1552
+ (float32x4_t)a,
1553
+ }};
1554
+ vst4q_f32(rgba, v);
1555
+ #else
1556
+ store_4(rgba+0, r);
1557
+ store_4(rgba+1, g);
1558
+ store_4(rgba+2, b);
1559
+ store_4(rgba+3, a);
1560
+ #endif
1561
+ } return;
1562
+ }
1563
+ }
1564
+ }
1565
+
1566
+
1567
+ static void run_program(const Op* program, const void** arguments,
1568
+ const char* src, char* dst, int n,
1569
+ const size_t src_bpp, const size_t dst_bpp) {
1570
+ int i = 0;
1571
+ while (n >= N) {
1572
+ exec_ops(program, arguments, src, dst, i);
1573
+ i += N;
1574
+ n -= N;
1575
+ }
1576
+ if (n > 0) {
1577
+ char tmp[4*4*N] = {0};
1578
+
1579
+ memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
1580
+ exec_ops(program, arguments, tmp, tmp, 0);
1581
+ memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
1582
+ }
1583
+ }
1584
+
1585
+ // Clean up any #defines we may have set so that we can be #included again.
1586
+ #if defined(USING_AVX)
1587
+ #undef USING_AVX
1588
+ #endif
1589
+ #if defined(USING_AVX_F16C)
1590
+ #undef USING_AVX_F16C
1591
+ #endif
1592
+ #if defined(USING_AVX2)
1593
+ #undef USING_AVX2
1594
+ #endif
1595
+ #if defined(USING_AVX512F)
1596
+ #undef USING_AVX512F
1597
+ #endif
1598
+
1599
+ #if defined(USING_NEON)
1600
+ #undef USING_NEON
1601
+ #endif
1602
+ #if defined(USING_NEON_F16C)
1603
+ #undef USING_NEON_F16C
1604
+ #endif
1605
+ #if defined(USING_NEON_FP16)
1606
+ #undef USING_NEON_FP16
1607
+ #endif
1608
+
1609
+ #undef FALLTHROUGH