@shopify/react-native-skia 0.1.157 → 0.1.159

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. package/android/CMakeLists.txt +35 -11
  2. package/android/build.gradle +31 -30
  3. package/android/cpp/jni/JniLoad.cpp +2 -0
  4. package/android/cpp/jni/include/JniSkiaDomView.h +89 -0
  5. package/android/cpp/rnskia-android/SkiaOpenGLRenderer.cpp +4 -3
  6. package/android/cpp/rnskia-android/SkiaOpenGLRenderer.h +4 -3
  7. package/android/src/main/java/com/shopify/reactnative/skia/RNSkiaPackage.java +2 -1
  8. package/android/src/main/java/com/shopify/reactnative/skia/SkiaDomView.java +45 -0
  9. package/android/src/main/java/com/shopify/reactnative/skia/SkiaDomViewManager.java +64 -0
  10. package/cpp/api/JsiSkContourMeasure.h +7 -5
  11. package/cpp/api/JsiSkHostObjects.h +6 -0
  12. package/cpp/api/JsiSkImageFilterFactory.h +1 -1
  13. package/cpp/api/JsiSkPaint.h +9 -2
  14. package/cpp/api/JsiSkPath.h +1 -0
  15. package/cpp/api/JsiSkRuntimeEffect.h +36 -36
  16. package/cpp/jsi/JsiHostObject.cpp +16 -28
  17. package/cpp/jsi/JsiHostObject.h +127 -7
  18. package/cpp/jsi/JsiValue.cpp +346 -0
  19. package/cpp/jsi/JsiValue.h +222 -0
  20. package/cpp/jsi/JsiValueWrapper.h +33 -5
  21. package/cpp/rnskia/RNSkDomView.cpp +220 -0
  22. package/cpp/rnskia/RNSkDomView.h +140 -0
  23. package/cpp/rnskia/RNSkJsView.cpp +0 -4
  24. package/cpp/rnskia/RNSkJsView.h +6 -4
  25. package/cpp/rnskia/RNSkManager.cpp +7 -0
  26. package/cpp/rnskia/RNSkPictureView.h +5 -8
  27. package/cpp/rnskia/RNSkView.h +113 -5
  28. package/cpp/rnskia/dom/JsiDomApi.h +167 -0
  29. package/cpp/rnskia/dom/base/BaseNodeProp.h +72 -0
  30. package/cpp/rnskia/dom/base/DerivedNodeProp.h +187 -0
  31. package/cpp/rnskia/dom/base/DrawingContext.cpp +227 -0
  32. package/cpp/rnskia/dom/base/DrawingContext.h +136 -0
  33. package/cpp/rnskia/dom/base/JsiDependencyManager.h +294 -0
  34. package/cpp/rnskia/dom/base/JsiDomDeclarationNode.h +176 -0
  35. package/cpp/rnskia/dom/base/JsiDomDrawingNode.h +50 -0
  36. package/cpp/rnskia/dom/base/JsiDomNode.h +361 -0
  37. package/cpp/rnskia/dom/base/JsiDomRenderNode.h +267 -0
  38. package/cpp/rnskia/dom/base/NodeProp.h +130 -0
  39. package/cpp/rnskia/dom/base/NodePropsContainer.h +119 -0
  40. package/cpp/rnskia/dom/nodes/JsiBackdropFilterNode.h +38 -0
  41. package/cpp/rnskia/dom/nodes/JsiBlendNode.h +112 -0
  42. package/cpp/rnskia/dom/nodes/JsiBlurMaskNode.h +78 -0
  43. package/cpp/rnskia/dom/nodes/JsiBoxNode.h +104 -0
  44. package/cpp/rnskia/dom/nodes/JsiBoxShadowNode.h +33 -0
  45. package/cpp/rnskia/dom/nodes/JsiCircleNode.h +38 -0
  46. package/cpp/rnskia/dom/nodes/JsiColorFilterNodes.h +192 -0
  47. package/cpp/rnskia/dom/nodes/JsiCustomDrawingNode.h +123 -0
  48. package/cpp/rnskia/dom/nodes/JsiDiffRectNode.h +42 -0
  49. package/cpp/rnskia/dom/nodes/JsiFillNode.h +22 -0
  50. package/cpp/rnskia/dom/nodes/JsiGlyphsNode.h +56 -0
  51. package/cpp/rnskia/dom/nodes/JsiGroupNode.h +26 -0
  52. package/cpp/rnskia/dom/nodes/JsiImageFilterNodes.h +415 -0
  53. package/cpp/rnskia/dom/nodes/JsiImageNode.h +34 -0
  54. package/cpp/rnskia/dom/nodes/JsiImageSvgNode.h +44 -0
  55. package/cpp/rnskia/dom/nodes/JsiLayerNode.h +64 -0
  56. package/cpp/rnskia/dom/nodes/JsiLineNode.h +43 -0
  57. package/cpp/rnskia/dom/nodes/JsiOvalNode.h +34 -0
  58. package/cpp/rnskia/dom/nodes/JsiPaintNode.h +77 -0
  59. package/cpp/rnskia/dom/nodes/JsiPatchNode.h +54 -0
  60. package/cpp/rnskia/dom/nodes/JsiPathEffectNodes.h +315 -0
  61. package/cpp/rnskia/dom/nodes/JsiPathNode.h +181 -0
  62. package/cpp/rnskia/dom/nodes/JsiPictureNode.h +32 -0
  63. package/cpp/rnskia/dom/nodes/JsiPointsNode.h +51 -0
  64. package/cpp/rnskia/dom/nodes/JsiRRectNode.h +34 -0
  65. package/cpp/rnskia/dom/nodes/JsiRectNode.h +34 -0
  66. package/cpp/rnskia/dom/nodes/JsiShaderNodes.h +517 -0
  67. package/cpp/rnskia/dom/nodes/JsiTextBlobNode.h +47 -0
  68. package/cpp/rnskia/dom/nodes/JsiTextNode.h +54 -0
  69. package/cpp/rnskia/dom/nodes/JsiTextPathNode.h +32 -0
  70. package/cpp/rnskia/dom/nodes/JsiVerticesNode.h +43 -0
  71. package/cpp/rnskia/dom/props/BezierProps.h +63 -0
  72. package/cpp/rnskia/dom/props/BlendModeProp.h +101 -0
  73. package/cpp/rnskia/dom/props/BoxShadowProps.h +61 -0
  74. package/cpp/rnskia/dom/props/CircleProp.h +46 -0
  75. package/cpp/rnskia/dom/props/ClipProp.h +62 -0
  76. package/cpp/rnskia/dom/props/ColorProp.h +80 -0
  77. package/cpp/rnskia/dom/props/DrawingProp.h +33 -0
  78. package/cpp/rnskia/dom/props/FontProp.h +34 -0
  79. package/cpp/rnskia/dom/props/GlyphsProp.h +53 -0
  80. package/cpp/rnskia/dom/props/ImageProps.h +173 -0
  81. package/cpp/rnskia/dom/props/LayerProp.h +50 -0
  82. package/cpp/rnskia/dom/props/MatrixProp.h +33 -0
  83. package/cpp/rnskia/dom/props/NumbersProp.h +63 -0
  84. package/cpp/rnskia/dom/props/PaintProps.h +172 -0
  85. package/cpp/rnskia/dom/props/PathProp.h +55 -0
  86. package/cpp/rnskia/dom/props/PictureProp.h +38 -0
  87. package/cpp/rnskia/dom/props/PointProp.h +72 -0
  88. package/cpp/rnskia/dom/props/PointsProp.h +83 -0
  89. package/cpp/rnskia/dom/props/RRectProp.h +134 -0
  90. package/cpp/rnskia/dom/props/RadiusProp.h +43 -0
  91. package/cpp/rnskia/dom/props/RectProp.h +118 -0
  92. package/cpp/rnskia/dom/props/StrokeProps.h +75 -0
  93. package/cpp/rnskia/dom/props/SvgProp.h +37 -0
  94. package/cpp/rnskia/dom/props/TextBlobProp.h +128 -0
  95. package/cpp/rnskia/dom/props/TileModeProp.h +50 -0
  96. package/cpp/rnskia/dom/props/TransformProp.h +80 -0
  97. package/cpp/rnskia/dom/props/TransformsProps.h +68 -0
  98. package/cpp/rnskia/dom/props/UniformsProp.h +194 -0
  99. package/cpp/rnskia/dom/props/VertexModeProp.h +47 -0
  100. package/cpp/rnskia/dom/props/VerticesProps.h +67 -0
  101. package/cpp/rnskia/values/RNSkReadonlyValue.h +13 -4
  102. package/cpp/skia/include/android/SkAndroidFrameworkUtils.h +35 -1
  103. package/cpp/skia/include/codec/SkAndroidCodec.h +17 -1
  104. package/cpp/skia/include/codec/SkCodec.h +8 -5
  105. package/cpp/skia/include/core/SkAnnotation.h +2 -0
  106. package/cpp/skia/include/core/SkBitmap.h +52 -1
  107. package/cpp/skia/include/core/SkBlendMode.h +2 -0
  108. package/cpp/skia/include/core/SkCanvas.h +52 -31
  109. package/cpp/skia/include/core/SkCapabilities.h +44 -0
  110. package/cpp/skia/include/core/SkColor.h +7 -0
  111. package/cpp/skia/include/core/SkColorFilter.h +37 -0
  112. package/cpp/skia/include/core/SkColorSpace.h +1 -1
  113. package/cpp/skia/include/core/SkFont.h +4 -0
  114. package/cpp/skia/include/core/SkFontMgr.h +3 -0
  115. package/cpp/skia/include/core/SkGraphics.h +9 -0
  116. package/cpp/skia/include/core/SkImage.h +77 -17
  117. package/cpp/skia/include/core/SkImageEncoder.h +5 -3
  118. package/cpp/skia/include/core/SkImageGenerator.h +27 -17
  119. package/cpp/skia/include/core/SkM44.h +1 -0
  120. package/cpp/skia/include/core/SkMesh.h +120 -34
  121. package/cpp/skia/include/core/SkMilestone.h +1 -1
  122. package/cpp/skia/include/core/SkOverdrawCanvas.h +2 -1
  123. package/cpp/skia/include/core/SkPaint.h +15 -2
  124. package/cpp/skia/include/core/SkPath.h +4 -0
  125. package/cpp/skia/include/core/SkPathBuilder.h +1 -1
  126. package/cpp/skia/include/core/SkPicture.h +0 -3
  127. package/cpp/skia/include/core/SkPictureRecorder.h +0 -2
  128. package/cpp/skia/include/core/SkPixmap.h +19 -0
  129. package/cpp/skia/include/core/SkRasterHandleAllocator.h +3 -1
  130. package/cpp/skia/include/core/SkRect.h +11 -4
  131. package/cpp/skia/include/core/SkRefCnt.h +13 -1
  132. package/cpp/skia/include/core/SkRegion.h +6 -0
  133. package/cpp/skia/include/core/SkSamplingOptions.h +8 -6
  134. package/cpp/skia/include/core/SkScalar.h +6 -25
  135. package/cpp/skia/include/core/SkShader.h +20 -12
  136. package/cpp/skia/include/core/SkSpan.h +51 -19
  137. package/cpp/skia/include/core/SkStream.h +2 -2
  138. package/cpp/skia/include/core/SkString.h +11 -3
  139. package/cpp/skia/include/core/SkSurface.h +85 -8
  140. package/cpp/skia/include/core/SkTextBlob.h +5 -2
  141. package/cpp/skia/include/core/SkTypes.h +11 -10
  142. package/cpp/skia/include/docs/SkPDFDocument.h +0 -5
  143. package/cpp/skia/include/effects/Sk1DPathEffect.h +6 -1
  144. package/cpp/skia/include/effects/Sk2DPathEffect.h +4 -1
  145. package/cpp/skia/include/effects/SkColorMatrix.h +1 -0
  146. package/cpp/skia/include/effects/SkColorMatrixFilter.h +5 -8
  147. package/cpp/skia/include/effects/SkCornerPathEffect.h +5 -1
  148. package/cpp/skia/include/effects/SkDashPathEffect.h +5 -1
  149. package/cpp/skia/include/effects/SkGradientShader.h +68 -38
  150. package/cpp/skia/include/effects/SkHighContrastFilter.h +5 -1
  151. package/cpp/skia/include/effects/SkImageFilters.h +5 -4
  152. package/cpp/skia/include/effects/SkLumaColorFilter.h +4 -1
  153. package/cpp/skia/include/effects/SkOpPathEffect.h +6 -2
  154. package/cpp/skia/include/effects/SkOverdrawColorFilter.h +5 -2
  155. package/cpp/skia/include/effects/SkRuntimeEffect.h +54 -62
  156. package/cpp/skia/include/effects/SkShaderMaskFilter.h +3 -1
  157. package/cpp/skia/include/effects/SkTableColorFilter.h +8 -21
  158. package/cpp/skia/include/effects/SkTableMaskFilter.h +5 -1
  159. package/cpp/skia/include/effects/SkTrimPathEffect.h +5 -1
  160. package/cpp/skia/include/encode/SkEncoder.h +17 -0
  161. package/cpp/skia/include/encode/SkWebpEncoder.h +17 -0
  162. package/cpp/skia/include/gpu/GpuTypes.h +18 -0
  163. package/cpp/skia/include/gpu/GrBackendSurface.h +38 -17
  164. package/cpp/skia/include/gpu/GrBackendSurfaceMutableState.h +6 -71
  165. package/cpp/skia/include/gpu/GrContextOptions.h +1 -1
  166. package/cpp/skia/include/gpu/GrContextThreadSafeProxy.h +10 -9
  167. package/cpp/skia/include/gpu/GrDirectContext.h +42 -22
  168. package/cpp/skia/include/gpu/GrRecordingContext.h +6 -3
  169. package/cpp/skia/include/gpu/GrTypes.h +11 -11
  170. package/cpp/skia/include/gpu/MutableTextureState.h +122 -0
  171. package/cpp/skia/include/gpu/gl/GrGLFunctions.h +1 -0
  172. package/cpp/skia/include/gpu/gl/GrGLInterface.h +1 -0
  173. package/cpp/skia/include/gpu/graphite/BackendTexture.h +7 -0
  174. package/cpp/skia/include/gpu/graphite/CombinationBuilder.h +195 -0
  175. package/cpp/skia/include/gpu/graphite/Context.h +47 -55
  176. package/cpp/skia/include/gpu/graphite/ContextOptions.h +85 -0
  177. package/cpp/skia/include/gpu/graphite/GraphiteTypes.h +1 -17
  178. package/cpp/skia/include/gpu/graphite/ImageProvider.h +61 -0
  179. package/cpp/skia/include/gpu/graphite/Recorder.h +87 -8
  180. package/cpp/skia/include/gpu/graphite/Recording.h +19 -9
  181. package/cpp/skia/include/gpu/graphite/TextureInfo.h +40 -8
  182. package/cpp/skia/include/gpu/graphite/dawn/DawnBackendContext.h +25 -0
  183. package/cpp/skia/include/gpu/graphite/mtl/MtlBackendContext.h +3 -2
  184. package/cpp/skia/include/gpu/graphite/vk/VulkanGraphiteTypes.h +69 -0
  185. package/cpp/skia/include/gpu/mtl/MtlMemoryAllocator.h +39 -0
  186. package/cpp/skia/include/gpu/vk/GrVkBackendContext.h +21 -19
  187. package/cpp/skia/include/gpu/vk/GrVkExtensions.h +2 -50
  188. package/cpp/skia/include/gpu/vk/GrVkMemoryAllocator.h +2 -127
  189. package/cpp/skia/include/gpu/vk/GrVkTypes.h +5 -43
  190. package/cpp/skia/include/gpu/vk/VulkanBackendContext.h +46 -0
  191. package/cpp/skia/include/gpu/vk/VulkanExtensions.h +67 -0
  192. package/cpp/skia/include/gpu/vk/VulkanMemoryAllocator.h +116 -0
  193. package/cpp/skia/include/gpu/vk/VulkanTypes.h +59 -0
  194. package/cpp/skia/include/pathops/SkPathOps.h +1 -1
  195. package/cpp/skia/include/private/SkColorData.h +10 -40
  196. package/cpp/skia/include/private/SkEncodedInfo.h +9 -3
  197. package/cpp/skia/include/private/SkFloatingPoint.h +9 -6
  198. package/cpp/skia/include/private/SkHalf.h +5 -52
  199. package/cpp/skia/include/private/SkMacros.h +1 -1
  200. package/cpp/skia/include/private/SkMalloc.h +4 -0
  201. package/cpp/skia/include/private/SkPathRef.h +10 -10
  202. package/cpp/skia/include/private/SkSLModifiers.h +59 -23
  203. package/cpp/skia/include/private/SkSLProgramKind.h +1 -0
  204. package/cpp/skia/include/private/SkSLSymbol.h +7 -3
  205. package/cpp/skia/include/private/SkStringView.h +4 -0
  206. package/cpp/skia/include/private/SkTArray.h +21 -7
  207. package/cpp/skia/include/private/SkTDArray.h +173 -285
  208. package/cpp/skia/include/private/SkTHash.h +33 -32
  209. package/cpp/skia/include/private/SkTemplates.h +24 -26
  210. package/cpp/skia/include/private/SkVx.h +218 -135
  211. package/cpp/skia/include/private/chromium/GrSlug.h +3 -65
  212. package/cpp/skia/include/private/chromium/SkChromeRemoteGlyphCache.h +6 -3
  213. package/cpp/skia/include/private/chromium/Slug.h +76 -0
  214. package/cpp/skia/include/private/gpu/ganesh/GrTypesPriv.h +6 -1
  215. package/cpp/skia/include/private/gpu/ganesh/GrVkTypesPriv.h +5 -39
  216. package/cpp/skia/include/private/gpu/graphite/VulkanGraphiteTypesPriv.h +63 -0
  217. package/cpp/skia/include/{gpu/vk/GrVkVulkan.h → private/gpu/vk/SkiaVulkan.h} +2 -2
  218. package/cpp/skia/include/private/gpu/vk/VulkanTypesPriv.h +57 -0
  219. package/cpp/skia/include/sksl/DSL.h +0 -1
  220. package/cpp/skia/include/sksl/DSLBlock.h +4 -18
  221. package/cpp/skia/include/sksl/DSLCase.h +2 -8
  222. package/cpp/skia/include/sksl/DSLCore.h +8 -15
  223. package/cpp/skia/include/sksl/DSLExpression.h +51 -142
  224. package/cpp/skia/include/sksl/DSLFunction.h +7 -15
  225. package/cpp/skia/include/sksl/DSLModifiers.h +5 -2
  226. package/cpp/skia/include/sksl/DSLStatement.h +4 -39
  227. package/cpp/skia/include/sksl/DSLSymbols.h +1 -11
  228. package/cpp/skia/include/sksl/DSLType.h +20 -12
  229. package/cpp/skia/include/sksl/DSLVar.h +56 -146
  230. package/cpp/skia/include/sksl/SkSLErrorReporter.h +2 -15
  231. package/cpp/skia/include/sksl/SkSLOperator.h +62 -59
  232. package/cpp/skia/include/sksl/SkSLPosition.h +2 -0
  233. package/cpp/skia/include/sksl/SkSLVersion.h +27 -0
  234. package/cpp/skia/include/svg/SkSVGCanvas.h +1 -0
  235. package/cpp/skia/include/utils/SkAnimCodecPlayer.h +1 -1
  236. package/cpp/skia/include/utils/SkBase64.h +2 -0
  237. package/cpp/skia/include/utils/SkCustomTypeface.h +24 -11
  238. package/cpp/skia/include/utils/SkEventTracer.h +12 -1
  239. package/cpp/skia/include/utils/SkNWayCanvas.h +11 -4
  240. package/cpp/skia/include/utils/SkPaintFilterCanvas.h +9 -4
  241. package/cpp/skia/include/utils/SkParse.h +3 -0
  242. package/cpp/skia/include/utils/SkShadowUtils.h +2 -0
  243. package/cpp/skia/include/utils/SkTextUtils.h +2 -1
  244. package/cpp/skia/{include/third_party → modules}/skcms/skcms.h +10 -0
  245. package/cpp/skia/modules/skcms/skcms_internal.h +56 -0
  246. package/cpp/skia/modules/skcms/src/Transform_inl.h +1609 -0
  247. package/cpp/skia/modules/skparagraph/include/DartTypes.h +153 -0
  248. package/cpp/skia/modules/skparagraph/include/FontArguments.h +46 -0
  249. package/cpp/skia/modules/skparagraph/include/FontCollection.h +84 -0
  250. package/cpp/skia/modules/skparagraph/include/Metrics.h +98 -0
  251. package/cpp/skia/modules/skparagraph/include/Paragraph.h +111 -0
  252. package/cpp/skia/modules/skparagraph/include/ParagraphBuilder.h +69 -0
  253. package/cpp/skia/modules/skparagraph/include/ParagraphCache.h +77 -0
  254. package/cpp/skia/modules/skparagraph/include/ParagraphStyle.h +143 -0
  255. package/cpp/skia/modules/skparagraph/include/TextShadow.h +30 -0
  256. package/cpp/skia/modules/skparagraph/include/TextStyle.h +352 -0
  257. package/cpp/skia/modules/skparagraph/include/TypefaceFontProvider.h +81 -0
  258. package/cpp/skia/modules/svg/include/SkSVGAttributeParser.h +1 -1
  259. package/cpp/skia/modules/svg/include/SkSVGTypes.h +3 -3
  260. package/cpp/skia/src/core/SkLRUCache.h +126 -0
  261. package/cpp/skia/src/core/SkTInternalLList.h +302 -0
  262. package/cpp/utils/RNSkTimingInfo.h +1 -0
  263. package/ios/RNSkia-iOS/RNSkMetalCanvasProvider.h +15 -4
  264. package/ios/RNSkia-iOS/RNSkMetalCanvasProvider.mm +40 -54
  265. package/ios/RNSkia-iOS/SkiaDomViewManager.h +8 -0
  266. package/ios/RNSkia-iOS/SkiaDomViewManager.mm +51 -0
  267. package/lib/commonjs/dom/nodes/DrawingNode.js +1 -5
  268. package/lib/commonjs/dom/nodes/DrawingNode.js.map +1 -1
  269. package/lib/commonjs/dom/nodes/JsiSkDOM.js +56 -56
  270. package/lib/commonjs/dom/nodes/JsiSkDOM.js.map +1 -1
  271. package/lib/commonjs/dom/nodes/RenderNode.js +3 -9
  272. package/lib/commonjs/dom/nodes/RenderNode.js.map +1 -1
  273. package/lib/commonjs/dom/nodes/drawings/Text.js +4 -9
  274. package/lib/commonjs/dom/nodes/drawings/Text.js.map +1 -1
  275. package/lib/commonjs/dom/types/Common.js.map +1 -1
  276. package/lib/commonjs/dom/types/Drawings.js.map +1 -1
  277. package/lib/commonjs/mock/index.js +0 -1
  278. package/lib/commonjs/mock/index.js.map +1 -1
  279. package/lib/commonjs/renderer/Canvas.js +21 -57
  280. package/lib/commonjs/renderer/Canvas.js.map +1 -1
  281. package/lib/commonjs/renderer/DependencyManager.js +0 -5
  282. package/lib/commonjs/renderer/DependencyManager.js.map +1 -1
  283. package/lib/commonjs/renderer/HostComponents.js.map +1 -1
  284. package/lib/commonjs/renderer/components/Mask.js +1 -3
  285. package/lib/commonjs/renderer/components/Mask.js.map +1 -1
  286. package/lib/commonjs/renderer/components/Paint.js +5 -18
  287. package/lib/commonjs/renderer/components/Paint.js.map +1 -1
  288. package/lib/commonjs/renderer/useCanvas.js +8 -6
  289. package/lib/commonjs/renderer/useCanvas.js.map +1 -1
  290. package/lib/commonjs/skia/core/Picture.js +1 -24
  291. package/lib/commonjs/skia/core/Picture.js.map +1 -1
  292. package/lib/commonjs/skia/core/Rect.js +1 -1
  293. package/lib/commonjs/skia/core/Rect.js.map +1 -1
  294. package/lib/commonjs/skia/types/ContourMeasure.js.map +1 -1
  295. package/lib/commonjs/skia/types/Size.js +2 -0
  296. package/lib/commonjs/skia/types/Size.js.map +1 -0
  297. package/lib/commonjs/skia/types/index.js +13 -0
  298. package/lib/commonjs/skia/types/index.js.map +1 -1
  299. package/lib/commonjs/skia/web/JsiSkColor.js +8 -0
  300. package/lib/commonjs/skia/web/JsiSkColor.js.map +1 -1
  301. package/lib/commonjs/skia/web/JsiSkContourMeasure.js +4 -7
  302. package/lib/commonjs/skia/web/JsiSkContourMeasure.js.map +1 -1
  303. package/lib/commonjs/values/web/RNSkReadonlyValue.js +4 -2
  304. package/lib/commonjs/values/web/RNSkReadonlyValue.js.map +1 -1
  305. package/lib/commonjs/views/SkiaBaseWebView.js +9 -2
  306. package/lib/commonjs/views/SkiaBaseWebView.js.map +1 -1
  307. package/lib/commonjs/views/SkiaDomView.js +152 -0
  308. package/lib/commonjs/views/SkiaDomView.js.map +1 -0
  309. package/lib/commonjs/views/SkiaDomView.web.js +55 -0
  310. package/lib/commonjs/views/SkiaDomView.web.js.map +1 -0
  311. package/lib/commonjs/views/SkiaPictureView.js +16 -2
  312. package/lib/commonjs/views/SkiaPictureView.js.map +1 -1
  313. package/lib/commonjs/views/SkiaView.js +17 -2
  314. package/lib/commonjs/views/SkiaView.js.map +1 -1
  315. package/lib/commonjs/views/index.js +13 -0
  316. package/lib/commonjs/views/index.js.map +1 -1
  317. package/lib/commonjs/views/types.js.map +1 -1
  318. package/lib/module/dom/nodes/DrawingNode.js +2 -6
  319. package/lib/module/dom/nodes/DrawingNode.js.map +1 -1
  320. package/lib/module/dom/nodes/JsiSkDOM.js +56 -56
  321. package/lib/module/dom/nodes/JsiSkDOM.js.map +1 -1
  322. package/lib/module/dom/nodes/RenderNode.js +2 -5
  323. package/lib/module/dom/nodes/RenderNode.js.map +1 -1
  324. package/lib/module/dom/nodes/drawings/Text.js +4 -9
  325. package/lib/module/dom/nodes/drawings/Text.js.map +1 -1
  326. package/lib/module/dom/types/Common.js.map +1 -1
  327. package/lib/module/dom/types/Drawings.js.map +1 -1
  328. package/lib/module/mock/index.js +0 -1
  329. package/lib/module/mock/index.js.map +1 -1
  330. package/lib/module/renderer/Canvas.js +22 -58
  331. package/lib/module/renderer/Canvas.js.map +1 -1
  332. package/lib/module/renderer/DependencyManager.js +0 -5
  333. package/lib/module/renderer/DependencyManager.js.map +1 -1
  334. package/lib/module/renderer/HostComponents.js.map +1 -1
  335. package/lib/module/renderer/components/Mask.js +2 -4
  336. package/lib/module/renderer/components/Mask.js.map +1 -1
  337. package/lib/module/renderer/components/Paint.js +3 -14
  338. package/lib/module/renderer/components/Paint.js.map +1 -1
  339. package/lib/module/renderer/useCanvas.js +6 -4
  340. package/lib/module/renderer/useCanvas.js.map +1 -1
  341. package/lib/module/skia/core/Picture.js +0 -18
  342. package/lib/module/skia/core/Picture.js.map +1 -1
  343. package/lib/module/skia/core/Rect.js +1 -1
  344. package/lib/module/skia/core/Rect.js.map +1 -1
  345. package/lib/module/skia/types/ContourMeasure.js.map +1 -1
  346. package/lib/module/skia/types/Size.js +2 -0
  347. package/lib/module/skia/types/Size.js.map +1 -0
  348. package/lib/module/skia/types/index.js +1 -0
  349. package/lib/module/skia/types/index.js.map +1 -1
  350. package/lib/module/skia/web/JsiSkColor.js +8 -0
  351. package/lib/module/skia/web/JsiSkColor.js.map +1 -1
  352. package/lib/module/skia/web/JsiSkContourMeasure.js +3 -7
  353. package/lib/module/skia/web/JsiSkContourMeasure.js.map +1 -1
  354. package/lib/module/values/web/RNSkReadonlyValue.js +4 -2
  355. package/lib/module/values/web/RNSkReadonlyValue.js.map +1 -1
  356. package/lib/module/views/SkiaBaseWebView.js +9 -2
  357. package/lib/module/views/SkiaBaseWebView.js.map +1 -1
  358. package/lib/module/views/SkiaDomView.js +128 -0
  359. package/lib/module/views/SkiaDomView.js.map +1 -0
  360. package/lib/module/views/SkiaDomView.web.js +41 -0
  361. package/lib/module/views/SkiaDomView.web.js.map +1 -0
  362. package/lib/module/views/SkiaPictureView.js +14 -2
  363. package/lib/module/views/SkiaPictureView.js.map +1 -1
  364. package/lib/module/views/SkiaView.js +15 -2
  365. package/lib/module/views/SkiaView.js.map +1 -1
  366. package/lib/module/views/index.js +1 -0
  367. package/lib/module/views/index.js.map +1 -1
  368. package/lib/module/views/types.js.map +1 -1
  369. package/lib/typescript/src/dom/nodes/JsiSkDOM.d.ts +57 -64
  370. package/lib/typescript/src/dom/nodes/RenderNode.d.ts +1 -3
  371. package/lib/typescript/src/dom/types/Common.d.ts +2 -3
  372. package/lib/typescript/src/dom/types/Drawings.d.ts +1 -3
  373. package/lib/typescript/src/renderer/Canvas.d.ts +5 -5
  374. package/lib/typescript/src/renderer/DependencyManager.d.ts +0 -2
  375. package/lib/typescript/src/renderer/HostComponents.d.ts +67 -8
  376. package/lib/typescript/src/renderer/components/Paint.d.ts +3 -4
  377. package/lib/typescript/src/renderer/useCanvas.d.ts +1 -6
  378. package/lib/typescript/src/skia/core/Picture.d.ts +0 -8
  379. package/lib/typescript/src/skia/types/ContourMeasure.d.ts +3 -2
  380. package/lib/typescript/src/skia/types/Size.d.ts +4 -0
  381. package/lib/typescript/src/skia/types/index.d.ts +1 -0
  382. package/lib/typescript/src/skia/web/JsiSkContourMeasure.d.ts +2 -6
  383. package/lib/typescript/src/views/SkiaBaseWebView.d.ts +4 -0
  384. package/lib/typescript/src/views/SkiaDomView.d.ts +31 -0
  385. package/lib/typescript/src/views/SkiaDomView.web.d.ts +7 -0
  386. package/lib/typescript/src/views/index.d.ts +1 -0
  387. package/lib/typescript/src/views/types.d.ts +12 -2
  388. package/libs/android/arm64-v8a/libskia.a +0 -0
  389. package/libs/android/arm64-v8a/libskottie.a +0 -0
  390. package/libs/android/arm64-v8a/libskparagraph.a +0 -0
  391. package/libs/android/arm64-v8a/libsksg.a +0 -0
  392. package/libs/android/arm64-v8a/libskshaper.a +0 -0
  393. package/libs/android/arm64-v8a/libskunicode.a +0 -0
  394. package/libs/android/arm64-v8a/libsvg.a +0 -0
  395. package/libs/android/armeabi-v7a/libskia.a +0 -0
  396. package/libs/android/armeabi-v7a/libskottie.a +0 -0
  397. package/libs/android/armeabi-v7a/libskparagraph.a +0 -0
  398. package/libs/android/armeabi-v7a/libsksg.a +0 -0
  399. package/libs/android/armeabi-v7a/libskshaper.a +0 -0
  400. package/libs/android/armeabi-v7a/libskunicode.a +0 -0
  401. package/libs/android/armeabi-v7a/libsvg.a +0 -0
  402. package/libs/android/x86/libskia.a +0 -0
  403. package/libs/android/x86/libskottie.a +0 -0
  404. package/libs/android/x86/libskparagraph.a +0 -0
  405. package/libs/android/x86/libsksg.a +0 -0
  406. package/libs/android/x86/libskshaper.a +0 -0
  407. package/libs/android/x86/libskunicode.a +0 -0
  408. package/libs/android/x86/libsvg.a +0 -0
  409. package/libs/android/x86_64/libskia.a +0 -0
  410. package/libs/android/x86_64/libskottie.a +0 -0
  411. package/libs/android/x86_64/libskparagraph.a +0 -0
  412. package/libs/android/x86_64/libsksg.a +0 -0
  413. package/libs/android/x86_64/libskshaper.a +0 -0
  414. package/libs/android/x86_64/libskunicode.a +0 -0
  415. package/libs/android/x86_64/libsvg.a +0 -0
  416. package/libs/ios/libskia.xcframework/ios-arm64_arm64e/libskia.a +0 -0
  417. package/libs/ios/libskia.xcframework/ios-arm64_arm64e_x86_64-simulator/libskia.a +0 -0
  418. package/libs/ios/libskottie.xcframework/ios-arm64_arm64e/libskottie.a +0 -0
  419. package/libs/ios/libskottie.xcframework/ios-arm64_arm64e_x86_64-simulator/libskottie.a +0 -0
  420. package/libs/ios/libskparagraph.xcframework/Info.plist +42 -0
  421. package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e/libskparagraph.a +0 -0
  422. package/libs/ios/libskparagraph.xcframework/ios-arm64_arm64e_x86_64-simulator/libskparagraph.a +0 -0
  423. package/libs/ios/libsksg.xcframework/Info.plist +5 -5
  424. package/libs/ios/libsksg.xcframework/ios-arm64_arm64e/libsksg.a +0 -0
  425. package/libs/ios/libsksg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsksg.a +0 -0
  426. package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e/libskshaper.a +0 -0
  427. package/libs/ios/libskshaper.xcframework/ios-arm64_arm64e_x86_64-simulator/libskshaper.a +0 -0
  428. package/libs/ios/libskunicode.xcframework/Info.plist +42 -0
  429. package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e/libskunicode.a +0 -0
  430. package/libs/ios/libskunicode.xcframework/ios-arm64_arm64e_x86_64-simulator/libskunicode.a +0 -0
  431. package/libs/ios/libsvg.xcframework/ios-arm64_arm64e/libsvg.a +0 -0
  432. package/libs/ios/libsvg.xcframework/ios-arm64_arm64e_x86_64-simulator/libsvg.a +0 -0
  433. package/package.json +6 -4
  434. package/react-native-skia.podspec +3 -1
  435. package/src/dom/nodes/DrawingNode.ts +2 -4
  436. package/src/dom/nodes/JsiSkDOM.ts +170 -56
  437. package/src/dom/nodes/RenderNode.ts +2 -14
  438. package/src/dom/nodes/drawings/Text.ts +4 -4
  439. package/src/dom/types/Common.ts +2 -4
  440. package/src/dom/types/Drawings.ts +1 -4
  441. package/src/mock/index.ts +0 -1
  442. package/src/renderer/Canvas.tsx +32 -51
  443. package/src/renderer/DependencyManager.tsx +0 -5
  444. package/src/renderer/HostComponents.ts +153 -5
  445. package/src/renderer/components/Mask.tsx +2 -2
  446. package/src/renderer/components/Paint.tsx +3 -14
  447. package/src/renderer/useCanvas.ts +5 -5
  448. package/src/skia/core/Picture.ts +0 -24
  449. package/src/skia/core/Rect.ts +1 -1
  450. package/src/skia/types/ContourMeasure.tsx +3 -2
  451. package/src/skia/types/Size.ts +4 -0
  452. package/src/skia/types/index.ts +1 -0
  453. package/src/skia/web/JsiSkColor.ts +11 -0
  454. package/src/skia/web/JsiSkContourMeasure.ts +7 -3
  455. package/src/values/web/RNSkReadonlyValue.ts +4 -2
  456. package/src/views/SkiaBaseWebView.tsx +6 -2
  457. package/src/views/SkiaDomView.tsx +120 -0
  458. package/src/views/SkiaDomView.web.tsx +37 -0
  459. package/src/views/SkiaPictureView.tsx +10 -2
  460. package/src/views/SkiaView.tsx +11 -3
  461. package/src/views/index.ts +1 -0
  462. package/src/views/types.ts +19 -2
  463. package/cpp/jsi/JsiSimpleValueWrapper.h +0 -99
  464. package/cpp/skia/include/c/sk_canvas.h +0 -159
  465. package/cpp/skia/include/c/sk_colorspace.h +0 -25
  466. package/cpp/skia/include/c/sk_data.h +0 -65
  467. package/cpp/skia/include/c/sk_image.h +0 -71
  468. package/cpp/skia/include/c/sk_imageinfo.h +0 -62
  469. package/cpp/skia/include/c/sk_maskfilter.h +0 -47
  470. package/cpp/skia/include/c/sk_matrix.h +0 -49
  471. package/cpp/skia/include/c/sk_paint.h +0 -145
  472. package/cpp/skia/include/c/sk_path.h +0 -102
  473. package/cpp/skia/include/c/sk_picture.h +0 -70
  474. package/cpp/skia/include/c/sk_shader.h +0 -143
  475. package/cpp/skia/include/c/sk_surface.h +0 -73
  476. package/cpp/skia/include/c/sk_types.h +0 -278
  477. package/cpp/skia/include/gpu/graphite/SkStuff.h +0 -47
  478. package/cpp/skia/include/private/SkNx.h +0 -430
  479. package/cpp/skia/include/private/SkNx_neon.h +0 -713
  480. package/cpp/skia/include/private/SkNx_sse.h +0 -823
  481. package/cpp/skia/include/sksl/DSLRuntimeEffects.h +0 -32
  482. package/cpp/skia/include/sksl/DSLWrapper.h +0 -77
@@ -0,0 +1,1609 @@
1
+ /*
2
+ * Copyright 2018 Google Inc.
3
+ *
4
+ * Use of this source code is governed by a BSD-style license that can be
5
+ * found in the LICENSE file.
6
+ */
7
+
8
+ // Intentionally NO #pragma once... included multiple times.
9
+
10
+ // This file is included from skcms.cc in a namespace with some pre-defines:
11
+ // - N: depth of all vectors, 1,4,8, or 16 (preprocessor define)
12
+ // - V<T>: a template to create a vector of N T's.
13
+
14
+ using F = V<Color>; // Called F for historic reasons... maybe rename C?
15
+ using I32 = V<int32_t>;
16
+ using U64 = V<uint64_t>;
17
+ using U32 = V<uint32_t>;
18
+ using U16 = V<uint16_t>;
19
+ using U8 = V<uint8_t>;
20
+
21
+
22
+ #if defined(__GNUC__) && !defined(__clang__)
23
+ // Once again, GCC is kind of weird, not allowing vector = scalar directly.
24
+ static constexpr F F0 = F() + 0.0f,
25
+ F1 = F() + 1.0f,
26
+ FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
27
+ #else
28
+ static constexpr F F0 = 0.0f,
29
+ F1 = 1.0f,
30
+ FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
31
+ #endif
32
+
33
+ // Instead of checking __AVX__ below, we'll check USING_AVX.
34
+ // This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
35
+ // Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
36
+
37
+ #if !defined(USING_AVX) && N == 8 && defined(__AVX__)
38
+ #define USING_AVX
39
+ #endif
40
+ #if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
41
+ #define USING AVX_F16C
42
+ #endif
43
+ #if !defined(USING_AVX2) && defined(USING_AVX) && defined(__AVX2__)
44
+ #define USING_AVX2
45
+ #endif
46
+ #if !defined(USING_AVX512F) && N == 16 && defined(__AVX512F__) && defined(__AVX512DQ__)
47
+ #define USING_AVX512F
48
+ #endif
49
+
50
+ // Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
51
+ // This is more for organizational clarity... skcms.cc doesn't force these.
52
+ #if N > 1 && defined(__ARM_NEON)
53
+ #define USING_NEON
54
+ #if __ARM_FP & 2
55
+ #define USING_NEON_F16C
56
+ #endif
57
+ #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(SKCMS_OPT_INTO_NEON_FP16)
58
+ #define USING_NEON_FP16
59
+ #endif
60
+ #endif
61
+
62
+ // These -Wvector-conversion warnings seem to trigger in very bogus situations,
63
+ // like vst3q_f32() expecting a 16x char rather than a 4x float vector. :/
64
+ #if defined(USING_NEON) && defined(__clang__)
65
+ #pragma clang diagnostic ignored "-Wvector-conversion"
66
+ #endif
67
+
68
+ // GCC & Clang (but not clang-cl) warn returning U64 on x86 is larger than a register.
69
+ // You'd see warnings like, "using AVX even though AVX is not enabled".
70
+ // We stifle these warnings; our helpers that return U64 are always inlined.
71
+ #if defined(__SSE__) && defined(__GNUC__)
72
+ #if !defined(__has_warning)
73
+ #pragma GCC diagnostic ignored "-Wpsabi"
74
+ #elif __has_warning("-Wpsabi")
75
+ #pragma GCC diagnostic ignored "-Wpsabi"
76
+ #endif
77
+ #endif
78
+
79
+ #if defined(__clang__)
80
+ #define FALLTHROUGH [[clang::fallthrough]]
81
+ #else
82
+ #define FALLTHROUGH
83
+ #endif
84
+
85
+ // We tag most helper functions as SI, to enforce good code generation
86
+ // but also work around what we think is a bug in GCC: when targeting 32-bit
87
+ // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
88
+ // MMX mm0 register, which seems to mess with unrelated code that later uses
89
+ // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
90
+ //
91
+ // It helps codegen to call __builtin_memcpy() when we know the byte count at compile time.
92
+ #if defined(__clang__) || defined(__GNUC__)
93
+ #define SI static inline __attribute__((always_inline))
94
+ #else
95
+ #define SI static inline
96
+ #endif
97
+
98
+ template <typename T, typename P>
99
+ SI T load(const P* ptr) {
100
+ T val;
101
+ small_memcpy(&val, ptr, sizeof(val));
102
+ return val;
103
+ }
104
+ template <typename T, typename P>
105
+ SI void store(P* ptr, const T& val) {
106
+ small_memcpy(ptr, &val, sizeof(val));
107
+ }
108
+
109
+ // (T)v is a cast when N == 1 and a bit-pun when N>1,
110
+ // so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
111
+ template <typename D, typename S>
112
+ SI D cast(const S& v) {
113
+ #if N == 1
114
+ return (D)v;
115
+ #elif defined(__clang__)
116
+ return __builtin_convertvector(v, D);
117
+ #else
118
+ D d;
119
+ for (int i = 0; i < N; i++) {
120
+ d[i] = v[i];
121
+ }
122
+ return d;
123
+ #endif
124
+ }
125
+
126
+ template <typename D, typename S>
127
+ SI D bit_pun(const S& v) {
128
+ static_assert(sizeof(D) == sizeof(v), "");
129
+ return load<D>(&v);
130
+ }
131
+
132
+ // When we convert from float to fixed point, it's very common to want to round,
133
+ // and for some reason compilers generate better code when converting to int32_t.
134
+ // To serve both those ends, we use this function to_fixed() instead of direct cast().
135
+ #if defined(USING_NEON_FP16)
136
+ // NEON's got a F16 -> U16 instruction, so this should be fine without going via I16.
137
+ SI U16 to_fixed(F f) { return cast<U16>(f + 0.5f); }
138
+ #else
139
+ SI U32 to_fixed(F f) { return (U32)cast<I32>(f + 0.5f); }
140
+ #endif
141
+
142
+
143
+ // Sometimes we do something crazy on one branch of a conditonal,
144
+ // like divide by zero or convert a huge float to an integer,
145
+ // but then harmlessly select the other side. That trips up N==1
146
+ // sanitizer builds, so we make if_then_else() a macro to avoid
147
+ // evaluating the unused side.
148
+
149
+ #if N == 1
150
+ #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
151
+ #else
152
+ template <typename C, typename T>
153
+ SI T if_then_else(C cond, T t, T e) {
154
+ return bit_pun<T>( ( cond & bit_pun<C>(t)) |
155
+ (~cond & bit_pun<C>(e)) );
156
+ }
157
+ #endif
158
+
159
+
160
+ SI F F_from_Half(U16 half) {
161
+ #if defined(USING_NEON_FP16)
162
+ return bit_pun<F>(half);
163
+ #elif defined(USING_NEON_F16C)
164
+ return vcvt_f32_f16((float16x4_t)half);
165
+ #elif defined(USING_AVX512F)
166
+ return (F)_mm512_cvtph_ps((__m256i)half);
167
+ #elif defined(USING_AVX_F16C)
168
+ typedef int16_t __attribute__((vector_size(16))) I16;
169
+ return __builtin_ia32_vcvtph2ps256((I16)half);
170
+ #else
171
+ U32 wide = cast<U32>(half);
172
+ // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
173
+ U32 s = wide & 0x8000,
174
+ em = wide ^ s;
175
+
176
+ // Constructing the float is easy if the half is not denormalized.
177
+ F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
178
+
179
+ // Simply flush all denorm half floats to zero.
180
+ return if_then_else(em < 0x0400, F0, norm);
181
+ #endif
182
+ }
183
+
184
+ #if defined(__clang__)
185
+ // The -((127-15)<<10) underflows that side of the math when
186
+ // we pass a denorm half float. It's harmless... we'll take the 0 side anyway.
187
+ __attribute__((no_sanitize("unsigned-integer-overflow")))
188
+ #endif
189
+ SI U16 Half_from_F(F f) {
190
+ #if defined(USING_NEON_FP16)
191
+ return bit_pun<U16>(f);
192
+ #elif defined(USING_NEON_F16C)
193
+ return (U16)vcvt_f16_f32(f);
194
+ #elif defined(USING_AVX512F)
195
+ return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
196
+ #elif defined(USING_AVX_F16C)
197
+ return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
198
+ #else
199
+ // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
200
+ U32 sem = bit_pun<U32>(f),
201
+ s = sem & 0x80000000,
202
+ em = sem ^ s;
203
+
204
+ // For simplicity we flush denorm half floats (including all denorm floats) to zero.
205
+ return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
206
+ , (s>>16) + (em>>13) - ((127-15)<<10)));
207
+ #endif
208
+ }
209
+
210
+ // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
211
+ #if defined(USING_NEON_FP16)
212
+ SI U16 swap_endian_16(U16 v) {
213
+ return (U16)vrev16q_u8((uint8x16_t) v);
214
+ }
215
+ #elif defined(USING_NEON)
216
+ SI U16 swap_endian_16(U16 v) {
217
+ return (U16)vrev16_u8((uint8x8_t) v);
218
+ }
219
+ #endif
220
+
221
+ SI U64 swap_endian_16x4(const U64& rgba) {
222
+ return (rgba & 0x00ff00ff00ff00ff) << 8
223
+ | (rgba & 0xff00ff00ff00ff00) >> 8;
224
+ }
225
+
226
+ #if defined(USING_NEON_FP16)
227
+ SI F min_(F x, F y) { return (F)vminq_f16((float16x8_t)x, (float16x8_t)y); }
228
+ SI F max_(F x, F y) { return (F)vmaxq_f16((float16x8_t)x, (float16x8_t)y); }
229
+ #elif defined(USING_NEON)
230
+ SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
231
+ SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
232
+ #else
233
+ SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
234
+ SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
235
+ #endif
236
+
237
+ SI F floor_(F x) {
238
+ #if N == 1
239
+ return floorf_(x);
240
+ #elif defined(USING_NEON_FP16)
241
+ return vrndmq_f16(x);
242
+ #elif defined(__aarch64__)
243
+ return vrndmq_f32(x);
244
+ #elif defined(USING_AVX512F)
245
+ // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
246
+ // and integer santizer catches that this implicit cast changes the
247
+ // value from -1 to 65535. We'll cast manually to work around it.
248
+ // Read this as `return _mm512_floor_ps(x)`.
249
+ return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
250
+ #elif defined(USING_AVX)
251
+ return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
252
+ #elif defined(__SSE4_1__)
253
+ return _mm_floor_ps(x);
254
+ #else
255
+ // Round trip through integers with a truncating cast.
256
+ F roundtrip = cast<F>(cast<I32>(x));
257
+ // If x is negative, truncating gives the ceiling instead of the floor.
258
+ return roundtrip - if_then_else(roundtrip > x, F1, F0);
259
+
260
+ // This implementation fails for values of x that are outside
261
+ // the range an integer can represent. We expect most x to be small.
262
+ #endif
263
+ }
264
+
265
+ SI F approx_log2(F x) {
266
+ #if defined(USING_NEON_FP16)
267
+ // TODO(mtklein)
268
+ return x;
269
+ #else
270
+ // The first approximation of log2(x) is its exponent 'e', minus 127.
271
+ I32 bits = bit_pun<I32>(x);
272
+
273
+ F e = cast<F>(bits) * (1.0f / (1<<23));
274
+
275
+ // If we use the mantissa too we can refine the error signficantly.
276
+ F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
277
+
278
+ return e - 124.225514990f
279
+ - 1.498030302f*m
280
+ - 1.725879990f/(0.3520887068f + m);
281
+ #endif
282
+ }
283
+
284
+ SI F approx_log(F x) {
285
+ const float ln2 = 0.69314718f;
286
+ return ln2 * approx_log2(x);
287
+ }
288
+
289
+ SI F approx_exp2(F x) {
290
+ #if defined(USING_NEON_FP16)
291
+ // TODO(mtklein)
292
+ return x;
293
+ #else
294
+ F fract = x - floor_(x);
295
+
296
+ F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
297
+ - 1.490129070f*fract
298
+ + 27.728023300f/(4.84252568f - fract));
299
+ I32 bits = cast<I32>(min_(max_(fbits, F0), FInfBits));
300
+
301
+ return bit_pun<F>(bits);
302
+ #endif
303
+ }
304
+
305
+ SI F approx_pow(F x, float y) {
306
+ return if_then_else((x == F0) | (x == F1), x
307
+ , approx_exp2(approx_log2(x) * y));
308
+ }
309
+
310
+ SI F approx_exp(F x) {
311
+ const float log2_e = 1.4426950408889634074f;
312
+ return approx_exp2(log2_e * x);
313
+ }
314
+
315
+ // Return tf(x).
316
+ SI F apply_tf(const skcms_TransferFunction* tf, F x) {
317
+ #if defined(USING_NEON_FP16)
318
+ // TODO(mtklein)
319
+ (void)tf;
320
+ return x;
321
+ #else
322
+ // Peel off the sign bit and set x = |x|.
323
+ U32 bits = bit_pun<U32>(x),
324
+ sign = bits & 0x80000000;
325
+ x = bit_pun<F>(bits ^ sign);
326
+
327
+ // The transfer function has a linear part up to d, exponential at d and after.
328
+ F v = if_then_else(x < tf->d, tf->c*x + tf->f
329
+ , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
330
+
331
+ // Tack the sign bit back on.
332
+ return bit_pun<F>(sign | bit_pun<U32>(v));
333
+ #endif
334
+ }
335
+
336
+ SI F apply_pq(const skcms_TransferFunction* tf, F x) {
337
+ #if defined(USING_NEON_FP16)
338
+ // TODO(mtklein)
339
+ (void)tf;
340
+ return x;
341
+ #else
342
+ U32 bits = bit_pun<U32>(x),
343
+ sign = bits & 0x80000000;
344
+ x = bit_pun<F>(bits ^ sign);
345
+
346
+ F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
347
+ / (tf->d + tf->e * approx_pow(x, tf->c)),
348
+ tf->f);
349
+
350
+ return bit_pun<F>(sign | bit_pun<U32>(v));
351
+ #endif
352
+ }
353
+
354
+ SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
355
+ #if defined(USING_NEON_FP16)
356
+ // TODO(mtklein)
357
+ (void)tf;
358
+ return x;
359
+ #else
360
+ const float R = tf->a, G = tf->b,
361
+ a = tf->c, b = tf->d, c = tf->e,
362
+ K = tf->f + 1;
363
+ U32 bits = bit_pun<U32>(x),
364
+ sign = bits & 0x80000000;
365
+ x = bit_pun<F>(bits ^ sign);
366
+
367
+ F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
368
+ , approx_exp((x-c)*a) + b);
369
+
370
+ return K*bit_pun<F>(sign | bit_pun<U32>(v));
371
+ #endif
372
+ }
373
+
374
+ SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
375
+ #if defined(USING_NEON_FP16)
376
+ // TODO(mtklein)
377
+ (void)tf;
378
+ return x;
379
+ #else
380
+ const float R = tf->a, G = tf->b,
381
+ a = tf->c, b = tf->d, c = tf->e,
382
+ K = tf->f + 1;
383
+ U32 bits = bit_pun<U32>(x),
384
+ sign = bits & 0x80000000;
385
+ x = bit_pun<F>(bits ^ sign);
386
+ x /= K;
387
+
388
+ F v = if_then_else(x <= 1, R * approx_pow(x, G)
389
+ , a * approx_log(x - b) + c);
390
+
391
+ return bit_pun<F>(sign | bit_pun<U32>(v));
392
+ #endif
393
+ }
394
+
395
+
396
+ // Strided loads and stores of N values, starting from p.
397
+ template <typename T, typename P>
398
+ SI T load_3(const P* p) {
399
+ #if N == 1
400
+ return (T)p[0];
401
+ #elif N == 4
402
+ return T{p[ 0],p[ 3],p[ 6],p[ 9]};
403
+ #elif N == 8
404
+ return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
405
+ #elif N == 16
406
+ return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
407
+ p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
408
+ #endif
409
+ }
410
+
411
+ template <typename T, typename P>
412
+ SI T load_4(const P* p) {
413
+ #if N == 1
414
+ return (T)p[0];
415
+ #elif N == 4
416
+ return T{p[ 0],p[ 4],p[ 8],p[12]};
417
+ #elif N == 8
418
+ return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
419
+ #elif N == 16
420
+ return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
421
+ p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
422
+ #endif
423
+ }
424
+
425
+ template <typename T, typename P>
426
+ SI void store_3(P* p, const T& v) {
427
+ #if N == 1
428
+ p[0] = v;
429
+ #elif N == 4
430
+ p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
431
+ #elif N == 8
432
+ p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
433
+ p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
434
+ #elif N == 16
435
+ p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
436
+ p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
437
+ p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
438
+ p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
439
+ #endif
440
+ }
441
+
442
+ template <typename T, typename P>
443
+ SI void store_4(P* p, const T& v) {
444
+ #if N == 1
445
+ p[0] = v;
446
+ #elif N == 4
447
+ p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
448
+ #elif N == 8
449
+ p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
450
+ p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
451
+ #elif N == 16
452
+ p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
453
+ p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
454
+ p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
455
+ p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
456
+ #endif
457
+ }
458
+
459
+
460
+ SI U8 gather_8(const uint8_t* p, I32 ix) {
461
+ #if N == 1
462
+ U8 v = p[ix];
463
+ #elif N == 4
464
+ U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
465
+ #elif N == 8
466
+ U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
467
+ p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
468
+ #elif N == 16
469
+ U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
470
+ p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
471
+ p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
472
+ p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
473
+ #endif
474
+ return v;
475
+ }
476
+
477
+ SI U16 gather_16(const uint8_t* p, I32 ix) {
478
+ // Load the i'th 16-bit value from p.
479
+ auto load_16 = [p](int i) {
480
+ return load<uint16_t>(p + 2*i);
481
+ };
482
+ #if N == 1
483
+ U16 v = load_16(ix);
484
+ #elif N == 4
485
+ U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
486
+ #elif N == 8
487
+ U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
488
+ load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
489
+ #elif N == 16
490
+ U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
491
+ load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
492
+ load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
493
+ load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
494
+ #endif
495
+ return v;
496
+ }
497
+
498
+ SI U32 gather_32(const uint8_t* p, I32 ix) {
499
+ // Load the i'th 32-bit value from p.
500
+ auto load_32 = [p](int i) {
501
+ return load<uint32_t>(p + 4*i);
502
+ };
503
+ #if N == 1
504
+ U32 v = load_32(ix);
505
+ #elif N == 4
506
+ U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
507
+ #elif N == 8
508
+ U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
509
+ load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
510
+ #elif N == 16
511
+ U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
512
+ load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
513
+ load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
514
+ load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
515
+ #endif
516
+ // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
517
+ return v;
518
+ }
519
+
520
+ SI U32 gather_24(const uint8_t* p, I32 ix) {
521
+ // First, back up a byte. Any place we're gathering from has a safe junk byte to read
522
+ // in front of it, either a previous table value, or some tag metadata.
523
+ p -= 1;
524
+
525
+ // Load the i'th 24-bit value from p, and 1 extra byte.
526
+ auto load_24_32 = [p](int i) {
527
+ return load<uint32_t>(p + 3*i);
528
+ };
529
+
530
+ // Now load multiples of 4 bytes (a junk byte, then r,g,b).
531
+ #if N == 1
532
+ U32 v = load_24_32(ix);
533
+ #elif N == 4
534
+ U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
535
+ #elif N == 8 && !defined(USING_AVX2)
536
+ U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
537
+ load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
538
+ #elif N == 8
539
+ (void)load_24_32;
540
+ // The gather instruction here doesn't need any particular alignment,
541
+ // but the intrinsic takes a const int*.
542
+ const int* p4 = bit_pun<const int*>(p);
543
+ I32 zero = { 0, 0, 0, 0, 0, 0, 0, 0},
544
+ mask = {-1,-1,-1,-1, -1,-1,-1,-1};
545
+ #if defined(__clang__)
546
+ U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
547
+ #elif defined(__GNUC__)
548
+ U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
549
+ #endif
550
+ #elif N == 16
551
+ (void)load_24_32;
552
+ // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
553
+ // And AVX-512 swapped the order of arguments. :/
554
+ const int* p4 = bit_pun<const int*>(p);
555
+ U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
556
+ #endif
557
+
558
+ // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
559
+ return v >> 8;
560
+ }
561
+
562
+ #if !defined(__arm__)
563
+ SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
564
+ // As in gather_24(), with everything doubled.
565
+ p -= 2;
566
+
567
+ // Load the i'th 48-bit value from p, and 2 extra bytes.
568
+ auto load_48_64 = [p](int i) {
569
+ return load<uint64_t>(p + 6*i);
570
+ };
571
+
572
+ #if N == 1
573
+ *v = load_48_64(ix);
574
+ #elif N == 4
575
+ *v = U64{
576
+ load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
577
+ };
578
+ #elif N == 8 && !defined(USING_AVX2)
579
+ *v = U64{
580
+ load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
581
+ load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
582
+ };
583
+ #elif N == 8
584
+ (void)load_48_64;
585
+ typedef int32_t __attribute__((vector_size(16))) Half_I32;
586
+ typedef long long __attribute__((vector_size(32))) Half_I64;
587
+
588
+ // The gather instruction here doesn't need any particular alignment,
589
+ // but the intrinsic takes a const long long*.
590
+ const long long int* p8 = bit_pun<const long long int*>(p);
591
+
592
+ Half_I64 zero = { 0, 0, 0, 0},
593
+ mask = {-1,-1,-1,-1};
594
+
595
+ ix *= 6;
596
+ Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
597
+ ix_hi = { ix[4], ix[5], ix[6], ix[7] };
598
+
599
+ #if defined(__clang__)
600
+ Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
601
+ hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
602
+ #elif defined(__GNUC__)
603
+ Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
604
+ hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
605
+ #endif
606
+ store((char*)v + 0, lo);
607
+ store((char*)v + 32, hi);
608
+ #elif N == 16
609
+ (void)load_48_64;
610
+ const long long int* p8 = bit_pun<const long long int*>(p);
611
+ __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
612
+ hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
613
+ store((char*)v + 0, lo);
614
+ store((char*)v + 64, hi);
615
+ #endif
616
+
617
+ *v >>= 16;
618
+ }
619
+ #endif
620
+
621
+ SI F F_from_U8(U8 v) {
622
+ return cast<F>(v) * (1/255.0f);
623
+ }
624
+
625
+ SI F F_from_U16_BE(U16 v) {
626
+ // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
627
+ // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
628
+ U16 lo = (v >> 8),
629
+ hi = (v << 8) & 0xffff;
630
+ return cast<F>(lo|hi) * (1/65535.0f);
631
+ }
632
+
633
+ SI U16 U16_from_F(F v) {
634
+ // 65535 == inf in FP16, so promote to FP32 before converting.
635
+ return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
636
+ }
637
+
638
+ SI F minus_1_ulp(F v) {
639
+ #if defined(USING_NEON_FP16)
640
+ return bit_pun<F>( bit_pun<U16>(v) - 1 );
641
+ #else
642
+ return bit_pun<F>( bit_pun<U32>(v) - 1 );
643
+ #endif
644
+ }
645
+
646
+ SI F table(const skcms_Curve* curve, F v) {
647
+ // Clamp the input to [0,1], then scale to a table index.
648
+ F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
649
+
650
+ // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
651
+ I32 lo = cast<I32>( ix ),
652
+ hi = cast<I32>(minus_1_ulp(ix+1.0f));
653
+ F t = ix - cast<F>(lo); // i.e. the fractional part of ix.
654
+
655
+ // TODO: can we load l and h simultaneously? Each entry in 'h' is either
656
+ // the same as in 'l' or adjacent. We have a rough idea that's it'd always be safe
657
+ // to read adjacent entries and perhaps underflow the table by a byte or two
658
+ // (it'd be junk, but always safe to read). Not sure how to lerp yet.
659
+ F l,h;
660
+ if (curve->table_8) {
661
+ l = F_from_U8(gather_8(curve->table_8, lo));
662
+ h = F_from_U8(gather_8(curve->table_8, hi));
663
+ } else {
664
+ l = F_from_U16_BE(gather_16(curve->table_16, lo));
665
+ h = F_from_U16_BE(gather_16(curve->table_16, hi));
666
+ }
667
+ return l + (h-l)*t;
668
+ }
669
+
670
+ SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) {
671
+ U32 rgb = gather_24(grid_8, ix);
672
+
673
+ *r = cast<F>((rgb >> 0) & 0xff) * (1/255.0f);
674
+ *g = cast<F>((rgb >> 8) & 0xff) * (1/255.0f);
675
+ *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
676
+ }
677
+
678
+ SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) {
679
+ // TODO: don't forget to optimize gather_32().
680
+ U32 rgba = gather_32(grid_8, ix);
681
+
682
+ *r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
683
+ *g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
684
+ *b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
685
+ *a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
686
+ }
687
+
688
+ SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) {
689
+ #if defined(__arm__)
690
+ // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
691
+ *r = F_from_U16_BE(gather_16(grid_16, 3*ix+0));
692
+ *g = F_from_U16_BE(gather_16(grid_16, 3*ix+1));
693
+ *b = F_from_U16_BE(gather_16(grid_16, 3*ix+2));
694
+ #else
695
+ // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
696
+ U64 rgb;
697
+ gather_48(grid_16, ix, &rgb);
698
+ rgb = swap_endian_16x4(rgb);
699
+
700
+ *r = cast<F>((rgb >> 0) & 0xffff) * (1/65535.0f);
701
+ *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
702
+ *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
703
+ #endif
704
+ }
705
+
706
+ SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) {
707
+ // TODO: gather_64()-based fast path?
708
+ *r = F_from_U16_BE(gather_16(grid_16, 4*ix+0));
709
+ *g = F_from_U16_BE(gather_16(grid_16, 4*ix+1));
710
+ *b = F_from_U16_BE(gather_16(grid_16, 4*ix+2));
711
+ *a = F_from_U16_BE(gather_16(grid_16, 4*ix+3));
712
+ }
713
+
714
+ static void clut(uint32_t input_channels, uint32_t output_channels,
715
+ const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
716
+ F* r, F* g, F* b, F* a) {
717
+
718
+ const int dim = (int)input_channels;
719
+ assert (0 < dim && dim <= 4);
720
+ assert (output_channels == 3 ||
721
+ output_channels == 4);
722
+
723
+ // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
724
+ I32 index [8]; // Index contribution by dimension, first low from 0, then high from 4.
725
+ F weight[8]; // Weight for each contribution, again first low, then high.
726
+
727
+ // O(dim) work first: calculate index,weight from r,g,b,a.
728
+ const F inputs[] = { *r,*g,*b,*a };
729
+ for (int i = dim-1, stride = 1; i >= 0; i--) {
730
+ // x is where we logically want to sample the grid in the i-th dimension.
731
+ F x = inputs[i] * (float)(grid_points[i] - 1);
732
+
733
+ // But we can't index at floats. lo and hi are the two integer grid points surrounding x.
734
+ I32 lo = cast<I32>( x ), // i.e. trunc(x) == floor(x) here.
735
+ hi = cast<I32>(minus_1_ulp(x+1.0f));
736
+ // Notice how we fold in the accumulated stride across previous dimensions here.
737
+ index[i+0] = lo * stride;
738
+ index[i+4] = hi * stride;
739
+ stride *= grid_points[i];
740
+
741
+ // We'll interpolate between those two integer grid points by t.
742
+ F t = x - cast<F>(lo); // i.e. fract(x)
743
+ weight[i+0] = 1-t;
744
+ weight[i+4] = t;
745
+ }
746
+
747
+ *r = *g = *b = F0;
748
+ if (output_channels == 4) {
749
+ *a = F0;
750
+ }
751
+
752
+ // We'll sample 2^dim == 1<<dim table entries per pixel,
753
+ // in all combinations of low and high in each dimension.
754
+ for (int combo = 0; combo < (1<<dim); combo++) { // This loop can be done in any order.
755
+
756
+ // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
757
+ // where 0 selects the low index contribution and its weight 1-t,
758
+ // or 4 the high index contribution and its weight t.
759
+
760
+ // Since 0<dim≤4, we can always just start off with the 0-th channel,
761
+ // then handle the others conditionally.
762
+ I32 ix = index [0 + (combo&1)*4];
763
+ F w = weight[0 + (combo&1)*4];
764
+
765
+ switch ((dim-1)&3) { // This lets the compiler know there are no other cases to handle.
766
+ case 3: ix += index [3 + (combo&8)/2];
767
+ w *= weight[3 + (combo&8)/2];
768
+ FALLTHROUGH;
769
+ // fall through
770
+
771
+ case 2: ix += index [2 + (combo&4)*1];
772
+ w *= weight[2 + (combo&4)*1];
773
+ FALLTHROUGH;
774
+ // fall through
775
+
776
+ case 1: ix += index [1 + (combo&2)*2];
777
+ w *= weight[1 + (combo&2)*2];
778
+ }
779
+
780
+ F R,G,B,A=F0;
781
+ if (output_channels == 3) {
782
+ if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B); }
783
+ else { sample_clut_16(grid_16,ix, &R,&G,&B); }
784
+ } else {
785
+ if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B,&A); }
786
+ else { sample_clut_16(grid_16,ix, &R,&G,&B,&A); }
787
+ }
788
+ *r += w*R;
789
+ *g += w*G;
790
+ *b += w*B;
791
+ *a += w*A;
792
+ }
793
+ }
794
+
795
+ static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
796
+ clut(a2b->input_channels, a2b->output_channels,
797
+ a2b->grid_points, a2b->grid_8, a2b->grid_16,
798
+ r,g,b,&a);
799
+ }
800
+ static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
801
+ clut(b2a->input_channels, b2a->output_channels,
802
+ b2a->grid_points, b2a->grid_8, b2a->grid_16,
803
+ r,g,b,a);
804
+ }
805
+
806
+ static void exec_ops(const Op* ops, const void** args,
807
+ const char* src, char* dst, int i) {
808
+ F r = F0, g = F0, b = F0, a = F1;
809
+ while (true) {
810
+ switch (*ops++) {
811
+ case Op_load_a8:{
812
+ a = F_from_U8(load<U8>(src + 1*i));
813
+ } break;
814
+
815
+ case Op_load_g8:{
816
+ r = g = b = F_from_U8(load<U8>(src + 1*i));
817
+ } break;
818
+
819
+ case Op_load_4444:{
820
+ U16 abgr = load<U16>(src + 2*i);
821
+
822
+ r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
823
+ g = cast<F>((abgr >> 8) & 0xf) * (1/15.0f);
824
+ b = cast<F>((abgr >> 4) & 0xf) * (1/15.0f);
825
+ a = cast<F>((abgr >> 0) & 0xf) * (1/15.0f);
826
+ } break;
827
+
828
+ case Op_load_565:{
829
+ U16 rgb = load<U16>(src + 2*i);
830
+
831
+ r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
832
+ g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
833
+ b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
834
+ } break;
835
+
836
+ case Op_load_888:{
837
+ const uint8_t* rgb = (const uint8_t*)(src + 3*i);
838
+ #if defined(USING_NEON_FP16)
839
+ // See the explanation under USING_NEON below. This is that doubled up.
840
+ uint8x16x3_t v = {{ vdupq_n_u8(0), vdupq_n_u8(0), vdupq_n_u8(0) }};
841
+ v = vld3q_lane_u8(rgb+ 0, v, 0);
842
+ v = vld3q_lane_u8(rgb+ 3, v, 2);
843
+ v = vld3q_lane_u8(rgb+ 6, v, 4);
844
+ v = vld3q_lane_u8(rgb+ 9, v, 6);
845
+
846
+ v = vld3q_lane_u8(rgb+12, v, 8);
847
+ v = vld3q_lane_u8(rgb+15, v, 10);
848
+ v = vld3q_lane_u8(rgb+18, v, 12);
849
+ v = vld3q_lane_u8(rgb+21, v, 14);
850
+
851
+ r = cast<F>((U16)v.val[0]) * (1/255.0f);
852
+ g = cast<F>((U16)v.val[1]) * (1/255.0f);
853
+ b = cast<F>((U16)v.val[2]) * (1/255.0f);
854
+ #elif defined(USING_NEON)
855
+ // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
856
+ // a time. Since we're doing that, we might as well load them into 16-bit lanes.
857
+ // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
858
+ uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
859
+ v = vld3_lane_u8(rgb+0, v, 0);
860
+ v = vld3_lane_u8(rgb+3, v, 2);
861
+ v = vld3_lane_u8(rgb+6, v, 4);
862
+ v = vld3_lane_u8(rgb+9, v, 6);
863
+
864
+ // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
865
+ // convert to F. (Again, U32 would be even better here if drop ARMv7 or split
866
+ // ARMv7 and ARMv8 impls.)
867
+ r = cast<F>((U16)v.val[0]) * (1/255.0f);
868
+ g = cast<F>((U16)v.val[1]) * (1/255.0f);
869
+ b = cast<F>((U16)v.val[2]) * (1/255.0f);
870
+ #else
871
+ r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
872
+ g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
873
+ b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
874
+ #endif
875
+ } break;
876
+
877
+ case Op_load_8888:{
878
+ U32 rgba = load<U32>(src + 4*i);
879
+
880
+ r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
881
+ g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
882
+ b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
883
+ a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
884
+ } break;
885
+
886
+ case Op_load_8888_palette8:{
887
+ const uint8_t* palette = (const uint8_t*) *args++;
888
+ I32 ix = cast<I32>(load<U8>(src + 1*i));
889
+ U32 rgba = gather_32(palette, ix);
890
+
891
+ r = cast<F>((rgba >> 0) & 0xff) * (1/255.0f);
892
+ g = cast<F>((rgba >> 8) & 0xff) * (1/255.0f);
893
+ b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
894
+ a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
895
+ } break;
896
+
897
+ case Op_load_1010102:{
898
+ U32 rgba = load<U32>(src + 4*i);
899
+
900
+ r = cast<F>((rgba >> 0) & 0x3ff) * (1/1023.0f);
901
+ g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
902
+ b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
903
+ a = cast<F>((rgba >> 30) & 0x3 ) * (1/ 3.0f);
904
+ } break;
905
+
906
+ case Op_load_161616LE:{
907
+ uintptr_t ptr = (uintptr_t)(src + 6*i);
908
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
909
+ const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
910
+ #if defined(USING_NEON_FP16)
911
+ uint16x8x3_t v = vld3q_u16(rgb);
912
+ r = cast<F>((U16)v.val[0]) * (1/65535.0f);
913
+ g = cast<F>((U16)v.val[1]) * (1/65535.0f);
914
+ b = cast<F>((U16)v.val[2]) * (1/65535.0f);
915
+ #elif defined(USING_NEON)
916
+ uint16x4x3_t v = vld3_u16(rgb);
917
+ r = cast<F>((U16)v.val[0]) * (1/65535.0f);
918
+ g = cast<F>((U16)v.val[1]) * (1/65535.0f);
919
+ b = cast<F>((U16)v.val[2]) * (1/65535.0f);
920
+ #else
921
+ r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
922
+ g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
923
+ b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
924
+ #endif
925
+ } break;
926
+
927
+ case Op_load_16161616LE:{
928
+ uintptr_t ptr = (uintptr_t)(src + 8*i);
929
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
930
+ const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
931
+ #if defined(USING_NEON_FP16)
932
+ uint16x8x4_t v = vld4q_u16(rgba);
933
+ r = cast<F>((U16)v.val[0]) * (1/65535.0f);
934
+ g = cast<F>((U16)v.val[1]) * (1/65535.0f);
935
+ b = cast<F>((U16)v.val[2]) * (1/65535.0f);
936
+ a = cast<F>((U16)v.val[3]) * (1/65535.0f);
937
+ #elif defined(USING_NEON)
938
+ uint16x4x4_t v = vld4_u16(rgba);
939
+ r = cast<F>((U16)v.val[0]) * (1/65535.0f);
940
+ g = cast<F>((U16)v.val[1]) * (1/65535.0f);
941
+ b = cast<F>((U16)v.val[2]) * (1/65535.0f);
942
+ a = cast<F>((U16)v.val[3]) * (1/65535.0f);
943
+ #else
944
+ U64 px = load<U64>(rgba);
945
+
946
+ r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
947
+ g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
948
+ b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
949
+ a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
950
+ #endif
951
+ } break;
952
+
953
+ case Op_load_161616BE:{
954
+ uintptr_t ptr = (uintptr_t)(src + 6*i);
955
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
956
+ const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
957
+ #if defined(USING_NEON_FP16)
958
+ uint16x8x3_t v = vld3q_u16(rgb);
959
+ r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
960
+ g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
961
+ b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
962
+ #elif defined(USING_NEON)
963
+ uint16x4x3_t v = vld3_u16(rgb);
964
+ r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
965
+ g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
966
+ b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
967
+ #else
968
+ U32 R = load_3<U32>(rgb+0),
969
+ G = load_3<U32>(rgb+1),
970
+ B = load_3<U32>(rgb+2);
971
+ // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
972
+ r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
973
+ g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
974
+ b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
975
+ #endif
976
+ } break;
977
+
978
+ case Op_load_16161616BE:{
979
+ uintptr_t ptr = (uintptr_t)(src + 8*i);
980
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
981
+ const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
982
+ #if defined(USING_NEON_FP16)
983
+ uint16x8x4_t v = vld4q_u16(rgba);
984
+ r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
985
+ g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
986
+ b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
987
+ a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
988
+ #elif defined(USING_NEON)
989
+ uint16x4x4_t v = vld4_u16(rgba);
990
+ r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
991
+ g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
992
+ b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
993
+ a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
994
+ #else
995
+ U64 px = swap_endian_16x4(load<U64>(rgba));
996
+
997
+ r = cast<F>((px >> 0) & 0xffff) * (1/65535.0f);
998
+ g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
999
+ b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
1000
+ a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
1001
+ #endif
1002
+ } break;
1003
+
1004
+ case Op_load_hhh:{
1005
+ uintptr_t ptr = (uintptr_t)(src + 6*i);
1006
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1007
+ const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1008
+ #if defined(USING_NEON_FP16)
1009
+ uint16x8x3_t v = vld3q_u16(rgb);
1010
+ U16 R = (U16)v.val[0],
1011
+ G = (U16)v.val[1],
1012
+ B = (U16)v.val[2];
1013
+ #elif defined(USING_NEON)
1014
+ uint16x4x3_t v = vld3_u16(rgb);
1015
+ U16 R = (U16)v.val[0],
1016
+ G = (U16)v.val[1],
1017
+ B = (U16)v.val[2];
1018
+ #else
1019
+ U16 R = load_3<U16>(rgb+0),
1020
+ G = load_3<U16>(rgb+1),
1021
+ B = load_3<U16>(rgb+2);
1022
+ #endif
1023
+ r = F_from_Half(R);
1024
+ g = F_from_Half(G);
1025
+ b = F_from_Half(B);
1026
+ } break;
1027
+
1028
+ case Op_load_hhhh:{
1029
+ uintptr_t ptr = (uintptr_t)(src + 8*i);
1030
+ assert( (ptr & 1) == 0 ); // src must be 2-byte aligned for this
1031
+ const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1032
+ #if defined(USING_NEON_FP16)
1033
+ uint16x8x4_t v = vld4q_u16(rgba);
1034
+ U16 R = (U16)v.val[0],
1035
+ G = (U16)v.val[1],
1036
+ B = (U16)v.val[2],
1037
+ A = (U16)v.val[3];
1038
+ #elif defined(USING_NEON)
1039
+ uint16x4x4_t v = vld4_u16(rgba);
1040
+ U16 R = (U16)v.val[0],
1041
+ G = (U16)v.val[1],
1042
+ B = (U16)v.val[2],
1043
+ A = (U16)v.val[3];
1044
+ #else
1045
+ U64 px = load<U64>(rgba);
1046
+ U16 R = cast<U16>((px >> 0) & 0xffff),
1047
+ G = cast<U16>((px >> 16) & 0xffff),
1048
+ B = cast<U16>((px >> 32) & 0xffff),
1049
+ A = cast<U16>((px >> 48) & 0xffff);
1050
+ #endif
1051
+ r = F_from_Half(R);
1052
+ g = F_from_Half(G);
1053
+ b = F_from_Half(B);
1054
+ a = F_from_Half(A);
1055
+ } break;
1056
+
1057
+ case Op_load_fff:{
1058
+ uintptr_t ptr = (uintptr_t)(src + 12*i);
1059
+ assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1060
+ const float* rgb = (const float*)ptr; // cast to const float* to be safe.
1061
+ #if defined(USING_NEON_FP16)
1062
+ float32x4x3_t lo = vld3q_f32(rgb + 0),
1063
+ hi = vld3q_f32(rgb + 12);
1064
+ r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1065
+ g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1066
+ b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1067
+ #elif defined(USING_NEON)
1068
+ float32x4x3_t v = vld3q_f32(rgb);
1069
+ r = (F)v.val[0];
1070
+ g = (F)v.val[1];
1071
+ b = (F)v.val[2];
1072
+ #else
1073
+ r = load_3<F>(rgb+0);
1074
+ g = load_3<F>(rgb+1);
1075
+ b = load_3<F>(rgb+2);
1076
+ #endif
1077
+ } break;
1078
+
1079
+ case Op_load_ffff:{
1080
+ uintptr_t ptr = (uintptr_t)(src + 16*i);
1081
+ assert( (ptr & 3) == 0 ); // src must be 4-byte aligned for this
1082
+ const float* rgba = (const float*)ptr; // cast to const float* to be safe.
1083
+ #if defined(USING_NEON_FP16)
1084
+ float32x4x4_t lo = vld4q_f32(rgba + 0),
1085
+ hi = vld4q_f32(rgba + 16);
1086
+ r = (F)vcombine_f16(vcvt_f16_f32(lo.val[0]), vcvt_f16_f32(hi.val[0]));
1087
+ g = (F)vcombine_f16(vcvt_f16_f32(lo.val[1]), vcvt_f16_f32(hi.val[1]));
1088
+ b = (F)vcombine_f16(vcvt_f16_f32(lo.val[2]), vcvt_f16_f32(hi.val[2]));
1089
+ a = (F)vcombine_f16(vcvt_f16_f32(lo.val[3]), vcvt_f16_f32(hi.val[3]));
1090
+ #elif defined(USING_NEON)
1091
+ float32x4x4_t v = vld4q_f32(rgba);
1092
+ r = (F)v.val[0];
1093
+ g = (F)v.val[1];
1094
+ b = (F)v.val[2];
1095
+ a = (F)v.val[3];
1096
+ #else
1097
+ r = load_4<F>(rgba+0);
1098
+ g = load_4<F>(rgba+1);
1099
+ b = load_4<F>(rgba+2);
1100
+ a = load_4<F>(rgba+3);
1101
+ #endif
1102
+ } break;
1103
+
1104
+ case Op_swap_rb:{
1105
+ F t = r;
1106
+ r = b;
1107
+ b = t;
1108
+ } break;
1109
+
1110
+ case Op_clamp:{
1111
+ r = max_(F0, min_(r, F1));
1112
+ g = max_(F0, min_(g, F1));
1113
+ b = max_(F0, min_(b, F1));
1114
+ a = max_(F0, min_(a, F1));
1115
+ } break;
1116
+
1117
+ case Op_invert:{
1118
+ r = F1 - r;
1119
+ g = F1 - g;
1120
+ b = F1 - b;
1121
+ a = F1 - a;
1122
+ } break;
1123
+
1124
+ case Op_force_opaque:{
1125
+ a = F1;
1126
+ } break;
1127
+
1128
+ case Op_premul:{
1129
+ r *= a;
1130
+ g *= a;
1131
+ b *= a;
1132
+ } break;
1133
+
1134
+ case Op_unpremul:{
1135
+ F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1136
+ r *= scale;
1137
+ g *= scale;
1138
+ b *= scale;
1139
+ } break;
1140
+
1141
+ case Op_matrix_3x3:{
1142
+ const skcms_Matrix3x3* matrix = (const skcms_Matrix3x3*) *args++;
1143
+ const float* m = &matrix->vals[0][0];
1144
+
1145
+ F R = m[0]*r + m[1]*g + m[2]*b,
1146
+ G = m[3]*r + m[4]*g + m[5]*b,
1147
+ B = m[6]*r + m[7]*g + m[8]*b;
1148
+
1149
+ r = R;
1150
+ g = G;
1151
+ b = B;
1152
+ } break;
1153
+
1154
+ case Op_matrix_3x4:{
1155
+ const skcms_Matrix3x4* matrix = (const skcms_Matrix3x4*) *args++;
1156
+ const float* m = &matrix->vals[0][0];
1157
+
1158
+ F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1159
+ G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1160
+ B = m[8]*r + m[9]*g + m[10]*b + m[11];
1161
+
1162
+ r = R;
1163
+ g = G;
1164
+ b = B;
1165
+ } break;
1166
+
1167
+ case Op_lab_to_xyz:{
1168
+ // The L*a*b values are in r,g,b, but normalized to [0,1]. Reconstruct them:
1169
+ F L = r * 100.0f,
1170
+ A = g * 255.0f - 128.0f,
1171
+ B = b * 255.0f - 128.0f;
1172
+
1173
+ // Convert to CIE XYZ.
1174
+ F Y = (L + 16.0f) * (1/116.0f),
1175
+ X = Y + A*(1/500.0f),
1176
+ Z = Y - B*(1/200.0f);
1177
+
1178
+ X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1179
+ Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1180
+ Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1181
+
1182
+ // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1183
+ r = X * 0.9642f;
1184
+ g = Y ;
1185
+ b = Z * 0.8249f;
1186
+ } break;
1187
+
1188
+ // As above, in reverse.
1189
+ case Op_xyz_to_lab:{
1190
+ F X = r * (1/0.9642f),
1191
+ Y = g,
1192
+ Z = b * (1/0.8249f);
1193
+
1194
+ X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
1195
+ Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
1196
+ Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
1197
+
1198
+ F L = Y*116.0f - 16.0f,
1199
+ A = (X-Y)*500.0f,
1200
+ B = (Y-Z)*200.0f;
1201
+
1202
+ r = L * (1/100.f);
1203
+ g = (A + 128.0f) * (1/255.0f);
1204
+ b = (B + 128.0f) * (1/255.0f);
1205
+ } break;
1206
+
1207
+ case Op_tf_r:{ r = apply_tf((const skcms_TransferFunction*)*args++, r); } break;
1208
+ case Op_tf_g:{ g = apply_tf((const skcms_TransferFunction*)*args++, g); } break;
1209
+ case Op_tf_b:{ b = apply_tf((const skcms_TransferFunction*)*args++, b); } break;
1210
+ case Op_tf_a:{ a = apply_tf((const skcms_TransferFunction*)*args++, a); } break;
1211
+
1212
+ case Op_pq_r:{ r = apply_pq((const skcms_TransferFunction*)*args++, r); } break;
1213
+ case Op_pq_g:{ g = apply_pq((const skcms_TransferFunction*)*args++, g); } break;
1214
+ case Op_pq_b:{ b = apply_pq((const skcms_TransferFunction*)*args++, b); } break;
1215
+ case Op_pq_a:{ a = apply_pq((const skcms_TransferFunction*)*args++, a); } break;
1216
+
1217
+ case Op_hlg_r:{ r = apply_hlg((const skcms_TransferFunction*)*args++, r); } break;
1218
+ case Op_hlg_g:{ g = apply_hlg((const skcms_TransferFunction*)*args++, g); } break;
1219
+ case Op_hlg_b:{ b = apply_hlg((const skcms_TransferFunction*)*args++, b); } break;
1220
+ case Op_hlg_a:{ a = apply_hlg((const skcms_TransferFunction*)*args++, a); } break;
1221
+
1222
+ case Op_hlginv_r:{ r = apply_hlginv((const skcms_TransferFunction*)*args++, r); } break;
1223
+ case Op_hlginv_g:{ g = apply_hlginv((const skcms_TransferFunction*)*args++, g); } break;
1224
+ case Op_hlginv_b:{ b = apply_hlginv((const skcms_TransferFunction*)*args++, b); } break;
1225
+ case Op_hlginv_a:{ a = apply_hlginv((const skcms_TransferFunction*)*args++, a); } break;
1226
+
1227
+ case Op_table_r: { r = table((const skcms_Curve*)*args++, r); } break;
1228
+ case Op_table_g: { g = table((const skcms_Curve*)*args++, g); } break;
1229
+ case Op_table_b: { b = table((const skcms_Curve*)*args++, b); } break;
1230
+ case Op_table_a: { a = table((const skcms_Curve*)*args++, a); } break;
1231
+
1232
+ case Op_clut_A2B: {
1233
+ const skcms_A2B* a2b = (const skcms_A2B*) *args++;
1234
+ clut(a2b, &r,&g,&b,a);
1235
+
1236
+ if (a2b->input_channels == 4) {
1237
+ // CMYK is opaque.
1238
+ a = F1;
1239
+ }
1240
+ } break;
1241
+
1242
+ case Op_clut_B2A: {
1243
+ const skcms_B2A* b2a = (const skcms_B2A*) *args++;
1244
+ clut(b2a, &r,&g,&b,&a);
1245
+ } break;
1246
+
1247
+ // Notice, from here on down the store_ ops all return, ending the loop.
1248
+
1249
+ case Op_store_a8: {
1250
+ store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1251
+ } return;
1252
+
1253
+ case Op_store_g8: {
1254
+ // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1255
+ store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1256
+ } return;
1257
+
1258
+ case Op_store_4444: {
1259
+ store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1260
+ | cast<U16>(to_fixed(g * 15) << 8)
1261
+ | cast<U16>(to_fixed(b * 15) << 4)
1262
+ | cast<U16>(to_fixed(a * 15) << 0));
1263
+ } return;
1264
+
1265
+ case Op_store_565: {
1266
+ store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) << 0 )
1267
+ | cast<U16>(to_fixed(g * 63) << 5 )
1268
+ | cast<U16>(to_fixed(b * 31) << 11 ));
1269
+ } return;
1270
+
1271
+ case Op_store_888: {
1272
+ uint8_t* rgb = (uint8_t*)dst + 3*i;
1273
+ #if defined(USING_NEON_FP16)
1274
+ // See the explanation under USING_NEON below. This is that doubled up.
1275
+ U16 R = to_fixed(r * 255),
1276
+ G = to_fixed(g * 255),
1277
+ B = to_fixed(b * 255);
1278
+
1279
+ uint8x16x3_t v = {{ (uint8x16_t)R, (uint8x16_t)G, (uint8x16_t)B }};
1280
+ vst3q_lane_u8(rgb+ 0, v, 0);
1281
+ vst3q_lane_u8(rgb+ 3, v, 2);
1282
+ vst3q_lane_u8(rgb+ 6, v, 4);
1283
+ vst3q_lane_u8(rgb+ 9, v, 6);
1284
+
1285
+ vst3q_lane_u8(rgb+12, v, 8);
1286
+ vst3q_lane_u8(rgb+15, v, 10);
1287
+ vst3q_lane_u8(rgb+18, v, 12);
1288
+ vst3q_lane_u8(rgb+21, v, 14);
1289
+ #elif defined(USING_NEON)
1290
+ // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1291
+ // get there via U16 to save some instructions converting to float. And just
1292
+ // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1293
+ U16 R = cast<U16>(to_fixed(r * 255)),
1294
+ G = cast<U16>(to_fixed(g * 255)),
1295
+ B = cast<U16>(to_fixed(b * 255));
1296
+
1297
+ uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1298
+ vst3_lane_u8(rgb+0, v, 0);
1299
+ vst3_lane_u8(rgb+3, v, 2);
1300
+ vst3_lane_u8(rgb+6, v, 4);
1301
+ vst3_lane_u8(rgb+9, v, 6);
1302
+ #else
1303
+ store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1304
+ store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1305
+ store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1306
+ #endif
1307
+ } return;
1308
+
1309
+ case Op_store_8888: {
1310
+ store(dst + 4*i, cast<U32>(to_fixed(r * 255)) << 0
1311
+ | cast<U32>(to_fixed(g * 255)) << 8
1312
+ | cast<U32>(to_fixed(b * 255)) << 16
1313
+ | cast<U32>(to_fixed(a * 255)) << 24);
1314
+ } return;
1315
+
1316
+ case Op_store_1010102: {
1317
+ store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) << 0
1318
+ | cast<U32>(to_fixed(g * 1023)) << 10
1319
+ | cast<U32>(to_fixed(b * 1023)) << 20
1320
+ | cast<U32>(to_fixed(a * 3)) << 30);
1321
+ } return;
1322
+
1323
+ case Op_store_161616LE: {
1324
+ uintptr_t ptr = (uintptr_t)(dst + 6*i);
1325
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1326
+ uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1327
+ #if defined(USING_NEON_FP16)
1328
+ uint16x8x3_t v = {{
1329
+ (uint16x8_t)U16_from_F(r),
1330
+ (uint16x8_t)U16_from_F(g),
1331
+ (uint16x8_t)U16_from_F(b),
1332
+ }};
1333
+ vst3q_u16(rgb, v);
1334
+ #elif defined(USING_NEON)
1335
+ uint16x4x3_t v = {{
1336
+ (uint16x4_t)U16_from_F(r),
1337
+ (uint16x4_t)U16_from_F(g),
1338
+ (uint16x4_t)U16_from_F(b),
1339
+ }};
1340
+ vst3_u16(rgb, v);
1341
+ #else
1342
+ store_3(rgb+0, U16_from_F(r));
1343
+ store_3(rgb+1, U16_from_F(g));
1344
+ store_3(rgb+2, U16_from_F(b));
1345
+ #endif
1346
+
1347
+ } return;
1348
+
1349
+ case Op_store_16161616LE: {
1350
+ uintptr_t ptr = (uintptr_t)(dst + 8*i);
1351
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1352
+ uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1353
+ #if defined(USING_NEON_FP16)
1354
+ uint16x8x4_t v = {{
1355
+ (uint16x8_t)U16_from_F(r),
1356
+ (uint16x8_t)U16_from_F(g),
1357
+ (uint16x8_t)U16_from_F(b),
1358
+ (uint16x8_t)U16_from_F(a),
1359
+ }};
1360
+ vst4q_u16(rgba, v);
1361
+ #elif defined(USING_NEON)
1362
+ uint16x4x4_t v = {{
1363
+ (uint16x4_t)U16_from_F(r),
1364
+ (uint16x4_t)U16_from_F(g),
1365
+ (uint16x4_t)U16_from_F(b),
1366
+ (uint16x4_t)U16_from_F(a),
1367
+ }};
1368
+ vst4_u16(rgba, v);
1369
+ #else
1370
+ U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1371
+ | cast<U64>(to_fixed(g * 65535)) << 16
1372
+ | cast<U64>(to_fixed(b * 65535)) << 32
1373
+ | cast<U64>(to_fixed(a * 65535)) << 48;
1374
+ store(rgba, px);
1375
+ #endif
1376
+ } return;
1377
+
1378
+ case Op_store_161616BE: {
1379
+ uintptr_t ptr = (uintptr_t)(dst + 6*i);
1380
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1381
+ uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1382
+ #if defined(USING_NEON_FP16)
1383
+ uint16x8x3_t v = {{
1384
+ (uint16x8_t)swap_endian_16(U16_from_F(r)),
1385
+ (uint16x8_t)swap_endian_16(U16_from_F(g)),
1386
+ (uint16x8_t)swap_endian_16(U16_from_F(b)),
1387
+ }};
1388
+ vst3q_u16(rgb, v);
1389
+ #elif defined(USING_NEON)
1390
+ uint16x4x3_t v = {{
1391
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1392
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1393
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1394
+ }};
1395
+ vst3_u16(rgb, v);
1396
+ #else
1397
+ U32 R = to_fixed(r * 65535),
1398
+ G = to_fixed(g * 65535),
1399
+ B = to_fixed(b * 65535);
1400
+ store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1401
+ store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1402
+ store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1403
+ #endif
1404
+
1405
+ } return;
1406
+
1407
+ case Op_store_16161616BE: {
1408
+ uintptr_t ptr = (uintptr_t)(dst + 8*i);
1409
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1410
+ uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1411
+ #if defined(USING_NEON_FP16)
1412
+ uint16x8x4_t v = {{
1413
+ (uint16x8_t)swap_endian_16(U16_from_F(r)),
1414
+ (uint16x8_t)swap_endian_16(U16_from_F(g)),
1415
+ (uint16x8_t)swap_endian_16(U16_from_F(b)),
1416
+ (uint16x8_t)swap_endian_16(U16_from_F(a)),
1417
+ }};
1418
+ vst4q_u16(rgba, v);
1419
+ #elif defined(USING_NEON)
1420
+ uint16x4x4_t v = {{
1421
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1422
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1423
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1424
+ (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1425
+ }};
1426
+ vst4_u16(rgba, v);
1427
+ #else
1428
+ U64 px = cast<U64>(to_fixed(r * 65535)) << 0
1429
+ | cast<U64>(to_fixed(g * 65535)) << 16
1430
+ | cast<U64>(to_fixed(b * 65535)) << 32
1431
+ | cast<U64>(to_fixed(a * 65535)) << 48;
1432
+ store(rgba, swap_endian_16x4(px));
1433
+ #endif
1434
+ } return;
1435
+
1436
+ case Op_store_hhh: {
1437
+ uintptr_t ptr = (uintptr_t)(dst + 6*i);
1438
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1439
+ uint16_t* rgb = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1440
+
1441
+ U16 R = Half_from_F(r),
1442
+ G = Half_from_F(g),
1443
+ B = Half_from_F(b);
1444
+ #if defined(USING_NEON_FP16)
1445
+ uint16x8x3_t v = {{
1446
+ (uint16x8_t)R,
1447
+ (uint16x8_t)G,
1448
+ (uint16x8_t)B,
1449
+ }};
1450
+ vst3q_u16(rgb, v);
1451
+ #elif defined(USING_NEON)
1452
+ uint16x4x3_t v = {{
1453
+ (uint16x4_t)R,
1454
+ (uint16x4_t)G,
1455
+ (uint16x4_t)B,
1456
+ }};
1457
+ vst3_u16(rgb, v);
1458
+ #else
1459
+ store_3(rgb+0, R);
1460
+ store_3(rgb+1, G);
1461
+ store_3(rgb+2, B);
1462
+ #endif
1463
+ } return;
1464
+
1465
+ case Op_store_hhhh: {
1466
+ uintptr_t ptr = (uintptr_t)(dst + 8*i);
1467
+ assert( (ptr & 1) == 0 ); // The dst pointer must be 2-byte aligned
1468
+ uint16_t* rgba = (uint16_t*)ptr; // for this cast to uint16_t* to be safe.
1469
+
1470
+ U16 R = Half_from_F(r),
1471
+ G = Half_from_F(g),
1472
+ B = Half_from_F(b),
1473
+ A = Half_from_F(a);
1474
+ #if defined(USING_NEON_FP16)
1475
+ uint16x8x4_t v = {{
1476
+ (uint16x8_t)R,
1477
+ (uint16x8_t)G,
1478
+ (uint16x8_t)B,
1479
+ (uint16x8_t)A,
1480
+ }};
1481
+ vst4q_u16(rgba, v);
1482
+ #elif defined(USING_NEON)
1483
+ uint16x4x4_t v = {{
1484
+ (uint16x4_t)R,
1485
+ (uint16x4_t)G,
1486
+ (uint16x4_t)B,
1487
+ (uint16x4_t)A,
1488
+ }};
1489
+ vst4_u16(rgba, v);
1490
+ #else
1491
+ store(rgba, cast<U64>(R) << 0
1492
+ | cast<U64>(G) << 16
1493
+ | cast<U64>(B) << 32
1494
+ | cast<U64>(A) << 48);
1495
+ #endif
1496
+
1497
+ } return;
1498
+
1499
+ case Op_store_fff: {
1500
+ uintptr_t ptr = (uintptr_t)(dst + 12*i);
1501
+ assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1502
+ float* rgb = (float*)ptr; // for this cast to float* to be safe.
1503
+ #if defined(USING_NEON_FP16)
1504
+ float32x4x3_t lo = {{
1505
+ vcvt_f32_f16(vget_low_f16(r)),
1506
+ vcvt_f32_f16(vget_low_f16(g)),
1507
+ vcvt_f32_f16(vget_low_f16(b)),
1508
+ }}, hi = {{
1509
+ vcvt_f32_f16(vget_high_f16(r)),
1510
+ vcvt_f32_f16(vget_high_f16(g)),
1511
+ vcvt_f32_f16(vget_high_f16(b)),
1512
+ }};
1513
+ vst3q_f32(rgb + 0, lo);
1514
+ vst3q_f32(rgb + 12, hi);
1515
+ #elif defined(USING_NEON)
1516
+ float32x4x3_t v = {{
1517
+ (float32x4_t)r,
1518
+ (float32x4_t)g,
1519
+ (float32x4_t)b,
1520
+ }};
1521
+ vst3q_f32(rgb, v);
1522
+ #else
1523
+ store_3(rgb+0, r);
1524
+ store_3(rgb+1, g);
1525
+ store_3(rgb+2, b);
1526
+ #endif
1527
+ } return;
1528
+
1529
+ case Op_store_ffff: {
1530
+ uintptr_t ptr = (uintptr_t)(dst + 16*i);
1531
+ assert( (ptr & 3) == 0 ); // The dst pointer must be 4-byte aligned
1532
+ float* rgba = (float*)ptr; // for this cast to float* to be safe.
1533
+ #if defined(USING_NEON_FP16)
1534
+ float32x4x4_t lo = {{
1535
+ vcvt_f32_f16(vget_low_f16(r)),
1536
+ vcvt_f32_f16(vget_low_f16(g)),
1537
+ vcvt_f32_f16(vget_low_f16(b)),
1538
+ vcvt_f32_f16(vget_low_f16(a)),
1539
+ }}, hi = {{
1540
+ vcvt_f32_f16(vget_high_f16(r)),
1541
+ vcvt_f32_f16(vget_high_f16(g)),
1542
+ vcvt_f32_f16(vget_high_f16(b)),
1543
+ vcvt_f32_f16(vget_high_f16(a)),
1544
+ }};
1545
+ vst4q_f32(rgba + 0, lo);
1546
+ vst4q_f32(rgba + 16, hi);
1547
+ #elif defined(USING_NEON)
1548
+ float32x4x4_t v = {{
1549
+ (float32x4_t)r,
1550
+ (float32x4_t)g,
1551
+ (float32x4_t)b,
1552
+ (float32x4_t)a,
1553
+ }};
1554
+ vst4q_f32(rgba, v);
1555
+ #else
1556
+ store_4(rgba+0, r);
1557
+ store_4(rgba+1, g);
1558
+ store_4(rgba+2, b);
1559
+ store_4(rgba+3, a);
1560
+ #endif
1561
+ } return;
1562
+ }
1563
+ }
1564
+ }
1565
+
1566
+
1567
+ static void run_program(const Op* program, const void** arguments,
1568
+ const char* src, char* dst, int n,
1569
+ const size_t src_bpp, const size_t dst_bpp) {
1570
+ int i = 0;
1571
+ while (n >= N) {
1572
+ exec_ops(program, arguments, src, dst, i);
1573
+ i += N;
1574
+ n -= N;
1575
+ }
1576
+ if (n > 0) {
1577
+ char tmp[4*4*N] = {0};
1578
+
1579
+ memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
1580
+ exec_ops(program, arguments, tmp, tmp, 0);
1581
+ memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
1582
+ }
1583
+ }
1584
+
1585
+ // Clean up any #defines we may have set so that we can be #included again.
1586
+ #if defined(USING_AVX)
1587
+ #undef USING_AVX
1588
+ #endif
1589
+ #if defined(USING_AVX_F16C)
1590
+ #undef USING_AVX_F16C
1591
+ #endif
1592
+ #if defined(USING_AVX2)
1593
+ #undef USING_AVX2
1594
+ #endif
1595
+ #if defined(USING_AVX512F)
1596
+ #undef USING_AVX512F
1597
+ #endif
1598
+
1599
+ #if defined(USING_NEON)
1600
+ #undef USING_NEON
1601
+ #endif
1602
+ #if defined(USING_NEON_F16C)
1603
+ #undef USING_NEON_F16C
1604
+ #endif
1605
+ #if defined(USING_NEON_FP16)
1606
+ #undef USING_NEON_FP16
1607
+ #endif
1608
+
1609
+ #undef FALLTHROUGH