cactus-react-native 1.5.0 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. package/Cactus.podspec +1 -1
  2. package/README.md +347 -241
  3. package/android/CMakeLists.txt +24 -5
  4. package/android/src/main/jniLibs/arm64-v8a/libcactus.a +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/libcurl.a +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/libmbedcrypto.a +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/libmbedtls.a +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/libmbedx509.a +0 -0
  9. package/cpp/HybridCactus.cpp +197 -117
  10. package/cpp/HybridCactus.hpp +18 -9
  11. package/cpp/cactus_ffi.h +66 -42
  12. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus.h +0 -1
  13. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_cloud.h +48 -0
  14. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_ffi.h +66 -42
  15. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_utils.h +568 -135
  16. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/engine.h +148 -17
  17. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/graph.h +145 -36
  18. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel.h +187 -6
  19. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/kernel_utils.h +49 -149
  20. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Info.plist +0 -0
  21. package/ios/cactus.xcframework/ios-arm64/cactus.framework/cactus +0 -0
  22. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus.h +0 -1
  23. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_cloud.h +48 -0
  24. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_ffi.h +66 -42
  25. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_utils.h +568 -135
  26. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/engine.h +148 -17
  27. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/graph.h +145 -36
  28. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel.h +187 -6
  29. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/kernel_utils.h +49 -149
  30. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Info.plist +0 -0
  31. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/_CodeSignature/CodeResources +1 -1
  32. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/cactus +0 -0
  33. package/lib/module/classes/CactusLM.js +16 -49
  34. package/lib/module/classes/CactusLM.js.map +1 -1
  35. package/lib/module/classes/CactusSTT.js +41 -75
  36. package/lib/module/classes/CactusSTT.js.map +1 -1
  37. package/lib/module/classes/CactusVAD.js +95 -0
  38. package/lib/module/classes/CactusVAD.js.map +1 -0
  39. package/lib/module/hooks/useCactusLM.js +10 -11
  40. package/lib/module/hooks/useCactusLM.js.map +1 -1
  41. package/lib/module/hooks/useCactusSTT.js +23 -62
  42. package/lib/module/hooks/useCactusSTT.js.map +1 -1
  43. package/lib/module/hooks/useCactusVAD.js +171 -0
  44. package/lib/module/hooks/useCactusVAD.js.map +1 -0
  45. package/lib/module/index.js +2 -3
  46. package/lib/module/index.js.map +1 -1
  47. package/lib/module/modelRegistry.js +52 -0
  48. package/lib/module/modelRegistry.js.map +1 -0
  49. package/lib/module/native/Cactus.js +103 -23
  50. package/lib/module/native/Cactus.js.map +1 -1
  51. package/lib/module/native/CactusIndex.js.map +1 -1
  52. package/lib/module/native/index.js +0 -3
  53. package/lib/module/native/index.js.map +1 -1
  54. package/lib/module/types/CactusVAD.js +4 -0
  55. package/lib/module/{specs/CactusUtil.nitro.js.map → types/CactusVAD.js.map} +1 -1
  56. package/lib/typescript/src/classes/CactusLM.d.ts +5 -7
  57. package/lib/typescript/src/classes/CactusLM.d.ts.map +1 -1
  58. package/lib/typescript/src/classes/CactusSTT.d.ts +9 -12
  59. package/lib/typescript/src/classes/CactusSTT.d.ts.map +1 -1
  60. package/lib/typescript/src/classes/CactusVAD.d.ts +20 -0
  61. package/lib/typescript/src/classes/CactusVAD.d.ts.map +1 -0
  62. package/lib/typescript/src/hooks/useCactusLM.d.ts +2 -2
  63. package/lib/typescript/src/hooks/useCactusLM.d.ts.map +1 -1
  64. package/lib/typescript/src/hooks/useCactusSTT.d.ts +6 -8
  65. package/lib/typescript/src/hooks/useCactusSTT.d.ts.map +1 -1
  66. package/lib/typescript/src/hooks/useCactusVAD.d.ts +15 -0
  67. package/lib/typescript/src/hooks/useCactusVAD.d.ts.map +1 -0
  68. package/lib/typescript/src/index.d.ts +7 -5
  69. package/lib/typescript/src/index.d.ts.map +1 -1
  70. package/lib/typescript/src/modelRegistry.d.ts +5 -0
  71. package/lib/typescript/src/modelRegistry.d.ts.map +1 -0
  72. package/lib/typescript/src/native/Cactus.d.ts +13 -11
  73. package/lib/typescript/src/native/Cactus.d.ts.map +1 -1
  74. package/lib/typescript/src/native/CactusIndex.d.ts +2 -2
  75. package/lib/typescript/src/native/CactusIndex.d.ts.map +1 -1
  76. package/lib/typescript/src/native/index.d.ts +0 -3
  77. package/lib/typescript/src/native/index.d.ts.map +1 -1
  78. package/lib/typescript/src/specs/Cactus.nitro.d.ts +7 -6
  79. package/lib/typescript/src/specs/Cactus.nitro.d.ts.map +1 -1
  80. package/lib/typescript/src/types/CactusIndex.d.ts +2 -2
  81. package/lib/typescript/src/types/CactusIndex.d.ts.map +1 -1
  82. package/lib/typescript/src/types/CactusLM.d.ts +19 -11
  83. package/lib/typescript/src/types/CactusLM.d.ts.map +1 -1
  84. package/lib/typescript/src/types/CactusSTT.d.ts +44 -12
  85. package/lib/typescript/src/types/CactusSTT.d.ts.map +1 -1
  86. package/lib/typescript/src/types/CactusVAD.d.ts +34 -0
  87. package/lib/typescript/src/types/CactusVAD.d.ts.map +1 -0
  88. package/lib/typescript/src/types/common.d.ts +1 -6
  89. package/lib/typescript/src/types/common.d.ts.map +1 -1
  90. package/nitro.json +0 -11
  91. package/nitrogen/generated/android/cactus+autolinking.cmake +0 -5
  92. package/nitrogen/generated/android/cactusOnLoad.cpp +0 -30
  93. package/nitrogen/generated/ios/Cactus-Swift-Cxx-Bridge.cpp +0 -50
  94. package/nitrogen/generated/ios/Cactus-Swift-Cxx-Bridge.hpp +9 -147
  95. package/nitrogen/generated/ios/Cactus-Swift-Cxx-Umbrella.hpp +0 -13
  96. package/nitrogen/generated/ios/CactusAutolinking.mm +0 -26
  97. package/nitrogen/generated/ios/CactusAutolinking.swift +0 -30
  98. package/nitrogen/generated/shared/c++/HybridCactusSpec.cpp +5 -4
  99. package/nitrogen/generated/shared/c++/HybridCactusSpec.hpp +7 -6
  100. package/package.json +3 -3
  101. package/src/classes/CactusLM.ts +18 -65
  102. package/src/classes/CactusSTT.ts +52 -90
  103. package/src/classes/CactusVAD.ts +129 -0
  104. package/src/hooks/useCactusLM.ts +14 -17
  105. package/src/hooks/useCactusSTT.ts +47 -98
  106. package/src/hooks/useCactusVAD.ts +215 -0
  107. package/src/index.tsx +21 -12
  108. package/src/modelRegistry.ts +65 -0
  109. package/src/native/Cactus.ts +131 -38
  110. package/src/native/CactusIndex.ts +2 -2
  111. package/src/native/index.ts +0 -3
  112. package/src/specs/Cactus.nitro.ts +16 -7
  113. package/src/types/CactusIndex.ts +2 -2
  114. package/src/types/CactusLM.ts +19 -11
  115. package/src/types/CactusSTT.ts +47 -13
  116. package/src/types/CactusVAD.ts +39 -0
  117. package/src/types/common.ts +1 -6
  118. package/android/src/main/java/com/margelo/nitro/cactus/HybridCactusCrypto.kt +0 -46
  119. package/android/src/main/java/com/margelo/nitro/cactus/HybridCactusDeviceInfo.kt +0 -27
  120. package/android/src/main/jniLibs/arm64-v8a/libcactus_util.a +0 -0
  121. package/cpp/HybridCactusUtil.cpp +0 -47
  122. package/cpp/HybridCactusUtil.hpp +0 -27
  123. package/cpp/cactus_util.h +0 -25
  124. package/ios/HybridCactusCrypto.swift +0 -37
  125. package/ios/HybridCactusDeviceInfo.swift +0 -32
  126. package/ios/cactus.xcframework/ios-arm64/cactus.framework/Headers/cactus_telemetry.h +0 -656
  127. package/ios/cactus.xcframework/ios-arm64-simulator/cactus.framework/Headers/cactus_telemetry.h +0 -656
  128. package/ios/cactus_util.xcframework/Info.plist +0 -39
  129. package/ios/cactus_util.xcframework/ios-arm64/cactus_util.framework/Headers/cactus_util.h +0 -25
  130. package/ios/cactus_util.xcframework/ios-arm64/cactus_util.framework/Headers/database.h +0 -27
  131. package/ios/cactus_util.xcframework/ios-arm64/cactus_util.framework/Headers/ios_utils.h +0 -10
  132. package/ios/cactus_util.xcframework/ios-arm64/cactus_util.framework/Headers/logging.h +0 -25
  133. package/ios/cactus_util.xcframework/ios-arm64/cactus_util.framework/Info.plist +0 -0
  134. package/ios/cactus_util.xcframework/ios-arm64/cactus_util.framework/cactus_util +0 -0
  135. package/ios/cactus_util.xcframework/ios-arm64-simulator/cactus_util.framework/Headers/cactus_util.h +0 -25
  136. package/ios/cactus_util.xcframework/ios-arm64-simulator/cactus_util.framework/Headers/database.h +0 -27
  137. package/ios/cactus_util.xcframework/ios-arm64-simulator/cactus_util.framework/Headers/ios_utils.h +0 -10
  138. package/ios/cactus_util.xcframework/ios-arm64-simulator/cactus_util.framework/Headers/logging.h +0 -25
  139. package/ios/cactus_util.xcframework/ios-arm64-simulator/cactus_util.framework/Info.plist +0 -0
  140. package/ios/cactus_util.xcframework/ios-arm64-simulator/cactus_util.framework/_CodeSignature/CodeResources +0 -135
  141. package/ios/cactus_util.xcframework/ios-arm64-simulator/cactus_util.framework/cactus_util +0 -0
  142. package/lib/module/api/Database.js +0 -45
  143. package/lib/module/api/Database.js.map +0 -1
  144. package/lib/module/api/RemoteLM.js +0 -201
  145. package/lib/module/api/RemoteLM.js.map +0 -1
  146. package/lib/module/config/CactusConfig.js +0 -12
  147. package/lib/module/config/CactusConfig.js.map +0 -1
  148. package/lib/module/models.js +0 -336
  149. package/lib/module/models.js.map +0 -1
  150. package/lib/module/native/CactusCrypto.js +0 -10
  151. package/lib/module/native/CactusCrypto.js.map +0 -1
  152. package/lib/module/native/CactusDeviceInfo.js +0 -13
  153. package/lib/module/native/CactusDeviceInfo.js.map +0 -1
  154. package/lib/module/native/CactusUtil.js +0 -36
  155. package/lib/module/native/CactusUtil.js.map +0 -1
  156. package/lib/module/specs/CactusCrypto.nitro.js +0 -4
  157. package/lib/module/specs/CactusCrypto.nitro.js.map +0 -1
  158. package/lib/module/specs/CactusDeviceInfo.nitro.js +0 -4
  159. package/lib/module/specs/CactusDeviceInfo.nitro.js.map +0 -1
  160. package/lib/module/specs/CactusUtil.nitro.js +0 -4
  161. package/lib/module/telemetry/Telemetry.js +0 -154
  162. package/lib/module/telemetry/Telemetry.js.map +0 -1
  163. package/lib/typescript/src/api/Database.d.ts +0 -12
  164. package/lib/typescript/src/api/Database.d.ts.map +0 -1
  165. package/lib/typescript/src/api/RemoteLM.d.ts +0 -14
  166. package/lib/typescript/src/api/RemoteLM.d.ts.map +0 -1
  167. package/lib/typescript/src/config/CactusConfig.d.ts +0 -7
  168. package/lib/typescript/src/config/CactusConfig.d.ts.map +0 -1
  169. package/lib/typescript/src/models.d.ts +0 -6
  170. package/lib/typescript/src/models.d.ts.map +0 -1
  171. package/lib/typescript/src/native/CactusCrypto.d.ts +0 -5
  172. package/lib/typescript/src/native/CactusCrypto.d.ts.map +0 -1
  173. package/lib/typescript/src/native/CactusDeviceInfo.d.ts +0 -7
  174. package/lib/typescript/src/native/CactusDeviceInfo.d.ts.map +0 -1
  175. package/lib/typescript/src/native/CactusUtil.d.ts +0 -6
  176. package/lib/typescript/src/native/CactusUtil.d.ts.map +0 -1
  177. package/lib/typescript/src/specs/CactusCrypto.nitro.d.ts +0 -8
  178. package/lib/typescript/src/specs/CactusCrypto.nitro.d.ts.map +0 -1
  179. package/lib/typescript/src/specs/CactusDeviceInfo.nitro.d.ts +0 -16
  180. package/lib/typescript/src/specs/CactusDeviceInfo.nitro.d.ts.map +0 -1
  181. package/lib/typescript/src/specs/CactusUtil.nitro.d.ts +0 -10
  182. package/lib/typescript/src/specs/CactusUtil.nitro.d.ts.map +0 -1
  183. package/lib/typescript/src/telemetry/Telemetry.d.ts +0 -34
  184. package/lib/typescript/src/telemetry/Telemetry.d.ts.map +0 -1
  185. package/nitrogen/generated/android/c++/JDeviceInfo.hpp +0 -74
  186. package/nitrogen/generated/android/c++/JHybridCactusCryptoSpec.cpp +0 -65
  187. package/nitrogen/generated/android/c++/JHybridCactusCryptoSpec.hpp +0 -65
  188. package/nitrogen/generated/android/c++/JHybridCactusDeviceInfoSpec.cpp +0 -85
  189. package/nitrogen/generated/android/c++/JHybridCactusDeviceInfoSpec.hpp +0 -66
  190. package/nitrogen/generated/android/kotlin/com/margelo/nitro/cactus/DeviceInfo.kt +0 -50
  191. package/nitrogen/generated/android/kotlin/com/margelo/nitro/cactus/HybridCactusCryptoSpec.kt +0 -58
  192. package/nitrogen/generated/android/kotlin/com/margelo/nitro/cactus/HybridCactusDeviceInfoSpec.kt +0 -62
  193. package/nitrogen/generated/ios/c++/HybridCactusCryptoSpecSwift.cpp +0 -11
  194. package/nitrogen/generated/ios/c++/HybridCactusCryptoSpecSwift.hpp +0 -77
  195. package/nitrogen/generated/ios/c++/HybridCactusDeviceInfoSpecSwift.cpp +0 -11
  196. package/nitrogen/generated/ios/c++/HybridCactusDeviceInfoSpecSwift.hpp +0 -88
  197. package/nitrogen/generated/ios/swift/DeviceInfo.swift +0 -98
  198. package/nitrogen/generated/ios/swift/Func_void_DeviceInfo.swift +0 -47
  199. package/nitrogen/generated/ios/swift/Func_void_std__optional_std__string_.swift +0 -54
  200. package/nitrogen/generated/ios/swift/HybridCactusCryptoSpec.swift +0 -57
  201. package/nitrogen/generated/ios/swift/HybridCactusCryptoSpec_cxx.swift +0 -139
  202. package/nitrogen/generated/ios/swift/HybridCactusDeviceInfoSpec.swift +0 -58
  203. package/nitrogen/generated/ios/swift/HybridCactusDeviceInfoSpec_cxx.swift +0 -164
  204. package/nitrogen/generated/shared/c++/DeviceInfo.hpp +0 -92
  205. package/nitrogen/generated/shared/c++/HybridCactusCryptoSpec.cpp +0 -21
  206. package/nitrogen/generated/shared/c++/HybridCactusCryptoSpec.hpp +0 -63
  207. package/nitrogen/generated/shared/c++/HybridCactusDeviceInfoSpec.cpp +0 -22
  208. package/nitrogen/generated/shared/c++/HybridCactusDeviceInfoSpec.hpp +0 -67
  209. package/nitrogen/generated/shared/c++/HybridCactusUtilSpec.cpp +0 -23
  210. package/nitrogen/generated/shared/c++/HybridCactusUtilSpec.hpp +0 -66
  211. package/src/api/Database.ts +0 -55
  212. package/src/api/RemoteLM.ts +0 -273
  213. package/src/config/CactusConfig.ts +0 -11
  214. package/src/models.ts +0 -344
  215. package/src/native/CactusCrypto.ts +0 -11
  216. package/src/native/CactusDeviceInfo.ts +0 -18
  217. package/src/native/CactusUtil.ts +0 -43
  218. package/src/specs/CactusCrypto.nitro.ts +0 -6
  219. package/src/specs/CactusDeviceInfo.nitro.ts +0 -15
  220. package/src/specs/CactusUtil.nitro.ts +0 -8
  221. package/src/telemetry/Telemetry.ts +0 -236
@@ -4,6 +4,8 @@
4
4
  #include <cstddef>
5
5
  #include <arm_neon.h>
6
6
 
7
+ enum class Precision;
8
+
7
9
  enum class ScalarOpType {
8
10
  ADD,
9
11
  SUBTRACT,
@@ -12,15 +14,17 @@ enum class ScalarOpType {
12
14
  EXP,
13
15
  SQRT,
14
16
  COS,
15
- SIN
17
+ SIN,
18
+ LOG
16
19
  };
17
20
 
18
- constexpr size_t KV_QUANT_GROUP_SIZE = 128;
21
+ constexpr size_t KV_QUANT_GROUP_SIZE = 32;
19
22
 
20
23
  void cactus_add_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
21
24
  void cactus_add_f16_clipped(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
22
25
  void cactus_subtract_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
23
26
  void cactus_multiply_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
27
+ void cactus_add_scaled_f16(const __fp16* base, const __fp16* src, __fp16* output, size_t num_elements, float scale);
24
28
  void cactus_divide_f16(const __fp16* a, const __fp16* b, __fp16* output, size_t num_elements);
25
29
 
26
30
  void cactus_add_broadcast_f16(const __fp16* a, const __fp16* b, __fp16* output,
@@ -38,14 +42,35 @@ void cactus_divide_broadcast_f16(const __fp16* a, const __fp16* b, __fp16* outpu
38
42
 
39
43
  void cactus_scalar_op_f16(const __fp16* input, __fp16* output, size_t num_elements, float scalar_value, ScalarOpType op_type);
40
44
 
45
+ void cactus_gemv_int8(const int8_t* A, float A_scale,
46
+ const int8_t* B, const __fp16* B_scales,
47
+ __fp16* C, size_t K, size_t N, size_t group_size);
48
+
49
+ void cactus_gemm_int8(const int8_t* A, const float* A_scales,
50
+ const int8_t* B, const __fp16* B_scales,
51
+ __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
52
+
41
53
  void cactus_matmul_int8(const int8_t* A, const float* A_scales,
42
54
  const int8_t* B, const __fp16* B_scales,
43
55
  __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
44
56
 
57
+ void cactus_gemv_int4(const int8_t* A, float A_scale,
58
+ const int8_t* B_packed, const __fp16* B_scales,
59
+ __fp16* C, size_t K, size_t N, size_t group_size);
60
+
61
+ void cactus_gemm_int4(const int8_t* A, const float* A_scales,
62
+ const int8_t* B_packed, const __fp16* B_scales,
63
+ __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
64
+
45
65
  void cactus_matmul_int4(const int8_t* A, const float* A_scales,
46
- const uint8_t* B_packed, const __fp16* B_scales,
66
+ const int8_t* B_packed, const __fp16* B_scales,
47
67
  __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
48
68
 
69
+ void cactus_matmul_integer(Precision precision,
70
+ const int8_t* A, const float* A_scales,
71
+ const int8_t* B, const __fp16* B_scales,
72
+ __fp16* C, size_t M, size_t K, size_t N, size_t group_size);
73
+
49
74
  void cactus_matmul_f16(const __fp16* a, const __fp16* b_transposed, __fp16* c,
50
75
  size_t M, size_t K, size_t N);
51
76
 
@@ -75,19 +100,70 @@ void cactus_rms_norm_f16(const __fp16* input, const __fp16* weight, __fp16* outp
75
100
  void cactus_rope_f16(const __fp16* input, __fp16* output, size_t batch_size, size_t seq_len,
76
101
  size_t num_heads, size_t head_dim, size_t start_pos, float theta);
77
102
 
103
+ void cactus_gpt_j_rope_f16(const __fp16* input, __fp16* output, size_t batch_size, size_t seq_len,
104
+ size_t num_heads, size_t head_dim, size_t rot_dim, size_t start_pos, float theta);
105
+
78
106
  void cactus_softmax_f16(const __fp16* input, __fp16* output, size_t batch_size,
79
107
  size_t seq_len, size_t vocab_size);
80
108
 
109
+ void cactus_relu_f16(const __fp16* input, __fp16* output, size_t num_elements);
110
+
81
111
  void cactus_silu_f16(const __fp16* input, __fp16* output, size_t num_elements);
82
112
 
83
113
  void cactus_gelu_f16(const __fp16* input, __fp16* output, size_t num_elements);
84
114
 
85
115
  void cactus_gelu_f16_erf(const __fp16* input, __fp16* output, size_t num_elements);
86
116
 
117
+ void cactus_sigmoid_f16(const __fp16* input, __fp16* output, size_t num_elements);
118
+
119
+ void cactus_tanh_f16(const __fp16* input, __fp16* output, size_t num_elements);
120
+
121
+ void cactus_glu_f16(
122
+ const __fp16* input,
123
+ __fp16* output,
124
+ size_t outer_size,
125
+ size_t split_size,
126
+ size_t inner_size
127
+ );
128
+
129
+ void cactus_glu_f32(
130
+ const float* input,
131
+ float* output,
132
+ size_t outer_size,
133
+ size_t split_size,
134
+ size_t inner_size
135
+ );
136
+
137
+ void cactus_batchnorm_f16(
138
+ const __fp16* input,
139
+ const float* weight,
140
+ const float* bias,
141
+ const float* running_mean,
142
+ const float* running_var,
143
+ __fp16* output,
144
+ size_t outer_size,
145
+ size_t channels,
146
+ size_t inner_size,
147
+ float epsilon
148
+ );
149
+
150
+ void cactus_batchnorm_f32(
151
+ const float* input,
152
+ const float* weight,
153
+ const float* bias,
154
+ const float* running_mean,
155
+ const float* running_var,
156
+ float* output,
157
+ size_t outer_size,
158
+ size_t channels,
159
+ size_t inner_size,
160
+ float epsilon
161
+ );
162
+
87
163
  void cactus_attention_f16(const __fp16* queries, const __fp16* keys, const __fp16* values, __fp16* output,
88
164
  size_t batch_size, size_t seq_len, size_t kv_seq_len, size_t num_q_heads, size_t num_kv_heads,
89
165
  size_t head_dim, float scale, const __fp16* mask, size_t position_offset = 0, size_t window_size = 0,
90
- bool is_causal = true);
166
+ bool is_causal = true, bool mask_is_additive = false, bool mask_per_head = false);
91
167
 
92
168
  void cactus_attention_hybrid_int8_fp16(
93
169
  const __fp16* queries,
@@ -100,7 +176,7 @@ void cactus_attention_hybrid_int8_fp16(
100
176
  __fp16* output,
101
177
  size_t batch_size, size_t seq_len, size_t cache_len, size_t new_len,
102
178
  size_t num_q_heads, size_t num_kv_heads, size_t head_dim,
103
- float scale, size_t position_offset = 0, bool is_causal = true,
179
+ float scale, size_t position_offset = 0, bool is_causal = true, size_t window_size = 0,
104
180
  size_t group_size = KV_QUANT_GROUP_SIZE);
105
181
 
106
182
  void cactus_conv1d_causal_depthwise_f16(
@@ -124,6 +200,96 @@ void cactus_conv1d_f16_k3(
124
200
  size_t stride
125
201
  );
126
202
 
203
+ void cactus_conv1d_f16(
204
+ const __fp16* input,
205
+ const __fp16* weight,
206
+ const __fp16* bias,
207
+ __fp16* output,
208
+ size_t N,
209
+ size_t L,
210
+ size_t C_in,
211
+ size_t C_out,
212
+ size_t K,
213
+ size_t stride
214
+ );
215
+
216
+ void cactus_stft_f16(
217
+ const __fp16* input,
218
+ const __fp16* weight,
219
+ __fp16* output,
220
+ size_t N, size_t L,
221
+ size_t C_in, size_t C_out,
222
+ size_t K, size_t stride,
223
+ size_t num_fft_bins
224
+ );
225
+
226
+ void cactus_conv1d_f16_k7s3_oc8(
227
+ const __fp16* input,
228
+ const __fp16* Wpack,
229
+ const __fp16* bias,
230
+ __fp16* output,
231
+ size_t N,
232
+ size_t L,
233
+ size_t C_in,
234
+ size_t C_out
235
+ );
236
+
237
+ void cactus_conv1d_same_depthwise_f16_k9(
238
+ const __fp16* input,
239
+ const __fp16* weight,
240
+ const __fp16* bias,
241
+ __fp16* output,
242
+ size_t N,
243
+ size_t L,
244
+ size_t C
245
+ );
246
+
247
+ void cactus_conv2d_f16_k3s2p1_nchw(
248
+ const __fp16* input,
249
+ const __fp16* weight,
250
+ const __fp16* bias,
251
+ __fp16* output,
252
+ size_t N,
253
+ size_t C_in,
254
+ size_t H,
255
+ size_t W,
256
+ size_t C_out
257
+ );
258
+
259
+ void cactus_conv2d_depthwise_f16_k3s2p1_nchw(
260
+ const __fp16* input,
261
+ const __fp16* weight,
262
+ const __fp16* bias,
263
+ __fp16* output,
264
+ size_t N,
265
+ size_t C,
266
+ size_t H,
267
+ size_t W
268
+ );
269
+
270
+ void cactus_conv2d_pointwise_f16_1x1_nchw_gemm(
271
+ const __fp16* input,
272
+ const __fp16* weight,
273
+ const __fp16* bias,
274
+ __fp16* output,
275
+ size_t N,
276
+ size_t C_in,
277
+ size_t H,
278
+ size_t W,
279
+ size_t C_out
280
+ );
281
+
282
+ void cactus_conv1d_pointwise_f16_gemm(
283
+ const __fp16* input,
284
+ const __fp16* weight,
285
+ const __fp16* bias,
286
+ __fp16* output,
287
+ size_t N,
288
+ size_t L,
289
+ size_t C_in,
290
+ size_t C_out
291
+ );
292
+
127
293
  void cactus_bilinear_interpolation_f16(const __fp16* input, __fp16* output, size_t src_height, size_t src_width, size_t embed_dim,
128
294
  size_t dst_height, size_t dst_width);
129
295
 
@@ -162,4 +328,19 @@ inline size_t kv_scales_count(size_t seq_len, size_t kv_heads, size_t head_dim,
162
328
 
163
329
  void cactus_unpack_int4_to_int8(const uint8_t* packed, int8_t* unpacked, size_t unpacked_count);
164
330
 
165
- #endif
331
+ void cactus_lstm_cell_f16(
332
+ const __fp16* x_input,
333
+ const __fp16* h_prev,
334
+ const __fp16* c_prev,
335
+ const __fp16* weight_ih,
336
+ const __fp16* weight_hh,
337
+ const __fp16* bias_ih,
338
+ const __fp16* bias_hh,
339
+ __fp16* h_new,
340
+ __fp16* c_new,
341
+ size_t batch_size,
342
+ size_t input_size,
343
+ size_t hidden_size
344
+ );
345
+
346
+ #endif
@@ -4,6 +4,7 @@
4
4
  #include <arm_neon.h>
5
5
  #if defined(__APPLE__)
6
6
  #include <TargetConditionals.h>
7
+ #include <sys/sysctl.h>
7
8
  #endif
8
9
  #if defined(__ANDROID__)
9
10
  #include <sys/auxv.h>
@@ -43,58 +44,33 @@ inline void stream_store_f16x8(__fp16* dst, float16x8_t val) {
43
44
  #endif
44
45
  }
45
46
 
46
- #if defined(__ARM_FEATURE_DOTPROD)
47
- inline int32x4_t accum_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
48
- return vdotq_s32(acc, a, b);
49
- }
50
- #else
51
- inline int32x4_t accum_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
52
- int16x8_t prod_low = vmull_s8(vget_low_s8(a), vget_low_s8(b));
53
- int32x4_t acc_high = vpaddlq_s16(vmull_s8(vget_high_s8(a), vget_high_s8(b)));
54
- return vaddq_s32(vaddq_s32(acc, vpaddlq_s16(prod_low)), acc_high);
55
- }
56
- #endif
57
-
58
- // I8MM support: runtime detection on Android, compile-time on Apple
59
- #if defined(__ANDROID__) && defined(__aarch64__)
60
-
61
- inline bool cactus_has_i8mm() {
62
- static int8_t supported = -1;
63
- if (supported == -1) {
64
- unsigned long hwcaps = getauxval(AT_HWCAP2);
65
- supported = (hwcaps & HWCAP2_I8MM) ? 1 : 0;
66
- }
67
- return supported;
68
- }
69
-
70
- __attribute__((target("arch=armv8.2-a+i8mm")))
71
- inline int32x4_t accum_matmul(int32x4_t acc, int8x16_t a, int8x16_t b) {
72
- return vmmlaq_s32(acc, a, b);
73
- }
74
-
75
- #elif defined(__APPLE__) && defined(__aarch64__)
76
-
77
- inline bool cactus_has_i8mm() {
78
- return true;
79
- }
47
+ inline bool cpu_has_sme2() {
48
+ #if defined(__aarch64__)
49
+ static std::once_flag once;
50
+ static bool has = false;
51
+
52
+ std::call_once(once, []() {
80
53
 
81
- __attribute__((target("i8mm")))
82
- inline int32x4_t accum_matmul(int32x4_t acc, int8x16_t a, int8x16_t b) {
83
- return vmmlaq_s32(acc, a, b);
84
- }
54
+ #if defined(__APPLE__)
55
+ int ret = 0;
56
+ size_t size = sizeof(ret);
57
+ if (sysctlbyname("hw.optional.arm.FEAT_SME2", &ret, &size, nullptr, 0) == 0) {
58
+ has = ret == 1;
59
+ }
60
+
61
+ #elif defined(__ANDROID__)
62
+ unsigned long hwcap2 = getauxval(AT_HWCAP2);
63
+ #ifdef HWCAP2_SME2
64
+ has = (hwcap2 & HWCAP2_SME2) != 0;
65
+ #endif
85
66
 
67
+ #endif
68
+ });
69
+
70
+ return has;
86
71
  #else
87
-
88
- inline bool cactus_has_i8mm() {
89
- return false;
90
- }
91
-
72
+ return false;
92
73
  #endif
93
-
94
- inline float16x8_t accum_f16_dot(float16x8_t acc, float16x8_t a_low, float16x8_t a_high,
95
- float16x8_t b_low, float16x8_t b_high) {
96
- acc = vfmaq_f16(acc, a_low, b_low);
97
- return vfmaq_f16(acc, a_high, b_high);
98
74
  }
99
75
 
100
76
  inline float32x4_t fast_exp_f32x4(float32x4_t x) {
@@ -154,100 +130,10 @@ inline float32x4_t fast_tanh_f32x4(float32x4_t x) {
154
130
  return result;
155
131
  }
156
132
 
157
- inline int8x16_t unpack_int4_lo(uint8x16_t packed) {
158
- uint8x16_t lo = vandq_u8(packed, vdupq_n_u8(0x0F));
159
- uint8x16_t sign_mask = vcgtq_u8(lo, vdupq_n_u8(7));
160
- uint8x16_t correction = vandq_u8(sign_mask, vdupq_n_u8(16));
161
- return vreinterpretq_s8_u8(vsubq_u8(lo, correction));
162
- }
163
-
164
- inline int8x16_t unpack_int4_hi(uint8x16_t packed) {
165
- uint8x16_t hi = vshrq_n_u8(packed, 4);
166
- uint8x16_t sign_mask = vcgtq_u8(hi, vdupq_n_u8(7));
167
- uint8x16_t correction = vandq_u8(sign_mask, vdupq_n_u8(16));
168
- return vreinterpretq_s8_u8(vsubq_u8(hi, correction));
169
- }
170
-
171
- inline void unpack_int4_to_int8x32(uint8x16_t packed, int8x16_t& out_lo, int8x16_t& out_hi) {
172
- int8x16_t lo_nibbles = unpack_int4_lo(packed);
173
- int8x16_t hi_nibbles = unpack_int4_hi(packed);
174
- int8x16x2_t interleaved = vzipq_s8(lo_nibbles, hi_nibbles);
175
- out_lo = interleaved.val[0];
176
- out_hi = interleaved.val[1];
177
- }
178
-
179
- inline int32x4_t int4_dot_asm(int32x4_t acc, uint8x16_t packed, int8x16_t a_lo, int8x16_t a_hi) {
180
- #if defined(__aarch64__)
181
- int8x16_t b_lo, b_hi;
182
-
183
- __asm__ __volatile__ (
184
- "movi v16.16b, #0x0F \n" // low nibble mask
185
- "movi v17.16b, #7 \n" // sign threshold
186
- "movi v18.16b, #16 \n" // sign correction
187
-
188
- "and %[b_lo].16b, %[packed].16b, v16.16b \n"
189
-
190
- "ushr %[b_hi].16b, %[packed].16b, #4 \n"
191
-
192
- "cmgt v19.16b, %[b_lo].16b, v17.16b \n"
193
- "and v19.16b, v19.16b, v18.16b \n"
194
- "sub %[b_lo].16b, %[b_lo].16b, v19.16b \n"
195
-
196
- "cmgt v20.16b, %[b_hi].16b, v17.16b \n"
197
- "and v20.16b, v20.16b, v18.16b \n"
198
- "sub %[b_hi].16b, %[b_hi].16b, v20.16b \n"
199
-
200
- "zip1 v21.16b, %[b_lo].16b, %[b_hi].16b \n"
201
- "zip2 v22.16b, %[b_lo].16b, %[b_hi].16b \n"
202
-
203
- ".arch armv8.2-a+dotprod \n"
204
- "sdot %[acc].4s, %[a_lo].16b, v21.16b \n"
205
- "sdot %[acc].4s, %[a_hi].16b, v22.16b \n"
206
-
207
- : [acc] "+w"(acc), [b_lo] "=w"(b_lo), [b_hi] "=w"(b_hi)
208
- : [packed] "w"(packed), [a_lo] "w"(a_lo), [a_hi] "w"(a_hi)
209
- : "v16", "v17", "v18", "v19", "v20", "v21", "v22"
210
- );
211
-
212
- return acc;
213
- #else
214
- int8x16_t b_lo, b_hi;
215
- unpack_int4_to_int8x32(packed, b_lo, b_hi);
216
- acc = accum_dot(acc, a_lo, b_lo);
217
- acc = accum_dot(acc, a_hi, b_hi);
218
- return acc;
219
- #endif
220
- }
221
-
222
- inline int32_t int4_dot_m1_asm(const int8_t* a_ptr, const uint8_t* b_packed, size_t group_size) {
223
- #if defined(__aarch64__)
224
- int32x4_t acc = vdupq_n_s32(0);
225
-
226
- for (size_t k = 0; k < group_size; k += 64) {
227
- uint8x16_t p0 = vld1q_u8(b_packed + k/2);
228
- uint8x16_t p1 = vld1q_u8(b_packed + k/2 + 16);
229
-
230
- int8x16_t a0 = vld1q_s8(a_ptr + k);
231
- int8x16_t a1 = vld1q_s8(a_ptr + k + 16);
232
- int8x16_t a2 = vld1q_s8(a_ptr + k + 32);
233
- int8x16_t a3 = vld1q_s8(a_ptr + k + 48);
234
-
235
- acc = int4_dot_asm(acc, p0, a0, a1);
236
- acc = int4_dot_asm(acc, p1, a2, a3);
237
- }
238
-
239
- return vaddvq_s32(acc);
240
- #else
241
- int32x4_t acc = vdupq_n_s32(0);
242
- for (size_t k = 0; k < group_size; k += 32) {
243
- uint8x16_t packed = vld1q_u8(b_packed + k/2);
244
- int8x16_t b_lo, b_hi;
245
- unpack_int4_to_int8x32(packed, b_lo, b_hi);
246
- acc = accum_dot(acc, vld1q_s8(a_ptr + k), b_lo);
247
- acc = accum_dot(acc, vld1q_s8(a_ptr + k + 16), b_hi);
248
- }
249
- return vaddvq_s32(acc);
250
- #endif
133
+ inline void unpack_int4_as_int8x16x2(const uint8_t* ptr, int8x16_t& high_decoded, int8x16_t& low_decoded) {
134
+ int8x16_t packed = vreinterpretq_s8_u8(vld1q_u8(ptr));
135
+ high_decoded = vshrq_n_s8(packed, 4);
136
+ low_decoded = vshrq_n_s8(vshlq_n_s8(packed, 4), 4);
251
137
  }
252
138
 
253
139
  namespace CactusThreading {
@@ -431,18 +317,32 @@ namespace CactusThreading {
431
317
  struct GemmThreading {
432
318
  #if defined(__ANDROID__)
433
319
  static size_t get_num_threads(size_t M, size_t pool_size) {
434
- if (M <= 1) return 1;
435
- return pool_size;
320
+ if (M <= 1) return 1;
321
+ return pool_size;
322
+ }
323
+ static size_t get_gemv_threads(size_t /*N_blocks*/, size_t /*pool_size*/) {
324
+ return 1;
436
325
  }
437
326
  #elif defined(__APPLE__) && TARGET_OS_IPHONE
327
+ static constexpr size_t GEMV_MIN_N_BLOCKS = 512;
438
328
  static size_t get_num_threads(size_t M, size_t pool_size) {
439
- if (M <= 1) return std::min(pool_size, static_cast<size_t>(2));
440
- return pool_size;
329
+ if (M <= 1) return std::min(pool_size, static_cast<size_t>(2));
330
+ return pool_size;
331
+ }
332
+ static size_t get_gemv_threads(size_t N_blocks, size_t pool_size) {
333
+ if (N_blocks < GEMV_MIN_N_BLOCKS) return 1;
334
+ return std::min(pool_size, static_cast<size_t>(3));
441
335
  }
442
- #else // Mac
336
+ #else
337
+ static constexpr size_t GEMV_MIN_N_BLOCKS = 256;
443
338
  static size_t get_num_threads(size_t M, size_t pool_size) {
444
339
  if (M <= 1) return std::min(pool_size, static_cast<size_t>(4));
445
- return pool_size;
340
+ return pool_size;
341
+ }
342
+ static size_t get_gemv_threads(size_t N_blocks, size_t pool_size) {
343
+ if (N_blocks < GEMV_MIN_N_BLOCKS) return 1;
344
+ if (N_blocks < 512) return std::min(pool_size, static_cast<size_t>(2));
345
+ return std::min(pool_size, static_cast<size_t>(5));
446
346
  }
447
347
  #endif
448
348
  };
@@ -599,4 +499,4 @@ namespace CactusThreading {
599
499
  }
600
500
 
601
501
 
602
- #endif // KERNEL_UTILS_H
502
+ #endif // KERNEL_UTILS_H
@@ -7,7 +7,6 @@
7
7
  #include "engine/engine.h"
8
8
  #include "models/model.h"
9
9
  #include "ffi/cactus_ffi.h"
10
- #include "ffi/cactus_telemetry.h"
11
10
  #include "npu/npu.h"
12
11
 
13
12
  #endif // CACTUS_H
@@ -0,0 +1,48 @@
1
+ #ifndef CACTUS_CLOUD_H
2
+ #define CACTUS_CLOUD_H
3
+
4
+ #include "cactus_utils.h"
5
+ #include <string>
6
+ #include <vector>
7
+
8
+ namespace cactus {
9
+ namespace ffi {
10
+
11
+ struct CloudResponse {
12
+ std::string transcript;
13
+ std::string api_key_hash;
14
+ bool used_cloud = false;
15
+ std::string error;
16
+ };
17
+
18
+ struct CloudCompletionRequest {
19
+ std::vector<cactus::engine::ChatMessage> messages;
20
+ std::vector<ToolFunction> tools;
21
+ std::string local_output;
22
+ std::vector<std::string> local_function_calls;
23
+ bool has_images = false;
24
+ std::string cloud_key;
25
+ };
26
+
27
+ struct CloudCompletionResult {
28
+ bool ok = false;
29
+ bool used_cloud = false;
30
+ std::string response;
31
+ std::vector<std::string> function_calls;
32
+ std::string error;
33
+ };
34
+
35
+ std::string cloud_base64_encode(const uint8_t* data, size_t len);
36
+ std::vector<uint8_t> cloud_build_wav(const uint8_t* pcm, size_t pcm_bytes);
37
+ std::string resolve_cloud_api_key(const char* cloud_key_param);
38
+ CloudResponse cloud_transcribe_request(const std::string& audio_b64,
39
+ const std::string& fallback_text,
40
+ long timeout_seconds = 15L,
41
+ const char* cloud_key = nullptr);
42
+ CloudCompletionResult cloud_complete_request(const CloudCompletionRequest& request,
43
+ long timeout_ms);
44
+
45
+ } // namespace ffi
46
+ } // namespace cactus
47
+
48
+ #endif // CACTUS_CLOUD_H