mlx-cpu 0.30.1__py3-none-manylinux_2_35_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. mlx/__main__.py +27 -0
  2. mlx/_reprlib_fix.py +16 -0
  3. mlx/extension.py +88 -0
  4. mlx/include/mlx/3rdparty/pocketfft.h +3581 -0
  5. mlx/include/mlx/allocator.h +73 -0
  6. mlx/include/mlx/array.h +645 -0
  7. mlx/include/mlx/backend/common/binary.h +97 -0
  8. mlx/include/mlx/backend/common/broadcasting.h +11 -0
  9. mlx/include/mlx/backend/common/buffer_cache.h +157 -0
  10. mlx/include/mlx/backend/common/compiled.h +77 -0
  11. mlx/include/mlx/backend/common/copy.h +50 -0
  12. mlx/include/mlx/backend/common/hadamard.h +109 -0
  13. mlx/include/mlx/backend/common/matmul.h +67 -0
  14. mlx/include/mlx/backend/common/reduce.h +59 -0
  15. mlx/include/mlx/backend/common/slicing.h +20 -0
  16. mlx/include/mlx/backend/common/ternary.h +85 -0
  17. mlx/include/mlx/backend/common/unary.h +29 -0
  18. mlx/include/mlx/backend/common/utils.h +205 -0
  19. mlx/include/mlx/backend/cpu/arange.h +28 -0
  20. mlx/include/mlx/backend/cpu/available.h +9 -0
  21. mlx/include/mlx/backend/cpu/binary.h +517 -0
  22. mlx/include/mlx/backend/cpu/binary_ops.h +98 -0
  23. mlx/include/mlx/backend/cpu/binary_two.h +166 -0
  24. mlx/include/mlx/backend/cpu/compiled_preamble.h +12 -0
  25. mlx/include/mlx/backend/cpu/copy.h +36 -0
  26. mlx/include/mlx/backend/cpu/encoder.h +67 -0
  27. mlx/include/mlx/backend/cpu/eval.h +12 -0
  28. mlx/include/mlx/backend/cpu/gemm.h +26 -0
  29. mlx/include/mlx/backend/cpu/gemms/simd_gemm.h +139 -0
  30. mlx/include/mlx/backend/cpu/jit_compiler.h +20 -0
  31. mlx/include/mlx/backend/cpu/lapack.h +80 -0
  32. mlx/include/mlx/backend/cpu/simd/accelerate_fp16_simd.h +56 -0
  33. mlx/include/mlx/backend/cpu/simd/accelerate_simd.h +329 -0
  34. mlx/include/mlx/backend/cpu/simd/base_simd.h +295 -0
  35. mlx/include/mlx/backend/cpu/simd/math.h +193 -0
  36. mlx/include/mlx/backend/cpu/simd/neon_fp16_simd.h +212 -0
  37. mlx/include/mlx/backend/cpu/simd/simd.h +4 -0
  38. mlx/include/mlx/backend/cpu/simd/type.h +11 -0
  39. mlx/include/mlx/backend/cpu/slicing.h +21 -0
  40. mlx/include/mlx/backend/cpu/ternary.h +154 -0
  41. mlx/include/mlx/backend/cpu/threefry.h +21 -0
  42. mlx/include/mlx/backend/cpu/unary.h +281 -0
  43. mlx/include/mlx/backend/cpu/unary_ops.h +180 -0
  44. mlx/include/mlx/backend/cuda/allocator.h +89 -0
  45. mlx/include/mlx/backend/cuda/conv/conv.h +126 -0
  46. mlx/include/mlx/backend/cuda/cublas_utils.h +96 -0
  47. mlx/include/mlx/backend/cuda/cuda.h +10 -0
  48. mlx/include/mlx/backend/cuda/cuda_utils.h +89 -0
  49. mlx/include/mlx/backend/cuda/cudnn_utils.h +171 -0
  50. mlx/include/mlx/backend/cuda/device/config.h +12 -0
  51. mlx/include/mlx/backend/cuda/device.h +189 -0
  52. mlx/include/mlx/backend/cuda/event.h +78 -0
  53. mlx/include/mlx/backend/cuda/gemms/cublas_gemm.h +114 -0
  54. mlx/include/mlx/backend/cuda/gemms/gemv.h +24 -0
  55. mlx/include/mlx/backend/cuda/jit_module.h +119 -0
  56. mlx/include/mlx/backend/cuda/lru_cache.h +189 -0
  57. mlx/include/mlx/backend/cuda/quantized/cublas_qqmm.h +88 -0
  58. mlx/include/mlx/backend/cuda/quantized/cuda_fp4.h +83 -0
  59. mlx/include/mlx/backend/cuda/quantized/qqmm_utils.h +30 -0
  60. mlx/include/mlx/backend/cuda/quantized/quantized.h +45 -0
  61. mlx/include/mlx/backend/cuda/utils.h +46 -0
  62. mlx/include/mlx/backend/cuda/worker.h +55 -0
  63. mlx/include/mlx/backend/gpu/available.h +9 -0
  64. mlx/include/mlx/backend/gpu/copy.h +57 -0
  65. mlx/include/mlx/backend/gpu/eval.h +18 -0
  66. mlx/include/mlx/backend/gpu/slicing.h +36 -0
  67. mlx/include/mlx/backend/metal/allocator.h +79 -0
  68. mlx/include/mlx/backend/metal/binary.h +33 -0
  69. mlx/include/mlx/backend/metal/device.h +283 -0
  70. mlx/include/mlx/backend/metal/jit/includes.h +57 -0
  71. mlx/include/mlx/backend/metal/jit/indexing.h +76 -0
  72. mlx/include/mlx/backend/metal/kernels/arange.h +9 -0
  73. mlx/include/mlx/backend/metal/kernels/atomic.h +345 -0
  74. mlx/include/mlx/backend/metal/kernels/bf16.h +16 -0
  75. mlx/include/mlx/backend/metal/kernels/bf16_math.h +380 -0
  76. mlx/include/mlx/backend/metal/kernels/binary.h +199 -0
  77. mlx/include/mlx/backend/metal/kernels/binary_ops.h +326 -0
  78. mlx/include/mlx/backend/metal/kernels/binary_two.h +244 -0
  79. mlx/include/mlx/backend/metal/kernels/cexpf.h +134 -0
  80. mlx/include/mlx/backend/metal/kernels/complex.h +173 -0
  81. mlx/include/mlx/backend/metal/kernels/copy.h +276 -0
  82. mlx/include/mlx/backend/metal/kernels/defines.h +24 -0
  83. mlx/include/mlx/backend/metal/kernels/erf.h +69 -0
  84. mlx/include/mlx/backend/metal/kernels/expm1f.h +90 -0
  85. mlx/include/mlx/backend/metal/kernels/fft/radix.h +328 -0
  86. mlx/include/mlx/backend/metal/kernels/fft/readwrite.h +624 -0
  87. mlx/include/mlx/backend/metal/kernels/fft.h +486 -0
  88. mlx/include/mlx/backend/metal/kernels/fp4.h +59 -0
  89. mlx/include/mlx/backend/metal/kernels/fp8.h +82 -0
  90. mlx/include/mlx/backend/metal/kernels/fp_quantized.h +1804 -0
  91. mlx/include/mlx/backend/metal/kernels/fp_quantized_nax.h +1059 -0
  92. mlx/include/mlx/backend/metal/kernels/gemv_masked.h +827 -0
  93. mlx/include/mlx/backend/metal/kernels/hadamard.h +182 -0
  94. mlx/include/mlx/backend/metal/kernels/indexing/gather.h +51 -0
  95. mlx/include/mlx/backend/metal/kernels/indexing/gather_axis.h +44 -0
  96. mlx/include/mlx/backend/metal/kernels/indexing/gather_front.h +24 -0
  97. mlx/include/mlx/backend/metal/kernels/indexing/indexing.h +23 -0
  98. mlx/include/mlx/backend/metal/kernels/indexing/masked_scatter.h +38 -0
  99. mlx/include/mlx/backend/metal/kernels/indexing/scatter.h +59 -0
  100. mlx/include/mlx/backend/metal/kernels/indexing/scatter_axis.h +52 -0
  101. mlx/include/mlx/backend/metal/kernels/logsumexp.h +140 -0
  102. mlx/include/mlx/backend/metal/kernels/quantized.h +2502 -0
  103. mlx/include/mlx/backend/metal/kernels/quantized_nax.h +1705 -0
  104. mlx/include/mlx/backend/metal/kernels/quantized_utils.h +90 -0
  105. mlx/include/mlx/backend/metal/kernels/reduce.h +5 -0
  106. mlx/include/mlx/backend/metal/kernels/reduce_utils.h +6 -0
  107. mlx/include/mlx/backend/metal/kernels/reduction/ops.h +275 -0
  108. mlx/include/mlx/backend/metal/kernels/reduction/reduce_all.h +66 -0
  109. mlx/include/mlx/backend/metal/kernels/reduction/reduce_col.h +398 -0
  110. mlx/include/mlx/backend/metal/kernels/reduction/reduce_init.h +8 -0
  111. mlx/include/mlx/backend/metal/kernels/reduction/reduce_row.h +369 -0
  112. mlx/include/mlx/backend/metal/kernels/scan.h +514 -0
  113. mlx/include/mlx/backend/metal/kernels/sdpa_vector.h +415 -0
  114. mlx/include/mlx/backend/metal/kernels/softmax.h +190 -0
  115. mlx/include/mlx/backend/metal/kernels/sort.h +715 -0
  116. mlx/include/mlx/backend/metal/kernels/steel/attn/attn.h +296 -0
  117. mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h +476 -0
  118. mlx/include/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_nax.h +481 -0
  119. mlx/include/mlx/backend/metal/kernels/steel/attn/loader.h +264 -0
  120. mlx/include/mlx/backend/metal/kernels/steel/attn/mma.h +750 -0
  121. mlx/include/mlx/backend/metal/kernels/steel/attn/nax.h +1076 -0
  122. mlx/include/mlx/backend/metal/kernels/steel/attn/params.h +44 -0
  123. mlx/include/mlx/backend/metal/kernels/steel/attn/transforms.h +71 -0
  124. mlx/include/mlx/backend/metal/kernels/steel/conv/conv.h +13 -0
  125. mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv.h +176 -0
  126. mlx/include/mlx/backend/metal/kernels/steel/conv/kernels/steel_conv_general.h +225 -0
  127. mlx/include/mlx/backend/metal/kernels/steel/conv/loader.h +6 -0
  128. mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_l.h +451 -0
  129. mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_channel_n.h +319 -0
  130. mlx/include/mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h +381 -0
  131. mlx/include/mlx/backend/metal/kernels/steel/conv/params.h +62 -0
  132. mlx/include/mlx/backend/metal/kernels/steel/defines.h +7 -0
  133. mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm.h +295 -0
  134. mlx/include/mlx/backend/metal/kernels/steel/gemm/gemm_nax.h +156 -0
  135. mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused.h +346 -0
  136. mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_fused_nax.h +207 -0
  137. mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather.h +459 -0
  138. mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_gather_nax.h +132 -0
  139. mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h +719 -0
  140. mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_segmented.h +266 -0
  141. mlx/include/mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_splitk.h +227 -0
  142. mlx/include/mlx/backend/metal/kernels/steel/gemm/loader.h +137 -0
  143. mlx/include/mlx/backend/metal/kernels/steel/gemm/mma.h +1146 -0
  144. mlx/include/mlx/backend/metal/kernels/steel/gemm/nax.h +1084 -0
  145. mlx/include/mlx/backend/metal/kernels/steel/gemm/params.h +64 -0
  146. mlx/include/mlx/backend/metal/kernels/steel/gemm/transforms.h +72 -0
  147. mlx/include/mlx/backend/metal/kernels/steel/utils/integral_constant.h +134 -0
  148. mlx/include/mlx/backend/metal/kernels/steel/utils/type_traits.h +55 -0
  149. mlx/include/mlx/backend/metal/kernels/steel/utils.h +42 -0
  150. mlx/include/mlx/backend/metal/kernels/ternary.h +145 -0
  151. mlx/include/mlx/backend/metal/kernels/ternary_ops.h +10 -0
  152. mlx/include/mlx/backend/metal/kernels/unary.h +63 -0
  153. mlx/include/mlx/backend/metal/kernels/unary_ops.h +454 -0
  154. mlx/include/mlx/backend/metal/kernels/utils.h +444 -0
  155. mlx/include/mlx/backend/metal/matmul.h +144 -0
  156. mlx/include/mlx/backend/metal/metal.h +22 -0
  157. mlx/include/mlx/backend/metal/reduce.h +41 -0
  158. mlx/include/mlx/backend/metal/resident.h +32 -0
  159. mlx/include/mlx/backend/metal/scan.h +17 -0
  160. mlx/include/mlx/backend/metal/ternary.h +21 -0
  161. mlx/include/mlx/backend/metal/unary.h +21 -0
  162. mlx/include/mlx/backend/metal/utils.h +84 -0
  163. mlx/include/mlx/backend/no_gpu/apple_memory.h +16 -0
  164. mlx/include/mlx/backend/no_gpu/linux_memory.h +22 -0
  165. mlx/include/mlx/compile.h +44 -0
  166. mlx/include/mlx/compile_impl.h +69 -0
  167. mlx/include/mlx/device.h +31 -0
  168. mlx/include/mlx/distributed/distributed.h +60 -0
  169. mlx/include/mlx/distributed/distributed_impl.h +59 -0
  170. mlx/include/mlx/distributed/jaccl/jaccl.h +12 -0
  171. mlx/include/mlx/distributed/mpi/mpi.h +12 -0
  172. mlx/include/mlx/distributed/mpi/mpi_declarations.h +28 -0
  173. mlx/include/mlx/distributed/nccl/nccl.h +12 -0
  174. mlx/include/mlx/distributed/ops.h +56 -0
  175. mlx/include/mlx/distributed/primitives.h +156 -0
  176. mlx/include/mlx/distributed/reduction_ops.h +38 -0
  177. mlx/include/mlx/distributed/ring/ring.h +12 -0
  178. mlx/include/mlx/distributed/utils.h +67 -0
  179. mlx/include/mlx/dtype.h +115 -0
  180. mlx/include/mlx/dtype_utils.h +119 -0
  181. mlx/include/mlx/einsum.h +22 -0
  182. mlx/include/mlx/event.h +58 -0
  183. mlx/include/mlx/export.h +136 -0
  184. mlx/include/mlx/export_impl.h +98 -0
  185. mlx/include/mlx/fast.h +102 -0
  186. mlx/include/mlx/fast_primitives.h +427 -0
  187. mlx/include/mlx/fence.h +39 -0
  188. mlx/include/mlx/fft.h +167 -0
  189. mlx/include/mlx/graph_utils.h +66 -0
  190. mlx/include/mlx/io/gguf.h +20 -0
  191. mlx/include/mlx/io/load.h +175 -0
  192. mlx/include/mlx/io.h +61 -0
  193. mlx/include/mlx/linalg.h +111 -0
  194. mlx/include/mlx/memory.h +78 -0
  195. mlx/include/mlx/mlx.h +25 -0
  196. mlx/include/mlx/ops.h +1627 -0
  197. mlx/include/mlx/primitives.h +2524 -0
  198. mlx/include/mlx/random.h +282 -0
  199. mlx/include/mlx/scheduler.h +188 -0
  200. mlx/include/mlx/small_vector.h +540 -0
  201. mlx/include/mlx/stream.h +41 -0
  202. mlx/include/mlx/threadpool.h +133 -0
  203. mlx/include/mlx/transforms.h +229 -0
  204. mlx/include/mlx/transforms_impl.h +86 -0
  205. mlx/include/mlx/types/bf16.h +187 -0
  206. mlx/include/mlx/types/complex.h +113 -0
  207. mlx/include/mlx/types/fp16.h +234 -0
  208. mlx/include/mlx/types/half_types.h +58 -0
  209. mlx/include/mlx/types/limits.h +70 -0
  210. mlx/include/mlx/utils.h +175 -0
  211. mlx/include/mlx/version.h +20 -0
  212. mlx/lib/libmlx.so +0 -0
  213. mlx/py.typed +1 -0
  214. mlx/share/cmake/MLX/FindNCCL.cmake +54 -0
  215. mlx/share/cmake/MLX/Findnvpl.cmake +3 -0
  216. mlx/share/cmake/MLX/MLXConfig.cmake +66 -0
  217. mlx/share/cmake/MLX/MLXConfigVersion.cmake +65 -0
  218. mlx/share/cmake/MLX/MLXTargets-release.cmake +19 -0
  219. mlx/share/cmake/MLX/MLXTargets.cmake +106 -0
  220. mlx/share/cmake/MLX/extension.cmake +50 -0
  221. mlx/utils.py +325 -0
  222. mlx_cpu-0.30.1.dist-info/METADATA +142 -0
  223. mlx_cpu-0.30.1.dist-info/RECORD +231 -0
  224. mlx_cpu-0.30.1.dist-info/WHEEL +5 -0
  225. mlx_cpu-0.30.1.dist-info/licenses/LICENSE +21 -0
  226. mlx_cpu-0.30.1.dist-info/sboms/auditwheel.cdx.json +1 -0
  227. mlx_cpu-0.30.1.dist-info/top_level.txt +1 -0
  228. mlx_cpu.libs/libblas-bd8a282c.so.3.10.0 +0 -0
  229. mlx_cpu.libs/libgfortran-3ec47101.so.5.0.0 +0 -0
  230. mlx_cpu.libs/liblapack-86b2c207.so.3.10.0 +0 -0
  231. mlx_cpu.libs/libquadmath-67d31475.so.0.0.0 +0 -0
@@ -0,0 +1,715 @@
1
+ // Copyright © 2023-2024 Apple Inc.
2
+
3
+ #define MLX_MTL_CONST static constant constexpr const
4
+ #define MLX_MTL_LOOP_UNROLL _Pragma("clang loop unroll(full)")
5
+
6
+ using namespace metal;
7
+
8
+ // Based on GPU merge sort algorithm at
9
+ // https://github.com/NVIDIA/cccl/tree/main/cub/cub
10
+
11
+ ///////////////////////////////////////////////////////////////////////////////
12
+ // Thread-level sort
13
+ ///////////////////////////////////////////////////////////////////////////////
14
+
15
+ template <typename T>
16
+ METAL_FUNC void thread_swap(thread T& a, thread T& b) {
17
+ T w = a;
18
+ a = b;
19
+ b = w;
20
+ }
21
+
22
+ template <typename T, typename = void>
23
+ struct Init {
24
+ static constexpr constant T v = Limits<T>::max;
25
+ };
26
+
27
+ template <typename T>
28
+ struct Init<T, metal::enable_if_t<metal::is_floating_point_v<T>>> {
29
+ static constexpr constant T v = metal::numeric_limits<T>::quiet_NaN();
30
+ };
31
+
32
+ template <typename T>
33
+ struct LessThan {
34
+ static constexpr constant T init = Init<T>::v;
35
+ METAL_FUNC bool operator()(T a, T b) const {
36
+ if constexpr (
37
+ metal::is_floating_point_v<T> || metal::is_same_v<T, complex64_t>) {
38
+ bool an = isnan(a);
39
+ bool bn = isnan(b);
40
+ if (an | bn) {
41
+ return (!an) & bn;
42
+ }
43
+ }
44
+ return a < b;
45
+ }
46
+ };
47
+
48
+ template <
49
+ typename ValT,
50
+ typename IdxT,
51
+ bool ARG_SORT,
52
+ short N_PER_THREAD,
53
+ typename CompareOp>
54
+ struct ThreadSort {
55
+ static METAL_FUNC void sort(
56
+ thread ValT (&vals)[N_PER_THREAD],
57
+ thread IdxT (&idxs)[N_PER_THREAD]) {
58
+ CompareOp op;
59
+ MLX_MTL_LOOP_UNROLL
60
+ for (short i = 0; i < N_PER_THREAD; ++i) {
61
+ MLX_MTL_LOOP_UNROLL
62
+ for (short j = i & 1; j < N_PER_THREAD - 1; j += 2) {
63
+ if (op(vals[j + 1], vals[j])) {
64
+ thread_swap(vals[j + 1], vals[j]);
65
+ if (ARG_SORT) {
66
+ thread_swap(idxs[j + 1], idxs[j]);
67
+ }
68
+ }
69
+ }
70
+ }
71
+ }
72
+ };
73
+
74
+ ///////////////////////////////////////////////////////////////////////////////
75
+ // Threadgroup-level sort
76
+ ///////////////////////////////////////////////////////////////////////////////
77
+
78
+ template <
79
+ typename ValT,
80
+ typename IdxT,
81
+ bool ARG_SORT,
82
+ short BLOCK_THREADS,
83
+ short N_PER_THREAD,
84
+ typename CompareOp>
85
+ struct BlockMergeSort {
86
+ using thread_sort_t =
87
+ ThreadSort<ValT, IdxT, ARG_SORT, N_PER_THREAD, CompareOp>;
88
+ static METAL_FUNC int merge_partition(
89
+ const threadgroup ValT* As,
90
+ const threadgroup ValT* Bs,
91
+ short A_sz,
92
+ short B_sz,
93
+ short sort_md) {
94
+ CompareOp op;
95
+
96
+ short A_st = max(0, sort_md - B_sz);
97
+ short A_ed = min(sort_md, A_sz);
98
+
99
+ while (A_st < A_ed) {
100
+ short md = A_st + (A_ed - A_st) / 2;
101
+ auto a = As[md];
102
+ auto b = Bs[sort_md - 1 - md];
103
+
104
+ if (op(b, a)) {
105
+ A_ed = md;
106
+ } else {
107
+ A_st = md + 1;
108
+ }
109
+ }
110
+
111
+ return A_ed;
112
+ }
113
+
114
+ static METAL_FUNC void merge_step(
115
+ const threadgroup ValT* As,
116
+ const threadgroup ValT* Bs,
117
+ const threadgroup IdxT* As_idx,
118
+ const threadgroup IdxT* Bs_idx,
119
+ short A_sz,
120
+ short B_sz,
121
+ thread ValT (&vals)[N_PER_THREAD],
122
+ thread IdxT (&idxs)[N_PER_THREAD]) {
123
+ CompareOp op;
124
+ short a_idx = 0;
125
+ short b_idx = 0;
126
+
127
+ for (int i = 0; i < N_PER_THREAD; ++i) {
128
+ auto a = As[a_idx];
129
+ auto b = Bs[b_idx];
130
+ bool pred = (b_idx < B_sz) && (a_idx >= A_sz || op(b, a));
131
+
132
+ vals[i] = pred ? b : a;
133
+ if (ARG_SORT) {
134
+ idxs[i] = pred ? Bs_idx[b_idx] : As_idx[a_idx];
135
+ }
136
+
137
+ b_idx += short(pred);
138
+ a_idx += short(!pred);
139
+ }
140
+ }
141
+
142
+ static METAL_FUNC void sort(
143
+ threadgroup ValT* tgp_vals [[threadgroup(0)]],
144
+ threadgroup IdxT* tgp_idxs [[threadgroup(1)]],
145
+ int size_sorted_axis,
146
+ uint3 lid [[thread_position_in_threadgroup]]) {
147
+ // Get thread location
148
+ int idx = lid.x * N_PER_THREAD;
149
+
150
+ // Load from shared memory
151
+ thread ValT thread_vals[N_PER_THREAD];
152
+ thread IdxT thread_idxs[N_PER_THREAD];
153
+ for (int i = 0; i < N_PER_THREAD; ++i) {
154
+ thread_vals[i] = tgp_vals[idx + i];
155
+ if (ARG_SORT) {
156
+ thread_idxs[i] = tgp_idxs[idx + i];
157
+ }
158
+ }
159
+
160
+ // Per thread sort
161
+ if (idx < size_sorted_axis) {
162
+ thread_sort_t::sort(thread_vals, thread_idxs);
163
+ }
164
+
165
+ // Do merges using threadgroup memory
166
+ for (int merge_threads = 2; merge_threads <= BLOCK_THREADS;
167
+ merge_threads *= 2) {
168
+ // Update threadgroup memory
169
+ threadgroup_barrier(mem_flags::mem_threadgroup);
170
+ for (int i = 0; i < N_PER_THREAD; ++i) {
171
+ tgp_vals[idx + i] = thread_vals[i];
172
+ if (ARG_SORT) {
173
+ tgp_idxs[idx + i] = thread_idxs[i];
174
+ }
175
+ }
176
+ threadgroup_barrier(mem_flags::mem_threadgroup);
177
+
178
+ // Find location in merge step
179
+ int merge_group = lid.x / merge_threads;
180
+ int merge_lane = lid.x % merge_threads;
181
+
182
+ int sort_sz = N_PER_THREAD * merge_threads;
183
+ int sort_st = N_PER_THREAD * merge_threads * merge_group;
184
+
185
+ // As = tgp_vals[A_st:A_ed] is sorted
186
+ // Bs = tgp_vals[B_st:B_ed] is sorted
187
+ int A_st = sort_st;
188
+ int A_ed = sort_st + sort_sz / 2;
189
+ int B_st = sort_st + sort_sz / 2;
190
+ int B_ed = sort_st + sort_sz;
191
+
192
+ const threadgroup ValT* As = tgp_vals + A_st;
193
+ const threadgroup ValT* Bs = tgp_vals + B_st;
194
+ int A_sz = A_ed - A_st;
195
+ int B_sz = B_ed - B_st;
196
+
197
+ // Find a partition of merge elements
198
+ // Ci = merge(As[partition:], Bs[sort_md - partition:])
199
+ // of size N_PER_THREAD for each merge lane i
200
+ // C = [Ci] is sorted
201
+ int sort_md = N_PER_THREAD * merge_lane;
202
+ int partition = merge_partition(As, Bs, A_sz, B_sz, sort_md);
203
+
204
+ As += partition;
205
+ Bs += sort_md - partition;
206
+
207
+ A_sz -= partition;
208
+ B_sz -= sort_md - partition;
209
+
210
+ const threadgroup IdxT* As_idx =
211
+ ARG_SORT ? tgp_idxs + A_st + partition : nullptr;
212
+ const threadgroup IdxT* Bs_idx =
213
+ ARG_SORT ? tgp_idxs + B_st + sort_md - partition : nullptr;
214
+
215
+ // Merge starting at the partition and store results in thread registers
216
+ merge_step(As, Bs, As_idx, Bs_idx, A_sz, B_sz, thread_vals, thread_idxs);
217
+ }
218
+
219
+ // Write out to shared memory
220
+ threadgroup_barrier(mem_flags::mem_threadgroup);
221
+ for (int i = 0; i < N_PER_THREAD; ++i) {
222
+ tgp_vals[idx + i] = thread_vals[i];
223
+ if (ARG_SORT) {
224
+ tgp_idxs[idx + i] = thread_idxs[i];
225
+ }
226
+ }
227
+ }
228
+ };
229
+
230
+ ///////////////////////////////////////////////////////////////////////////////
231
+ // Kernel sort
232
+ ///////////////////////////////////////////////////////////////////////////////
233
+
234
+ template <
235
+ typename T,
236
+ typename U,
237
+ bool ARG_SORT,
238
+ short BLOCK_THREADS,
239
+ short N_PER_THREAD,
240
+ typename CompareOp = LessThan<T>>
241
+ struct KernelMergeSort {
242
+ using ValT = T;
243
+ using IdxT = uint;
244
+ using block_merge_sort_t = BlockMergeSort<
245
+ ValT,
246
+ IdxT,
247
+ ARG_SORT,
248
+ BLOCK_THREADS,
249
+ N_PER_THREAD,
250
+ CompareOp>;
251
+
252
+ MLX_MTL_CONST short N_PER_BLOCK = BLOCK_THREADS * N_PER_THREAD;
253
+
254
+ static METAL_FUNC void block_sort(
255
+ const device T* inp,
256
+ device U* out,
257
+ const constant int& size_sorted_axis,
258
+ const constant int& in_stride_sorted_axis,
259
+ const constant int& out_stride_sorted_axis,
260
+ const constant int& in_stride_segment_axis,
261
+ const constant int& out_stride_segment_axis,
262
+ threadgroup ValT* tgp_vals,
263
+ threadgroup IdxT* tgp_idxs,
264
+ uint3 tid [[threadgroup_position_in_grid]],
265
+ uint3 lid [[thread_position_in_threadgroup]]) {
266
+ // tid.y tells us the segment index
267
+ inp += tid.y * in_stride_segment_axis;
268
+ out += tid.y * out_stride_segment_axis;
269
+
270
+ // Copy into threadgroup memory
271
+ for (short i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
272
+ tgp_vals[i] = i < size_sorted_axis ? inp[i * in_stride_sorted_axis]
273
+ : ValT(CompareOp::init);
274
+ if (ARG_SORT) {
275
+ tgp_idxs[i] = i;
276
+ }
277
+ }
278
+
279
+ // Sort elements within the block
280
+ threadgroup_barrier(mem_flags::mem_threadgroup);
281
+
282
+ block_merge_sort_t::sort(tgp_vals, tgp_idxs, size_sorted_axis, lid);
283
+
284
+ threadgroup_barrier(mem_flags::mem_threadgroup);
285
+
286
+ // Write output
287
+ for (int i = lid.x; i < size_sorted_axis; i += BLOCK_THREADS) {
288
+ if (ARG_SORT) {
289
+ out[i * out_stride_sorted_axis] = tgp_idxs[i];
290
+ } else {
291
+ out[i * out_stride_sorted_axis] = tgp_vals[i];
292
+ }
293
+ }
294
+ }
295
+ };
296
+
297
+ template <
298
+ typename T,
299
+ typename U,
300
+ bool ARG_SORT,
301
+ short BLOCK_THREADS,
302
+ short N_PER_THREAD>
303
+ [[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void block_sort(
304
+ const device T* inp [[buffer(0)]],
305
+ device U* out [[buffer(1)]],
306
+ const constant int& size_sorted_axis [[buffer(2)]],
307
+ const constant int& in_stride_sorted_axis [[buffer(3)]],
308
+ const constant int& out_stride_sorted_axis [[buffer(4)]],
309
+ const constant int& in_stride_segment_axis [[buffer(5)]],
310
+ const constant int& out_stride_segment_axis [[buffer(6)]],
311
+ uint3 tid [[threadgroup_position_in_grid]],
312
+ uint3 lid [[thread_position_in_threadgroup]]) {
313
+ using sort_kernel =
314
+ KernelMergeSort<T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD>;
315
+ using ValT = typename sort_kernel::ValT;
316
+ using IdxT = typename sort_kernel::IdxT;
317
+
318
+ if (ARG_SORT) {
319
+ threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
320
+ threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
321
+ sort_kernel::block_sort(
322
+ inp,
323
+ out,
324
+ size_sorted_axis,
325
+ in_stride_sorted_axis,
326
+ out_stride_sorted_axis,
327
+ in_stride_segment_axis,
328
+ out_stride_segment_axis,
329
+ tgp_vals,
330
+ tgp_idxs,
331
+ tid,
332
+ lid);
333
+ } else {
334
+ threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
335
+ sort_kernel::block_sort(
336
+ inp,
337
+ out,
338
+ size_sorted_axis,
339
+ in_stride_sorted_axis,
340
+ out_stride_sorted_axis,
341
+ in_stride_segment_axis,
342
+ out_stride_segment_axis,
343
+ tgp_vals,
344
+ nullptr,
345
+ tid,
346
+ lid);
347
+ }
348
+ }
349
+
350
+ constant constexpr const int zero_helper = 0;
351
+
352
+ template <
353
+ typename T,
354
+ typename U,
355
+ bool ARG_SORT,
356
+ short BLOCK_THREADS,
357
+ short N_PER_THREAD>
358
+ [[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void block_sort_nc(
359
+ const device T* inp [[buffer(0)]],
360
+ device U* out [[buffer(1)]],
361
+ const constant int& size_sorted_axis [[buffer(2)]],
362
+ const constant int& in_stride_sorted_axis [[buffer(3)]],
363
+ const constant int& out_stride_sorted_axis [[buffer(4)]],
364
+ const constant int& nc_dim [[buffer(5)]],
365
+ const constant int* nc_shape [[buffer(6)]],
366
+ const constant int64_t* in_nc_strides [[buffer(7)]],
367
+ const constant int64_t* out_nc_strides [[buffer(8)]],
368
+ uint3 tid [[threadgroup_position_in_grid]],
369
+ uint3 lid [[thread_position_in_threadgroup]]) {
370
+ using sort_kernel =
371
+ KernelMergeSort<T, U, ARG_SORT, BLOCK_THREADS, N_PER_THREAD>;
372
+ using ValT = typename sort_kernel::ValT;
373
+ using IdxT = typename sort_kernel::IdxT;
374
+
375
+ auto in_block_idx = elem_to_loc(tid.y, nc_shape, in_nc_strides, nc_dim);
376
+ auto out_block_idx = elem_to_loc(tid.y, nc_shape, out_nc_strides, nc_dim);
377
+ inp += in_block_idx;
378
+ out += out_block_idx;
379
+
380
+ if (ARG_SORT) {
381
+ threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
382
+ threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
383
+ sort_kernel::block_sort(
384
+ inp,
385
+ out,
386
+ size_sorted_axis,
387
+ in_stride_sorted_axis,
388
+ out_stride_sorted_axis,
389
+ zero_helper,
390
+ zero_helper,
391
+ tgp_vals,
392
+ tgp_idxs,
393
+ tid,
394
+ lid);
395
+ } else {
396
+ threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
397
+ sort_kernel::block_sort(
398
+ inp,
399
+ out,
400
+ size_sorted_axis,
401
+ in_stride_sorted_axis,
402
+ out_stride_sorted_axis,
403
+ zero_helper,
404
+ zero_helper,
405
+ tgp_vals,
406
+ nullptr,
407
+ tid,
408
+ lid);
409
+ }
410
+ }
411
+
412
+ template <
413
+ typename ValT,
414
+ typename IdxT,
415
+ bool ARG_SORT,
416
+ short BLOCK_THREADS,
417
+ short N_PER_THREAD,
418
+ typename CompareOp = LessThan<ValT>>
419
+ struct KernelMultiBlockMergeSort {
420
+ using block_merge_sort_t = BlockMergeSort<
421
+ ValT,
422
+ IdxT,
423
+ ARG_SORT,
424
+ BLOCK_THREADS,
425
+ N_PER_THREAD,
426
+ CompareOp>;
427
+
428
+ MLX_MTL_CONST short N_PER_BLOCK = BLOCK_THREADS * N_PER_THREAD;
429
+
430
+ static METAL_FUNC void block_sort(
431
+ const device ValT* inp,
432
+ device ValT* out_vals,
433
+ device IdxT* out_idxs,
434
+ const constant int& size_sorted_axis,
435
+ const constant int& stride_sorted_axis,
436
+ threadgroup ValT* tgp_vals,
437
+ threadgroup IdxT* tgp_idxs,
438
+ uint3 tid [[threadgroup_position_in_grid]],
439
+ uint3 lid [[thread_position_in_threadgroup]]) {
440
+ // tid.y tells us the segment index
441
+ int base_idx = tid.x * N_PER_BLOCK;
442
+
443
+ // Copy into threadgroup memory
444
+ for (short i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
445
+ int idx = base_idx + i;
446
+ tgp_vals[i] = idx < size_sorted_axis ? inp[idx * stride_sorted_axis]
447
+ : ValT(CompareOp::init);
448
+ tgp_idxs[i] = idx;
449
+ }
450
+
451
+ // Sort elements within the block
452
+ threadgroup_barrier(mem_flags::mem_threadgroup);
453
+
454
+ block_merge_sort_t::sort(tgp_vals, tgp_idxs, size_sorted_axis, lid);
455
+
456
+ threadgroup_barrier(mem_flags::mem_threadgroup);
457
+
458
+ // Write output
459
+ for (int i = lid.x; i < N_PER_BLOCK; i += BLOCK_THREADS) {
460
+ int idx = base_idx + i;
461
+ if (idx < size_sorted_axis) {
462
+ out_vals[idx] = tgp_vals[i];
463
+ out_idxs[idx] = tgp_idxs[i];
464
+ }
465
+ }
466
+ }
467
+
468
+ static METAL_FUNC int merge_partition(
469
+ const device ValT* As,
470
+ const device ValT* Bs,
471
+ int A_sz,
472
+ int B_sz,
473
+ int sort_md) {
474
+ CompareOp op;
475
+
476
+ int A_st = max(0, sort_md - B_sz);
477
+ int A_ed = min(sort_md, A_sz);
478
+
479
+ while (A_st < A_ed) {
480
+ int md = A_st + (A_ed - A_st) / 2;
481
+ auto a = As[md];
482
+ auto b = Bs[sort_md - 1 - md];
483
+
484
+ if (op(b, a)) {
485
+ A_ed = md;
486
+ } else {
487
+ A_st = md + 1;
488
+ }
489
+ }
490
+
491
+ return A_ed;
492
+ }
493
+ };
494
+
495
+ template <
496
+ typename ValT,
497
+ typename IdxT,
498
+ bool ARG_SORT,
499
+ short BLOCK_THREADS,
500
+ short N_PER_THREAD>
501
+ [[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void mb_block_sort(
502
+ const device ValT* inp [[buffer(0)]],
503
+ device ValT* out_vals [[buffer(1)]],
504
+ device IdxT* out_idxs [[buffer(2)]],
505
+ const constant int& size_sorted_axis [[buffer(3)]],
506
+ const constant int& stride_sorted_axis [[buffer(4)]],
507
+ const constant int& nc_dim [[buffer(5)]],
508
+ const constant int* nc_shape [[buffer(6)]],
509
+ const constant int64_t* nc_strides [[buffer(7)]],
510
+ uint3 tid [[threadgroup_position_in_grid]],
511
+ uint3 lid [[thread_position_in_threadgroup]]) {
512
+ using sort_kernel = KernelMultiBlockMergeSort<
513
+ ValT,
514
+ IdxT,
515
+ ARG_SORT,
516
+ BLOCK_THREADS,
517
+ N_PER_THREAD>;
518
+
519
+ auto block_idx = elem_to_loc(tid.y, nc_shape, nc_strides, nc_dim);
520
+ inp += block_idx;
521
+ out_vals += tid.y * size_sorted_axis;
522
+ out_idxs += tid.y * size_sorted_axis;
523
+
524
+ threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
525
+ threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
526
+
527
+ sort_kernel::block_sort(
528
+ inp,
529
+ out_vals,
530
+ out_idxs,
531
+ size_sorted_axis,
532
+ stride_sorted_axis,
533
+ tgp_vals,
534
+ tgp_idxs,
535
+ tid,
536
+ lid);
537
+ }
538
+
539
+ template <
540
+ typename ValT,
541
+ typename IdxT,
542
+ bool ARG_SORT,
543
+ short BLOCK_THREADS,
544
+ short N_PER_THREAD>
545
+ [[kernel]] void mb_block_partition(
546
+ device IdxT* block_partitions [[buffer(0)]],
547
+ const device ValT* dev_vals [[buffer(1)]],
548
+ const device IdxT* dev_idxs [[buffer(2)]],
549
+ const constant int& size_sorted_axis [[buffer(3)]],
550
+ const constant int& merge_tiles [[buffer(4)]],
551
+ const constant int& n_blocks [[buffer(5)]],
552
+ uint3 tid [[threadgroup_position_in_grid]],
553
+ uint3 lid [[thread_position_in_threadgroup]],
554
+ uint3 tgp_dims [[threads_per_threadgroup]]) {
555
+ using sort_kernel = KernelMultiBlockMergeSort<
556
+ ValT,
557
+ IdxT,
558
+ ARG_SORT,
559
+ BLOCK_THREADS,
560
+ N_PER_THREAD>;
561
+
562
+ block_partitions += tid.y * tgp_dims.x;
563
+ dev_vals += tid.y * size_sorted_axis;
564
+ dev_idxs += tid.y * size_sorted_axis;
565
+
566
+ for (int i = lid.x; i <= n_blocks; i += tgp_dims.x) {
567
+ // Find location in merge step
568
+ int merge_group = i / merge_tiles;
569
+ int merge_lane = i % merge_tiles;
570
+
571
+ int sort_sz = sort_kernel::N_PER_BLOCK * merge_tiles;
572
+ int sort_st = sort_kernel::N_PER_BLOCK * merge_tiles * merge_group;
573
+
574
+ int A_st = min(size_sorted_axis, sort_st);
575
+ int A_ed = min(size_sorted_axis, sort_st + sort_sz / 2);
576
+ int B_st = A_ed;
577
+ int B_ed = min(size_sorted_axis, B_st + sort_sz / 2);
578
+
579
+ int partition_at = min(B_ed - A_st, sort_kernel::N_PER_BLOCK * merge_lane);
580
+ int partition = sort_kernel::merge_partition(
581
+ dev_vals + A_st,
582
+ dev_vals + B_st,
583
+ A_ed - A_st,
584
+ B_ed - B_st,
585
+ partition_at);
586
+
587
+ block_partitions[i] = A_st + partition;
588
+ }
589
+ }
590
+
591
+ template <
592
+ typename ValT,
593
+ typename IdxT,
594
+ bool ARG_SORT,
595
+ short BLOCK_THREADS,
596
+ short N_PER_THREAD,
597
+ typename CompareOp = LessThan<ValT>>
598
+ [[kernel, max_total_threads_per_threadgroup(BLOCK_THREADS)]] void
599
+ mb_block_merge(
600
+ const device IdxT* block_partitions [[buffer(0)]],
601
+ const device ValT* dev_vals_in [[buffer(1)]],
602
+ const device IdxT* dev_idxs_in [[buffer(2)]],
603
+ device ValT* dev_vals_out [[buffer(3)]],
604
+ device IdxT* dev_idxs_out [[buffer(4)]],
605
+ const constant int& size_sorted_axis [[buffer(5)]],
606
+ const constant int& merge_tiles [[buffer(6)]],
607
+ const constant int& num_tiles [[buffer(7)]],
608
+ uint3 tid [[threadgroup_position_in_grid]],
609
+ uint3 lid [[thread_position_in_threadgroup]]) {
610
+ using sort_kernel = KernelMultiBlockMergeSort<
611
+ ValT,
612
+ IdxT,
613
+ ARG_SORT,
614
+ BLOCK_THREADS,
615
+ N_PER_THREAD,
616
+ CompareOp>;
617
+
618
+ using block_sort_t = typename sort_kernel::block_merge_sort_t;
619
+
620
+ block_partitions += tid.y * (num_tiles + 1);
621
+ dev_vals_in += tid.y * size_sorted_axis;
622
+ dev_idxs_in += tid.y * size_sorted_axis;
623
+ dev_vals_out += tid.y * size_sorted_axis;
624
+ dev_idxs_out += tid.y * size_sorted_axis;
625
+
626
+ int block_idx = tid.x;
627
+ int merge_group = block_idx / merge_tiles;
628
+ int sort_st = sort_kernel::N_PER_BLOCK * merge_tiles * merge_group;
629
+ int sort_sz = sort_kernel::N_PER_BLOCK * merge_tiles;
630
+ int sort_md = sort_kernel::N_PER_BLOCK * block_idx - sort_st;
631
+
632
+ int A_st = block_partitions[block_idx + 0];
633
+ int A_ed = block_partitions[block_idx + 1];
634
+ int B_st = min(size_sorted_axis, 2 * sort_st + sort_sz / 2 + sort_md - A_st);
635
+ int B_ed = min(
636
+ size_sorted_axis,
637
+ 2 * sort_st + sort_sz / 2 + sort_md + sort_kernel::N_PER_BLOCK - A_ed);
638
+
639
+ if ((block_idx % merge_tiles) == merge_tiles - 1) {
640
+ A_ed = min(size_sorted_axis, sort_st + sort_sz / 2);
641
+ B_ed = min(size_sorted_axis, sort_st + sort_sz);
642
+ }
643
+
644
+ int A_sz = A_ed - A_st;
645
+ int B_sz = B_ed - B_st;
646
+
647
+ // Load from global memory
648
+ thread ValT thread_vals[N_PER_THREAD];
649
+ thread IdxT thread_idxs[N_PER_THREAD];
650
+ for (int i = 0; i < N_PER_THREAD; i++) {
651
+ int idx = BLOCK_THREADS * i + lid.x;
652
+ if (idx < (A_sz + B_sz)) {
653
+ thread_vals[i] = (idx < A_sz) ? dev_vals_in[A_st + idx]
654
+ : dev_vals_in[B_st + idx - A_sz];
655
+ thread_idxs[i] = (idx < A_sz) ? dev_idxs_in[A_st + idx]
656
+ : dev_idxs_in[B_st + idx - A_sz];
657
+ } else {
658
+ thread_vals[i] = CompareOp::init;
659
+ thread_idxs[i] = 0;
660
+ }
661
+ }
662
+
663
+ // Write to shared memory
664
+ threadgroup ValT tgp_vals[sort_kernel::N_PER_BLOCK];
665
+ threadgroup IdxT tgp_idxs[sort_kernel::N_PER_BLOCK];
666
+ threadgroup_barrier(mem_flags::mem_threadgroup);
667
+ for (int i = 0; i < N_PER_THREAD; i++) {
668
+ int idx = BLOCK_THREADS * i + lid.x;
669
+ tgp_vals[idx] = thread_vals[i];
670
+ tgp_idxs[idx] = thread_idxs[i];
671
+ }
672
+ threadgroup_barrier(mem_flags::mem_threadgroup);
673
+
674
+ // Merge
675
+ int sort_md_local = min(A_sz + B_sz, N_PER_THREAD * int(lid.x));
676
+
677
+ int A_st_local = block_sort_t::merge_partition(
678
+ tgp_vals, tgp_vals + A_sz, A_sz, B_sz, sort_md_local);
679
+ int A_ed_local = A_sz;
680
+
681
+ int B_st_local = sort_md_local - A_st_local;
682
+ int B_ed_local = B_sz;
683
+
684
+ int A_sz_local = A_ed_local - A_st_local;
685
+ int B_sz_local = B_ed_local - B_st_local;
686
+
687
+ // Do merge
688
+ block_sort_t::merge_step(
689
+ tgp_vals + A_st_local,
690
+ tgp_vals + A_ed_local + B_st_local,
691
+ tgp_idxs + A_st_local,
692
+ tgp_idxs + A_ed_local + B_st_local,
693
+ A_sz_local,
694
+ B_sz_local,
695
+ thread_vals,
696
+ thread_idxs);
697
+
698
+ threadgroup_barrier(mem_flags::mem_threadgroup);
699
+ for (int i = 0; i < N_PER_THREAD; ++i) {
700
+ int idx = lid.x * N_PER_THREAD;
701
+ tgp_vals[idx + i] = thread_vals[i];
702
+ tgp_idxs[idx + i] = thread_idxs[i];
703
+ }
704
+
705
+ threadgroup_barrier(mem_flags::mem_threadgroup);
706
+ // Write output
707
+ int base_idx = tid.x * sort_kernel::N_PER_BLOCK;
708
+ for (int i = lid.x; i < sort_kernel::N_PER_BLOCK; i += BLOCK_THREADS) {
709
+ int idx = base_idx + i;
710
+ if (idx < size_sorted_axis) {
711
+ dev_vals_out[idx] = tgp_vals[i];
712
+ dev_idxs_out[idx] = tgp_idxs[i];
713
+ }
714
+ }
715
+ }