llama_cpp 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
41
41
  // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
42
42
  // - the mapping is used during computation to determine the arguments of the compute kernels
43
43
  // - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
44
+ // - max_size specifies the maximum size of a tensor and is used to create shared views such
45
+ // that it is guaranteed that the tensor will fit in at least one of the views
44
46
  //
45
47
  bool ggml_metal_add_buffer(
46
48
  struct ggml_metal_context * ctx,
47
49
  const char * name,
48
50
  void * data,
49
- size_t size);
51
+ size_t size,
52
+ size_t max_size);
50
53
 
51
54
  // set data from host memory into the device
52
55
  void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
@@ -57,6 +57,7 @@ struct ggml_metal_context {
57
57
  GGML_METAL_DECL_KERNEL(get_rows_q5_k);
58
58
  GGML_METAL_DECL_KERNEL(get_rows_q6_k);
59
59
  GGML_METAL_DECL_KERNEL(rms_norm);
60
+ GGML_METAL_DECL_KERNEL(norm);
60
61
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
61
62
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
62
63
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
@@ -66,8 +67,10 @@ struct ggml_metal_context {
66
67
  GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
67
68
  GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
68
69
  GGML_METAL_DECL_KERNEL(rope);
70
+ GGML_METAL_DECL_KERNEL(alibi_f32);
69
71
  GGML_METAL_DECL_KERNEL(cpy_f32_f16);
70
72
  GGML_METAL_DECL_KERNEL(cpy_f32_f32);
73
+ GGML_METAL_DECL_KERNEL(cpy_f16_f16);
71
74
 
72
75
  #undef GGML_METAL_DECL_KERNEL
73
76
  };
@@ -162,6 +165,7 @@ struct ggml_metal_context * ggml_metal_init(void) {
162
165
  GGML_METAL_ADD_KERNEL(get_rows_q5_k);
163
166
  GGML_METAL_ADD_KERNEL(get_rows_q6_k);
164
167
  GGML_METAL_ADD_KERNEL(rms_norm);
168
+ GGML_METAL_ADD_KERNEL(norm);
165
169
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
166
170
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
167
171
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
@@ -171,12 +175,22 @@ struct ggml_metal_context * ggml_metal_init(void) {
171
175
  GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
172
176
  GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
173
177
  GGML_METAL_ADD_KERNEL(rope);
178
+ GGML_METAL_ADD_KERNEL(alibi_f32);
174
179
  GGML_METAL_ADD_KERNEL(cpy_f32_f16);
175
180
  GGML_METAL_ADD_KERNEL(cpy_f32_f32);
181
+ GGML_METAL_ADD_KERNEL(cpy_f16_f16);
176
182
 
177
183
  #undef GGML_METAL_ADD_KERNEL
178
184
  }
179
185
 
186
+ fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
187
+ fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
188
+ if (ctx->device.maxTransferRate != 0) {
189
+ fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
190
+ } else {
191
+ fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
192
+ }
193
+
180
194
  return ctx;
181
195
  }
182
196
 
@@ -193,10 +207,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
193
207
  static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
194
208
  //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
195
209
 
210
+ const int64_t tsize = ggml_nbytes(t);
211
+
212
+ // find the view that contains the tensor fully
196
213
  for (int i = 0; i < ctx->n_buffers; ++i) {
197
214
  const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
198
215
 
199
- if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
216
+ if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
200
217
  *offs = (size_t) ioffs;
201
218
 
202
219
  //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
@@ -214,7 +231,8 @@ bool ggml_metal_add_buffer(
214
231
  struct ggml_metal_context * ctx,
215
232
  const char * name,
216
233
  void * data,
217
- size_t size) {
234
+ size_t size,
235
+ size_t max_size) {
218
236
  if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
219
237
  fprintf(stderr, "%s: too many buffers\n", __func__);
220
238
  return false;
@@ -231,30 +249,68 @@ bool ggml_metal_add_buffer(
231
249
  }
232
250
  }
233
251
 
234
- size_t page_size = getpagesize();
235
- size_t aligned_size = size;
236
- if ((aligned_size % page_size) != 0) {
237
- aligned_size += (page_size - (aligned_size % page_size));
252
+ const size_t size_page = getpagesize();
253
+
254
+ size_t size_aligned = size;
255
+ if ((size_aligned % size_page) != 0) {
256
+ size_aligned += (size_page - (size_aligned % size_page));
238
257
  }
239
258
 
240
- ctx->buffers[ctx->n_buffers].name = name;
241
- ctx->buffers[ctx->n_buffers].data = data;
242
- ctx->buffers[ctx->n_buffers].size = size;
259
+ // the buffer fits into the max buffer size allowed by the device
260
+ if (size_aligned <= ctx->device.maxBufferLength) {
261
+ ctx->buffers[ctx->n_buffers].name = name;
262
+ ctx->buffers[ctx->n_buffers].data = data;
263
+ ctx->buffers[ctx->n_buffers].size = size;
243
264
 
244
- if (ctx->device.maxBufferLength < aligned_size) {
245
- fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
246
- return false;
247
- }
248
- ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
265
+ ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
266
+
267
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
268
+ fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
269
+ return false;
270
+ }
271
+
272
+ fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
249
273
 
250
- if (ctx->buffers[ctx->n_buffers].metal == nil) {
251
- fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
252
- return false;
274
+ ++ctx->n_buffers;
253
275
  } else {
254
- fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
276
+ // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
277
+ // one of the views
278
+ const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
279
+ const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
280
+ const size_t size_view = ctx->device.maxBufferLength;
281
+
282
+ for (size_t i = 0; i < size; i += size_step) {
283
+ const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
284
+
285
+ ctx->buffers[ctx->n_buffers].name = name;
286
+ ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
287
+ ctx->buffers[ctx->n_buffers].size = size_step_aligned;
288
+
289
+ ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
290
+
291
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
292
+ fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
293
+ return false;
294
+ }
295
+
296
+ fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
297
+ if (i + size_step < size) {
298
+ fprintf(stderr, "\n");
299
+ }
300
+
301
+ ++ctx->n_buffers;
302
+ }
255
303
  }
256
304
 
257
- ++ctx->n_buffers;
305
+ fprintf(stderr, ", (%8.2f / %8.2f)",
306
+ ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
307
+ ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
308
+
309
+ if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
310
+ fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
311
+ } else {
312
+ fprintf(stderr, "\n");
313
+ }
258
314
  }
259
315
 
260
316
  return true;
@@ -735,6 +791,70 @@ void ggml_metal_graph_compute(
735
791
 
736
792
  [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
737
793
  } break;
794
+ case GGML_OP_NORM:
795
+ {
796
+ if (encoder == nil) {
797
+ encoder = [command_buffer computeCommandEncoder];
798
+ }
799
+
800
+ const float eps = 1e-5f;
801
+
802
+ const int nth = 256;
803
+
804
+ [encoder setComputePipelineState:ctx->pipeline_norm];
805
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
806
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
807
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
808
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
809
+ [encoder setBytes:&eps length:sizeof( float) atIndex:4];
810
+ [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
811
+
812
+ const int64_t nrows = ggml_nrows(src0);
813
+
814
+ [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
815
+ } break;
816
+ case GGML_OP_ALIBI:
817
+ {
818
+ if (encoder == nil) {
819
+ encoder = [command_buffer computeCommandEncoder];
820
+ }
821
+
822
+ GGML_ASSERT((src0t == GGML_TYPE_F32));
823
+
824
+ const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past);
825
+ const int n_head = ((int32_t *) src1->data)[1];
826
+ const float max_bias = ((float *) src1->data)[2];
827
+
828
+ if (__builtin_popcount(n_head) != 1) {
829
+ GGML_ASSERT(false && "only power-of-two n_head implemented");
830
+ }
831
+
832
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
833
+ const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
834
+
835
+ [encoder setComputePipelineState:ctx->pipeline_alibi_f32];
836
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
837
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
838
+ [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
839
+ [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
840
+ [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
841
+ [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
842
+ [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
843
+ [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
844
+ [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
845
+ [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
846
+ [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
847
+ [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
848
+ [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
849
+ [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
850
+ [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
851
+ [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
852
+ [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
853
+ [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
854
+ [encoder setBytes:&m0 length:sizeof( float) atIndex:18];
855
+ const int nth = 32;
856
+ [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
857
+ } break;
738
858
  case GGML_OP_ROPE:
739
859
  {
740
860
  if (encoder == nil) {
@@ -788,6 +908,14 @@ void ggml_metal_graph_compute(
788
908
  default: GGML_ASSERT(false && "not implemented");
789
909
  };
790
910
  } break;
911
+ case GGML_TYPE_F16:
912
+ {
913
+ switch (dstt) {
914
+ case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
915
+ case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break;
916
+ default: GGML_ASSERT(false && "not implemented");
917
+ };
918
+ } break;
791
919
  default: GGML_ASSERT(false && "not implemented");
792
920
  }
793
921
 
@@ -831,4 +959,14 @@ void ggml_metal_graph_compute(
831
959
  dispatch_barrier_sync(queue, ^{});
832
960
 
833
961
  [command_buffers[n_cb - 1] waitUntilCompleted];
962
+
963
+ // check status of command buffers
964
+ // needed to detect if the device ran out-of-memory for example (#1881)
965
+ for (int i = 0; i < n_cb; i++) {
966
+ MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
967
+ if (status != MTLCommandBufferStatusCompleted) {
968
+ fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
969
+ GGML_ASSERT(false);
970
+ }
971
+ }
834
972
  }
@@ -256,6 +256,72 @@ kernel void kernel_get_rows_q4_1(
256
256
  (device float *) ((device char *) dst + i*nb1), ne00);
257
257
  }
258
258
 
259
+ kernel void kernel_norm(
260
+ device const void * src0,
261
+ device float * dst,
262
+ constant int64_t & ne00,
263
+ constant uint64_t & nb01,
264
+ constant float & eps,
265
+ threadgroup float * sum [[threadgroup(0)]],
266
+ uint tgpig[[threadgroup_position_in_grid]],
267
+ uint tpitg[[thread_position_in_threadgroup]],
268
+ uint ntg[[threads_per_threadgroup]]) {
269
+ device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01);
270
+ // MEAN
271
+ // parallel sum
272
+ sum[tpitg] = 0.0f;
273
+ for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
274
+ sum[tpitg] += x[i00];
275
+ }
276
+ // reduce
277
+ threadgroup_barrier(mem_flags::mem_threadgroup);
278
+ for (uint i = ntg/2; i > 0; i /= 2) {
279
+ if (tpitg < i) {
280
+ sum[tpitg] += sum[tpitg + i];
281
+ }
282
+ threadgroup_barrier(mem_flags::mem_threadgroup);
283
+ }
284
+ // broadcast
285
+ if (tpitg == 0) {
286
+ sum[0] /= ne00;
287
+ }
288
+ threadgroup_barrier(mem_flags::mem_threadgroup);
289
+ const float mean = sum[0];
290
+
291
+ // recenter
292
+ device float * y = dst + tgpig*ne00;
293
+ for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
294
+ y[i00] = x[i00] - mean;
295
+ }
296
+
297
+ // VARIANCE
298
+ // parallel sum
299
+ sum[tpitg] = 0.0f;
300
+ for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
301
+ sum[tpitg] += y[i00] * y[i00];
302
+ }
303
+ // reduce
304
+ threadgroup_barrier(mem_flags::mem_threadgroup);
305
+ for (uint i = ntg/2; i > 0; i /= 2) {
306
+ if (tpitg < i) {
307
+ sum[tpitg] += sum[tpitg + i];
308
+ }
309
+ threadgroup_barrier(mem_flags::mem_threadgroup);
310
+ }
311
+ // broadcast
312
+ if (tpitg == 0) {
313
+ sum[0] /= ne00;
314
+ }
315
+ threadgroup_barrier(mem_flags::mem_threadgroup);
316
+ const float variance = sum[0];
317
+
318
+ const float scale = 1.0f/sqrt(variance + eps);
319
+ for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
320
+ y[i00] = y[i00] * scale;
321
+ }
322
+ }
323
+
324
+
259
325
  kernel void kernel_rms_norm(
260
326
  device const void * src0,
261
327
  device float * dst,
@@ -485,6 +551,48 @@ kernel void kernel_mul_mat_f16_f32(
485
551
  }
486
552
  }
487
553
 
554
+ kernel void kernel_alibi_f32(
555
+ device const float * src0,
556
+ device float * dst,
557
+ constant int64_t & ne00,
558
+ constant int64_t & ne01,
559
+ constant int64_t & ne02,
560
+ constant int64_t & ne03,
561
+ constant uint64_t & nb00,
562
+ constant uint64_t & nb01,
563
+ constant uint64_t & nb02,
564
+ constant uint64_t & nb03,
565
+ constant int64_t & ne0,
566
+ constant int64_t & ne1,
567
+ constant int64_t & ne2,
568
+ constant int64_t & ne3,
569
+ constant uint64_t & nb0,
570
+ constant uint64_t & nb1,
571
+ constant uint64_t & nb2,
572
+ constant uint64_t & nb3,
573
+ constant float & m0,
574
+ uint3 tgpig[[threadgroup_position_in_grid]],
575
+ uint3 tpitg[[thread_position_in_threadgroup]],
576
+ uint3 ntg[[threads_per_threadgroup]]) {
577
+ const int64_t i03 = tgpig[2];
578
+ const int64_t i02 = tgpig[1];
579
+ const int64_t i01 = tgpig[0];
580
+
581
+ const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
582
+
583
+ const int64_t i3 = n / (ne2*ne1*ne0);
584
+ const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
585
+ const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
586
+ const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
587
+
588
+ device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
589
+ float m_k = pow(m0, i2 + 1);
590
+ for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
591
+ device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
592
+ dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1);
593
+ }
594
+ }
595
+
488
596
  kernel void kernel_rope(
489
597
  device const void * src0,
490
598
  device float * dst,
@@ -540,6 +648,47 @@ kernel void kernel_rope(
540
648
  }
541
649
  }
542
650
 
651
+ kernel void kernel_cpy_f16_f16(
652
+ device const half * src0,
653
+ device half * dst,
654
+ constant int64_t & ne00,
655
+ constant int64_t & ne01,
656
+ constant int64_t & ne02,
657
+ constant int64_t & ne03,
658
+ constant uint64_t & nb00,
659
+ constant uint64_t & nb01,
660
+ constant uint64_t & nb02,
661
+ constant uint64_t & nb03,
662
+ constant int64_t & ne0,
663
+ constant int64_t & ne1,
664
+ constant int64_t & ne2,
665
+ constant int64_t & ne3,
666
+ constant uint64_t & nb0,
667
+ constant uint64_t & nb1,
668
+ constant uint64_t & nb2,
669
+ constant uint64_t & nb3,
670
+ uint3 tgpig[[threadgroup_position_in_grid]],
671
+ uint3 tpitg[[thread_position_in_threadgroup]],
672
+ uint3 ntg[[threads_per_threadgroup]]) {
673
+ const int64_t i03 = tgpig[2];
674
+ const int64_t i02 = tgpig[1];
675
+ const int64_t i01 = tgpig[0];
676
+
677
+ const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
678
+
679
+ const int64_t i3 = n / (ne2*ne1*ne0);
680
+ const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
681
+ const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
682
+ const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
683
+
684
+ device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
685
+
686
+ for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
687
+ device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
688
+ dst_data[i00] = src[0];
689
+ }
690
+ }
691
+
543
692
  kernel void kernel_cpy_f32_f16(
544
693
  device const float * src0,
545
694
  device half * dst,