llama_cpp 0.0.5 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -169,14 +169,27 @@
169
169
  //
170
170
  //
171
171
 
172
- #ifdef __cplusplus
173
- extern "C" {
172
+ #ifdef GGML_SHARED
173
+ # if defined(_WIN32) && !defined(__MINGW32__)
174
+ # ifdef GGML_BUILD
175
+ # define GGML_API __declspec(dllexport)
176
+ # else
177
+ # define GGML_API __declspec(dllimport)
178
+ # endif
179
+ # else
180
+ # define GGML_API __attribute__ ((visibility ("default")))
181
+ # endif
182
+ #else
183
+ # define GGML_API
174
184
  #endif
175
185
 
176
186
  #include <stdint.h>
177
187
  #include <stddef.h>
178
188
  #include <stdbool.h>
179
189
 
190
+ #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
+ #define GGML_FILE_VERSION 1
192
+
180
193
  #define GGML_MAX_DIMS 4
181
194
  #define GGML_MAX_NODES 4096
182
195
  #define GGML_MAX_PARAMS 16
@@ -184,672 +197,696 @@ extern "C" {
184
197
  #define GGML_MAX_OPT 4
185
198
  #define GGML_DEFAULT_N_THREADS 4
186
199
 
200
+ #ifdef __cplusplus
201
+ extern "C" {
202
+ #endif
203
+
187
204
  #ifdef __ARM_NEON
188
- // we use the built-in 16-bit float type
189
- typedef __fp16 ggml_fp16_t;
205
+ // we use the built-in 16-bit float type
206
+ typedef __fp16 ggml_fp16_t;
190
207
  #else
191
- typedef uint16_t ggml_fp16_t;
208
+ typedef uint16_t ggml_fp16_t;
192
209
  #endif
193
210
 
194
- // convert FP16 <-> FP32
195
- float ggml_fp16_to_fp32(ggml_fp16_t x);
196
- ggml_fp16_t ggml_fp32_to_fp16(float x);
197
-
198
- struct ggml_object;
199
- struct ggml_context;
200
-
201
- enum ggml_type {
202
- // explicitly numbered values are used in llama.cpp files
203
- GGML_TYPE_F32 = 0,
204
- GGML_TYPE_F16 = 1,
205
- GGML_TYPE_Q4_0 = 2,
206
- GGML_TYPE_Q4_1 = 3,
207
- GGML_TYPE_Q8_0 = 4,
208
- GGML_TYPE_I8,
209
- GGML_TYPE_I16,
210
- GGML_TYPE_I32,
211
- GGML_TYPE_COUNT,
212
- };
213
-
214
- // available tensor operations:
215
- enum ggml_op {
216
- GGML_OP_NONE = 0,
217
-
218
- GGML_OP_DUP,
219
- GGML_OP_ADD,
220
- GGML_OP_SUB,
221
- GGML_OP_MUL,
222
- GGML_OP_DIV,
223
- GGML_OP_SQR,
224
- GGML_OP_SQRT,
225
- GGML_OP_SUM,
226
- GGML_OP_MEAN,
227
- GGML_OP_REPEAT,
228
- GGML_OP_ABS,
229
- GGML_OP_SGN,
230
- GGML_OP_NEG,
231
- GGML_OP_STEP,
232
- GGML_OP_RELU,
233
- GGML_OP_GELU,
234
- GGML_OP_SILU,
235
- GGML_OP_NORM, // normalize
236
- GGML_OP_RMS_NORM,
237
-
238
- GGML_OP_MUL_MAT,
239
-
240
- GGML_OP_SCALE,
241
- GGML_OP_CPY,
242
- GGML_OP_CONT,
243
- GGML_OP_RESHAPE,
244
- GGML_OP_VIEW,
245
- GGML_OP_PERMUTE,
246
- GGML_OP_TRANSPOSE,
247
- GGML_OP_GET_ROWS,
248
- GGML_OP_DIAG_MASK_INF,
249
- GGML_OP_SOFT_MAX,
250
- GGML_OP_ROPE,
251
- GGML_OP_CONV_1D_1S,
252
- GGML_OP_CONV_1D_2S,
253
-
254
- GGML_OP_FLASH_ATTN,
255
- GGML_OP_FLASH_FF,
256
-
257
- GGML_OP_MAP_UNARY,
258
- GGML_OP_MAP_BINARY,
259
-
260
- GGML_OP_COUNT,
261
- };
262
-
263
-
264
- // ggml object
265
- struct ggml_object {
266
- size_t offs;
267
- size_t size;
268
-
269
- struct ggml_object * next;
270
-
271
- char padding[8];
272
- };
273
-
274
- static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
275
-
276
- // n-dimensional tensor
277
- struct ggml_tensor {
278
- enum ggml_type type;
279
-
280
- int n_dims;
281
- int64_t ne[GGML_MAX_DIMS]; // number of elements
282
- size_t nb[GGML_MAX_DIMS]; // stride in bytes:
283
- // nb[0] = sizeof(type)
284
- // nb[1] = nb[0] * ne[0] + padding
285
- // nb[i] = nb[i-1] * ne[i-1]
286
-
287
- // compute data
288
- enum ggml_op op;
289
-
290
- bool is_param;
291
-
292
- struct ggml_tensor * grad;
293
- struct ggml_tensor * src0;
294
- struct ggml_tensor * src1;
295
- struct ggml_tensor * opt[GGML_MAX_OPT];
296
-
297
- // thread scheduling
298
- int n_tasks;
299
-
300
- // performance
301
- int perf_runs;
302
- int64_t perf_cycles;
303
- int64_t perf_time_us;
304
-
305
- void * data;
306
- char padding[8];
307
- };
308
-
309
- // computation graph
310
- struct ggml_cgraph {
311
- int n_nodes;
312
- int n_leafs;
313
- int n_threads;
314
-
315
- size_t work_size;
316
- struct ggml_tensor * work;
317
-
318
- struct ggml_tensor * nodes[GGML_MAX_NODES];
319
- struct ggml_tensor * grads[GGML_MAX_NODES];
320
- struct ggml_tensor * leafs[GGML_MAX_NODES];
321
-
322
- // performance
323
- int perf_runs;
324
- int64_t perf_cycles;
325
- int64_t perf_time_us;
326
- };
327
-
328
- // scratch buffer
329
- struct ggml_scratch {
330
- size_t offs;
331
- size_t size;
332
- void * data;
333
- };
211
+ // convert FP16 <-> FP32
212
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
213
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
214
+
215
+ struct ggml_object;
216
+ struct ggml_context;
217
+
218
+ enum ggml_type {
219
+ GGML_TYPE_F32 = 0,
220
+ GGML_TYPE_F16 = 1,
221
+ GGML_TYPE_Q4_0 = 2,
222
+ GGML_TYPE_Q4_1 = 3,
223
+ GGML_TYPE_Q4_2 = 4,
224
+ GGML_TYPE_Q4_3 = 5,
225
+ GGML_TYPE_Q5_0 = 6,
226
+ GGML_TYPE_Q5_1 = 7,
227
+ GGML_TYPE_Q8_0 = 8,
228
+ GGML_TYPE_Q8_1 = 9,
229
+ GGML_TYPE_I8,
230
+ GGML_TYPE_I16,
231
+ GGML_TYPE_I32,
232
+ GGML_TYPE_COUNT,
233
+ };
234
+
235
+ // available tensor operations:
236
+ enum ggml_op {
237
+ GGML_OP_NONE = 0,
238
+
239
+ GGML_OP_DUP,
240
+ GGML_OP_ADD,
241
+ GGML_OP_SUB,
242
+ GGML_OP_MUL,
243
+ GGML_OP_DIV,
244
+ GGML_OP_SQR,
245
+ GGML_OP_SQRT,
246
+ GGML_OP_SUM,
247
+ GGML_OP_MEAN,
248
+ GGML_OP_REPEAT,
249
+ GGML_OP_ABS,
250
+ GGML_OP_SGN,
251
+ GGML_OP_NEG,
252
+ GGML_OP_STEP,
253
+ GGML_OP_RELU,
254
+ GGML_OP_GELU,
255
+ GGML_OP_SILU,
256
+ GGML_OP_NORM, // normalize
257
+ GGML_OP_RMS_NORM,
258
+
259
+ GGML_OP_MUL_MAT,
260
+
261
+ GGML_OP_SCALE,
262
+ GGML_OP_CPY,
263
+ GGML_OP_CONT,
264
+ GGML_OP_RESHAPE,
265
+ GGML_OP_VIEW,
266
+ GGML_OP_PERMUTE,
267
+ GGML_OP_TRANSPOSE,
268
+ GGML_OP_GET_ROWS,
269
+ GGML_OP_DIAG_MASK_INF,
270
+ GGML_OP_SOFT_MAX,
271
+ GGML_OP_ROPE,
272
+ GGML_OP_CONV_1D_1S,
273
+ GGML_OP_CONV_1D_2S,
274
+
275
+ GGML_OP_FLASH_ATTN,
276
+ GGML_OP_FLASH_FF,
277
+
278
+ GGML_OP_MAP_UNARY,
279
+ GGML_OP_MAP_BINARY,
280
+
281
+ GGML_OP_COUNT,
282
+ };
283
+
284
+
285
+ // ggml object
286
+ struct ggml_object {
287
+ size_t offs;
288
+ size_t size;
289
+
290
+ struct ggml_object * next;
291
+
292
+ char padding[8];
293
+ };
294
+
295
+ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
296
+
297
+ // n-dimensional tensor
298
+ struct ggml_tensor {
299
+ enum ggml_type type;
300
+
301
+ int n_dims;
302
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
303
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
304
+ // nb[0] = sizeof(type)
305
+ // nb[1] = nb[0] * ne[0] + padding
306
+ // nb[i] = nb[i-1] * ne[i-1]
307
+
308
+ // compute data
309
+ enum ggml_op op;
310
+
311
+ bool is_param;
312
+
313
+ struct ggml_tensor * grad;
314
+ struct ggml_tensor * src0;
315
+ struct ggml_tensor * src1;
316
+ struct ggml_tensor * opt[GGML_MAX_OPT];
317
+
318
+ // thread scheduling
319
+ int n_tasks;
320
+
321
+ // performance
322
+ int perf_runs;
323
+ int64_t perf_cycles;
324
+ int64_t perf_time_us;
325
+
326
+ void * data;
327
+ char padding[8];
328
+ };
329
+
330
+ // computation graph
331
+ struct ggml_cgraph {
332
+ int n_nodes;
333
+ int n_leafs;
334
+ int n_threads;
335
+
336
+ size_t work_size;
337
+ struct ggml_tensor * work;
338
+
339
+ struct ggml_tensor * nodes[GGML_MAX_NODES];
340
+ struct ggml_tensor * grads[GGML_MAX_NODES];
341
+ struct ggml_tensor * leafs[GGML_MAX_NODES];
342
+
343
+ // performance
344
+ int perf_runs;
345
+ int64_t perf_cycles;
346
+ int64_t perf_time_us;
347
+ };
348
+
349
+ // scratch buffer
350
+ struct ggml_scratch {
351
+ size_t offs;
352
+ size_t size;
353
+ void * data;
354
+ };
334
355
 
335
- struct ggml_init_params {
336
- // memory pool
337
- size_t mem_size; // bytes
338
- void * mem_buffer; // if NULL, memory will be allocated internally
339
- bool no_alloc; // don't allocate memory for the tensor data
340
- };
356
+ struct ggml_init_params {
357
+ // memory pool
358
+ size_t mem_size; // bytes
359
+ void * mem_buffer; // if NULL, memory will be allocated internally
360
+ bool no_alloc; // don't allocate memory for the tensor data
361
+ };
341
362
 
342
- void ggml_time_init(void); // call this once at the beginning of the program
343
- int64_t ggml_time_ms(void);
344
- int64_t ggml_time_us(void);
345
- int64_t ggml_cycles(void);
346
- int64_t ggml_cycles_per_ms(void);
363
+ // misc
347
364
 
348
- void ggml_print_object (const struct ggml_object * obj);
349
- void ggml_print_objects(const struct ggml_context * ctx);
365
+ GGML_API void ggml_time_init(void); // call this once at the beginning of the program
366
+ GGML_API int64_t ggml_time_ms(void);
367
+ GGML_API int64_t ggml_time_us(void);
368
+ GGML_API int64_t ggml_cycles(void);
369
+ GGML_API int64_t ggml_cycles_per_ms(void);
350
370
 
351
- int64_t ggml_nelements(const struct ggml_tensor * tensor);
352
- size_t ggml_nbytes (const struct ggml_tensor * tensor);
371
+ GGML_API void ggml_print_object (const struct ggml_object * obj);
372
+ GGML_API void ggml_print_objects(const struct ggml_context * ctx);
353
373
 
354
- int ggml_blck_size (enum ggml_type type);
355
- size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
356
- float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
374
+ GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
375
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
357
376
 
358
- const char * ggml_type_name(enum ggml_type type);
377
+ GGML_API int ggml_blck_size (enum ggml_type type);
378
+ GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
379
+ GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
359
380
 
360
- size_t ggml_element_size(const struct ggml_tensor * tensor);
381
+ GGML_API const char * ggml_type_name(enum ggml_type type);
361
382
 
362
- struct ggml_context * ggml_init(struct ggml_init_params params);
363
- void ggml_free(struct ggml_context * ctx);
383
+ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
364
384
 
365
- size_t ggml_used_mem(const struct ggml_context * ctx);
385
+ GGML_API bool ggml_is_quantized(enum ggml_type type);
366
386
 
367
- size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
387
+ // main
368
388
 
369
- struct ggml_tensor * ggml_new_tensor(
370
- struct ggml_context * ctx,
371
- enum ggml_type type,
372
- int n_dims,
373
- const int64_t *ne);
374
-
375
- struct ggml_tensor * ggml_new_tensor_1d(
376
- struct ggml_context * ctx,
377
- enum ggml_type type,
378
- int64_t ne0);
379
-
380
- struct ggml_tensor * ggml_new_tensor_2d(
381
- struct ggml_context * ctx,
382
- enum ggml_type type,
383
- int64_t ne0,
384
- int64_t ne1);
385
-
386
- struct ggml_tensor * ggml_new_tensor_3d(
387
- struct ggml_context * ctx,
388
- enum ggml_type type,
389
- int64_t ne0,
390
- int64_t ne1,
391
- int64_t ne2);
392
-
393
- struct ggml_tensor * ggml_new_tensor_4d(
394
- struct ggml_context * ctx,
395
- enum ggml_type type,
396
- int64_t ne0,
397
- int64_t ne1,
398
- int64_t ne2,
399
- int64_t ne3);
400
-
401
- struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
402
- struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
403
-
404
- struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
405
- struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
406
-
407
- struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
408
- struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
409
- struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
410
-
411
- int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
412
- void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
413
-
414
- float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
415
- void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
416
-
417
- void * ggml_get_data (const struct ggml_tensor * tensor);
418
- float * ggml_get_data_f32(const struct ggml_tensor * tensor);
419
-
420
- //
421
- // operations on tensors with backpropagation
422
- //
423
-
424
- struct ggml_tensor * ggml_dup(
425
- struct ggml_context * ctx,
426
- struct ggml_tensor * a);
427
-
428
- struct ggml_tensor * ggml_add(
429
- struct ggml_context * ctx,
430
- struct ggml_tensor * a,
431
- struct ggml_tensor * b);
389
+ GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
390
+ GGML_API void ggml_free(struct ggml_context * ctx);
432
391
 
392
+ GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
433
393
 
434
- struct ggml_tensor * ggml_add_inplace(
435
- struct ggml_context * ctx,
436
- struct ggml_tensor * a,
437
- struct ggml_tensor * b);
394
+ GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
438
395
 
439
- struct ggml_tensor * ggml_sub(
440
- struct ggml_context * ctx,
441
- struct ggml_tensor * a,
442
- struct ggml_tensor * b);
396
+ GGML_API struct ggml_tensor * ggml_new_tensor(
397
+ struct ggml_context * ctx,
398
+ enum ggml_type type,
399
+ int n_dims,
400
+ const int64_t *ne);
443
401
 
444
- struct ggml_tensor * ggml_mul(
445
- struct ggml_context * ctx,
446
- struct ggml_tensor * a,
447
- struct ggml_tensor * b);
402
+ GGML_API struct ggml_tensor * ggml_new_tensor_1d(
403
+ struct ggml_context * ctx,
404
+ enum ggml_type type,
405
+ int64_t ne0);
448
406
 
449
- struct ggml_tensor * ggml_div(
450
- struct ggml_context * ctx,
451
- struct ggml_tensor * a,
452
- struct ggml_tensor * b);
453
-
454
- struct ggml_tensor * ggml_sqr(
455
- struct ggml_context * ctx,
456
- struct ggml_tensor * a);
457
-
458
- struct ggml_tensor * ggml_sqrt(
459
- struct ggml_context * ctx,
460
- struct ggml_tensor * a);
461
-
462
- // return scalar
463
- // TODO: compute sum along rows
464
- struct ggml_tensor * ggml_sum(
465
- struct ggml_context * ctx,
466
- struct ggml_tensor * a);
467
-
468
- // mean along rows
469
- struct ggml_tensor * ggml_mean(
470
- struct ggml_context * ctx,
471
- struct ggml_tensor * a);
472
-
473
- // if a is the same shape as b, and a is not parameter, return a
474
- // otherwise, return a new tensor: repeat(a) to fit in b
475
- struct ggml_tensor * ggml_repeat(
476
- struct ggml_context * ctx,
477
- struct ggml_tensor * a,
478
- struct ggml_tensor * b);
479
-
480
- struct ggml_tensor * ggml_abs(
481
- struct ggml_context * ctx,
482
- struct ggml_tensor * a);
483
-
484
- struct ggml_tensor * ggml_sgn(
485
- struct ggml_context * ctx,
486
- struct ggml_tensor * a);
487
-
488
- struct ggml_tensor * ggml_neg(
489
- struct ggml_context * ctx,
490
- struct ggml_tensor * a);
491
-
492
- struct ggml_tensor * ggml_step(
493
- struct ggml_context * ctx,
494
- struct ggml_tensor * a);
495
-
496
- struct ggml_tensor * ggml_relu(
497
- struct ggml_context * ctx,
498
- struct ggml_tensor * a);
499
-
500
- // TODO: double-check this computation is correct
501
- struct ggml_tensor * ggml_gelu(
502
- struct ggml_context * ctx,
503
- struct ggml_tensor * a);
504
-
505
- struct ggml_tensor * ggml_silu(
506
- struct ggml_context * ctx,
507
- struct ggml_tensor * a);
508
-
509
- // normalize along rows
510
- // TODO: eps is hardcoded to 1e-5 for now
511
- struct ggml_tensor * ggml_norm(
512
- struct ggml_context * ctx,
513
- struct ggml_tensor * a);
514
-
515
- struct ggml_tensor * ggml_rms_norm(
516
- struct ggml_context * ctx,
517
- struct ggml_tensor * a);
518
-
519
- // A: m rows, n columns
520
- // B: p rows, n columns (i.e. we transpose it internally)
521
- // result is m columns, p rows
522
- struct ggml_tensor * ggml_mul_mat(
523
- struct ggml_context * ctx,
524
- struct ggml_tensor * a,
525
- struct ggml_tensor * b);
526
-
527
- //
528
- // operations on tensors without backpropagation
529
- //
530
-
531
- // in-place, returns view(a)
532
- struct ggml_tensor * ggml_scale(
533
- struct ggml_context * ctx,
534
- struct ggml_tensor * a,
535
- struct ggml_tensor * b);
536
-
537
- // a -> b, return view(b)
538
- struct ggml_tensor * ggml_cpy(
539
- struct ggml_context * ctx,
540
- struct ggml_tensor * a,
541
- struct ggml_tensor * b);
542
-
543
- // make contiguous
544
- struct ggml_tensor * ggml_cont(
545
- struct ggml_context * ctx,
546
- struct ggml_tensor * a);
547
-
548
- // return view(a), b specifies the new shape
549
- // TODO: when we start computing gradient, make a copy instead of view
550
- struct ggml_tensor * ggml_reshape(
551
- struct ggml_context * ctx,
552
- struct ggml_tensor * a,
553
- struct ggml_tensor * b);
554
-
555
- // return view(a)
556
- // TODO: when we start computing gradient, make a copy instead of view
557
- struct ggml_tensor * ggml_reshape_2d(
558
- struct ggml_context * ctx,
559
- struct ggml_tensor * a,
560
- int64_t ne0,
561
- int64_t ne1);
562
-
563
- // return view(a)
564
- // TODO: when we start computing gradient, make a copy instead of view
565
- struct ggml_tensor * ggml_reshape_3d(
566
- struct ggml_context * ctx,
567
- struct ggml_tensor * a,
568
- int64_t ne0,
569
- int64_t ne1,
570
- int64_t ne2);
571
-
572
- // offset in bytes
573
- struct ggml_tensor * ggml_view_1d(
574
- struct ggml_context * ctx,
575
- struct ggml_tensor * a,
576
- int64_t ne0,
577
- size_t offset);
578
-
579
- struct ggml_tensor * ggml_view_2d(
580
- struct ggml_context * ctx,
581
- struct ggml_tensor * a,
582
- int64_t ne0,
583
- int64_t ne1,
584
- size_t nb1, // row stride in bytes
585
- size_t offset);
586
-
587
- struct ggml_tensor * ggml_view_3d(
588
- struct ggml_context * ctx,
589
- struct ggml_tensor * a,
590
- int64_t ne0,
591
- int64_t ne1,
592
- int64_t ne2,
593
- size_t nb1, // row stride in bytes
594
- size_t nb2, // slice stride in bytes
595
- size_t offset);
596
-
597
- struct ggml_tensor * ggml_permute(
598
- struct ggml_context * ctx,
599
- struct ggml_tensor * a,
600
- int axis0,
601
- int axis1,
602
- int axis2,
603
- int axis3);
604
-
605
- // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
606
- struct ggml_tensor * ggml_transpose(
607
- struct ggml_context * ctx,
608
- struct ggml_tensor * a);
609
-
610
- struct ggml_tensor * ggml_get_rows(
611
- struct ggml_context * ctx,
612
- struct ggml_tensor * a,
613
- struct ggml_tensor * b);
614
-
615
- // set elements above the diagonal to -INF
616
- // in-place, returns view(a)
617
- struct ggml_tensor * ggml_diag_mask_inf(
618
- struct ggml_context * ctx,
619
- struct ggml_tensor * a,
620
- int n_past);
621
-
622
- // in-place, returns view(a)
623
- struct ggml_tensor * ggml_soft_max(
624
- struct ggml_context * ctx,
625
- struct ggml_tensor * a);
626
-
627
- // rotary position embedding
628
- // in-place, returns view(a)
629
- // if mode == 1, skip n_past elements
630
- // TODO: avoid creating a new tensor every time
631
- struct ggml_tensor * ggml_rope(
632
- struct ggml_context * ctx,
633
- struct ggml_tensor * a,
634
- int n_past,
635
- int n_dims,
636
- int mode);
637
-
638
- // padding = 1
639
- // TODO: we don't support extra parameters for now
640
- // that's why we are hard-coding the stride, padding, and dilation
641
- // not great ..
642
- struct ggml_tensor * ggml_conv_1d_1s(
643
- struct ggml_context * ctx,
644
- struct ggml_tensor * a,
645
- struct ggml_tensor * b);
646
-
647
- struct ggml_tensor * ggml_conv_1d_2s(
648
- struct ggml_context * ctx,
649
- struct ggml_tensor * a,
650
- struct ggml_tensor * b);
651
-
652
- struct ggml_tensor * ggml_flash_attn(
653
- struct ggml_context * ctx,
654
- struct ggml_tensor * q,
655
- struct ggml_tensor * k,
656
- struct ggml_tensor * v,
657
- bool masked);
658
-
659
- struct ggml_tensor * ggml_flash_ff(
660
- struct ggml_context * ctx,
661
- struct ggml_tensor * a,
662
- struct ggml_tensor * b0,
663
- struct ggml_tensor * b1,
664
- struct ggml_tensor * c0,
665
- struct ggml_tensor * c1);
666
-
667
- // Mapping operations
668
- typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
669
- typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
670
-
671
- struct ggml_tensor * ggml_map_unary_f32(
672
- struct ggml_context * ctx,
673
- struct ggml_tensor * a,
674
- const ggml_unary_op_f32_t fun);
675
-
676
- struct ggml_tensor * ggml_map_binary_f32(
677
- struct ggml_context * ctx,
678
- struct ggml_tensor * a,
679
- struct ggml_tensor * b,
680
- const ggml_binary_op_f32_t fun);
681
-
682
- //
683
- // automatic differentiation
684
- //
685
-
686
- void ggml_set_param(
687
- struct ggml_context * ctx,
688
- struct ggml_tensor * tensor);
689
-
690
- void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
691
-
692
- struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
693
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
694
-
695
- void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
696
- void ggml_graph_reset (struct ggml_cgraph * cgraph);
697
-
698
- // print info and performance information for the graph
699
- void ggml_graph_print(const struct ggml_cgraph * cgraph);
700
-
701
- // dump the graph into a file using the dot format
702
- void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
703
-
704
- //
705
- // optimization
706
- //
707
-
708
- // optimization methods
709
- enum ggml_opt_type {
710
- GGML_OPT_ADAM,
711
- GGML_OPT_LBFGS,
712
- };
713
-
714
- // linesearch methods
715
- enum ggml_linesearch {
716
- GGML_LINESEARCH_DEFAULT = 1,
717
-
718
- GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
719
- GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
720
- GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
721
- };
722
-
723
- // optimization return values
724
- enum ggml_opt_result {
725
- GGML_OPT_OK = 0,
726
- GGML_OPT_DID_NOT_CONVERGE,
727
- GGML_OPT_NO_CONTEXT,
728
- GGML_OPT_INVALID_WOLFE,
729
- GGML_OPT_FAIL,
407
+ GGML_API struct ggml_tensor * ggml_new_tensor_2d(
408
+ struct ggml_context * ctx,
409
+ enum ggml_type type,
410
+ int64_t ne0,
411
+ int64_t ne1);
730
412
 
731
- GGML_LINESEARCH_FAIL = -128,
732
- GGML_LINESEARCH_MINIMUM_STEP,
733
- GGML_LINESEARCH_MAXIMUM_STEP,
734
- GGML_LINESEARCH_MAXIMUM_ITERATIONS,
735
- GGML_LINESEARCH_INVALID_PARAMETERS,
736
- };
413
+ GGML_API struct ggml_tensor * ggml_new_tensor_3d(
414
+ struct ggml_context * ctx,
415
+ enum ggml_type type,
416
+ int64_t ne0,
417
+ int64_t ne1,
418
+ int64_t ne2);
737
419
 
738
- // optimization parameters
739
- //
740
- // see ggml.c (ggml_opt_default_params) for default values
741
- //
742
- struct ggml_opt_params {
743
- enum ggml_opt_type type;
420
+ GGML_API struct ggml_tensor * ggml_new_tensor_4d(
421
+ struct ggml_context * ctx,
422
+ enum ggml_type type,
423
+ int64_t ne0,
424
+ int64_t ne1,
425
+ int64_t ne2,
426
+ int64_t ne3);
427
+
428
+ GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
429
+ GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
430
+
431
+ GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
432
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
433
+
434
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
435
+ GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
436
+ GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
437
+
438
+ GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
439
+ GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
440
+
441
+ GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
442
+ GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
443
+
444
+ GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
445
+ GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
446
+
447
+ //
448
+ // operations on tensors with backpropagation
449
+ //
744
450
 
745
- int n_threads;
451
+ GGML_API struct ggml_tensor * ggml_dup(
452
+ struct ggml_context * ctx,
453
+ struct ggml_tensor * a);
454
+
455
+ GGML_API struct ggml_tensor * ggml_add(
456
+ struct ggml_context * ctx,
457
+ struct ggml_tensor * a,
458
+ struct ggml_tensor * b);
459
+
460
+ GGML_API struct ggml_tensor * ggml_add_inplace(
461
+ struct ggml_context * ctx,
462
+ struct ggml_tensor * a,
463
+ struct ggml_tensor * b);
464
+
465
+ GGML_API struct ggml_tensor * ggml_sub(
466
+ struct ggml_context * ctx,
467
+ struct ggml_tensor * a,
468
+ struct ggml_tensor * b);
469
+
470
+ GGML_API struct ggml_tensor * ggml_mul(
471
+ struct ggml_context * ctx,
472
+ struct ggml_tensor * a,
473
+ struct ggml_tensor * b);
474
+
475
+ GGML_API struct ggml_tensor * ggml_div(
476
+ struct ggml_context * ctx,
477
+ struct ggml_tensor * a,
478
+ struct ggml_tensor * b);
479
+
480
+ GGML_API struct ggml_tensor * ggml_sqr(
481
+ struct ggml_context * ctx,
482
+ struct ggml_tensor * a);
483
+
484
+ GGML_API struct ggml_tensor * ggml_sqrt(
485
+ struct ggml_context * ctx,
486
+ struct ggml_tensor * a);
487
+
488
+ // return scalar
489
+ // TODO: compute sum along rows
490
+ GGML_API struct ggml_tensor * ggml_sum(
491
+ struct ggml_context * ctx,
492
+ struct ggml_tensor * a);
493
+
494
+ // mean along rows
495
+ GGML_API struct ggml_tensor * ggml_mean(
496
+ struct ggml_context * ctx,
497
+ struct ggml_tensor * a);
498
+
499
+ // if a is the same shape as b, and a is not parameter, return a
500
+ // otherwise, return a new tensor: repeat(a) to fit in b
501
+ GGML_API struct ggml_tensor * ggml_repeat(
502
+ struct ggml_context * ctx,
503
+ struct ggml_tensor * a,
504
+ struct ggml_tensor * b);
505
+
506
+ GGML_API struct ggml_tensor * ggml_abs(
507
+ struct ggml_context * ctx,
508
+ struct ggml_tensor * a);
509
+
510
+ GGML_API struct ggml_tensor * ggml_sgn(
511
+ struct ggml_context * ctx,
512
+ struct ggml_tensor * a);
513
+
514
+ GGML_API struct ggml_tensor * ggml_neg(
515
+ struct ggml_context * ctx,
516
+ struct ggml_tensor * a);
517
+
518
+ GGML_API struct ggml_tensor * ggml_step(
519
+ struct ggml_context * ctx,
520
+ struct ggml_tensor * a);
521
+
522
+ GGML_API struct ggml_tensor * ggml_relu(
523
+ struct ggml_context * ctx,
524
+ struct ggml_tensor * a);
525
+
526
+ // TODO: double-check this computation is correct
527
+ GGML_API struct ggml_tensor * ggml_gelu(
528
+ struct ggml_context * ctx,
529
+ struct ggml_tensor * a);
530
+
531
+ GGML_API struct ggml_tensor * ggml_silu(
532
+ struct ggml_context * ctx,
533
+ struct ggml_tensor * a);
534
+
535
+ // normalize along rows
536
+ // TODO: eps is hardcoded to 1e-5 for now
537
+ GGML_API struct ggml_tensor * ggml_norm(
538
+ struct ggml_context * ctx,
539
+ struct ggml_tensor * a);
540
+
541
+ GGML_API struct ggml_tensor * ggml_rms_norm(
542
+ struct ggml_context * ctx,
543
+ struct ggml_tensor * a);
544
+
545
+ // A: m rows, n columns
546
+ // B: p rows, n columns (i.e. we transpose it internally)
547
+ // result is m columns, p rows
548
+ GGML_API struct ggml_tensor * ggml_mul_mat(
549
+ struct ggml_context * ctx,
550
+ struct ggml_tensor * a,
551
+ struct ggml_tensor * b);
746
552
 
747
- // delta-based convergence test
748
553
  //
749
- // if past == 0 - disabled
750
- // if past > 0:
751
- // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
554
+ // operations on tensors without backpropagation
752
555
  //
753
- int past;
754
- float delta;
755
556
 
756
- // maximum number of iterations without improvement
557
+ // in-place, returns view(a)
558
+ GGML_API struct ggml_tensor * ggml_scale(
559
+ struct ggml_context * ctx,
560
+ struct ggml_tensor * a,
561
+ struct ggml_tensor * b);
562
+
563
+ // a -> b, return view(b)
564
+ GGML_API struct ggml_tensor * ggml_cpy(
565
+ struct ggml_context * ctx,
566
+ struct ggml_tensor * a,
567
+ struct ggml_tensor * b);
568
+
569
+ // make contiguous
570
+ GGML_API struct ggml_tensor * ggml_cont(
571
+ struct ggml_context * ctx,
572
+ struct ggml_tensor * a);
573
+
574
+ // return view(a), b specifies the new shape
575
+ // TODO: when we start computing gradient, make a copy instead of view
576
+ GGML_API struct ggml_tensor * ggml_reshape(
577
+ struct ggml_context * ctx,
578
+ struct ggml_tensor * a,
579
+ struct ggml_tensor * b);
580
+
581
+ // return view(a)
582
+ // TODO: when we start computing gradient, make a copy instead of view
583
+ GGML_API struct ggml_tensor * ggml_reshape_2d(
584
+ struct ggml_context * ctx,
585
+ struct ggml_tensor * a,
586
+ int64_t ne0,
587
+ int64_t ne1);
588
+
589
+ // return view(a)
590
+ // TODO: when we start computing gradient, make a copy instead of view
591
+ GGML_API struct ggml_tensor * ggml_reshape_3d(
592
+ struct ggml_context * ctx,
593
+ struct ggml_tensor * a,
594
+ int64_t ne0,
595
+ int64_t ne1,
596
+ int64_t ne2);
597
+
598
+ // offset in bytes
599
+ GGML_API struct ggml_tensor * ggml_view_1d(
600
+ struct ggml_context * ctx,
601
+ struct ggml_tensor * a,
602
+ int64_t ne0,
603
+ size_t offset);
604
+
605
+ GGML_API struct ggml_tensor * ggml_view_2d(
606
+ struct ggml_context * ctx,
607
+ struct ggml_tensor * a,
608
+ int64_t ne0,
609
+ int64_t ne1,
610
+ size_t nb1, // row stride in bytes
611
+ size_t offset);
612
+
613
+ GGML_API struct ggml_tensor * ggml_view_3d(
614
+ struct ggml_context * ctx,
615
+ struct ggml_tensor * a,
616
+ int64_t ne0,
617
+ int64_t ne1,
618
+ int64_t ne2,
619
+ size_t nb1, // row stride in bytes
620
+ size_t nb2, // slice stride in bytes
621
+ size_t offset);
622
+
623
+ GGML_API struct ggml_tensor * ggml_permute(
624
+ struct ggml_context * ctx,
625
+ struct ggml_tensor * a,
626
+ int axis0,
627
+ int axis1,
628
+ int axis2,
629
+ int axis3);
630
+
631
+ // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
632
+ GGML_API struct ggml_tensor * ggml_transpose(
633
+ struct ggml_context * ctx,
634
+ struct ggml_tensor * a);
635
+
636
+ GGML_API struct ggml_tensor * ggml_get_rows(
637
+ struct ggml_context * ctx,
638
+ struct ggml_tensor * a,
639
+ struct ggml_tensor * b);
640
+
641
+ // set elements above the diagonal to -INF
642
+ // in-place, returns view(a)
643
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf(
644
+ struct ggml_context * ctx,
645
+ struct ggml_tensor * a,
646
+ int n_past);
647
+
648
+ // in-place, returns view(a)
649
+ GGML_API struct ggml_tensor * ggml_soft_max(
650
+ struct ggml_context * ctx,
651
+ struct ggml_tensor * a);
652
+
653
+ // rotary position embedding
654
+ // in-place, returns view(a)
655
+ // if mode & 1 == 1, skip n_past elements
656
+ // if mode & 2 == 1, GPT-NeoX style
657
+ // TODO: avoid creating a new tensor every time
658
+ GGML_API struct ggml_tensor * ggml_rope(
659
+ struct ggml_context * ctx,
660
+ struct ggml_tensor * a,
661
+ int n_past,
662
+ int n_dims,
663
+ int mode);
664
+
665
+ // padding = 1
666
+ // TODO: we don't support extra parameters for now
667
+ // that's why we are hard-coding the stride, padding, and dilation
668
+ // not great ..
669
+ GGML_API struct ggml_tensor * ggml_conv_1d_1s(
670
+ struct ggml_context * ctx,
671
+ struct ggml_tensor * a,
672
+ struct ggml_tensor * b);
673
+
674
+ GGML_API struct ggml_tensor * ggml_conv_1d_2s(
675
+ struct ggml_context * ctx,
676
+ struct ggml_tensor * a,
677
+ struct ggml_tensor * b);
678
+
679
+ GGML_API struct ggml_tensor * ggml_flash_attn(
680
+ struct ggml_context * ctx,
681
+ struct ggml_tensor * q,
682
+ struct ggml_tensor * k,
683
+ struct ggml_tensor * v,
684
+ bool masked);
685
+
686
+ GGML_API struct ggml_tensor * ggml_flash_ff(
687
+ struct ggml_context * ctx,
688
+ struct ggml_tensor * a,
689
+ struct ggml_tensor * b0,
690
+ struct ggml_tensor * b1,
691
+ struct ggml_tensor * c0,
692
+ struct ggml_tensor * c1);
693
+
694
+ // Mapping operations
695
+ GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
696
+ GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
697
+
698
+ GGML_API struct ggml_tensor * ggml_map_unary_f32(
699
+ struct ggml_context * ctx,
700
+ struct ggml_tensor * a,
701
+ const ggml_unary_op_f32_t fun);
702
+
703
+ GGML_API struct ggml_tensor * ggml_map_binary_f32(
704
+ struct ggml_context * ctx,
705
+ struct ggml_tensor * a,
706
+ struct ggml_tensor * b,
707
+ const ggml_binary_op_f32_t fun);
708
+
757
709
  //
758
- // if 0 - disabled
759
- // if > 0:
760
- // assume convergence if no cost improvement in this number of iterations
710
+ // automatic differentiation
761
711
  //
762
- int max_no_improvement;
763
712
 
764
- bool print_forward_graph;
765
- bool print_backward_graph;
713
+ GGML_API void ggml_set_param(
714
+ struct ggml_context * ctx,
715
+ struct ggml_tensor * tensor);
766
716
 
767
- // ADAM parameters
768
- struct {
769
- int n_iter;
717
+ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
770
718
 
771
- float alpha; // learning rate
772
- float beta1;
773
- float beta2;
774
- float eps; // epsilon for numerical stability
775
- float eps_f; // epsilon for convergence test
776
- float eps_g; // epsilon for convergence test
777
- } adam;
719
+ GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
720
+ GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
778
721
 
779
- // LBFGS parameters
780
- struct {
781
- int m; // number of corrections to approximate the inv. Hessian
782
- int n_iter;
783
- int max_linesearch;
722
+ GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
723
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
784
724
 
785
- float eps; // convergence tolerance
786
- float ftol; // line search tolerance
787
- float wolfe;
788
- float min_step;
789
- float max_step;
725
+ // print info and performance information for the graph
726
+ GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
790
727
 
791
- enum ggml_linesearch linesearch;
792
- } lbfgs;
793
- };
728
+ // dump the graph into a file using the dot format
729
+ GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
794
730
 
795
- struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
731
+ //
732
+ // optimization
733
+ //
796
734
 
797
- // optimize the function defined by the tensor f
798
- enum ggml_opt_result ggml_opt(
799
- struct ggml_context * ctx,
800
- struct ggml_opt_params params,
801
- struct ggml_tensor * f);
735
+ // optimization methods
736
+ enum ggml_opt_type {
737
+ GGML_OPT_ADAM,
738
+ GGML_OPT_LBFGS,
739
+ };
740
+
741
+ // linesearch methods
742
+ enum ggml_linesearch {
743
+ GGML_LINESEARCH_DEFAULT = 1,
744
+
745
+ GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
746
+ GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
747
+ GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
748
+ };
749
+
750
+ // optimization return values
751
+ enum ggml_opt_result {
752
+ GGML_OPT_OK = 0,
753
+ GGML_OPT_DID_NOT_CONVERGE,
754
+ GGML_OPT_NO_CONTEXT,
755
+ GGML_OPT_INVALID_WOLFE,
756
+ GGML_OPT_FAIL,
757
+
758
+ GGML_LINESEARCH_FAIL = -128,
759
+ GGML_LINESEARCH_MINIMUM_STEP,
760
+ GGML_LINESEARCH_MAXIMUM_STEP,
761
+ GGML_LINESEARCH_MAXIMUM_ITERATIONS,
762
+ GGML_LINESEARCH_INVALID_PARAMETERS,
763
+ };
764
+
765
+ // optimization parameters
766
+ //
767
+ // see ggml.c (ggml_opt_default_params) for default values
768
+ //
769
+ struct ggml_opt_params {
770
+ enum ggml_opt_type type;
771
+
772
+ int n_threads;
773
+
774
+ // delta-based convergence test
775
+ //
776
+ // if past == 0 - disabled
777
+ // if past > 0:
778
+ // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
779
+ //
780
+ int past;
781
+ float delta;
782
+
783
+ // maximum number of iterations without improvement
784
+ //
785
+ // if 0 - disabled
786
+ // if > 0:
787
+ // assume convergence if no cost improvement in this number of iterations
788
+ //
789
+ int max_no_improvement;
790
+
791
+ bool print_forward_graph;
792
+ bool print_backward_graph;
793
+
794
+ // ADAM parameters
795
+ struct {
796
+ int n_iter;
797
+
798
+ float alpha; // learning rate
799
+ float beta1;
800
+ float beta2;
801
+ float eps; // epsilon for numerical stability
802
+ float eps_f; // epsilon for convergence test
803
+ float eps_g; // epsilon for convergence test
804
+ } adam;
805
+
806
+ // LBFGS parameters
807
+ struct {
808
+ int m; // number of corrections to approximate the inv. Hessian
809
+ int n_iter;
810
+ int max_linesearch;
811
+
812
+ float eps; // convergence tolerance
813
+ float ftol; // line search tolerance
814
+ float wolfe;
815
+ float min_step;
816
+ float max_step;
817
+
818
+ enum ggml_linesearch linesearch;
819
+ } lbfgs;
820
+ };
821
+
822
+ GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
823
+
824
+ // optimize the function defined by the tensor f
825
+ GGML_API enum ggml_opt_result ggml_opt(
826
+ struct ggml_context * ctx,
827
+ struct ggml_opt_params params,
828
+ struct ggml_tensor * f);
802
829
 
803
- //
804
- // quantization
805
- //
830
+ //
831
+ // quantization
832
+ //
806
833
 
807
- size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
808
- size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
834
+ GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
835
+ GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
836
+ GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
837
+ GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
838
+ GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
839
+ GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
840
+ GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
809
841
 
810
- //
811
- // system info
812
- //
842
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
813
843
 
814
- int ggml_cpu_has_avx(void);
815
- int ggml_cpu_has_avx2(void);
816
- int ggml_cpu_has_avx512(void);
817
- int ggml_cpu_has_avx512_vbmi(void);
818
- int ggml_cpu_has_avx512_vnni(void);
819
- int ggml_cpu_has_fma(void);
820
- int ggml_cpu_has_neon(void);
821
- int ggml_cpu_has_arm_fma(void);
822
- int ggml_cpu_has_f16c(void);
823
- int ggml_cpu_has_fp16_va(void);
824
- int ggml_cpu_has_wasm_simd(void);
825
- int ggml_cpu_has_blas(void);
826
- int ggml_cpu_has_sse3(void);
827
- int ggml_cpu_has_vsx(void);
844
+ //
845
+ // system info
846
+ //
828
847
 
848
+ GGML_API int ggml_cpu_has_avx (void);
849
+ GGML_API int ggml_cpu_has_avx2 (void);
850
+ GGML_API int ggml_cpu_has_avx512 (void);
851
+ GGML_API int ggml_cpu_has_avx512_vbmi(void);
852
+ GGML_API int ggml_cpu_has_avx512_vnni(void);
853
+ GGML_API int ggml_cpu_has_fma (void);
854
+ GGML_API int ggml_cpu_has_neon (void);
855
+ GGML_API int ggml_cpu_has_arm_fma (void);
856
+ GGML_API int ggml_cpu_has_f16c (void);
857
+ GGML_API int ggml_cpu_has_fp16_va (void);
858
+ GGML_API int ggml_cpu_has_wasm_simd (void);
859
+ GGML_API int ggml_cpu_has_blas (void);
860
+ GGML_API int ggml_cpu_has_cublas (void);
861
+ GGML_API int ggml_cpu_has_clblast (void);
862
+ GGML_API int ggml_cpu_has_gpublas (void);
863
+ GGML_API int ggml_cpu_has_sse3 (void);
864
+ GGML_API int ggml_cpu_has_vsx (void);
829
865
 
830
- //
831
- // Internal types and functions exposed for tests and benchmarks
832
- //
866
+ //
867
+ // Internal types and functions exposed for tests and benchmarks
868
+ //
833
869
 
834
870
  #ifdef __cplusplus
835
- // restrict not standard in C++
871
+ // restrict not standard in C++
836
872
  #define GGML_RESTRICT
837
873
  #else
838
874
  #define GGML_RESTRICT restrict
839
875
  #endif
840
- typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
841
- typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
842
- typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
843
-
844
- typedef struct {
845
- dequantize_row_q_t dequantize_row_q;
846
- quantize_row_q_t quantize_row_q;
847
- quantize_row_q_t quantize_row_q_reference;
848
- quantize_row_q_t quantize_row_q_dot;
849
- vec_dot_q_t vec_dot_q;
850
- } quantize_fns_t;
851
-
852
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
876
+ typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
877
+ typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
878
+ typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
879
+
880
+ typedef struct {
881
+ dequantize_row_q_t dequantize_row_q;
882
+ quantize_row_q_t quantize_row_q;
883
+ quantize_row_q_t quantize_row_q_reference;
884
+ quantize_row_q_t quantize_row_q_dot;
885
+ vec_dot_q_t vec_dot_q;
886
+ enum ggml_type vec_dot_type;
887
+ } quantize_fns_t;
888
+
889
+ quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
853
890
 
854
891
  #ifdef __cplusplus
855
892
  }