llama_cpp 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -169,14 +169,27 @@
169
169
  //
170
170
  //
171
171
 
172
- #ifdef __cplusplus
173
- extern "C" {
172
+ #ifdef GGML_SHARED
173
+ # if defined(_WIN32) && !defined(__MINGW32__)
174
+ # ifdef GGML_BUILD
175
+ # define GGML_API __declspec(dllexport)
176
+ # else
177
+ # define GGML_API __declspec(dllimport)
178
+ # endif
179
+ # else
180
+ # define GGML_API __attribute__ ((visibility ("default")))
181
+ # endif
182
+ #else
183
+ # define GGML_API
174
184
  #endif
175
185
 
176
186
  #include <stdint.h>
177
187
  #include <stddef.h>
178
188
  #include <stdbool.h>
179
189
 
190
+ #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
+ #define GGML_FILE_VERSION 1
192
+
180
193
  #define GGML_MAX_DIMS 4
181
194
  #define GGML_MAX_NODES 4096
182
195
  #define GGML_MAX_PARAMS 16
@@ -184,672 +197,696 @@ extern "C" {
184
197
  #define GGML_MAX_OPT 4
185
198
  #define GGML_DEFAULT_N_THREADS 4
186
199
 
200
+ #ifdef __cplusplus
201
+ extern "C" {
202
+ #endif
203
+
187
204
  #ifdef __ARM_NEON
188
- // we use the built-in 16-bit float type
189
- typedef __fp16 ggml_fp16_t;
205
+ // we use the built-in 16-bit float type
206
+ typedef __fp16 ggml_fp16_t;
190
207
  #else
191
- typedef uint16_t ggml_fp16_t;
208
+ typedef uint16_t ggml_fp16_t;
192
209
  #endif
193
210
 
194
- // convert FP16 <-> FP32
195
- float ggml_fp16_to_fp32(ggml_fp16_t x);
196
- ggml_fp16_t ggml_fp32_to_fp16(float x);
197
-
198
- struct ggml_object;
199
- struct ggml_context;
200
-
201
- enum ggml_type {
202
- // explicitly numbered values are used in llama.cpp files
203
- GGML_TYPE_F32 = 0,
204
- GGML_TYPE_F16 = 1,
205
- GGML_TYPE_Q4_0 = 2,
206
- GGML_TYPE_Q4_1 = 3,
207
- GGML_TYPE_Q8_0 = 4,
208
- GGML_TYPE_I8,
209
- GGML_TYPE_I16,
210
- GGML_TYPE_I32,
211
- GGML_TYPE_COUNT,
212
- };
213
-
214
- // available tensor operations:
215
- enum ggml_op {
216
- GGML_OP_NONE = 0,
217
-
218
- GGML_OP_DUP,
219
- GGML_OP_ADD,
220
- GGML_OP_SUB,
221
- GGML_OP_MUL,
222
- GGML_OP_DIV,
223
- GGML_OP_SQR,
224
- GGML_OP_SQRT,
225
- GGML_OP_SUM,
226
- GGML_OP_MEAN,
227
- GGML_OP_REPEAT,
228
- GGML_OP_ABS,
229
- GGML_OP_SGN,
230
- GGML_OP_NEG,
231
- GGML_OP_STEP,
232
- GGML_OP_RELU,
233
- GGML_OP_GELU,
234
- GGML_OP_SILU,
235
- GGML_OP_NORM, // normalize
236
- GGML_OP_RMS_NORM,
237
-
238
- GGML_OP_MUL_MAT,
239
-
240
- GGML_OP_SCALE,
241
- GGML_OP_CPY,
242
- GGML_OP_CONT,
243
- GGML_OP_RESHAPE,
244
- GGML_OP_VIEW,
245
- GGML_OP_PERMUTE,
246
- GGML_OP_TRANSPOSE,
247
- GGML_OP_GET_ROWS,
248
- GGML_OP_DIAG_MASK_INF,
249
- GGML_OP_SOFT_MAX,
250
- GGML_OP_ROPE,
251
- GGML_OP_CONV_1D_1S,
252
- GGML_OP_CONV_1D_2S,
253
-
254
- GGML_OP_FLASH_ATTN,
255
- GGML_OP_FLASH_FF,
256
-
257
- GGML_OP_MAP_UNARY,
258
- GGML_OP_MAP_BINARY,
259
-
260
- GGML_OP_COUNT,
261
- };
262
-
263
-
264
- // ggml object
265
- struct ggml_object {
266
- size_t offs;
267
- size_t size;
268
-
269
- struct ggml_object * next;
270
-
271
- char padding[8];
272
- };
273
-
274
- static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
275
-
276
- // n-dimensional tensor
277
- struct ggml_tensor {
278
- enum ggml_type type;
279
-
280
- int n_dims;
281
- int64_t ne[GGML_MAX_DIMS]; // number of elements
282
- size_t nb[GGML_MAX_DIMS]; // stride in bytes:
283
- // nb[0] = sizeof(type)
284
- // nb[1] = nb[0] * ne[0] + padding
285
- // nb[i] = nb[i-1] * ne[i-1]
286
-
287
- // compute data
288
- enum ggml_op op;
289
-
290
- bool is_param;
291
-
292
- struct ggml_tensor * grad;
293
- struct ggml_tensor * src0;
294
- struct ggml_tensor * src1;
295
- struct ggml_tensor * opt[GGML_MAX_OPT];
296
-
297
- // thread scheduling
298
- int n_tasks;
299
-
300
- // performance
301
- int perf_runs;
302
- int64_t perf_cycles;
303
- int64_t perf_time_us;
304
-
305
- void * data;
306
- char padding[8];
307
- };
308
-
309
- // computation graph
310
- struct ggml_cgraph {
311
- int n_nodes;
312
- int n_leafs;
313
- int n_threads;
314
-
315
- size_t work_size;
316
- struct ggml_tensor * work;
317
-
318
- struct ggml_tensor * nodes[GGML_MAX_NODES];
319
- struct ggml_tensor * grads[GGML_MAX_NODES];
320
- struct ggml_tensor * leafs[GGML_MAX_NODES];
321
-
322
- // performance
323
- int perf_runs;
324
- int64_t perf_cycles;
325
- int64_t perf_time_us;
326
- };
327
-
328
- // scratch buffer
329
- struct ggml_scratch {
330
- size_t offs;
331
- size_t size;
332
- void * data;
333
- };
211
+ // convert FP16 <-> FP32
212
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
213
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
214
+
215
+ struct ggml_object;
216
+ struct ggml_context;
217
+
218
+ enum ggml_type {
219
+ GGML_TYPE_F32 = 0,
220
+ GGML_TYPE_F16 = 1,
221
+ GGML_TYPE_Q4_0 = 2,
222
+ GGML_TYPE_Q4_1 = 3,
223
+ GGML_TYPE_Q4_2 = 4,
224
+ GGML_TYPE_Q4_3 = 5,
225
+ GGML_TYPE_Q5_0 = 6,
226
+ GGML_TYPE_Q5_1 = 7,
227
+ GGML_TYPE_Q8_0 = 8,
228
+ GGML_TYPE_Q8_1 = 9,
229
+ GGML_TYPE_I8,
230
+ GGML_TYPE_I16,
231
+ GGML_TYPE_I32,
232
+ GGML_TYPE_COUNT,
233
+ };
234
+
235
+ // available tensor operations:
236
+ enum ggml_op {
237
+ GGML_OP_NONE = 0,
238
+
239
+ GGML_OP_DUP,
240
+ GGML_OP_ADD,
241
+ GGML_OP_SUB,
242
+ GGML_OP_MUL,
243
+ GGML_OP_DIV,
244
+ GGML_OP_SQR,
245
+ GGML_OP_SQRT,
246
+ GGML_OP_SUM,
247
+ GGML_OP_MEAN,
248
+ GGML_OP_REPEAT,
249
+ GGML_OP_ABS,
250
+ GGML_OP_SGN,
251
+ GGML_OP_NEG,
252
+ GGML_OP_STEP,
253
+ GGML_OP_RELU,
254
+ GGML_OP_GELU,
255
+ GGML_OP_SILU,
256
+ GGML_OP_NORM, // normalize
257
+ GGML_OP_RMS_NORM,
258
+
259
+ GGML_OP_MUL_MAT,
260
+
261
+ GGML_OP_SCALE,
262
+ GGML_OP_CPY,
263
+ GGML_OP_CONT,
264
+ GGML_OP_RESHAPE,
265
+ GGML_OP_VIEW,
266
+ GGML_OP_PERMUTE,
267
+ GGML_OP_TRANSPOSE,
268
+ GGML_OP_GET_ROWS,
269
+ GGML_OP_DIAG_MASK_INF,
270
+ GGML_OP_SOFT_MAX,
271
+ GGML_OP_ROPE,
272
+ GGML_OP_CONV_1D_1S,
273
+ GGML_OP_CONV_1D_2S,
274
+
275
+ GGML_OP_FLASH_ATTN,
276
+ GGML_OP_FLASH_FF,
277
+
278
+ GGML_OP_MAP_UNARY,
279
+ GGML_OP_MAP_BINARY,
280
+
281
+ GGML_OP_COUNT,
282
+ };
283
+
284
+
285
+ // ggml object
286
+ struct ggml_object {
287
+ size_t offs;
288
+ size_t size;
289
+
290
+ struct ggml_object * next;
291
+
292
+ char padding[8];
293
+ };
294
+
295
+ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
296
+
297
+ // n-dimensional tensor
298
+ struct ggml_tensor {
299
+ enum ggml_type type;
300
+
301
+ int n_dims;
302
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
303
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
304
+ // nb[0] = sizeof(type)
305
+ // nb[1] = nb[0] * ne[0] + padding
306
+ // nb[i] = nb[i-1] * ne[i-1]
307
+
308
+ // compute data
309
+ enum ggml_op op;
310
+
311
+ bool is_param;
312
+
313
+ struct ggml_tensor * grad;
314
+ struct ggml_tensor * src0;
315
+ struct ggml_tensor * src1;
316
+ struct ggml_tensor * opt[GGML_MAX_OPT];
317
+
318
+ // thread scheduling
319
+ int n_tasks;
320
+
321
+ // performance
322
+ int perf_runs;
323
+ int64_t perf_cycles;
324
+ int64_t perf_time_us;
325
+
326
+ void * data;
327
+ char padding[8];
328
+ };
329
+
330
+ // computation graph
331
+ struct ggml_cgraph {
332
+ int n_nodes;
333
+ int n_leafs;
334
+ int n_threads;
335
+
336
+ size_t work_size;
337
+ struct ggml_tensor * work;
338
+
339
+ struct ggml_tensor * nodes[GGML_MAX_NODES];
340
+ struct ggml_tensor * grads[GGML_MAX_NODES];
341
+ struct ggml_tensor * leafs[GGML_MAX_NODES];
342
+
343
+ // performance
344
+ int perf_runs;
345
+ int64_t perf_cycles;
346
+ int64_t perf_time_us;
347
+ };
348
+
349
+ // scratch buffer
350
+ struct ggml_scratch {
351
+ size_t offs;
352
+ size_t size;
353
+ void * data;
354
+ };
334
355
 
335
- struct ggml_init_params {
336
- // memory pool
337
- size_t mem_size; // bytes
338
- void * mem_buffer; // if NULL, memory will be allocated internally
339
- bool no_alloc; // don't allocate memory for the tensor data
340
- };
356
+ struct ggml_init_params {
357
+ // memory pool
358
+ size_t mem_size; // bytes
359
+ void * mem_buffer; // if NULL, memory will be allocated internally
360
+ bool no_alloc; // don't allocate memory for the tensor data
361
+ };
341
362
 
342
- void ggml_time_init(void); // call this once at the beginning of the program
343
- int64_t ggml_time_ms(void);
344
- int64_t ggml_time_us(void);
345
- int64_t ggml_cycles(void);
346
- int64_t ggml_cycles_per_ms(void);
363
+ // misc
347
364
 
348
- void ggml_print_object (const struct ggml_object * obj);
349
- void ggml_print_objects(const struct ggml_context * ctx);
365
+ GGML_API void ggml_time_init(void); // call this once at the beginning of the program
366
+ GGML_API int64_t ggml_time_ms(void);
367
+ GGML_API int64_t ggml_time_us(void);
368
+ GGML_API int64_t ggml_cycles(void);
369
+ GGML_API int64_t ggml_cycles_per_ms(void);
350
370
 
351
- int64_t ggml_nelements(const struct ggml_tensor * tensor);
352
- size_t ggml_nbytes (const struct ggml_tensor * tensor);
371
+ GGML_API void ggml_print_object (const struct ggml_object * obj);
372
+ GGML_API void ggml_print_objects(const struct ggml_context * ctx);
353
373
 
354
- int ggml_blck_size (enum ggml_type type);
355
- size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
356
- float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
374
+ GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
375
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
357
376
 
358
- const char * ggml_type_name(enum ggml_type type);
377
+ GGML_API int ggml_blck_size (enum ggml_type type);
378
+ GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
379
+ GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
359
380
 
360
- size_t ggml_element_size(const struct ggml_tensor * tensor);
381
+ GGML_API const char * ggml_type_name(enum ggml_type type);
361
382
 
362
- struct ggml_context * ggml_init(struct ggml_init_params params);
363
- void ggml_free(struct ggml_context * ctx);
383
+ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
364
384
 
365
- size_t ggml_used_mem(const struct ggml_context * ctx);
385
+ GGML_API bool ggml_is_quantized(enum ggml_type type);
366
386
 
367
- size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
387
+ // main
368
388
 
369
- struct ggml_tensor * ggml_new_tensor(
370
- struct ggml_context * ctx,
371
- enum ggml_type type,
372
- int n_dims,
373
- const int64_t *ne);
374
-
375
- struct ggml_tensor * ggml_new_tensor_1d(
376
- struct ggml_context * ctx,
377
- enum ggml_type type,
378
- int64_t ne0);
379
-
380
- struct ggml_tensor * ggml_new_tensor_2d(
381
- struct ggml_context * ctx,
382
- enum ggml_type type,
383
- int64_t ne0,
384
- int64_t ne1);
385
-
386
- struct ggml_tensor * ggml_new_tensor_3d(
387
- struct ggml_context * ctx,
388
- enum ggml_type type,
389
- int64_t ne0,
390
- int64_t ne1,
391
- int64_t ne2);
392
-
393
- struct ggml_tensor * ggml_new_tensor_4d(
394
- struct ggml_context * ctx,
395
- enum ggml_type type,
396
- int64_t ne0,
397
- int64_t ne1,
398
- int64_t ne2,
399
- int64_t ne3);
400
-
401
- struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
402
- struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
403
-
404
- struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
405
- struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
406
-
407
- struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
408
- struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
409
- struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
410
-
411
- int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
412
- void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
413
-
414
- float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
415
- void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
416
-
417
- void * ggml_get_data (const struct ggml_tensor * tensor);
418
- float * ggml_get_data_f32(const struct ggml_tensor * tensor);
419
-
420
- //
421
- // operations on tensors with backpropagation
422
- //
423
-
424
- struct ggml_tensor * ggml_dup(
425
- struct ggml_context * ctx,
426
- struct ggml_tensor * a);
427
-
428
- struct ggml_tensor * ggml_add(
429
- struct ggml_context * ctx,
430
- struct ggml_tensor * a,
431
- struct ggml_tensor * b);
389
+ GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
390
+ GGML_API void ggml_free(struct ggml_context * ctx);
432
391
 
392
+ GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
433
393
 
434
- struct ggml_tensor * ggml_add_inplace(
435
- struct ggml_context * ctx,
436
- struct ggml_tensor * a,
437
- struct ggml_tensor * b);
394
+ GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
438
395
 
439
- struct ggml_tensor * ggml_sub(
440
- struct ggml_context * ctx,
441
- struct ggml_tensor * a,
442
- struct ggml_tensor * b);
396
+ GGML_API struct ggml_tensor * ggml_new_tensor(
397
+ struct ggml_context * ctx,
398
+ enum ggml_type type,
399
+ int n_dims,
400
+ const int64_t *ne);
443
401
 
444
- struct ggml_tensor * ggml_mul(
445
- struct ggml_context * ctx,
446
- struct ggml_tensor * a,
447
- struct ggml_tensor * b);
402
+ GGML_API struct ggml_tensor * ggml_new_tensor_1d(
403
+ struct ggml_context * ctx,
404
+ enum ggml_type type,
405
+ int64_t ne0);
448
406
 
449
- struct ggml_tensor * ggml_div(
450
- struct ggml_context * ctx,
451
- struct ggml_tensor * a,
452
- struct ggml_tensor * b);
453
-
454
- struct ggml_tensor * ggml_sqr(
455
- struct ggml_context * ctx,
456
- struct ggml_tensor * a);
457
-
458
- struct ggml_tensor * ggml_sqrt(
459
- struct ggml_context * ctx,
460
- struct ggml_tensor * a);
461
-
462
- // return scalar
463
- // TODO: compute sum along rows
464
- struct ggml_tensor * ggml_sum(
465
- struct ggml_context * ctx,
466
- struct ggml_tensor * a);
467
-
468
- // mean along rows
469
- struct ggml_tensor * ggml_mean(
470
- struct ggml_context * ctx,
471
- struct ggml_tensor * a);
472
-
473
- // if a is the same shape as b, and a is not parameter, return a
474
- // otherwise, return a new tensor: repeat(a) to fit in b
475
- struct ggml_tensor * ggml_repeat(
476
- struct ggml_context * ctx,
477
- struct ggml_tensor * a,
478
- struct ggml_tensor * b);
479
-
480
- struct ggml_tensor * ggml_abs(
481
- struct ggml_context * ctx,
482
- struct ggml_tensor * a);
483
-
484
- struct ggml_tensor * ggml_sgn(
485
- struct ggml_context * ctx,
486
- struct ggml_tensor * a);
487
-
488
- struct ggml_tensor * ggml_neg(
489
- struct ggml_context * ctx,
490
- struct ggml_tensor * a);
491
-
492
- struct ggml_tensor * ggml_step(
493
- struct ggml_context * ctx,
494
- struct ggml_tensor * a);
495
-
496
- struct ggml_tensor * ggml_relu(
497
- struct ggml_context * ctx,
498
- struct ggml_tensor * a);
499
-
500
- // TODO: double-check this computation is correct
501
- struct ggml_tensor * ggml_gelu(
502
- struct ggml_context * ctx,
503
- struct ggml_tensor * a);
504
-
505
- struct ggml_tensor * ggml_silu(
506
- struct ggml_context * ctx,
507
- struct ggml_tensor * a);
508
-
509
- // normalize along rows
510
- // TODO: eps is hardcoded to 1e-5 for now
511
- struct ggml_tensor * ggml_norm(
512
- struct ggml_context * ctx,
513
- struct ggml_tensor * a);
514
-
515
- struct ggml_tensor * ggml_rms_norm(
516
- struct ggml_context * ctx,
517
- struct ggml_tensor * a);
518
-
519
- // A: m rows, n columns
520
- // B: p rows, n columns (i.e. we transpose it internally)
521
- // result is m columns, p rows
522
- struct ggml_tensor * ggml_mul_mat(
523
- struct ggml_context * ctx,
524
- struct ggml_tensor * a,
525
- struct ggml_tensor * b);
526
-
527
- //
528
- // operations on tensors without backpropagation
529
- //
530
-
531
- // in-place, returns view(a)
532
- struct ggml_tensor * ggml_scale(
533
- struct ggml_context * ctx,
534
- struct ggml_tensor * a,
535
- struct ggml_tensor * b);
536
-
537
- // a -> b, return view(b)
538
- struct ggml_tensor * ggml_cpy(
539
- struct ggml_context * ctx,
540
- struct ggml_tensor * a,
541
- struct ggml_tensor * b);
542
-
543
- // make contiguous
544
- struct ggml_tensor * ggml_cont(
545
- struct ggml_context * ctx,
546
- struct ggml_tensor * a);
547
-
548
- // return view(a), b specifies the new shape
549
- // TODO: when we start computing gradient, make a copy instead of view
550
- struct ggml_tensor * ggml_reshape(
551
- struct ggml_context * ctx,
552
- struct ggml_tensor * a,
553
- struct ggml_tensor * b);
554
-
555
- // return view(a)
556
- // TODO: when we start computing gradient, make a copy instead of view
557
- struct ggml_tensor * ggml_reshape_2d(
558
- struct ggml_context * ctx,
559
- struct ggml_tensor * a,
560
- int64_t ne0,
561
- int64_t ne1);
562
-
563
- // return view(a)
564
- // TODO: when we start computing gradient, make a copy instead of view
565
- struct ggml_tensor * ggml_reshape_3d(
566
- struct ggml_context * ctx,
567
- struct ggml_tensor * a,
568
- int64_t ne0,
569
- int64_t ne1,
570
- int64_t ne2);
571
-
572
- // offset in bytes
573
- struct ggml_tensor * ggml_view_1d(
574
- struct ggml_context * ctx,
575
- struct ggml_tensor * a,
576
- int64_t ne0,
577
- size_t offset);
578
-
579
- struct ggml_tensor * ggml_view_2d(
580
- struct ggml_context * ctx,
581
- struct ggml_tensor * a,
582
- int64_t ne0,
583
- int64_t ne1,
584
- size_t nb1, // row stride in bytes
585
- size_t offset);
586
-
587
- struct ggml_tensor * ggml_view_3d(
588
- struct ggml_context * ctx,
589
- struct ggml_tensor * a,
590
- int64_t ne0,
591
- int64_t ne1,
592
- int64_t ne2,
593
- size_t nb1, // row stride in bytes
594
- size_t nb2, // slice stride in bytes
595
- size_t offset);
596
-
597
- struct ggml_tensor * ggml_permute(
598
- struct ggml_context * ctx,
599
- struct ggml_tensor * a,
600
- int axis0,
601
- int axis1,
602
- int axis2,
603
- int axis3);
604
-
605
- // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
606
- struct ggml_tensor * ggml_transpose(
607
- struct ggml_context * ctx,
608
- struct ggml_tensor * a);
609
-
610
- struct ggml_tensor * ggml_get_rows(
611
- struct ggml_context * ctx,
612
- struct ggml_tensor * a,
613
- struct ggml_tensor * b);
614
-
615
- // set elements above the diagonal to -INF
616
- // in-place, returns view(a)
617
- struct ggml_tensor * ggml_diag_mask_inf(
618
- struct ggml_context * ctx,
619
- struct ggml_tensor * a,
620
- int n_past);
621
-
622
- // in-place, returns view(a)
623
- struct ggml_tensor * ggml_soft_max(
624
- struct ggml_context * ctx,
625
- struct ggml_tensor * a);
626
-
627
- // rotary position embedding
628
- // in-place, returns view(a)
629
- // if mode == 1, skip n_past elements
630
- // TODO: avoid creating a new tensor every time
631
- struct ggml_tensor * ggml_rope(
632
- struct ggml_context * ctx,
633
- struct ggml_tensor * a,
634
- int n_past,
635
- int n_dims,
636
- int mode);
637
-
638
- // padding = 1
639
- // TODO: we don't support extra parameters for now
640
- // that's why we are hard-coding the stride, padding, and dilation
641
- // not great ..
642
- struct ggml_tensor * ggml_conv_1d_1s(
643
- struct ggml_context * ctx,
644
- struct ggml_tensor * a,
645
- struct ggml_tensor * b);
646
-
647
- struct ggml_tensor * ggml_conv_1d_2s(
648
- struct ggml_context * ctx,
649
- struct ggml_tensor * a,
650
- struct ggml_tensor * b);
651
-
652
- struct ggml_tensor * ggml_flash_attn(
653
- struct ggml_context * ctx,
654
- struct ggml_tensor * q,
655
- struct ggml_tensor * k,
656
- struct ggml_tensor * v,
657
- bool masked);
658
-
659
- struct ggml_tensor * ggml_flash_ff(
660
- struct ggml_context * ctx,
661
- struct ggml_tensor * a,
662
- struct ggml_tensor * b0,
663
- struct ggml_tensor * b1,
664
- struct ggml_tensor * c0,
665
- struct ggml_tensor * c1);
666
-
667
- // Mapping operations
668
- typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
669
- typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
670
-
671
- struct ggml_tensor * ggml_map_unary_f32(
672
- struct ggml_context * ctx,
673
- struct ggml_tensor * a,
674
- const ggml_unary_op_f32_t fun);
675
-
676
- struct ggml_tensor * ggml_map_binary_f32(
677
- struct ggml_context * ctx,
678
- struct ggml_tensor * a,
679
- struct ggml_tensor * b,
680
- const ggml_binary_op_f32_t fun);
681
-
682
- //
683
- // automatic differentiation
684
- //
685
-
686
- void ggml_set_param(
687
- struct ggml_context * ctx,
688
- struct ggml_tensor * tensor);
689
-
690
- void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
691
-
692
- struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
693
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
694
-
695
- void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
696
- void ggml_graph_reset (struct ggml_cgraph * cgraph);
697
-
698
- // print info and performance information for the graph
699
- void ggml_graph_print(const struct ggml_cgraph * cgraph);
700
-
701
- // dump the graph into a file using the dot format
702
- void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
703
-
704
- //
705
- // optimization
706
- //
707
-
708
- // optimization methods
709
- enum ggml_opt_type {
710
- GGML_OPT_ADAM,
711
- GGML_OPT_LBFGS,
712
- };
713
-
714
- // linesearch methods
715
- enum ggml_linesearch {
716
- GGML_LINESEARCH_DEFAULT = 1,
717
-
718
- GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
719
- GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
720
- GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
721
- };
722
-
723
- // optimization return values
724
- enum ggml_opt_result {
725
- GGML_OPT_OK = 0,
726
- GGML_OPT_DID_NOT_CONVERGE,
727
- GGML_OPT_NO_CONTEXT,
728
- GGML_OPT_INVALID_WOLFE,
729
- GGML_OPT_FAIL,
407
+ GGML_API struct ggml_tensor * ggml_new_tensor_2d(
408
+ struct ggml_context * ctx,
409
+ enum ggml_type type,
410
+ int64_t ne0,
411
+ int64_t ne1);
730
412
 
731
- GGML_LINESEARCH_FAIL = -128,
732
- GGML_LINESEARCH_MINIMUM_STEP,
733
- GGML_LINESEARCH_MAXIMUM_STEP,
734
- GGML_LINESEARCH_MAXIMUM_ITERATIONS,
735
- GGML_LINESEARCH_INVALID_PARAMETERS,
736
- };
413
+ GGML_API struct ggml_tensor * ggml_new_tensor_3d(
414
+ struct ggml_context * ctx,
415
+ enum ggml_type type,
416
+ int64_t ne0,
417
+ int64_t ne1,
418
+ int64_t ne2);
737
419
 
738
- // optimization parameters
739
- //
740
- // see ggml.c (ggml_opt_default_params) for default values
741
- //
742
- struct ggml_opt_params {
743
- enum ggml_opt_type type;
420
+ GGML_API struct ggml_tensor * ggml_new_tensor_4d(
421
+ struct ggml_context * ctx,
422
+ enum ggml_type type,
423
+ int64_t ne0,
424
+ int64_t ne1,
425
+ int64_t ne2,
426
+ int64_t ne3);
427
+
428
+ GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
429
+ GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
430
+
431
+ GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
432
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
433
+
434
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
435
+ GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
436
+ GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
437
+
438
+ GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
439
+ GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
440
+
441
+ GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
442
+ GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
443
+
444
+ GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
445
+ GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
446
+
447
+ //
448
+ // operations on tensors with backpropagation
449
+ //
744
450
 
745
- int n_threads;
451
+ GGML_API struct ggml_tensor * ggml_dup(
452
+ struct ggml_context * ctx,
453
+ struct ggml_tensor * a);
454
+
455
+ GGML_API struct ggml_tensor * ggml_add(
456
+ struct ggml_context * ctx,
457
+ struct ggml_tensor * a,
458
+ struct ggml_tensor * b);
459
+
460
+ GGML_API struct ggml_tensor * ggml_add_inplace(
461
+ struct ggml_context * ctx,
462
+ struct ggml_tensor * a,
463
+ struct ggml_tensor * b);
464
+
465
+ GGML_API struct ggml_tensor * ggml_sub(
466
+ struct ggml_context * ctx,
467
+ struct ggml_tensor * a,
468
+ struct ggml_tensor * b);
469
+
470
+ GGML_API struct ggml_tensor * ggml_mul(
471
+ struct ggml_context * ctx,
472
+ struct ggml_tensor * a,
473
+ struct ggml_tensor * b);
474
+
475
+ GGML_API struct ggml_tensor * ggml_div(
476
+ struct ggml_context * ctx,
477
+ struct ggml_tensor * a,
478
+ struct ggml_tensor * b);
479
+
480
+ GGML_API struct ggml_tensor * ggml_sqr(
481
+ struct ggml_context * ctx,
482
+ struct ggml_tensor * a);
483
+
484
+ GGML_API struct ggml_tensor * ggml_sqrt(
485
+ struct ggml_context * ctx,
486
+ struct ggml_tensor * a);
487
+
488
+ // return scalar
489
+ // TODO: compute sum along rows
490
+ GGML_API struct ggml_tensor * ggml_sum(
491
+ struct ggml_context * ctx,
492
+ struct ggml_tensor * a);
493
+
494
+ // mean along rows
495
+ GGML_API struct ggml_tensor * ggml_mean(
496
+ struct ggml_context * ctx,
497
+ struct ggml_tensor * a);
498
+
499
+ // if a is the same shape as b, and a is not parameter, return a
500
+ // otherwise, return a new tensor: repeat(a) to fit in b
501
+ GGML_API struct ggml_tensor * ggml_repeat(
502
+ struct ggml_context * ctx,
503
+ struct ggml_tensor * a,
504
+ struct ggml_tensor * b);
505
+
506
+ GGML_API struct ggml_tensor * ggml_abs(
507
+ struct ggml_context * ctx,
508
+ struct ggml_tensor * a);
509
+
510
+ GGML_API struct ggml_tensor * ggml_sgn(
511
+ struct ggml_context * ctx,
512
+ struct ggml_tensor * a);
513
+
514
+ GGML_API struct ggml_tensor * ggml_neg(
515
+ struct ggml_context * ctx,
516
+ struct ggml_tensor * a);
517
+
518
+ GGML_API struct ggml_tensor * ggml_step(
519
+ struct ggml_context * ctx,
520
+ struct ggml_tensor * a);
521
+
522
+ GGML_API struct ggml_tensor * ggml_relu(
523
+ struct ggml_context * ctx,
524
+ struct ggml_tensor * a);
525
+
526
+ // TODO: double-check this computation is correct
527
+ GGML_API struct ggml_tensor * ggml_gelu(
528
+ struct ggml_context * ctx,
529
+ struct ggml_tensor * a);
530
+
531
+ GGML_API struct ggml_tensor * ggml_silu(
532
+ struct ggml_context * ctx,
533
+ struct ggml_tensor * a);
534
+
535
+ // normalize along rows
536
+ // TODO: eps is hardcoded to 1e-5 for now
537
+ GGML_API struct ggml_tensor * ggml_norm(
538
+ struct ggml_context * ctx,
539
+ struct ggml_tensor * a);
540
+
541
+ GGML_API struct ggml_tensor * ggml_rms_norm(
542
+ struct ggml_context * ctx,
543
+ struct ggml_tensor * a);
544
+
545
+ // A: m rows, n columns
546
+ // B: p rows, n columns (i.e. we transpose it internally)
547
+ // result is m columns, p rows
548
+ GGML_API struct ggml_tensor * ggml_mul_mat(
549
+ struct ggml_context * ctx,
550
+ struct ggml_tensor * a,
551
+ struct ggml_tensor * b);
746
552
 
747
- // delta-based convergence test
748
553
  //
749
- // if past == 0 - disabled
750
- // if past > 0:
751
- // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
554
+ // operations on tensors without backpropagation
752
555
  //
753
- int past;
754
- float delta;
755
556
 
756
- // maximum number of iterations without improvement
557
+ // in-place, returns view(a)
558
+ GGML_API struct ggml_tensor * ggml_scale(
559
+ struct ggml_context * ctx,
560
+ struct ggml_tensor * a,
561
+ struct ggml_tensor * b);
562
+
563
+ // a -> b, return view(b)
564
+ GGML_API struct ggml_tensor * ggml_cpy(
565
+ struct ggml_context * ctx,
566
+ struct ggml_tensor * a,
567
+ struct ggml_tensor * b);
568
+
569
+ // make contiguous
570
+ GGML_API struct ggml_tensor * ggml_cont(
571
+ struct ggml_context * ctx,
572
+ struct ggml_tensor * a);
573
+
574
+ // return view(a), b specifies the new shape
575
+ // TODO: when we start computing gradient, make a copy instead of view
576
+ GGML_API struct ggml_tensor * ggml_reshape(
577
+ struct ggml_context * ctx,
578
+ struct ggml_tensor * a,
579
+ struct ggml_tensor * b);
580
+
581
+ // return view(a)
582
+ // TODO: when we start computing gradient, make a copy instead of view
583
+ GGML_API struct ggml_tensor * ggml_reshape_2d(
584
+ struct ggml_context * ctx,
585
+ struct ggml_tensor * a,
586
+ int64_t ne0,
587
+ int64_t ne1);
588
+
589
+ // return view(a)
590
+ // TODO: when we start computing gradient, make a copy instead of view
591
+ GGML_API struct ggml_tensor * ggml_reshape_3d(
592
+ struct ggml_context * ctx,
593
+ struct ggml_tensor * a,
594
+ int64_t ne0,
595
+ int64_t ne1,
596
+ int64_t ne2);
597
+
598
+ // offset in bytes
599
+ GGML_API struct ggml_tensor * ggml_view_1d(
600
+ struct ggml_context * ctx,
601
+ struct ggml_tensor * a,
602
+ int64_t ne0,
603
+ size_t offset);
604
+
605
+ GGML_API struct ggml_tensor * ggml_view_2d(
606
+ struct ggml_context * ctx,
607
+ struct ggml_tensor * a,
608
+ int64_t ne0,
609
+ int64_t ne1,
610
+ size_t nb1, // row stride in bytes
611
+ size_t offset);
612
+
613
+ GGML_API struct ggml_tensor * ggml_view_3d(
614
+ struct ggml_context * ctx,
615
+ struct ggml_tensor * a,
616
+ int64_t ne0,
617
+ int64_t ne1,
618
+ int64_t ne2,
619
+ size_t nb1, // row stride in bytes
620
+ size_t nb2, // slice stride in bytes
621
+ size_t offset);
622
+
623
+ GGML_API struct ggml_tensor * ggml_permute(
624
+ struct ggml_context * ctx,
625
+ struct ggml_tensor * a,
626
+ int axis0,
627
+ int axis1,
628
+ int axis2,
629
+ int axis3);
630
+
631
+ // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
632
+ GGML_API struct ggml_tensor * ggml_transpose(
633
+ struct ggml_context * ctx,
634
+ struct ggml_tensor * a);
635
+
636
+ GGML_API struct ggml_tensor * ggml_get_rows(
637
+ struct ggml_context * ctx,
638
+ struct ggml_tensor * a,
639
+ struct ggml_tensor * b);
640
+
641
+ // set elements above the diagonal to -INF
642
+ // in-place, returns view(a)
643
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf(
644
+ struct ggml_context * ctx,
645
+ struct ggml_tensor * a,
646
+ int n_past);
647
+
648
+ // in-place, returns view(a)
649
+ GGML_API struct ggml_tensor * ggml_soft_max(
650
+ struct ggml_context * ctx,
651
+ struct ggml_tensor * a);
652
+
653
+ // rotary position embedding
654
+ // in-place, returns view(a)
655
+ // if mode & 1 == 1, skip n_past elements
656
+ // if mode & 2 == 1, GPT-NeoX style
657
+ // TODO: avoid creating a new tensor every time
658
+ GGML_API struct ggml_tensor * ggml_rope(
659
+ struct ggml_context * ctx,
660
+ struct ggml_tensor * a,
661
+ int n_past,
662
+ int n_dims,
663
+ int mode);
664
+
665
+ // padding = 1
666
+ // TODO: we don't support extra parameters for now
667
+ // that's why we are hard-coding the stride, padding, and dilation
668
+ // not great ..
669
+ GGML_API struct ggml_tensor * ggml_conv_1d_1s(
670
+ struct ggml_context * ctx,
671
+ struct ggml_tensor * a,
672
+ struct ggml_tensor * b);
673
+
674
+ GGML_API struct ggml_tensor * ggml_conv_1d_2s(
675
+ struct ggml_context * ctx,
676
+ struct ggml_tensor * a,
677
+ struct ggml_tensor * b);
678
+
679
+ GGML_API struct ggml_tensor * ggml_flash_attn(
680
+ struct ggml_context * ctx,
681
+ struct ggml_tensor * q,
682
+ struct ggml_tensor * k,
683
+ struct ggml_tensor * v,
684
+ bool masked);
685
+
686
+ GGML_API struct ggml_tensor * ggml_flash_ff(
687
+ struct ggml_context * ctx,
688
+ struct ggml_tensor * a,
689
+ struct ggml_tensor * b0,
690
+ struct ggml_tensor * b1,
691
+ struct ggml_tensor * c0,
692
+ struct ggml_tensor * c1);
693
+
694
+ // Mapping operations
695
+ GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
696
+ GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
697
+
698
+ GGML_API struct ggml_tensor * ggml_map_unary_f32(
699
+ struct ggml_context * ctx,
700
+ struct ggml_tensor * a,
701
+ const ggml_unary_op_f32_t fun);
702
+
703
+ GGML_API struct ggml_tensor * ggml_map_binary_f32(
704
+ struct ggml_context * ctx,
705
+ struct ggml_tensor * a,
706
+ struct ggml_tensor * b,
707
+ const ggml_binary_op_f32_t fun);
708
+
757
709
  //
758
- // if 0 - disabled
759
- // if > 0:
760
- // assume convergence if no cost improvement in this number of iterations
710
+ // automatic differentiation
761
711
  //
762
- int max_no_improvement;
763
712
 
764
- bool print_forward_graph;
765
- bool print_backward_graph;
713
+ GGML_API void ggml_set_param(
714
+ struct ggml_context * ctx,
715
+ struct ggml_tensor * tensor);
766
716
 
767
- // ADAM parameters
768
- struct {
769
- int n_iter;
717
+ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
770
718
 
771
- float alpha; // learning rate
772
- float beta1;
773
- float beta2;
774
- float eps; // epsilon for numerical stability
775
- float eps_f; // epsilon for convergence test
776
- float eps_g; // epsilon for convergence test
777
- } adam;
719
+ GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
720
+ GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
778
721
 
779
- // LBFGS parameters
780
- struct {
781
- int m; // number of corrections to approximate the inv. Hessian
782
- int n_iter;
783
- int max_linesearch;
722
+ GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
723
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
784
724
 
785
- float eps; // convergence tolerance
786
- float ftol; // line search tolerance
787
- float wolfe;
788
- float min_step;
789
- float max_step;
725
+ // print info and performance information for the graph
726
+ GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
790
727
 
791
- enum ggml_linesearch linesearch;
792
- } lbfgs;
793
- };
728
+ // dump the graph into a file using the dot format
729
+ GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
794
730
 
795
- struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
731
+ //
732
+ // optimization
733
+ //
796
734
 
797
- // optimize the function defined by the tensor f
798
- enum ggml_opt_result ggml_opt(
799
- struct ggml_context * ctx,
800
- struct ggml_opt_params params,
801
- struct ggml_tensor * f);
735
+ // optimization methods
736
+ enum ggml_opt_type {
737
+ GGML_OPT_ADAM,
738
+ GGML_OPT_LBFGS,
739
+ };
740
+
741
+ // linesearch methods
742
+ enum ggml_linesearch {
743
+ GGML_LINESEARCH_DEFAULT = 1,
744
+
745
+ GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
746
+ GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
747
+ GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
748
+ };
749
+
750
+ // optimization return values
751
+ enum ggml_opt_result {
752
+ GGML_OPT_OK = 0,
753
+ GGML_OPT_DID_NOT_CONVERGE,
754
+ GGML_OPT_NO_CONTEXT,
755
+ GGML_OPT_INVALID_WOLFE,
756
+ GGML_OPT_FAIL,
757
+
758
+ GGML_LINESEARCH_FAIL = -128,
759
+ GGML_LINESEARCH_MINIMUM_STEP,
760
+ GGML_LINESEARCH_MAXIMUM_STEP,
761
+ GGML_LINESEARCH_MAXIMUM_ITERATIONS,
762
+ GGML_LINESEARCH_INVALID_PARAMETERS,
763
+ };
764
+
765
+ // optimization parameters
766
+ //
767
+ // see ggml.c (ggml_opt_default_params) for default values
768
+ //
769
+ struct ggml_opt_params {
770
+ enum ggml_opt_type type;
771
+
772
+ int n_threads;
773
+
774
+ // delta-based convergence test
775
+ //
776
+ // if past == 0 - disabled
777
+ // if past > 0:
778
+ // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
779
+ //
780
+ int past;
781
+ float delta;
782
+
783
+ // maximum number of iterations without improvement
784
+ //
785
+ // if 0 - disabled
786
+ // if > 0:
787
+ // assume convergence if no cost improvement in this number of iterations
788
+ //
789
+ int max_no_improvement;
790
+
791
+ bool print_forward_graph;
792
+ bool print_backward_graph;
793
+
794
+ // ADAM parameters
795
+ struct {
796
+ int n_iter;
797
+
798
+ float alpha; // learning rate
799
+ float beta1;
800
+ float beta2;
801
+ float eps; // epsilon for numerical stability
802
+ float eps_f; // epsilon for convergence test
803
+ float eps_g; // epsilon for convergence test
804
+ } adam;
805
+
806
+ // LBFGS parameters
807
+ struct {
808
+ int m; // number of corrections to approximate the inv. Hessian
809
+ int n_iter;
810
+ int max_linesearch;
811
+
812
+ float eps; // convergence tolerance
813
+ float ftol; // line search tolerance
814
+ float wolfe;
815
+ float min_step;
816
+ float max_step;
817
+
818
+ enum ggml_linesearch linesearch;
819
+ } lbfgs;
820
+ };
821
+
822
+ GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
823
+
824
+ // optimize the function defined by the tensor f
825
+ GGML_API enum ggml_opt_result ggml_opt(
826
+ struct ggml_context * ctx,
827
+ struct ggml_opt_params params,
828
+ struct ggml_tensor * f);
802
829
 
803
- //
804
- // quantization
805
- //
830
+ //
831
+ // quantization
832
+ //
806
833
 
807
- size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
808
- size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
834
+ GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
835
+ GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
836
+ GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
837
+ GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
838
+ GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
839
+ GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
840
+ GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
809
841
 
810
- //
811
- // system info
812
- //
842
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
813
843
 
814
- int ggml_cpu_has_avx(void);
815
- int ggml_cpu_has_avx2(void);
816
- int ggml_cpu_has_avx512(void);
817
- int ggml_cpu_has_avx512_vbmi(void);
818
- int ggml_cpu_has_avx512_vnni(void);
819
- int ggml_cpu_has_fma(void);
820
- int ggml_cpu_has_neon(void);
821
- int ggml_cpu_has_arm_fma(void);
822
- int ggml_cpu_has_f16c(void);
823
- int ggml_cpu_has_fp16_va(void);
824
- int ggml_cpu_has_wasm_simd(void);
825
- int ggml_cpu_has_blas(void);
826
- int ggml_cpu_has_sse3(void);
827
- int ggml_cpu_has_vsx(void);
844
+ //
845
+ // system info
846
+ //
828
847
 
848
+ GGML_API int ggml_cpu_has_avx (void);
849
+ GGML_API int ggml_cpu_has_avx2 (void);
850
+ GGML_API int ggml_cpu_has_avx512 (void);
851
+ GGML_API int ggml_cpu_has_avx512_vbmi(void);
852
+ GGML_API int ggml_cpu_has_avx512_vnni(void);
853
+ GGML_API int ggml_cpu_has_fma (void);
854
+ GGML_API int ggml_cpu_has_neon (void);
855
+ GGML_API int ggml_cpu_has_arm_fma (void);
856
+ GGML_API int ggml_cpu_has_f16c (void);
857
+ GGML_API int ggml_cpu_has_fp16_va (void);
858
+ GGML_API int ggml_cpu_has_wasm_simd (void);
859
+ GGML_API int ggml_cpu_has_blas (void);
860
+ GGML_API int ggml_cpu_has_cublas (void);
861
+ GGML_API int ggml_cpu_has_clblast (void);
862
+ GGML_API int ggml_cpu_has_gpublas (void);
863
+ GGML_API int ggml_cpu_has_sse3 (void);
864
+ GGML_API int ggml_cpu_has_vsx (void);
829
865
 
830
- //
831
- // Internal types and functions exposed for tests and benchmarks
832
- //
866
+ //
867
+ // Internal types and functions exposed for tests and benchmarks
868
+ //
833
869
 
834
870
  #ifdef __cplusplus
835
- // restrict not standard in C++
871
+ // restrict not standard in C++
836
872
  #define GGML_RESTRICT
837
873
  #else
838
874
  #define GGML_RESTRICT restrict
839
875
  #endif
840
- typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
841
- typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
842
- typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
843
-
844
- typedef struct {
845
- dequantize_row_q_t dequantize_row_q;
846
- quantize_row_q_t quantize_row_q;
847
- quantize_row_q_t quantize_row_q_reference;
848
- quantize_row_q_t quantize_row_q_dot;
849
- vec_dot_q_t vec_dot_q;
850
- } quantize_fns_t;
851
-
852
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
876
+ typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
877
+ typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
878
+ typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
879
+
880
+ typedef struct {
881
+ dequantize_row_q_t dequantize_row_q;
882
+ quantize_row_q_t quantize_row_q;
883
+ quantize_row_q_t quantize_row_q_reference;
884
+ quantize_row_q_t quantize_row_q_dot;
885
+ vec_dot_q_t vec_dot_q;
886
+ enum ggml_type vec_dot_type;
887
+ } quantize_fns_t;
888
+
889
+ quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
853
890
 
854
891
  #ifdef __cplusplus
855
892
  }