llama_cpp 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -169,14 +169,27 @@
169
169
  //
170
170
  //
171
171
 
172
- #ifdef __cplusplus
173
- extern "C" {
172
+ #ifdef GGML_SHARED
173
+ # if defined(_WIN32) && !defined(__MINGW32__)
174
+ # ifdef GGML_BUILD
175
+ # define GGML_API __declspec(dllexport)
176
+ # else
177
+ # define GGML_API __declspec(dllimport)
178
+ # endif
179
+ # else
180
+ # define GGML_API __attribute__ ((visibility ("default")))
181
+ # endif
182
+ #else
183
+ # define GGML_API
174
184
  #endif
175
185
 
176
186
  #include <stdint.h>
177
187
  #include <stddef.h>
178
188
  #include <stdbool.h>
179
189
 
190
+ #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
+ #define GGML_FILE_VERSION 1
192
+
180
193
  #define GGML_MAX_DIMS 4
181
194
  #define GGML_MAX_NODES 4096
182
195
  #define GGML_MAX_PARAMS 16
@@ -184,682 +197,696 @@ extern "C" {
184
197
  #define GGML_MAX_OPT 4
185
198
  #define GGML_DEFAULT_N_THREADS 4
186
199
 
200
+ #ifdef __cplusplus
201
+ extern "C" {
202
+ #endif
203
+
187
204
  #ifdef __ARM_NEON
188
- // we use the built-in 16-bit float type
189
- typedef __fp16 ggml_fp16_t;
205
+ // we use the built-in 16-bit float type
206
+ typedef __fp16 ggml_fp16_t;
190
207
  #else
191
- typedef uint16_t ggml_fp16_t;
208
+ typedef uint16_t ggml_fp16_t;
192
209
  #endif
193
210
 
194
- // convert FP16 <-> FP32
195
- float ggml_fp16_to_fp32(ggml_fp16_t x);
196
- ggml_fp16_t ggml_fp32_to_fp16(float x);
197
-
198
- struct ggml_object;
199
- struct ggml_context;
200
-
201
- enum ggml_type {
202
- // explicitly numbered values are used in llama.cpp files
203
- GGML_TYPE_F32 = 0,
204
- GGML_TYPE_F16 = 1,
205
- GGML_TYPE_Q4_0 = 2,
206
- GGML_TYPE_Q4_1 = 3,
207
- GGML_TYPE_Q4_2 = 4,
208
- GGML_TYPE_Q4_3 = 5,
209
- GGML_TYPE_Q8_0 = 6,
210
- GGML_TYPE_I8,
211
- GGML_TYPE_I16,
212
- GGML_TYPE_I32,
213
- GGML_TYPE_COUNT,
214
- };
215
-
216
- // available tensor operations:
217
- enum ggml_op {
218
- GGML_OP_NONE = 0,
219
-
220
- GGML_OP_DUP,
221
- GGML_OP_ADD,
222
- GGML_OP_SUB,
223
- GGML_OP_MUL,
224
- GGML_OP_DIV,
225
- GGML_OP_SQR,
226
- GGML_OP_SQRT,
227
- GGML_OP_SUM,
228
- GGML_OP_MEAN,
229
- GGML_OP_REPEAT,
230
- GGML_OP_ABS,
231
- GGML_OP_SGN,
232
- GGML_OP_NEG,
233
- GGML_OP_STEP,
234
- GGML_OP_RELU,
235
- GGML_OP_GELU,
236
- GGML_OP_SILU,
237
- GGML_OP_NORM, // normalize
238
- GGML_OP_RMS_NORM,
239
-
240
- GGML_OP_MUL_MAT,
241
-
242
- GGML_OP_SCALE,
243
- GGML_OP_CPY,
244
- GGML_OP_CONT,
245
- GGML_OP_RESHAPE,
246
- GGML_OP_VIEW,
247
- GGML_OP_PERMUTE,
248
- GGML_OP_TRANSPOSE,
249
- GGML_OP_GET_ROWS,
250
- GGML_OP_DIAG_MASK_INF,
251
- GGML_OP_SOFT_MAX,
252
- GGML_OP_ROPE,
253
- GGML_OP_CONV_1D_1S,
254
- GGML_OP_CONV_1D_2S,
255
-
256
- GGML_OP_FLASH_ATTN,
257
- GGML_OP_FLASH_FF,
258
-
259
- GGML_OP_MAP_UNARY,
260
- GGML_OP_MAP_BINARY,
261
-
262
- GGML_OP_COUNT,
263
- };
264
-
265
-
266
- // ggml object
267
- struct ggml_object {
268
- size_t offs;
269
- size_t size;
270
-
271
- struct ggml_object * next;
272
-
273
- char padding[8];
274
- };
275
-
276
- static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
277
-
278
- // n-dimensional tensor
279
- struct ggml_tensor {
280
- enum ggml_type type;
281
-
282
- int n_dims;
283
- int64_t ne[GGML_MAX_DIMS]; // number of elements
284
- size_t nb[GGML_MAX_DIMS]; // stride in bytes:
285
- // nb[0] = sizeof(type)
286
- // nb[1] = nb[0] * ne[0] + padding
287
- // nb[i] = nb[i-1] * ne[i-1]
288
-
289
- // compute data
290
- enum ggml_op op;
291
-
292
- bool is_param;
293
-
294
- struct ggml_tensor * grad;
295
- struct ggml_tensor * src0;
296
- struct ggml_tensor * src1;
297
- struct ggml_tensor * opt[GGML_MAX_OPT];
298
-
299
- // thread scheduling
300
- int n_tasks;
301
-
302
- // performance
303
- int perf_runs;
304
- int64_t perf_cycles;
305
- int64_t perf_time_us;
306
-
307
- void * data;
308
- char padding[8];
309
- };
310
-
311
- // computation graph
312
- struct ggml_cgraph {
313
- int n_nodes;
314
- int n_leafs;
315
- int n_threads;
316
-
317
- size_t work_size;
318
- struct ggml_tensor * work;
319
-
320
- struct ggml_tensor * nodes[GGML_MAX_NODES];
321
- struct ggml_tensor * grads[GGML_MAX_NODES];
322
- struct ggml_tensor * leafs[GGML_MAX_NODES];
323
-
324
- // performance
325
- int perf_runs;
326
- int64_t perf_cycles;
327
- int64_t perf_time_us;
328
- };
329
-
330
- // scratch buffer
331
- struct ggml_scratch {
332
- size_t offs;
333
- size_t size;
334
- void * data;
335
- };
211
+ // convert FP16 <-> FP32
212
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
213
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
214
+
215
+ struct ggml_object;
216
+ struct ggml_context;
217
+
218
+ enum ggml_type {
219
+ GGML_TYPE_F32 = 0,
220
+ GGML_TYPE_F16 = 1,
221
+ GGML_TYPE_Q4_0 = 2,
222
+ GGML_TYPE_Q4_1 = 3,
223
+ GGML_TYPE_Q4_2 = 4,
224
+ GGML_TYPE_Q4_3 = 5,
225
+ GGML_TYPE_Q5_0 = 6,
226
+ GGML_TYPE_Q5_1 = 7,
227
+ GGML_TYPE_Q8_0 = 8,
228
+ GGML_TYPE_Q8_1 = 9,
229
+ GGML_TYPE_I8,
230
+ GGML_TYPE_I16,
231
+ GGML_TYPE_I32,
232
+ GGML_TYPE_COUNT,
233
+ };
234
+
235
+ // available tensor operations:
236
+ enum ggml_op {
237
+ GGML_OP_NONE = 0,
238
+
239
+ GGML_OP_DUP,
240
+ GGML_OP_ADD,
241
+ GGML_OP_SUB,
242
+ GGML_OP_MUL,
243
+ GGML_OP_DIV,
244
+ GGML_OP_SQR,
245
+ GGML_OP_SQRT,
246
+ GGML_OP_SUM,
247
+ GGML_OP_MEAN,
248
+ GGML_OP_REPEAT,
249
+ GGML_OP_ABS,
250
+ GGML_OP_SGN,
251
+ GGML_OP_NEG,
252
+ GGML_OP_STEP,
253
+ GGML_OP_RELU,
254
+ GGML_OP_GELU,
255
+ GGML_OP_SILU,
256
+ GGML_OP_NORM, // normalize
257
+ GGML_OP_RMS_NORM,
258
+
259
+ GGML_OP_MUL_MAT,
260
+
261
+ GGML_OP_SCALE,
262
+ GGML_OP_CPY,
263
+ GGML_OP_CONT,
264
+ GGML_OP_RESHAPE,
265
+ GGML_OP_VIEW,
266
+ GGML_OP_PERMUTE,
267
+ GGML_OP_TRANSPOSE,
268
+ GGML_OP_GET_ROWS,
269
+ GGML_OP_DIAG_MASK_INF,
270
+ GGML_OP_SOFT_MAX,
271
+ GGML_OP_ROPE,
272
+ GGML_OP_CONV_1D_1S,
273
+ GGML_OP_CONV_1D_2S,
274
+
275
+ GGML_OP_FLASH_ATTN,
276
+ GGML_OP_FLASH_FF,
277
+
278
+ GGML_OP_MAP_UNARY,
279
+ GGML_OP_MAP_BINARY,
280
+
281
+ GGML_OP_COUNT,
282
+ };
283
+
284
+
285
+ // ggml object
286
+ struct ggml_object {
287
+ size_t offs;
288
+ size_t size;
289
+
290
+ struct ggml_object * next;
291
+
292
+ char padding[8];
293
+ };
294
+
295
+ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
296
+
297
+ // n-dimensional tensor
298
+ struct ggml_tensor {
299
+ enum ggml_type type;
300
+
301
+ int n_dims;
302
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
303
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
304
+ // nb[0] = sizeof(type)
305
+ // nb[1] = nb[0] * ne[0] + padding
306
+ // nb[i] = nb[i-1] * ne[i-1]
307
+
308
+ // compute data
309
+ enum ggml_op op;
310
+
311
+ bool is_param;
312
+
313
+ struct ggml_tensor * grad;
314
+ struct ggml_tensor * src0;
315
+ struct ggml_tensor * src1;
316
+ struct ggml_tensor * opt[GGML_MAX_OPT];
317
+
318
+ // thread scheduling
319
+ int n_tasks;
320
+
321
+ // performance
322
+ int perf_runs;
323
+ int64_t perf_cycles;
324
+ int64_t perf_time_us;
325
+
326
+ void * data;
327
+ char padding[8];
328
+ };
329
+
330
+ // computation graph
331
+ struct ggml_cgraph {
332
+ int n_nodes;
333
+ int n_leafs;
334
+ int n_threads;
335
+
336
+ size_t work_size;
337
+ struct ggml_tensor * work;
338
+
339
+ struct ggml_tensor * nodes[GGML_MAX_NODES];
340
+ struct ggml_tensor * grads[GGML_MAX_NODES];
341
+ struct ggml_tensor * leafs[GGML_MAX_NODES];
342
+
343
+ // performance
344
+ int perf_runs;
345
+ int64_t perf_cycles;
346
+ int64_t perf_time_us;
347
+ };
348
+
349
+ // scratch buffer
350
+ struct ggml_scratch {
351
+ size_t offs;
352
+ size_t size;
353
+ void * data;
354
+ };
336
355
 
337
- struct ggml_init_params {
338
- // memory pool
339
- size_t mem_size; // bytes
340
- void * mem_buffer; // if NULL, memory will be allocated internally
341
- bool no_alloc; // don't allocate memory for the tensor data
342
- };
356
+ struct ggml_init_params {
357
+ // memory pool
358
+ size_t mem_size; // bytes
359
+ void * mem_buffer; // if NULL, memory will be allocated internally
360
+ bool no_alloc; // don't allocate memory for the tensor data
361
+ };
343
362
 
344
- void ggml_time_init(void); // call this once at the beginning of the program
345
- int64_t ggml_time_ms(void);
346
- int64_t ggml_time_us(void);
347
- int64_t ggml_cycles(void);
348
- int64_t ggml_cycles_per_ms(void);
363
+ // misc
349
364
 
350
- void ggml_print_object (const struct ggml_object * obj);
351
- void ggml_print_objects(const struct ggml_context * ctx);
365
+ GGML_API void ggml_time_init(void); // call this once at the beginning of the program
366
+ GGML_API int64_t ggml_time_ms(void);
367
+ GGML_API int64_t ggml_time_us(void);
368
+ GGML_API int64_t ggml_cycles(void);
369
+ GGML_API int64_t ggml_cycles_per_ms(void);
352
370
 
353
- int64_t ggml_nelements(const struct ggml_tensor * tensor);
354
- size_t ggml_nbytes (const struct ggml_tensor * tensor);
371
+ GGML_API void ggml_print_object (const struct ggml_object * obj);
372
+ GGML_API void ggml_print_objects(const struct ggml_context * ctx);
355
373
 
356
- int ggml_blck_size (enum ggml_type type);
357
- size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
358
- float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
374
+ GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
375
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
359
376
 
360
- const char * ggml_type_name(enum ggml_type type);
377
+ GGML_API int ggml_blck_size (enum ggml_type type);
378
+ GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
379
+ GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
361
380
 
362
- size_t ggml_element_size(const struct ggml_tensor * tensor);
381
+ GGML_API const char * ggml_type_name(enum ggml_type type);
363
382
 
364
- bool ggml_is_quantized(enum ggml_type type);
383
+ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
365
384
 
366
- struct ggml_context * ggml_init(struct ggml_init_params params);
367
- void ggml_free(struct ggml_context * ctx);
385
+ GGML_API bool ggml_is_quantized(enum ggml_type type);
368
386
 
369
- size_t ggml_used_mem(const struct ggml_context * ctx);
387
+ // main
370
388
 
371
- size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
389
+ GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
390
+ GGML_API void ggml_free(struct ggml_context * ctx);
372
391
 
373
- struct ggml_tensor * ggml_new_tensor(
374
- struct ggml_context * ctx,
375
- enum ggml_type type,
376
- int n_dims,
377
- const int64_t *ne);
378
-
379
- struct ggml_tensor * ggml_new_tensor_1d(
380
- struct ggml_context * ctx,
381
- enum ggml_type type,
382
- int64_t ne0);
383
-
384
- struct ggml_tensor * ggml_new_tensor_2d(
385
- struct ggml_context * ctx,
386
- enum ggml_type type,
387
- int64_t ne0,
388
- int64_t ne1);
389
-
390
- struct ggml_tensor * ggml_new_tensor_3d(
391
- struct ggml_context * ctx,
392
- enum ggml_type type,
393
- int64_t ne0,
394
- int64_t ne1,
395
- int64_t ne2);
396
-
397
- struct ggml_tensor * ggml_new_tensor_4d(
398
- struct ggml_context * ctx,
399
- enum ggml_type type,
400
- int64_t ne0,
401
- int64_t ne1,
402
- int64_t ne2,
403
- int64_t ne3);
404
-
405
- struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
406
- struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
407
-
408
- struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
409
- struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
410
-
411
- struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
412
- struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
413
- struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
414
-
415
- int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
416
- void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
417
-
418
- float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
419
- void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
420
-
421
- void * ggml_get_data (const struct ggml_tensor * tensor);
422
- float * ggml_get_data_f32(const struct ggml_tensor * tensor);
423
-
424
- //
425
- // operations on tensors with backpropagation
426
- //
427
-
428
- struct ggml_tensor * ggml_dup(
429
- struct ggml_context * ctx,
430
- struct ggml_tensor * a);
431
-
432
- struct ggml_tensor * ggml_add(
433
- struct ggml_context * ctx,
434
- struct ggml_tensor * a,
435
- struct ggml_tensor * b);
392
+ GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
436
393
 
394
+ GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
437
395
 
438
- struct ggml_tensor * ggml_add_inplace(
439
- struct ggml_context * ctx,
440
- struct ggml_tensor * a,
441
- struct ggml_tensor * b);
396
+ GGML_API struct ggml_tensor * ggml_new_tensor(
397
+ struct ggml_context * ctx,
398
+ enum ggml_type type,
399
+ int n_dims,
400
+ const int64_t *ne);
442
401
 
443
- struct ggml_tensor * ggml_sub(
444
- struct ggml_context * ctx,
445
- struct ggml_tensor * a,
446
- struct ggml_tensor * b);
402
+ GGML_API struct ggml_tensor * ggml_new_tensor_1d(
403
+ struct ggml_context * ctx,
404
+ enum ggml_type type,
405
+ int64_t ne0);
447
406
 
448
- struct ggml_tensor * ggml_mul(
449
- struct ggml_context * ctx,
450
- struct ggml_tensor * a,
451
- struct ggml_tensor * b);
407
+ GGML_API struct ggml_tensor * ggml_new_tensor_2d(
408
+ struct ggml_context * ctx,
409
+ enum ggml_type type,
410
+ int64_t ne0,
411
+ int64_t ne1);
452
412
 
453
- struct ggml_tensor * ggml_div(
454
- struct ggml_context * ctx,
455
- struct ggml_tensor * a,
456
- struct ggml_tensor * b);
457
-
458
- struct ggml_tensor * ggml_sqr(
459
- struct ggml_context * ctx,
460
- struct ggml_tensor * a);
461
-
462
- struct ggml_tensor * ggml_sqrt(
463
- struct ggml_context * ctx,
464
- struct ggml_tensor * a);
465
-
466
- // return scalar
467
- // TODO: compute sum along rows
468
- struct ggml_tensor * ggml_sum(
469
- struct ggml_context * ctx,
470
- struct ggml_tensor * a);
471
-
472
- // mean along rows
473
- struct ggml_tensor * ggml_mean(
474
- struct ggml_context * ctx,
475
- struct ggml_tensor * a);
476
-
477
- // if a is the same shape as b, and a is not parameter, return a
478
- // otherwise, return a new tensor: repeat(a) to fit in b
479
- struct ggml_tensor * ggml_repeat(
480
- struct ggml_context * ctx,
481
- struct ggml_tensor * a,
482
- struct ggml_tensor * b);
483
-
484
- struct ggml_tensor * ggml_abs(
485
- struct ggml_context * ctx,
486
- struct ggml_tensor * a);
487
-
488
- struct ggml_tensor * ggml_sgn(
489
- struct ggml_context * ctx,
490
- struct ggml_tensor * a);
491
-
492
- struct ggml_tensor * ggml_neg(
493
- struct ggml_context * ctx,
494
- struct ggml_tensor * a);
495
-
496
- struct ggml_tensor * ggml_step(
497
- struct ggml_context * ctx,
498
- struct ggml_tensor * a);
499
-
500
- struct ggml_tensor * ggml_relu(
501
- struct ggml_context * ctx,
502
- struct ggml_tensor * a);
503
-
504
- // TODO: double-check this computation is correct
505
- struct ggml_tensor * ggml_gelu(
506
- struct ggml_context * ctx,
507
- struct ggml_tensor * a);
508
-
509
- struct ggml_tensor * ggml_silu(
510
- struct ggml_context * ctx,
511
- struct ggml_tensor * a);
512
-
513
- // normalize along rows
514
- // TODO: eps is hardcoded to 1e-5 for now
515
- struct ggml_tensor * ggml_norm(
516
- struct ggml_context * ctx,
517
- struct ggml_tensor * a);
518
-
519
- struct ggml_tensor * ggml_rms_norm(
520
- struct ggml_context * ctx,
521
- struct ggml_tensor * a);
522
-
523
- // A: m rows, n columns
524
- // B: p rows, n columns (i.e. we transpose it internally)
525
- // result is m columns, p rows
526
- struct ggml_tensor * ggml_mul_mat(
527
- struct ggml_context * ctx,
528
- struct ggml_tensor * a,
529
- struct ggml_tensor * b);
530
-
531
- //
532
- // operations on tensors without backpropagation
533
- //
534
-
535
- // in-place, returns view(a)
536
- struct ggml_tensor * ggml_scale(
537
- struct ggml_context * ctx,
538
- struct ggml_tensor * a,
539
- struct ggml_tensor * b);
540
-
541
- // a -> b, return view(b)
542
- struct ggml_tensor * ggml_cpy(
543
- struct ggml_context * ctx,
544
- struct ggml_tensor * a,
545
- struct ggml_tensor * b);
546
-
547
- // make contiguous
548
- struct ggml_tensor * ggml_cont(
549
- struct ggml_context * ctx,
550
- struct ggml_tensor * a);
551
-
552
- // return view(a), b specifies the new shape
553
- // TODO: when we start computing gradient, make a copy instead of view
554
- struct ggml_tensor * ggml_reshape(
555
- struct ggml_context * ctx,
556
- struct ggml_tensor * a,
557
- struct ggml_tensor * b);
558
-
559
- // return view(a)
560
- // TODO: when we start computing gradient, make a copy instead of view
561
- struct ggml_tensor * ggml_reshape_2d(
562
- struct ggml_context * ctx,
563
- struct ggml_tensor * a,
564
- int64_t ne0,
565
- int64_t ne1);
566
-
567
- // return view(a)
568
- // TODO: when we start computing gradient, make a copy instead of view
569
- struct ggml_tensor * ggml_reshape_3d(
570
- struct ggml_context * ctx,
571
- struct ggml_tensor * a,
572
- int64_t ne0,
573
- int64_t ne1,
574
- int64_t ne2);
575
-
576
- // offset in bytes
577
- struct ggml_tensor * ggml_view_1d(
578
- struct ggml_context * ctx,
579
- struct ggml_tensor * a,
580
- int64_t ne0,
581
- size_t offset);
582
-
583
- struct ggml_tensor * ggml_view_2d(
584
- struct ggml_context * ctx,
585
- struct ggml_tensor * a,
586
- int64_t ne0,
587
- int64_t ne1,
588
- size_t nb1, // row stride in bytes
589
- size_t offset);
590
-
591
- struct ggml_tensor * ggml_view_3d(
592
- struct ggml_context * ctx,
593
- struct ggml_tensor * a,
594
- int64_t ne0,
595
- int64_t ne1,
596
- int64_t ne2,
597
- size_t nb1, // row stride in bytes
598
- size_t nb2, // slice stride in bytes
599
- size_t offset);
600
-
601
- struct ggml_tensor * ggml_permute(
602
- struct ggml_context * ctx,
603
- struct ggml_tensor * a,
604
- int axis0,
605
- int axis1,
606
- int axis2,
607
- int axis3);
608
-
609
- // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
610
- struct ggml_tensor * ggml_transpose(
611
- struct ggml_context * ctx,
612
- struct ggml_tensor * a);
613
-
614
- struct ggml_tensor * ggml_get_rows(
615
- struct ggml_context * ctx,
616
- struct ggml_tensor * a,
617
- struct ggml_tensor * b);
618
-
619
- // set elements above the diagonal to -INF
620
- // in-place, returns view(a)
621
- struct ggml_tensor * ggml_diag_mask_inf(
622
- struct ggml_context * ctx,
623
- struct ggml_tensor * a,
624
- int n_past);
625
-
626
- // in-place, returns view(a)
627
- struct ggml_tensor * ggml_soft_max(
628
- struct ggml_context * ctx,
629
- struct ggml_tensor * a);
630
-
631
- // rotary position embedding
632
- // in-place, returns view(a)
633
- // if mode & 1 == 1, skip n_past elements
634
- // if mode & 2 == 1, GPT-NeoX style
635
- // TODO: avoid creating a new tensor every time
636
- struct ggml_tensor * ggml_rope(
637
- struct ggml_context * ctx,
638
- struct ggml_tensor * a,
639
- int n_past,
640
- int n_dims,
641
- int mode);
642
-
643
- // padding = 1
644
- // TODO: we don't support extra parameters for now
645
- // that's why we are hard-coding the stride, padding, and dilation
646
- // not great ..
647
- struct ggml_tensor * ggml_conv_1d_1s(
648
- struct ggml_context * ctx,
649
- struct ggml_tensor * a,
650
- struct ggml_tensor * b);
651
-
652
- struct ggml_tensor * ggml_conv_1d_2s(
653
- struct ggml_context * ctx,
654
- struct ggml_tensor * a,
655
- struct ggml_tensor * b);
656
-
657
- struct ggml_tensor * ggml_flash_attn(
658
- struct ggml_context * ctx,
659
- struct ggml_tensor * q,
660
- struct ggml_tensor * k,
661
- struct ggml_tensor * v,
662
- bool masked);
663
-
664
- struct ggml_tensor * ggml_flash_ff(
665
- struct ggml_context * ctx,
666
- struct ggml_tensor * a,
667
- struct ggml_tensor * b0,
668
- struct ggml_tensor * b1,
669
- struct ggml_tensor * c0,
670
- struct ggml_tensor * c1);
671
-
672
- // Mapping operations
673
- typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
674
- typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
675
-
676
- struct ggml_tensor * ggml_map_unary_f32(
677
- struct ggml_context * ctx,
678
- struct ggml_tensor * a,
679
- const ggml_unary_op_f32_t fun);
680
-
681
- struct ggml_tensor * ggml_map_binary_f32(
682
- struct ggml_context * ctx,
683
- struct ggml_tensor * a,
684
- struct ggml_tensor * b,
685
- const ggml_binary_op_f32_t fun);
686
-
687
- //
688
- // automatic differentiation
689
- //
690
-
691
- void ggml_set_param(
692
- struct ggml_context * ctx,
693
- struct ggml_tensor * tensor);
694
-
695
- void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
696
-
697
- struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
698
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
699
-
700
- void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
701
- void ggml_graph_reset (struct ggml_cgraph * cgraph);
702
-
703
- // print info and performance information for the graph
704
- void ggml_graph_print(const struct ggml_cgraph * cgraph);
705
-
706
- // dump the graph into a file using the dot format
707
- void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
708
-
709
- //
710
- // optimization
711
- //
712
-
713
- // optimization methods
714
- enum ggml_opt_type {
715
- GGML_OPT_ADAM,
716
- GGML_OPT_LBFGS,
717
- };
718
-
719
- // linesearch methods
720
- enum ggml_linesearch {
721
- GGML_LINESEARCH_DEFAULT = 1,
722
-
723
- GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
724
- GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
725
- GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
726
- };
727
-
728
- // optimization return values
729
- enum ggml_opt_result {
730
- GGML_OPT_OK = 0,
731
- GGML_OPT_DID_NOT_CONVERGE,
732
- GGML_OPT_NO_CONTEXT,
733
- GGML_OPT_INVALID_WOLFE,
734
- GGML_OPT_FAIL,
413
+ GGML_API struct ggml_tensor * ggml_new_tensor_3d(
414
+ struct ggml_context * ctx,
415
+ enum ggml_type type,
416
+ int64_t ne0,
417
+ int64_t ne1,
418
+ int64_t ne2);
735
419
 
736
- GGML_LINESEARCH_FAIL = -128,
737
- GGML_LINESEARCH_MINIMUM_STEP,
738
- GGML_LINESEARCH_MAXIMUM_STEP,
739
- GGML_LINESEARCH_MAXIMUM_ITERATIONS,
740
- GGML_LINESEARCH_INVALID_PARAMETERS,
741
- };
420
+ GGML_API struct ggml_tensor * ggml_new_tensor_4d(
421
+ struct ggml_context * ctx,
422
+ enum ggml_type type,
423
+ int64_t ne0,
424
+ int64_t ne1,
425
+ int64_t ne2,
426
+ int64_t ne3);
742
427
 
743
- // optimization parameters
744
- //
745
- // see ggml.c (ggml_opt_default_params) for default values
746
- //
747
- struct ggml_opt_params {
748
- enum ggml_opt_type type;
428
+ GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
429
+ GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
430
+
431
+ GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
432
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
433
+
434
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
435
+ GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
436
+ GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
437
+
438
+ GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
439
+ GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
749
440
 
750
- int n_threads;
441
+ GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
442
+ GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
443
+
444
+ GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
445
+ GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
751
446
 
752
- // delta-based convergence test
753
447
  //
754
- // if past == 0 - disabled
755
- // if past > 0:
756
- // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
448
+ // operations on tensors with backpropagation
757
449
  //
758
- int past;
759
- float delta;
760
450
 
761
- // maximum number of iterations without improvement
451
+ GGML_API struct ggml_tensor * ggml_dup(
452
+ struct ggml_context * ctx,
453
+ struct ggml_tensor * a);
454
+
455
+ GGML_API struct ggml_tensor * ggml_add(
456
+ struct ggml_context * ctx,
457
+ struct ggml_tensor * a,
458
+ struct ggml_tensor * b);
459
+
460
+ GGML_API struct ggml_tensor * ggml_add_inplace(
461
+ struct ggml_context * ctx,
462
+ struct ggml_tensor * a,
463
+ struct ggml_tensor * b);
464
+
465
+ GGML_API struct ggml_tensor * ggml_sub(
466
+ struct ggml_context * ctx,
467
+ struct ggml_tensor * a,
468
+ struct ggml_tensor * b);
469
+
470
+ GGML_API struct ggml_tensor * ggml_mul(
471
+ struct ggml_context * ctx,
472
+ struct ggml_tensor * a,
473
+ struct ggml_tensor * b);
474
+
475
+ GGML_API struct ggml_tensor * ggml_div(
476
+ struct ggml_context * ctx,
477
+ struct ggml_tensor * a,
478
+ struct ggml_tensor * b);
479
+
480
+ GGML_API struct ggml_tensor * ggml_sqr(
481
+ struct ggml_context * ctx,
482
+ struct ggml_tensor * a);
483
+
484
+ GGML_API struct ggml_tensor * ggml_sqrt(
485
+ struct ggml_context * ctx,
486
+ struct ggml_tensor * a);
487
+
488
+ // return scalar
489
+ // TODO: compute sum along rows
490
+ GGML_API struct ggml_tensor * ggml_sum(
491
+ struct ggml_context * ctx,
492
+ struct ggml_tensor * a);
493
+
494
+ // mean along rows
495
+ GGML_API struct ggml_tensor * ggml_mean(
496
+ struct ggml_context * ctx,
497
+ struct ggml_tensor * a);
498
+
499
+ // if a is the same shape as b, and a is not parameter, return a
500
+ // otherwise, return a new tensor: repeat(a) to fit in b
501
+ GGML_API struct ggml_tensor * ggml_repeat(
502
+ struct ggml_context * ctx,
503
+ struct ggml_tensor * a,
504
+ struct ggml_tensor * b);
505
+
506
+ GGML_API struct ggml_tensor * ggml_abs(
507
+ struct ggml_context * ctx,
508
+ struct ggml_tensor * a);
509
+
510
+ GGML_API struct ggml_tensor * ggml_sgn(
511
+ struct ggml_context * ctx,
512
+ struct ggml_tensor * a);
513
+
514
+ GGML_API struct ggml_tensor * ggml_neg(
515
+ struct ggml_context * ctx,
516
+ struct ggml_tensor * a);
517
+
518
+ GGML_API struct ggml_tensor * ggml_step(
519
+ struct ggml_context * ctx,
520
+ struct ggml_tensor * a);
521
+
522
+ GGML_API struct ggml_tensor * ggml_relu(
523
+ struct ggml_context * ctx,
524
+ struct ggml_tensor * a);
525
+
526
+ // TODO: double-check this computation is correct
527
+ GGML_API struct ggml_tensor * ggml_gelu(
528
+ struct ggml_context * ctx,
529
+ struct ggml_tensor * a);
530
+
531
+ GGML_API struct ggml_tensor * ggml_silu(
532
+ struct ggml_context * ctx,
533
+ struct ggml_tensor * a);
534
+
535
+ // normalize along rows
536
+ // TODO: eps is hardcoded to 1e-5 for now
537
+ GGML_API struct ggml_tensor * ggml_norm(
538
+ struct ggml_context * ctx,
539
+ struct ggml_tensor * a);
540
+
541
+ GGML_API struct ggml_tensor * ggml_rms_norm(
542
+ struct ggml_context * ctx,
543
+ struct ggml_tensor * a);
544
+
545
+ // A: m rows, n columns
546
+ // B: p rows, n columns (i.e. we transpose it internally)
547
+ // result is m columns, p rows
548
+ GGML_API struct ggml_tensor * ggml_mul_mat(
549
+ struct ggml_context * ctx,
550
+ struct ggml_tensor * a,
551
+ struct ggml_tensor * b);
552
+
762
553
  //
763
- // if 0 - disabled
764
- // if > 0:
765
- // assume convergence if no cost improvement in this number of iterations
554
+ // operations on tensors without backpropagation
766
555
  //
767
- int max_no_improvement;
768
556
 
769
- bool print_forward_graph;
770
- bool print_backward_graph;
557
+ // in-place, returns view(a)
558
+ GGML_API struct ggml_tensor * ggml_scale(
559
+ struct ggml_context * ctx,
560
+ struct ggml_tensor * a,
561
+ struct ggml_tensor * b);
562
+
563
+ // a -> b, return view(b)
564
+ GGML_API struct ggml_tensor * ggml_cpy(
565
+ struct ggml_context * ctx,
566
+ struct ggml_tensor * a,
567
+ struct ggml_tensor * b);
568
+
569
+ // make contiguous
570
+ GGML_API struct ggml_tensor * ggml_cont(
571
+ struct ggml_context * ctx,
572
+ struct ggml_tensor * a);
573
+
574
+ // return view(a), b specifies the new shape
575
+ // TODO: when we start computing gradient, make a copy instead of view
576
+ GGML_API struct ggml_tensor * ggml_reshape(
577
+ struct ggml_context * ctx,
578
+ struct ggml_tensor * a,
579
+ struct ggml_tensor * b);
580
+
581
+ // return view(a)
582
+ // TODO: when we start computing gradient, make a copy instead of view
583
+ GGML_API struct ggml_tensor * ggml_reshape_2d(
584
+ struct ggml_context * ctx,
585
+ struct ggml_tensor * a,
586
+ int64_t ne0,
587
+ int64_t ne1);
588
+
589
+ // return view(a)
590
+ // TODO: when we start computing gradient, make a copy instead of view
591
+ GGML_API struct ggml_tensor * ggml_reshape_3d(
592
+ struct ggml_context * ctx,
593
+ struct ggml_tensor * a,
594
+ int64_t ne0,
595
+ int64_t ne1,
596
+ int64_t ne2);
597
+
598
+ // offset in bytes
599
+ GGML_API struct ggml_tensor * ggml_view_1d(
600
+ struct ggml_context * ctx,
601
+ struct ggml_tensor * a,
602
+ int64_t ne0,
603
+ size_t offset);
604
+
605
+ GGML_API struct ggml_tensor * ggml_view_2d(
606
+ struct ggml_context * ctx,
607
+ struct ggml_tensor * a,
608
+ int64_t ne0,
609
+ int64_t ne1,
610
+ size_t nb1, // row stride in bytes
611
+ size_t offset);
612
+
613
+ GGML_API struct ggml_tensor * ggml_view_3d(
614
+ struct ggml_context * ctx,
615
+ struct ggml_tensor * a,
616
+ int64_t ne0,
617
+ int64_t ne1,
618
+ int64_t ne2,
619
+ size_t nb1, // row stride in bytes
620
+ size_t nb2, // slice stride in bytes
621
+ size_t offset);
622
+
623
+ GGML_API struct ggml_tensor * ggml_permute(
624
+ struct ggml_context * ctx,
625
+ struct ggml_tensor * a,
626
+ int axis0,
627
+ int axis1,
628
+ int axis2,
629
+ int axis3);
630
+
631
+ // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
632
+ GGML_API struct ggml_tensor * ggml_transpose(
633
+ struct ggml_context * ctx,
634
+ struct ggml_tensor * a);
635
+
636
+ GGML_API struct ggml_tensor * ggml_get_rows(
637
+ struct ggml_context * ctx,
638
+ struct ggml_tensor * a,
639
+ struct ggml_tensor * b);
640
+
641
+ // set elements above the diagonal to -INF
642
+ // in-place, returns view(a)
643
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf(
644
+ struct ggml_context * ctx,
645
+ struct ggml_tensor * a,
646
+ int n_past);
647
+
648
+ // in-place, returns view(a)
649
+ GGML_API struct ggml_tensor * ggml_soft_max(
650
+ struct ggml_context * ctx,
651
+ struct ggml_tensor * a);
652
+
653
+ // rotary position embedding
654
+ // in-place, returns view(a)
655
+ // if mode & 1 == 1, skip n_past elements
656
+ // if mode & 2 == 1, GPT-NeoX style
657
+ // TODO: avoid creating a new tensor every time
658
+ GGML_API struct ggml_tensor * ggml_rope(
659
+ struct ggml_context * ctx,
660
+ struct ggml_tensor * a,
661
+ int n_past,
662
+ int n_dims,
663
+ int mode);
664
+
665
+ // padding = 1
666
+ // TODO: we don't support extra parameters for now
667
+ // that's why we are hard-coding the stride, padding, and dilation
668
+ // not great ..
669
+ GGML_API struct ggml_tensor * ggml_conv_1d_1s(
670
+ struct ggml_context * ctx,
671
+ struct ggml_tensor * a,
672
+ struct ggml_tensor * b);
673
+
674
+ GGML_API struct ggml_tensor * ggml_conv_1d_2s(
675
+ struct ggml_context * ctx,
676
+ struct ggml_tensor * a,
677
+ struct ggml_tensor * b);
678
+
679
+ GGML_API struct ggml_tensor * ggml_flash_attn(
680
+ struct ggml_context * ctx,
681
+ struct ggml_tensor * q,
682
+ struct ggml_tensor * k,
683
+ struct ggml_tensor * v,
684
+ bool masked);
685
+
686
+ GGML_API struct ggml_tensor * ggml_flash_ff(
687
+ struct ggml_context * ctx,
688
+ struct ggml_tensor * a,
689
+ struct ggml_tensor * b0,
690
+ struct ggml_tensor * b1,
691
+ struct ggml_tensor * c0,
692
+ struct ggml_tensor * c1);
693
+
694
+ // Mapping operations
695
+ GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
696
+ GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
697
+
698
+ GGML_API struct ggml_tensor * ggml_map_unary_f32(
699
+ struct ggml_context * ctx,
700
+ struct ggml_tensor * a,
701
+ const ggml_unary_op_f32_t fun);
702
+
703
+ GGML_API struct ggml_tensor * ggml_map_binary_f32(
704
+ struct ggml_context * ctx,
705
+ struct ggml_tensor * a,
706
+ struct ggml_tensor * b,
707
+ const ggml_binary_op_f32_t fun);
708
+
709
+ //
710
+ // automatic differentiation
711
+ //
771
712
 
772
- // ADAM parameters
773
- struct {
774
- int n_iter;
713
+ GGML_API void ggml_set_param(
714
+ struct ggml_context * ctx,
715
+ struct ggml_tensor * tensor);
775
716
 
776
- float alpha; // learning rate
777
- float beta1;
778
- float beta2;
779
- float eps; // epsilon for numerical stability
780
- float eps_f; // epsilon for convergence test
781
- float eps_g; // epsilon for convergence test
782
- } adam;
717
+ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
783
718
 
784
- // LBFGS parameters
785
- struct {
786
- int m; // number of corrections to approximate the inv. Hessian
787
- int n_iter;
788
- int max_linesearch;
719
+ GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
720
+ GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
789
721
 
790
- float eps; // convergence tolerance
791
- float ftol; // line search tolerance
792
- float wolfe;
793
- float min_step;
794
- float max_step;
722
+ GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
723
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
795
724
 
796
- enum ggml_linesearch linesearch;
797
- } lbfgs;
798
- };
725
+ // print info and performance information for the graph
726
+ GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
799
727
 
800
- struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
728
+ // dump the graph into a file using the dot format
729
+ GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
801
730
 
802
- // optimize the function defined by the tensor f
803
- enum ggml_opt_result ggml_opt(
804
- struct ggml_context * ctx,
805
- struct ggml_opt_params params,
806
- struct ggml_tensor * f);
731
+ //
732
+ // optimization
733
+ //
807
734
 
808
- //
809
- // quantization
810
- //
735
+ // optimization methods
736
+ enum ggml_opt_type {
737
+ GGML_OPT_ADAM,
738
+ GGML_OPT_LBFGS,
739
+ };
740
+
741
+ // linesearch methods
742
+ enum ggml_linesearch {
743
+ GGML_LINESEARCH_DEFAULT = 1,
744
+
745
+ GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
746
+ GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
747
+ GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
748
+ };
749
+
750
+ // optimization return values
751
+ enum ggml_opt_result {
752
+ GGML_OPT_OK = 0,
753
+ GGML_OPT_DID_NOT_CONVERGE,
754
+ GGML_OPT_NO_CONTEXT,
755
+ GGML_OPT_INVALID_WOLFE,
756
+ GGML_OPT_FAIL,
757
+
758
+ GGML_LINESEARCH_FAIL = -128,
759
+ GGML_LINESEARCH_MINIMUM_STEP,
760
+ GGML_LINESEARCH_MAXIMUM_STEP,
761
+ GGML_LINESEARCH_MAXIMUM_ITERATIONS,
762
+ GGML_LINESEARCH_INVALID_PARAMETERS,
763
+ };
764
+
765
+ // optimization parameters
766
+ //
767
+ // see ggml.c (ggml_opt_default_params) for default values
768
+ //
769
+ struct ggml_opt_params {
770
+ enum ggml_opt_type type;
771
+
772
+ int n_threads;
773
+
774
+ // delta-based convergence test
775
+ //
776
+ // if past == 0 - disabled
777
+ // if past > 0:
778
+ // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
779
+ //
780
+ int past;
781
+ float delta;
782
+
783
+ // maximum number of iterations without improvement
784
+ //
785
+ // if 0 - disabled
786
+ // if > 0:
787
+ // assume convergence if no cost improvement in this number of iterations
788
+ //
789
+ int max_no_improvement;
790
+
791
+ bool print_forward_graph;
792
+ bool print_backward_graph;
793
+
794
+ // ADAM parameters
795
+ struct {
796
+ int n_iter;
797
+
798
+ float alpha; // learning rate
799
+ float beta1;
800
+ float beta2;
801
+ float eps; // epsilon for numerical stability
802
+ float eps_f; // epsilon for convergence test
803
+ float eps_g; // epsilon for convergence test
804
+ } adam;
805
+
806
+ // LBFGS parameters
807
+ struct {
808
+ int m; // number of corrections to approximate the inv. Hessian
809
+ int n_iter;
810
+ int max_linesearch;
811
+
812
+ float eps; // convergence tolerance
813
+ float ftol; // line search tolerance
814
+ float wolfe;
815
+ float min_step;
816
+ float max_step;
817
+
818
+ enum ggml_linesearch linesearch;
819
+ } lbfgs;
820
+ };
821
+
822
+ GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
823
+
824
+ // optimize the function defined by the tensor f
825
+ GGML_API enum ggml_opt_result ggml_opt(
826
+ struct ggml_context * ctx,
827
+ struct ggml_opt_params params,
828
+ struct ggml_tensor * f);
811
829
 
812
- size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
813
- size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
814
- size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
815
- size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
830
+ //
831
+ // quantization
832
+ //
816
833
 
817
- size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
834
+ GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
835
+ GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
836
+ GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
837
+ GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
838
+ GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
839
+ GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
840
+ GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
818
841
 
819
- //
820
- // system info
821
- //
842
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
822
843
 
823
- int ggml_cpu_has_avx(void);
824
- int ggml_cpu_has_avx2(void);
825
- int ggml_cpu_has_avx512(void);
826
- int ggml_cpu_has_avx512_vbmi(void);
827
- int ggml_cpu_has_avx512_vnni(void);
828
- int ggml_cpu_has_fma(void);
829
- int ggml_cpu_has_neon(void);
830
- int ggml_cpu_has_arm_fma(void);
831
- int ggml_cpu_has_f16c(void);
832
- int ggml_cpu_has_fp16_va(void);
833
- int ggml_cpu_has_wasm_simd(void);
834
- int ggml_cpu_has_blas(void);
835
- int ggml_cpu_has_cublas(void);
836
- int ggml_cpu_has_sse3(void);
837
- int ggml_cpu_has_vsx(void);
844
+ //
845
+ // system info
846
+ //
838
847
 
848
+ GGML_API int ggml_cpu_has_avx (void);
849
+ GGML_API int ggml_cpu_has_avx2 (void);
850
+ GGML_API int ggml_cpu_has_avx512 (void);
851
+ GGML_API int ggml_cpu_has_avx512_vbmi(void);
852
+ GGML_API int ggml_cpu_has_avx512_vnni(void);
853
+ GGML_API int ggml_cpu_has_fma (void);
854
+ GGML_API int ggml_cpu_has_neon (void);
855
+ GGML_API int ggml_cpu_has_arm_fma (void);
856
+ GGML_API int ggml_cpu_has_f16c (void);
857
+ GGML_API int ggml_cpu_has_fp16_va (void);
858
+ GGML_API int ggml_cpu_has_wasm_simd (void);
859
+ GGML_API int ggml_cpu_has_blas (void);
860
+ GGML_API int ggml_cpu_has_cublas (void);
861
+ GGML_API int ggml_cpu_has_clblast (void);
862
+ GGML_API int ggml_cpu_has_gpublas (void);
863
+ GGML_API int ggml_cpu_has_sse3 (void);
864
+ GGML_API int ggml_cpu_has_vsx (void);
839
865
 
840
- //
841
- // Internal types and functions exposed for tests and benchmarks
842
- //
866
+ //
867
+ // Internal types and functions exposed for tests and benchmarks
868
+ //
843
869
 
844
870
  #ifdef __cplusplus
845
- // restrict not standard in C++
871
+ // restrict not standard in C++
846
872
  #define GGML_RESTRICT
847
873
  #else
848
874
  #define GGML_RESTRICT restrict
849
875
  #endif
850
- typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
851
- typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
852
- typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
853
-
854
- typedef struct {
855
- dequantize_row_q_t dequantize_row_q;
856
- quantize_row_q_t quantize_row_q;
857
- quantize_row_q_t quantize_row_q_reference;
858
- quantize_row_q_t quantize_row_q_dot;
859
- vec_dot_q_t vec_dot_q;
860
- } quantize_fns_t;
861
-
862
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
876
+ typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
877
+ typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
878
+ typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
879
+
880
+ typedef struct {
881
+ dequantize_row_q_t dequantize_row_q;
882
+ quantize_row_q_t quantize_row_q;
883
+ quantize_row_q_t quantize_row_q_reference;
884
+ quantize_row_q_t quantize_row_q_dot;
885
+ vec_dot_q_t vec_dot_q;
886
+ enum ggml_type vec_dot_type;
887
+ } quantize_fns_t;
888
+
889
+ quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
863
890
 
864
891
  #ifdef __cplusplus
865
892
  }