llama_cpp 0.0.6 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -169,14 +169,27 @@
169
169
  //
170
170
  //
171
171
 
172
- #ifdef __cplusplus
173
- extern "C" {
172
+ #ifdef GGML_SHARED
173
+ # if defined(_WIN32) && !defined(__MINGW32__)
174
+ # ifdef GGML_BUILD
175
+ # define GGML_API __declspec(dllexport)
176
+ # else
177
+ # define GGML_API __declspec(dllimport)
178
+ # endif
179
+ # else
180
+ # define GGML_API __attribute__ ((visibility ("default")))
181
+ # endif
182
+ #else
183
+ # define GGML_API
174
184
  #endif
175
185
 
176
186
  #include <stdint.h>
177
187
  #include <stddef.h>
178
188
  #include <stdbool.h>
179
189
 
190
+ #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
+ #define GGML_FILE_VERSION 1
192
+
180
193
  #define GGML_MAX_DIMS 4
181
194
  #define GGML_MAX_NODES 4096
182
195
  #define GGML_MAX_PARAMS 16
@@ -184,682 +197,738 @@ extern "C" {
184
197
  #define GGML_MAX_OPT 4
185
198
  #define GGML_DEFAULT_N_THREADS 4
186
199
 
200
+ #define GGML_ASSERT(x) \
201
+ do { \
202
+ if (!(x)) { \
203
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
204
+ abort(); \
205
+ } \
206
+ } while (0)
207
+
208
+ #ifdef __cplusplus
209
+ extern "C" {
210
+ #endif
211
+
187
212
  #ifdef __ARM_NEON
188
- // we use the built-in 16-bit float type
189
- typedef __fp16 ggml_fp16_t;
213
+ // we use the built-in 16-bit float type
214
+ typedef __fp16 ggml_fp16_t;
190
215
  #else
191
- typedef uint16_t ggml_fp16_t;
216
+ typedef uint16_t ggml_fp16_t;
192
217
  #endif
193
218
 
194
- // convert FP16 <-> FP32
195
- float ggml_fp16_to_fp32(ggml_fp16_t x);
196
- ggml_fp16_t ggml_fp32_to_fp16(float x);
197
-
198
- struct ggml_object;
199
- struct ggml_context;
200
-
201
- enum ggml_type {
202
- // explicitly numbered values are used in llama.cpp files
203
- GGML_TYPE_F32 = 0,
204
- GGML_TYPE_F16 = 1,
205
- GGML_TYPE_Q4_0 = 2,
206
- GGML_TYPE_Q4_1 = 3,
207
- GGML_TYPE_Q4_2 = 4,
208
- GGML_TYPE_Q4_3 = 5,
209
- GGML_TYPE_Q8_0 = 6,
210
- GGML_TYPE_I8,
211
- GGML_TYPE_I16,
212
- GGML_TYPE_I32,
213
- GGML_TYPE_COUNT,
214
- };
215
-
216
- // available tensor operations:
217
- enum ggml_op {
218
- GGML_OP_NONE = 0,
219
-
220
- GGML_OP_DUP,
221
- GGML_OP_ADD,
222
- GGML_OP_SUB,
223
- GGML_OP_MUL,
224
- GGML_OP_DIV,
225
- GGML_OP_SQR,
226
- GGML_OP_SQRT,
227
- GGML_OP_SUM,
228
- GGML_OP_MEAN,
229
- GGML_OP_REPEAT,
230
- GGML_OP_ABS,
231
- GGML_OP_SGN,
232
- GGML_OP_NEG,
233
- GGML_OP_STEP,
234
- GGML_OP_RELU,
235
- GGML_OP_GELU,
236
- GGML_OP_SILU,
237
- GGML_OP_NORM, // normalize
238
- GGML_OP_RMS_NORM,
239
-
240
- GGML_OP_MUL_MAT,
241
-
242
- GGML_OP_SCALE,
243
- GGML_OP_CPY,
244
- GGML_OP_CONT,
245
- GGML_OP_RESHAPE,
246
- GGML_OP_VIEW,
247
- GGML_OP_PERMUTE,
248
- GGML_OP_TRANSPOSE,
249
- GGML_OP_GET_ROWS,
250
- GGML_OP_DIAG_MASK_INF,
251
- GGML_OP_SOFT_MAX,
252
- GGML_OP_ROPE,
253
- GGML_OP_CONV_1D_1S,
254
- GGML_OP_CONV_1D_2S,
255
-
256
- GGML_OP_FLASH_ATTN,
257
- GGML_OP_FLASH_FF,
258
-
259
- GGML_OP_MAP_UNARY,
260
- GGML_OP_MAP_BINARY,
261
-
262
- GGML_OP_COUNT,
263
- };
264
-
265
-
266
- // ggml object
267
- struct ggml_object {
268
- size_t offs;
269
- size_t size;
270
-
271
- struct ggml_object * next;
272
-
273
- char padding[8];
274
- };
275
-
276
- static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
277
-
278
- // n-dimensional tensor
279
- struct ggml_tensor {
280
- enum ggml_type type;
281
-
282
- int n_dims;
283
- int64_t ne[GGML_MAX_DIMS]; // number of elements
284
- size_t nb[GGML_MAX_DIMS]; // stride in bytes:
285
- // nb[0] = sizeof(type)
286
- // nb[1] = nb[0] * ne[0] + padding
287
- // nb[i] = nb[i-1] * ne[i-1]
288
-
289
- // compute data
290
- enum ggml_op op;
291
-
292
- bool is_param;
293
-
294
- struct ggml_tensor * grad;
295
- struct ggml_tensor * src0;
296
- struct ggml_tensor * src1;
297
- struct ggml_tensor * opt[GGML_MAX_OPT];
298
-
299
- // thread scheduling
300
- int n_tasks;
301
-
302
- // performance
303
- int perf_runs;
304
- int64_t perf_cycles;
305
- int64_t perf_time_us;
306
-
307
- void * data;
308
- char padding[8];
309
- };
310
-
311
- // computation graph
312
- struct ggml_cgraph {
313
- int n_nodes;
314
- int n_leafs;
315
- int n_threads;
316
-
317
- size_t work_size;
318
- struct ggml_tensor * work;
319
-
320
- struct ggml_tensor * nodes[GGML_MAX_NODES];
321
- struct ggml_tensor * grads[GGML_MAX_NODES];
322
- struct ggml_tensor * leafs[GGML_MAX_NODES];
323
-
324
- // performance
325
- int perf_runs;
326
- int64_t perf_cycles;
327
- int64_t perf_time_us;
328
- };
329
-
330
- // scratch buffer
331
- struct ggml_scratch {
332
- size_t offs;
333
- size_t size;
334
- void * data;
335
- };
219
+ // convert FP16 <-> FP32
220
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
221
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
222
+
223
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
224
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
225
+
226
+ struct ggml_object;
227
+ struct ggml_context;
228
+
229
+ enum ggml_type {
230
+ GGML_TYPE_F32 = 0,
231
+ GGML_TYPE_F16 = 1,
232
+ GGML_TYPE_Q4_0 = 2,
233
+ GGML_TYPE_Q4_1 = 3,
234
+ GGML_TYPE_Q4_2 = 4,
235
+ // GGML_TYPE_Q4_3 (5) support has been removed
236
+ GGML_TYPE_Q5_0 = 6,
237
+ GGML_TYPE_Q5_1 = 7,
238
+ GGML_TYPE_Q8_0 = 8,
239
+ GGML_TYPE_Q8_1 = 9,
240
+ GGML_TYPE_I8,
241
+ GGML_TYPE_I16,
242
+ GGML_TYPE_I32,
243
+ GGML_TYPE_COUNT,
244
+ };
245
+
246
+ // model file types
247
+ enum ggml_ftype {
248
+ GGML_FTYPE_UNKNOWN = -1,
249
+ GGML_FTYPE_ALL_F32 = 0,
250
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
251
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
252
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
253
+ GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
254
+ GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
255
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
256
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
257
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
258
+ };
259
+
260
+ // available tensor operations:
261
+ enum ggml_op {
262
+ GGML_OP_NONE = 0,
263
+
264
+ GGML_OP_DUP,
265
+ GGML_OP_ADD,
266
+ GGML_OP_SUB,
267
+ GGML_OP_MUL,
268
+ GGML_OP_DIV,
269
+ GGML_OP_SQR,
270
+ GGML_OP_SQRT,
271
+ GGML_OP_SUM,
272
+ GGML_OP_MEAN,
273
+ GGML_OP_REPEAT,
274
+ GGML_OP_ABS,
275
+ GGML_OP_SGN,
276
+ GGML_OP_NEG,
277
+ GGML_OP_STEP,
278
+ GGML_OP_RELU,
279
+ GGML_OP_GELU,
280
+ GGML_OP_SILU,
281
+ GGML_OP_NORM, // normalize
282
+ GGML_OP_RMS_NORM,
283
+
284
+ GGML_OP_MUL_MAT,
285
+
286
+ GGML_OP_SCALE,
287
+ GGML_OP_CPY,
288
+ GGML_OP_CONT,
289
+ GGML_OP_RESHAPE,
290
+ GGML_OP_VIEW,
291
+ GGML_OP_PERMUTE,
292
+ GGML_OP_TRANSPOSE,
293
+ GGML_OP_GET_ROWS,
294
+ GGML_OP_DIAG_MASK_INF,
295
+ GGML_OP_SOFT_MAX,
296
+ GGML_OP_ROPE,
297
+ GGML_OP_ALIBI,
298
+ GGML_OP_CONV_1D_1S,
299
+ GGML_OP_CONV_1D_2S,
300
+
301
+ GGML_OP_FLASH_ATTN,
302
+ GGML_OP_FLASH_FF,
303
+
304
+ GGML_OP_MAP_UNARY,
305
+ GGML_OP_MAP_BINARY,
306
+
307
+ GGML_OP_COUNT,
308
+ };
309
+
310
+
311
+ // ggml object
312
+ struct ggml_object {
313
+ size_t offs;
314
+ size_t size;
315
+
316
+ struct ggml_object * next;
317
+
318
+ char padding[8];
319
+ };
320
+
321
+ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
322
+
323
+ // n-dimensional tensor
324
+ struct ggml_tensor {
325
+ enum ggml_type type;
326
+
327
+ int n_dims;
328
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
329
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
330
+ // nb[0] = sizeof(type)
331
+ // nb[1] = nb[0] * ne[0] + padding
332
+ // nb[i] = nb[i-1] * ne[i-1]
333
+
334
+ // compute data
335
+ enum ggml_op op;
336
+
337
+ bool is_param;
338
+
339
+ struct ggml_tensor * grad;
340
+ struct ggml_tensor * src0;
341
+ struct ggml_tensor * src1;
342
+ struct ggml_tensor * opt[GGML_MAX_OPT];
343
+
344
+ // thread scheduling
345
+ int n_tasks;
346
+
347
+ // performance
348
+ int perf_runs;
349
+ int64_t perf_cycles;
350
+ int64_t perf_time_us;
351
+
352
+ void * data;
353
+
354
+ char name[32];
355
+
356
+ char padding[8]; // TODO: remove and add padding to name?
357
+ };
358
+
359
+ // computation graph
360
+ struct ggml_cgraph {
361
+ int n_nodes;
362
+ int n_leafs;
363
+ int n_threads;
364
+
365
+ size_t work_size;
366
+ struct ggml_tensor * work;
367
+
368
+ struct ggml_tensor * nodes[GGML_MAX_NODES];
369
+ struct ggml_tensor * grads[GGML_MAX_NODES];
370
+ struct ggml_tensor * leafs[GGML_MAX_NODES];
371
+
372
+ // performance
373
+ int perf_runs;
374
+ int64_t perf_cycles;
375
+ int64_t perf_time_us;
376
+ };
336
377
 
337
- struct ggml_init_params {
338
- // memory pool
339
- size_t mem_size; // bytes
340
- void * mem_buffer; // if NULL, memory will be allocated internally
341
- bool no_alloc; // don't allocate memory for the tensor data
342
- };
378
+ // scratch buffer
379
+ struct ggml_scratch {
380
+ size_t offs;
381
+ size_t size;
382
+ void * data;
383
+ };
343
384
 
344
- void ggml_time_init(void); // call this once at the beginning of the program
345
- int64_t ggml_time_ms(void);
346
- int64_t ggml_time_us(void);
347
- int64_t ggml_cycles(void);
348
- int64_t ggml_cycles_per_ms(void);
385
+ struct ggml_init_params {
386
+ // memory pool
387
+ size_t mem_size; // bytes
388
+ void * mem_buffer; // if NULL, memory will be allocated internally
389
+ bool no_alloc; // don't allocate memory for the tensor data
390
+ };
349
391
 
350
- void ggml_print_object (const struct ggml_object * obj);
351
- void ggml_print_objects(const struct ggml_context * ctx);
392
+ // misc
352
393
 
353
- int64_t ggml_nelements(const struct ggml_tensor * tensor);
354
- size_t ggml_nbytes (const struct ggml_tensor * tensor);
394
+ GGML_API void ggml_time_init(void); // call this once at the beginning of the program
395
+ GGML_API int64_t ggml_time_ms(void);
396
+ GGML_API int64_t ggml_time_us(void);
397
+ GGML_API int64_t ggml_cycles(void);
398
+ GGML_API int64_t ggml_cycles_per_ms(void);
355
399
 
356
- int ggml_blck_size (enum ggml_type type);
357
- size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
358
- float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
400
+ GGML_API void ggml_print_object (const struct ggml_object * obj);
401
+ GGML_API void ggml_print_objects(const struct ggml_context * ctx);
359
402
 
360
- const char * ggml_type_name(enum ggml_type type);
403
+ GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
404
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
361
405
 
362
- size_t ggml_element_size(const struct ggml_tensor * tensor);
406
+ GGML_API int ggml_blck_size (enum ggml_type type);
407
+ GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
408
+ GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
363
409
 
364
- bool ggml_is_quantized(enum ggml_type type);
410
+ GGML_API const char * ggml_type_name(enum ggml_type type);
365
411
 
366
- struct ggml_context * ggml_init(struct ggml_init_params params);
367
- void ggml_free(struct ggml_context * ctx);
412
+ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
368
413
 
369
- size_t ggml_used_mem(const struct ggml_context * ctx);
414
+ GGML_API bool ggml_is_quantized(enum ggml_type type);
370
415
 
371
- size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
416
+ // TODO: temporary until model loading of ggml examples is refactored
417
+ GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
372
418
 
373
- struct ggml_tensor * ggml_new_tensor(
374
- struct ggml_context * ctx,
375
- enum ggml_type type,
376
- int n_dims,
377
- const int64_t *ne);
378
-
379
- struct ggml_tensor * ggml_new_tensor_1d(
380
- struct ggml_context * ctx,
381
- enum ggml_type type,
382
- int64_t ne0);
383
-
384
- struct ggml_tensor * ggml_new_tensor_2d(
385
- struct ggml_context * ctx,
386
- enum ggml_type type,
387
- int64_t ne0,
388
- int64_t ne1);
389
-
390
- struct ggml_tensor * ggml_new_tensor_3d(
391
- struct ggml_context * ctx,
392
- enum ggml_type type,
393
- int64_t ne0,
394
- int64_t ne1,
395
- int64_t ne2);
396
-
397
- struct ggml_tensor * ggml_new_tensor_4d(
398
- struct ggml_context * ctx,
399
- enum ggml_type type,
400
- int64_t ne0,
401
- int64_t ne1,
402
- int64_t ne2,
403
- int64_t ne3);
404
-
405
- struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
406
- struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
407
-
408
- struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
409
- struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
410
-
411
- struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
412
- struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
413
- struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
414
-
415
- int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
416
- void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
417
-
418
- float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
419
- void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
420
-
421
- void * ggml_get_data (const struct ggml_tensor * tensor);
422
- float * ggml_get_data_f32(const struct ggml_tensor * tensor);
423
-
424
- //
425
- // operations on tensors with backpropagation
426
- //
427
-
428
- struct ggml_tensor * ggml_dup(
429
- struct ggml_context * ctx,
430
- struct ggml_tensor * a);
431
-
432
- struct ggml_tensor * ggml_add(
433
- struct ggml_context * ctx,
434
- struct ggml_tensor * a,
435
- struct ggml_tensor * b);
419
+ // main
436
420
 
421
+ GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
422
+ GGML_API void ggml_free(struct ggml_context * ctx);
437
423
 
438
- struct ggml_tensor * ggml_add_inplace(
439
- struct ggml_context * ctx,
440
- struct ggml_tensor * a,
441
- struct ggml_tensor * b);
424
+ GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
442
425
 
443
- struct ggml_tensor * ggml_sub(
444
- struct ggml_context * ctx,
445
- struct ggml_tensor * a,
446
- struct ggml_tensor * b);
426
+ GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
447
427
 
448
- struct ggml_tensor * ggml_mul(
449
- struct ggml_context * ctx,
450
- struct ggml_tensor * a,
451
- struct ggml_tensor * b);
428
+ GGML_API struct ggml_tensor * ggml_new_tensor(
429
+ struct ggml_context * ctx,
430
+ enum ggml_type type,
431
+ int n_dims,
432
+ const int64_t *ne);
452
433
 
453
- struct ggml_tensor * ggml_div(
454
- struct ggml_context * ctx,
455
- struct ggml_tensor * a,
456
- struct ggml_tensor * b);
457
-
458
- struct ggml_tensor * ggml_sqr(
459
- struct ggml_context * ctx,
460
- struct ggml_tensor * a);
461
-
462
- struct ggml_tensor * ggml_sqrt(
463
- struct ggml_context * ctx,
464
- struct ggml_tensor * a);
465
-
466
- // return scalar
467
- // TODO: compute sum along rows
468
- struct ggml_tensor * ggml_sum(
469
- struct ggml_context * ctx,
470
- struct ggml_tensor * a);
471
-
472
- // mean along rows
473
- struct ggml_tensor * ggml_mean(
474
- struct ggml_context * ctx,
475
- struct ggml_tensor * a);
476
-
477
- // if a is the same shape as b, and a is not parameter, return a
478
- // otherwise, return a new tensor: repeat(a) to fit in b
479
- struct ggml_tensor * ggml_repeat(
480
- struct ggml_context * ctx,
481
- struct ggml_tensor * a,
482
- struct ggml_tensor * b);
483
-
484
- struct ggml_tensor * ggml_abs(
485
- struct ggml_context * ctx,
486
- struct ggml_tensor * a);
487
-
488
- struct ggml_tensor * ggml_sgn(
489
- struct ggml_context * ctx,
490
- struct ggml_tensor * a);
491
-
492
- struct ggml_tensor * ggml_neg(
493
- struct ggml_context * ctx,
494
- struct ggml_tensor * a);
495
-
496
- struct ggml_tensor * ggml_step(
497
- struct ggml_context * ctx,
498
- struct ggml_tensor * a);
499
-
500
- struct ggml_tensor * ggml_relu(
501
- struct ggml_context * ctx,
502
- struct ggml_tensor * a);
503
-
504
- // TODO: double-check this computation is correct
505
- struct ggml_tensor * ggml_gelu(
506
- struct ggml_context * ctx,
507
- struct ggml_tensor * a);
508
-
509
- struct ggml_tensor * ggml_silu(
510
- struct ggml_context * ctx,
511
- struct ggml_tensor * a);
512
-
513
- // normalize along rows
514
- // TODO: eps is hardcoded to 1e-5 for now
515
- struct ggml_tensor * ggml_norm(
516
- struct ggml_context * ctx,
517
- struct ggml_tensor * a);
518
-
519
- struct ggml_tensor * ggml_rms_norm(
520
- struct ggml_context * ctx,
521
- struct ggml_tensor * a);
522
-
523
- // A: m rows, n columns
524
- // B: p rows, n columns (i.e. we transpose it internally)
525
- // result is m columns, p rows
526
- struct ggml_tensor * ggml_mul_mat(
527
- struct ggml_context * ctx,
528
- struct ggml_tensor * a,
529
- struct ggml_tensor * b);
530
-
531
- //
532
- // operations on tensors without backpropagation
533
- //
534
-
535
- // in-place, returns view(a)
536
- struct ggml_tensor * ggml_scale(
537
- struct ggml_context * ctx,
538
- struct ggml_tensor * a,
539
- struct ggml_tensor * b);
540
-
541
- // a -> b, return view(b)
542
- struct ggml_tensor * ggml_cpy(
543
- struct ggml_context * ctx,
544
- struct ggml_tensor * a,
545
- struct ggml_tensor * b);
546
-
547
- // make contiguous
548
- struct ggml_tensor * ggml_cont(
549
- struct ggml_context * ctx,
550
- struct ggml_tensor * a);
551
-
552
- // return view(a), b specifies the new shape
553
- // TODO: when we start computing gradient, make a copy instead of view
554
- struct ggml_tensor * ggml_reshape(
555
- struct ggml_context * ctx,
556
- struct ggml_tensor * a,
557
- struct ggml_tensor * b);
558
-
559
- // return view(a)
560
- // TODO: when we start computing gradient, make a copy instead of view
561
- struct ggml_tensor * ggml_reshape_2d(
562
- struct ggml_context * ctx,
563
- struct ggml_tensor * a,
564
- int64_t ne0,
565
- int64_t ne1);
566
-
567
- // return view(a)
568
- // TODO: when we start computing gradient, make a copy instead of view
569
- struct ggml_tensor * ggml_reshape_3d(
570
- struct ggml_context * ctx,
571
- struct ggml_tensor * a,
572
- int64_t ne0,
573
- int64_t ne1,
574
- int64_t ne2);
575
-
576
- // offset in bytes
577
- struct ggml_tensor * ggml_view_1d(
578
- struct ggml_context * ctx,
579
- struct ggml_tensor * a,
580
- int64_t ne0,
581
- size_t offset);
582
-
583
- struct ggml_tensor * ggml_view_2d(
584
- struct ggml_context * ctx,
585
- struct ggml_tensor * a,
586
- int64_t ne0,
587
- int64_t ne1,
588
- size_t nb1, // row stride in bytes
589
- size_t offset);
590
-
591
- struct ggml_tensor * ggml_view_3d(
592
- struct ggml_context * ctx,
593
- struct ggml_tensor * a,
594
- int64_t ne0,
595
- int64_t ne1,
596
- int64_t ne2,
597
- size_t nb1, // row stride in bytes
598
- size_t nb2, // slice stride in bytes
599
- size_t offset);
600
-
601
- struct ggml_tensor * ggml_permute(
602
- struct ggml_context * ctx,
603
- struct ggml_tensor * a,
604
- int axis0,
605
- int axis1,
606
- int axis2,
607
- int axis3);
608
-
609
- // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
610
- struct ggml_tensor * ggml_transpose(
611
- struct ggml_context * ctx,
612
- struct ggml_tensor * a);
613
-
614
- struct ggml_tensor * ggml_get_rows(
615
- struct ggml_context * ctx,
616
- struct ggml_tensor * a,
617
- struct ggml_tensor * b);
618
-
619
- // set elements above the diagonal to -INF
620
- // in-place, returns view(a)
621
- struct ggml_tensor * ggml_diag_mask_inf(
622
- struct ggml_context * ctx,
623
- struct ggml_tensor * a,
624
- int n_past);
625
-
626
- // in-place, returns view(a)
627
- struct ggml_tensor * ggml_soft_max(
628
- struct ggml_context * ctx,
629
- struct ggml_tensor * a);
630
-
631
- // rotary position embedding
632
- // in-place, returns view(a)
633
- // if mode & 1 == 1, skip n_past elements
634
- // if mode & 2 == 1, GPT-NeoX style
635
- // TODO: avoid creating a new tensor every time
636
- struct ggml_tensor * ggml_rope(
637
- struct ggml_context * ctx,
638
- struct ggml_tensor * a,
639
- int n_past,
640
- int n_dims,
641
- int mode);
642
-
643
- // padding = 1
644
- // TODO: we don't support extra parameters for now
645
- // that's why we are hard-coding the stride, padding, and dilation
646
- // not great ..
647
- struct ggml_tensor * ggml_conv_1d_1s(
648
- struct ggml_context * ctx,
649
- struct ggml_tensor * a,
650
- struct ggml_tensor * b);
651
-
652
- struct ggml_tensor * ggml_conv_1d_2s(
653
- struct ggml_context * ctx,
654
- struct ggml_tensor * a,
655
- struct ggml_tensor * b);
656
-
657
- struct ggml_tensor * ggml_flash_attn(
658
- struct ggml_context * ctx,
659
- struct ggml_tensor * q,
660
- struct ggml_tensor * k,
661
- struct ggml_tensor * v,
662
- bool masked);
663
-
664
- struct ggml_tensor * ggml_flash_ff(
665
- struct ggml_context * ctx,
666
- struct ggml_tensor * a,
667
- struct ggml_tensor * b0,
668
- struct ggml_tensor * b1,
669
- struct ggml_tensor * c0,
670
- struct ggml_tensor * c1);
671
-
672
- // Mapping operations
673
- typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
674
- typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
675
-
676
- struct ggml_tensor * ggml_map_unary_f32(
677
- struct ggml_context * ctx,
678
- struct ggml_tensor * a,
679
- const ggml_unary_op_f32_t fun);
680
-
681
- struct ggml_tensor * ggml_map_binary_f32(
682
- struct ggml_context * ctx,
683
- struct ggml_tensor * a,
684
- struct ggml_tensor * b,
685
- const ggml_binary_op_f32_t fun);
686
-
687
- //
688
- // automatic differentiation
689
- //
690
-
691
- void ggml_set_param(
692
- struct ggml_context * ctx,
693
- struct ggml_tensor * tensor);
694
-
695
- void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
696
-
697
- struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
698
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
699
-
700
- void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
701
- void ggml_graph_reset (struct ggml_cgraph * cgraph);
702
-
703
- // print info and performance information for the graph
704
- void ggml_graph_print(const struct ggml_cgraph * cgraph);
705
-
706
- // dump the graph into a file using the dot format
707
- void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
708
-
709
- //
710
- // optimization
711
- //
712
-
713
- // optimization methods
714
- enum ggml_opt_type {
715
- GGML_OPT_ADAM,
716
- GGML_OPT_LBFGS,
717
- };
718
-
719
- // linesearch methods
720
- enum ggml_linesearch {
721
- GGML_LINESEARCH_DEFAULT = 1,
722
-
723
- GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
724
- GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
725
- GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
726
- };
727
-
728
- // optimization return values
729
- enum ggml_opt_result {
730
- GGML_OPT_OK = 0,
731
- GGML_OPT_DID_NOT_CONVERGE,
732
- GGML_OPT_NO_CONTEXT,
733
- GGML_OPT_INVALID_WOLFE,
734
- GGML_OPT_FAIL,
434
+ GGML_API struct ggml_tensor * ggml_new_tensor_1d(
435
+ struct ggml_context * ctx,
436
+ enum ggml_type type,
437
+ int64_t ne0);
735
438
 
736
- GGML_LINESEARCH_FAIL = -128,
737
- GGML_LINESEARCH_MINIMUM_STEP,
738
- GGML_LINESEARCH_MAXIMUM_STEP,
739
- GGML_LINESEARCH_MAXIMUM_ITERATIONS,
740
- GGML_LINESEARCH_INVALID_PARAMETERS,
741
- };
439
+ GGML_API struct ggml_tensor * ggml_new_tensor_2d(
440
+ struct ggml_context * ctx,
441
+ enum ggml_type type,
442
+ int64_t ne0,
443
+ int64_t ne1);
742
444
 
743
- // optimization parameters
744
- //
745
- // see ggml.c (ggml_opt_default_params) for default values
746
- //
747
- struct ggml_opt_params {
748
- enum ggml_opt_type type;
445
+ GGML_API struct ggml_tensor * ggml_new_tensor_3d(
446
+ struct ggml_context * ctx,
447
+ enum ggml_type type,
448
+ int64_t ne0,
449
+ int64_t ne1,
450
+ int64_t ne2);
451
+
452
+ GGML_API struct ggml_tensor * ggml_new_tensor_4d(
453
+ struct ggml_context * ctx,
454
+ enum ggml_type type,
455
+ int64_t ne0,
456
+ int64_t ne1,
457
+ int64_t ne2,
458
+ int64_t ne3);
459
+
460
+ GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
461
+ GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
462
+
463
+ GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
464
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
465
+
466
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
467
+ GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
468
+ GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
749
469
 
750
- int n_threads;
470
+ GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
471
+ GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
472
+
473
+ GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
474
+ GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
475
+
476
+ GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
477
+ GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
478
+
479
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
480
+ GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
751
481
 
752
- // delta-based convergence test
753
482
  //
754
- // if past == 0 - disabled
755
- // if past > 0:
756
- // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
483
+ // operations on tensors with backpropagation
757
484
  //
758
- int past;
759
- float delta;
760
485
 
761
- // maximum number of iterations without improvement
486
+ GGML_API struct ggml_tensor * ggml_dup(
487
+ struct ggml_context * ctx,
488
+ struct ggml_tensor * a);
489
+
490
+ GGML_API struct ggml_tensor * ggml_add(
491
+ struct ggml_context * ctx,
492
+ struct ggml_tensor * a,
493
+ struct ggml_tensor * b);
494
+
495
+ GGML_API struct ggml_tensor * ggml_add_inplace(
496
+ struct ggml_context * ctx,
497
+ struct ggml_tensor * a,
498
+ struct ggml_tensor * b);
499
+
500
+ GGML_API struct ggml_tensor * ggml_sub(
501
+ struct ggml_context * ctx,
502
+ struct ggml_tensor * a,
503
+ struct ggml_tensor * b);
504
+
505
+ GGML_API struct ggml_tensor * ggml_mul(
506
+ struct ggml_context * ctx,
507
+ struct ggml_tensor * a,
508
+ struct ggml_tensor * b);
509
+
510
+ GGML_API struct ggml_tensor * ggml_div(
511
+ struct ggml_context * ctx,
512
+ struct ggml_tensor * a,
513
+ struct ggml_tensor * b);
514
+
515
+ GGML_API struct ggml_tensor * ggml_sqr(
516
+ struct ggml_context * ctx,
517
+ struct ggml_tensor * a);
518
+
519
+ GGML_API struct ggml_tensor * ggml_sqrt(
520
+ struct ggml_context * ctx,
521
+ struct ggml_tensor * a);
522
+
523
+ // return scalar
524
+ // TODO: compute sum along rows
525
+ GGML_API struct ggml_tensor * ggml_sum(
526
+ struct ggml_context * ctx,
527
+ struct ggml_tensor * a);
528
+
529
+ // mean along rows
530
+ GGML_API struct ggml_tensor * ggml_mean(
531
+ struct ggml_context * ctx,
532
+ struct ggml_tensor * a);
533
+
534
+ // if a is the same shape as b, and a is not parameter, return a
535
+ // otherwise, return a new tensor: repeat(a) to fit in b
536
+ GGML_API struct ggml_tensor * ggml_repeat(
537
+ struct ggml_context * ctx,
538
+ struct ggml_tensor * a,
539
+ struct ggml_tensor * b);
540
+
541
+ GGML_API struct ggml_tensor * ggml_abs(
542
+ struct ggml_context * ctx,
543
+ struct ggml_tensor * a);
544
+
545
+ GGML_API struct ggml_tensor * ggml_sgn(
546
+ struct ggml_context * ctx,
547
+ struct ggml_tensor * a);
548
+
549
+ GGML_API struct ggml_tensor * ggml_neg(
550
+ struct ggml_context * ctx,
551
+ struct ggml_tensor * a);
552
+
553
+ GGML_API struct ggml_tensor * ggml_step(
554
+ struct ggml_context * ctx,
555
+ struct ggml_tensor * a);
556
+
557
+ GGML_API struct ggml_tensor * ggml_relu(
558
+ struct ggml_context * ctx,
559
+ struct ggml_tensor * a);
560
+
561
+ // TODO: double-check this computation is correct
562
+ GGML_API struct ggml_tensor * ggml_gelu(
563
+ struct ggml_context * ctx,
564
+ struct ggml_tensor * a);
565
+
566
+ GGML_API struct ggml_tensor * ggml_silu(
567
+ struct ggml_context * ctx,
568
+ struct ggml_tensor * a);
569
+
570
+ // normalize along rows
571
+ // TODO: eps is hardcoded to 1e-5 for now
572
+ GGML_API struct ggml_tensor * ggml_norm(
573
+ struct ggml_context * ctx,
574
+ struct ggml_tensor * a);
575
+
576
+ GGML_API struct ggml_tensor * ggml_rms_norm(
577
+ struct ggml_context * ctx,
578
+ struct ggml_tensor * a);
579
+
580
+ // A: m rows, n columns
581
+ // B: p rows, n columns (i.e. we transpose it internally)
582
+ // result is m columns, p rows
583
+ GGML_API struct ggml_tensor * ggml_mul_mat(
584
+ struct ggml_context * ctx,
585
+ struct ggml_tensor * a,
586
+ struct ggml_tensor * b);
587
+
762
588
  //
763
- // if 0 - disabled
764
- // if > 0:
765
- // assume convergence if no cost improvement in this number of iterations
589
+ // operations on tensors without backpropagation
766
590
  //
767
- int max_no_improvement;
768
591
 
769
- bool print_forward_graph;
770
- bool print_backward_graph;
592
+ // in-place, returns view(a)
593
+ GGML_API struct ggml_tensor * ggml_scale(
594
+ struct ggml_context * ctx,
595
+ struct ggml_tensor * a,
596
+ struct ggml_tensor * b);
597
+
598
+ // a -> b, return view(b)
599
+ GGML_API struct ggml_tensor * ggml_cpy(
600
+ struct ggml_context * ctx,
601
+ struct ggml_tensor * a,
602
+ struct ggml_tensor * b);
603
+
604
+ // make contiguous
605
+ GGML_API struct ggml_tensor * ggml_cont(
606
+ struct ggml_context * ctx,
607
+ struct ggml_tensor * a);
608
+
609
+ // return view(a), b specifies the new shape
610
+ // TODO: when we start computing gradient, make a copy instead of view
611
+ GGML_API struct ggml_tensor * ggml_reshape(
612
+ struct ggml_context * ctx,
613
+ struct ggml_tensor * a,
614
+ struct ggml_tensor * b);
615
+
616
+ // return view(a)
617
+ // TODO: when we start computing gradient, make a copy instead of view
618
+ GGML_API struct ggml_tensor * ggml_reshape_2d(
619
+ struct ggml_context * ctx,
620
+ struct ggml_tensor * a,
621
+ int64_t ne0,
622
+ int64_t ne1);
623
+
624
+ // return view(a)
625
+ // TODO: when we start computing gradient, make a copy instead of view
626
+ GGML_API struct ggml_tensor * ggml_reshape_3d(
627
+ struct ggml_context * ctx,
628
+ struct ggml_tensor * a,
629
+ int64_t ne0,
630
+ int64_t ne1,
631
+ int64_t ne2);
632
+
633
+ // offset in bytes
634
+ GGML_API struct ggml_tensor * ggml_view_1d(
635
+ struct ggml_context * ctx,
636
+ struct ggml_tensor * a,
637
+ int64_t ne0,
638
+ size_t offset);
639
+
640
+ GGML_API struct ggml_tensor * ggml_view_2d(
641
+ struct ggml_context * ctx,
642
+ struct ggml_tensor * a,
643
+ int64_t ne0,
644
+ int64_t ne1,
645
+ size_t nb1, // row stride in bytes
646
+ size_t offset);
647
+
648
+ GGML_API struct ggml_tensor * ggml_view_3d(
649
+ struct ggml_context * ctx,
650
+ struct ggml_tensor * a,
651
+ int64_t ne0,
652
+ int64_t ne1,
653
+ int64_t ne2,
654
+ size_t nb1, // row stride in bytes
655
+ size_t nb2, // slice stride in bytes
656
+ size_t offset);
657
+
658
+ GGML_API struct ggml_tensor * ggml_permute(
659
+ struct ggml_context * ctx,
660
+ struct ggml_tensor * a,
661
+ int axis0,
662
+ int axis1,
663
+ int axis2,
664
+ int axis3);
665
+
666
+ // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
667
+ GGML_API struct ggml_tensor * ggml_transpose(
668
+ struct ggml_context * ctx,
669
+ struct ggml_tensor * a);
670
+
671
+ GGML_API struct ggml_tensor * ggml_get_rows(
672
+ struct ggml_context * ctx,
673
+ struct ggml_tensor * a,
674
+ struct ggml_tensor * b);
675
+
676
+ // set elements above the diagonal to -INF
677
+ // in-place, returns view(a)
678
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf(
679
+ struct ggml_context * ctx,
680
+ struct ggml_tensor * a,
681
+ int n_past);
682
+
683
+ // in-place, returns view(a)
684
+ GGML_API struct ggml_tensor * ggml_soft_max(
685
+ struct ggml_context * ctx,
686
+ struct ggml_tensor * a);
687
+
688
+ // rotary position embedding
689
+ // in-place, returns view(a)
690
+ // if mode & 1 == 1, skip n_past elements
691
+ // if mode & 2 == 1, GPT-NeoX style
692
+ // TODO: avoid creating a new tensor every time
693
+ GGML_API struct ggml_tensor * ggml_rope(
694
+ struct ggml_context * ctx,
695
+ struct ggml_tensor * a,
696
+ int n_past,
697
+ int n_dims,
698
+ int mode);
699
+
700
+ // alibi position embedding
701
+ // in-place, returns view(a)
702
+ struct ggml_tensor * ggml_alibi(
703
+ struct ggml_context * ctx,
704
+ struct ggml_tensor * a,
705
+ int n_past,
706
+ int n_head);
707
+
708
+ // padding = 1
709
+ // TODO: we don't support extra parameters for now
710
+ // that's why we are hard-coding the stride, padding, and dilation
711
+ // not great ..
712
+ GGML_API struct ggml_tensor * ggml_conv_1d_1s(
713
+ struct ggml_context * ctx,
714
+ struct ggml_tensor * a,
715
+ struct ggml_tensor * b);
716
+
717
+ GGML_API struct ggml_tensor * ggml_conv_1d_2s(
718
+ struct ggml_context * ctx,
719
+ struct ggml_tensor * a,
720
+ struct ggml_tensor * b);
721
+
722
+ GGML_API struct ggml_tensor * ggml_flash_attn(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * q,
725
+ struct ggml_tensor * k,
726
+ struct ggml_tensor * v,
727
+ bool masked);
728
+
729
+ GGML_API struct ggml_tensor * ggml_flash_ff(
730
+ struct ggml_context * ctx,
731
+ struct ggml_tensor * a,
732
+ struct ggml_tensor * b0,
733
+ struct ggml_tensor * b1,
734
+ struct ggml_tensor * c0,
735
+ struct ggml_tensor * c1);
736
+
737
+ // Mapping operations
738
+ typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
739
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
740
+
741
+ GGML_API struct ggml_tensor * ggml_map_unary_f32(
742
+ struct ggml_context * ctx,
743
+ struct ggml_tensor * a,
744
+ const ggml_unary_op_f32_t fun);
745
+
746
+ GGML_API struct ggml_tensor * ggml_map_binary_f32(
747
+ struct ggml_context * ctx,
748
+ struct ggml_tensor * a,
749
+ struct ggml_tensor * b,
750
+ const ggml_binary_op_f32_t fun);
771
751
 
772
- // ADAM parameters
773
- struct {
774
- int n_iter;
752
+ //
753
+ // automatic differentiation
754
+ //
775
755
 
776
- float alpha; // learning rate
777
- float beta1;
778
- float beta2;
779
- float eps; // epsilon for numerical stability
780
- float eps_f; // epsilon for convergence test
781
- float eps_g; // epsilon for convergence test
782
- } adam;
756
+ GGML_API void ggml_set_param(
757
+ struct ggml_context * ctx,
758
+ struct ggml_tensor * tensor);
783
759
 
784
- // LBFGS parameters
785
- struct {
786
- int m; // number of corrections to approximate the inv. Hessian
787
- int n_iter;
788
- int max_linesearch;
760
+ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
789
761
 
790
- float eps; // convergence tolerance
791
- float ftol; // line search tolerance
792
- float wolfe;
793
- float min_step;
794
- float max_step;
762
+ GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
763
+ GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
795
764
 
796
- enum ggml_linesearch linesearch;
797
- } lbfgs;
798
- };
765
+ GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
766
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
799
767
 
800
- struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
768
+ // print info and performance information for the graph
769
+ GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
801
770
 
802
- // optimize the function defined by the tensor f
803
- enum ggml_opt_result ggml_opt(
804
- struct ggml_context * ctx,
805
- struct ggml_opt_params params,
806
- struct ggml_tensor * f);
771
+ // dump the graph into a file using the dot format
772
+ GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
807
773
 
808
- //
809
- // quantization
810
- //
774
+ //
775
+ // optimization
776
+ //
811
777
 
812
- size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
813
- size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
814
- size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
815
- size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
778
+ // optimization methods
779
+ enum ggml_opt_type {
780
+ GGML_OPT_ADAM,
781
+ GGML_OPT_LBFGS,
782
+ };
783
+
784
+ // linesearch methods
785
+ enum ggml_linesearch {
786
+ GGML_LINESEARCH_DEFAULT = 1,
787
+
788
+ GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
789
+ GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
790
+ GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
791
+ };
792
+
793
+ // optimization return values
794
+ enum ggml_opt_result {
795
+ GGML_OPT_OK = 0,
796
+ GGML_OPT_DID_NOT_CONVERGE,
797
+ GGML_OPT_NO_CONTEXT,
798
+ GGML_OPT_INVALID_WOLFE,
799
+ GGML_OPT_FAIL,
800
+
801
+ GGML_LINESEARCH_FAIL = -128,
802
+ GGML_LINESEARCH_MINIMUM_STEP,
803
+ GGML_LINESEARCH_MAXIMUM_STEP,
804
+ GGML_LINESEARCH_MAXIMUM_ITERATIONS,
805
+ GGML_LINESEARCH_INVALID_PARAMETERS,
806
+ };
807
+
808
+ // optimization parameters
809
+ //
810
+ // see ggml.c (ggml_opt_default_params) for default values
811
+ //
812
+ struct ggml_opt_params {
813
+ enum ggml_opt_type type;
814
+
815
+ int n_threads;
816
+
817
+ // delta-based convergence test
818
+ //
819
+ // if past == 0 - disabled
820
+ // if past > 0:
821
+ // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
822
+ //
823
+ int past;
824
+ float delta;
825
+
826
+ // maximum number of iterations without improvement
827
+ //
828
+ // if 0 - disabled
829
+ // if > 0:
830
+ // assume convergence if no cost improvement in this number of iterations
831
+ //
832
+ int max_no_improvement;
833
+
834
+ bool print_forward_graph;
835
+ bool print_backward_graph;
836
+
837
+ // ADAM parameters
838
+ struct {
839
+ int n_iter;
840
+
841
+ float alpha; // learning rate
842
+ float beta1;
843
+ float beta2;
844
+ float eps; // epsilon for numerical stability
845
+ float eps_f; // epsilon for convergence test
846
+ float eps_g; // epsilon for convergence test
847
+ } adam;
848
+
849
+ // LBFGS parameters
850
+ struct {
851
+ int m; // number of corrections to approximate the inv. Hessian
852
+ int n_iter;
853
+ int max_linesearch;
854
+
855
+ float eps; // convergence tolerance
856
+ float ftol; // line search tolerance
857
+ float wolfe;
858
+ float min_step;
859
+ float max_step;
860
+
861
+ enum ggml_linesearch linesearch;
862
+ } lbfgs;
863
+ };
864
+
865
+ GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
866
+
867
+ // optimize the function defined by the tensor f
868
+ GGML_API enum ggml_opt_result ggml_opt(
869
+ struct ggml_context * ctx,
870
+ struct ggml_opt_params params,
871
+ struct ggml_tensor * f);
816
872
 
817
- size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
873
+ //
874
+ // quantization
875
+ //
818
876
 
819
- //
820
- // system info
821
- //
877
+ GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
878
+ GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
879
+ GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
880
+ GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
881
+ GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
882
+ GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
822
883
 
823
- int ggml_cpu_has_avx(void);
824
- int ggml_cpu_has_avx2(void);
825
- int ggml_cpu_has_avx512(void);
826
- int ggml_cpu_has_avx512_vbmi(void);
827
- int ggml_cpu_has_avx512_vnni(void);
828
- int ggml_cpu_has_fma(void);
829
- int ggml_cpu_has_neon(void);
830
- int ggml_cpu_has_arm_fma(void);
831
- int ggml_cpu_has_f16c(void);
832
- int ggml_cpu_has_fp16_va(void);
833
- int ggml_cpu_has_wasm_simd(void);
834
- int ggml_cpu_has_blas(void);
835
- int ggml_cpu_has_cublas(void);
836
- int ggml_cpu_has_sse3(void);
837
- int ggml_cpu_has_vsx(void);
884
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
838
885
 
886
+ //
887
+ // system info
888
+ //
839
889
 
840
- //
841
- // Internal types and functions exposed for tests and benchmarks
842
- //
890
+ GGML_API int ggml_cpu_has_avx (void);
891
+ GGML_API int ggml_cpu_has_avx2 (void);
892
+ GGML_API int ggml_cpu_has_avx512 (void);
893
+ GGML_API int ggml_cpu_has_avx512_vbmi(void);
894
+ GGML_API int ggml_cpu_has_avx512_vnni(void);
895
+ GGML_API int ggml_cpu_has_fma (void);
896
+ GGML_API int ggml_cpu_has_neon (void);
897
+ GGML_API int ggml_cpu_has_arm_fma (void);
898
+ GGML_API int ggml_cpu_has_f16c (void);
899
+ GGML_API int ggml_cpu_has_fp16_va (void);
900
+ GGML_API int ggml_cpu_has_wasm_simd (void);
901
+ GGML_API int ggml_cpu_has_blas (void);
902
+ GGML_API int ggml_cpu_has_cublas (void);
903
+ GGML_API int ggml_cpu_has_clblast (void);
904
+ GGML_API int ggml_cpu_has_gpublas (void);
905
+ GGML_API int ggml_cpu_has_sse3 (void);
906
+ GGML_API int ggml_cpu_has_vsx (void);
907
+
908
+ //
909
+ // Internal types and functions exposed for tests and benchmarks
910
+ //
843
911
 
844
912
  #ifdef __cplusplus
845
- // restrict not standard in C++
913
+ // restrict not standard in C++
846
914
  #define GGML_RESTRICT
847
915
  #else
848
916
  #define GGML_RESTRICT restrict
849
917
  #endif
850
- typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
851
- typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
852
- typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
853
-
854
- typedef struct {
855
- dequantize_row_q_t dequantize_row_q;
856
- quantize_row_q_t quantize_row_q;
857
- quantize_row_q_t quantize_row_q_reference;
858
- quantize_row_q_t quantize_row_q_dot;
859
- vec_dot_q_t vec_dot_q;
860
- } quantize_fns_t;
861
-
862
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
918
+ typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
919
+ typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
920
+ typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
921
+
922
+ typedef struct {
923
+ dequantize_row_q_t dequantize_row_q;
924
+ quantize_row_q_t quantize_row_q;
925
+ quantize_row_q_t quantize_row_q_reference;
926
+ quantize_row_q_t quantize_row_q_dot;
927
+ vec_dot_q_t vec_dot_q;
928
+ enum ggml_type vec_dot_type;
929
+ } quantize_fns_t;
930
+
931
+ quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
863
932
 
864
933
  #ifdef __cplusplus
865
934
  }