llama_cpp 0.0.6 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -169,14 +169,27 @@
169
169
  //
170
170
  //
171
171
 
172
- #ifdef __cplusplus
173
- extern "C" {
172
+ #ifdef GGML_SHARED
173
+ # if defined(_WIN32) && !defined(__MINGW32__)
174
+ # ifdef GGML_BUILD
175
+ # define GGML_API __declspec(dllexport)
176
+ # else
177
+ # define GGML_API __declspec(dllimport)
178
+ # endif
179
+ # else
180
+ # define GGML_API __attribute__ ((visibility ("default")))
181
+ # endif
182
+ #else
183
+ # define GGML_API
174
184
  #endif
175
185
 
176
186
  #include <stdint.h>
177
187
  #include <stddef.h>
178
188
  #include <stdbool.h>
179
189
 
190
+ #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
191
+ #define GGML_FILE_VERSION 1
192
+
180
193
  #define GGML_MAX_DIMS 4
181
194
  #define GGML_MAX_NODES 4096
182
195
  #define GGML_MAX_PARAMS 16
@@ -184,682 +197,738 @@ extern "C" {
184
197
  #define GGML_MAX_OPT 4
185
198
  #define GGML_DEFAULT_N_THREADS 4
186
199
 
200
+ #define GGML_ASSERT(x) \
201
+ do { \
202
+ if (!(x)) { \
203
+ fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
204
+ abort(); \
205
+ } \
206
+ } while (0)
207
+
208
+ #ifdef __cplusplus
209
+ extern "C" {
210
+ #endif
211
+
187
212
  #ifdef __ARM_NEON
188
- // we use the built-in 16-bit float type
189
- typedef __fp16 ggml_fp16_t;
213
+ // we use the built-in 16-bit float type
214
+ typedef __fp16 ggml_fp16_t;
190
215
  #else
191
- typedef uint16_t ggml_fp16_t;
216
+ typedef uint16_t ggml_fp16_t;
192
217
  #endif
193
218
 
194
- // convert FP16 <-> FP32
195
- float ggml_fp16_to_fp32(ggml_fp16_t x);
196
- ggml_fp16_t ggml_fp32_to_fp16(float x);
197
-
198
- struct ggml_object;
199
- struct ggml_context;
200
-
201
- enum ggml_type {
202
- // explicitly numbered values are used in llama.cpp files
203
- GGML_TYPE_F32 = 0,
204
- GGML_TYPE_F16 = 1,
205
- GGML_TYPE_Q4_0 = 2,
206
- GGML_TYPE_Q4_1 = 3,
207
- GGML_TYPE_Q4_2 = 4,
208
- GGML_TYPE_Q4_3 = 5,
209
- GGML_TYPE_Q8_0 = 6,
210
- GGML_TYPE_I8,
211
- GGML_TYPE_I16,
212
- GGML_TYPE_I32,
213
- GGML_TYPE_COUNT,
214
- };
215
-
216
- // available tensor operations:
217
- enum ggml_op {
218
- GGML_OP_NONE = 0,
219
-
220
- GGML_OP_DUP,
221
- GGML_OP_ADD,
222
- GGML_OP_SUB,
223
- GGML_OP_MUL,
224
- GGML_OP_DIV,
225
- GGML_OP_SQR,
226
- GGML_OP_SQRT,
227
- GGML_OP_SUM,
228
- GGML_OP_MEAN,
229
- GGML_OP_REPEAT,
230
- GGML_OP_ABS,
231
- GGML_OP_SGN,
232
- GGML_OP_NEG,
233
- GGML_OP_STEP,
234
- GGML_OP_RELU,
235
- GGML_OP_GELU,
236
- GGML_OP_SILU,
237
- GGML_OP_NORM, // normalize
238
- GGML_OP_RMS_NORM,
239
-
240
- GGML_OP_MUL_MAT,
241
-
242
- GGML_OP_SCALE,
243
- GGML_OP_CPY,
244
- GGML_OP_CONT,
245
- GGML_OP_RESHAPE,
246
- GGML_OP_VIEW,
247
- GGML_OP_PERMUTE,
248
- GGML_OP_TRANSPOSE,
249
- GGML_OP_GET_ROWS,
250
- GGML_OP_DIAG_MASK_INF,
251
- GGML_OP_SOFT_MAX,
252
- GGML_OP_ROPE,
253
- GGML_OP_CONV_1D_1S,
254
- GGML_OP_CONV_1D_2S,
255
-
256
- GGML_OP_FLASH_ATTN,
257
- GGML_OP_FLASH_FF,
258
-
259
- GGML_OP_MAP_UNARY,
260
- GGML_OP_MAP_BINARY,
261
-
262
- GGML_OP_COUNT,
263
- };
264
-
265
-
266
- // ggml object
267
- struct ggml_object {
268
- size_t offs;
269
- size_t size;
270
-
271
- struct ggml_object * next;
272
-
273
- char padding[8];
274
- };
275
-
276
- static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
277
-
278
- // n-dimensional tensor
279
- struct ggml_tensor {
280
- enum ggml_type type;
281
-
282
- int n_dims;
283
- int64_t ne[GGML_MAX_DIMS]; // number of elements
284
- size_t nb[GGML_MAX_DIMS]; // stride in bytes:
285
- // nb[0] = sizeof(type)
286
- // nb[1] = nb[0] * ne[0] + padding
287
- // nb[i] = nb[i-1] * ne[i-1]
288
-
289
- // compute data
290
- enum ggml_op op;
291
-
292
- bool is_param;
293
-
294
- struct ggml_tensor * grad;
295
- struct ggml_tensor * src0;
296
- struct ggml_tensor * src1;
297
- struct ggml_tensor * opt[GGML_MAX_OPT];
298
-
299
- // thread scheduling
300
- int n_tasks;
301
-
302
- // performance
303
- int perf_runs;
304
- int64_t perf_cycles;
305
- int64_t perf_time_us;
306
-
307
- void * data;
308
- char padding[8];
309
- };
310
-
311
- // computation graph
312
- struct ggml_cgraph {
313
- int n_nodes;
314
- int n_leafs;
315
- int n_threads;
316
-
317
- size_t work_size;
318
- struct ggml_tensor * work;
319
-
320
- struct ggml_tensor * nodes[GGML_MAX_NODES];
321
- struct ggml_tensor * grads[GGML_MAX_NODES];
322
- struct ggml_tensor * leafs[GGML_MAX_NODES];
323
-
324
- // performance
325
- int perf_runs;
326
- int64_t perf_cycles;
327
- int64_t perf_time_us;
328
- };
329
-
330
- // scratch buffer
331
- struct ggml_scratch {
332
- size_t offs;
333
- size_t size;
334
- void * data;
335
- };
219
+ // convert FP16 <-> FP32
220
+ GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
221
+ GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
222
+
223
+ GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
224
+ GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
225
+
226
+ struct ggml_object;
227
+ struct ggml_context;
228
+
229
+ enum ggml_type {
230
+ GGML_TYPE_F32 = 0,
231
+ GGML_TYPE_F16 = 1,
232
+ GGML_TYPE_Q4_0 = 2,
233
+ GGML_TYPE_Q4_1 = 3,
234
+ GGML_TYPE_Q4_2 = 4,
235
+ // GGML_TYPE_Q4_3 (5) support has been removed
236
+ GGML_TYPE_Q5_0 = 6,
237
+ GGML_TYPE_Q5_1 = 7,
238
+ GGML_TYPE_Q8_0 = 8,
239
+ GGML_TYPE_Q8_1 = 9,
240
+ GGML_TYPE_I8,
241
+ GGML_TYPE_I16,
242
+ GGML_TYPE_I32,
243
+ GGML_TYPE_COUNT,
244
+ };
245
+
246
+ // model file types
247
+ enum ggml_ftype {
248
+ GGML_FTYPE_UNKNOWN = -1,
249
+ GGML_FTYPE_ALL_F32 = 0,
250
+ GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
251
+ GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
252
+ GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
253
+ GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
254
+ GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
255
+ GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
256
+ GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
257
+ GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
258
+ };
259
+
260
+ // available tensor operations:
261
+ enum ggml_op {
262
+ GGML_OP_NONE = 0,
263
+
264
+ GGML_OP_DUP,
265
+ GGML_OP_ADD,
266
+ GGML_OP_SUB,
267
+ GGML_OP_MUL,
268
+ GGML_OP_DIV,
269
+ GGML_OP_SQR,
270
+ GGML_OP_SQRT,
271
+ GGML_OP_SUM,
272
+ GGML_OP_MEAN,
273
+ GGML_OP_REPEAT,
274
+ GGML_OP_ABS,
275
+ GGML_OP_SGN,
276
+ GGML_OP_NEG,
277
+ GGML_OP_STEP,
278
+ GGML_OP_RELU,
279
+ GGML_OP_GELU,
280
+ GGML_OP_SILU,
281
+ GGML_OP_NORM, // normalize
282
+ GGML_OP_RMS_NORM,
283
+
284
+ GGML_OP_MUL_MAT,
285
+
286
+ GGML_OP_SCALE,
287
+ GGML_OP_CPY,
288
+ GGML_OP_CONT,
289
+ GGML_OP_RESHAPE,
290
+ GGML_OP_VIEW,
291
+ GGML_OP_PERMUTE,
292
+ GGML_OP_TRANSPOSE,
293
+ GGML_OP_GET_ROWS,
294
+ GGML_OP_DIAG_MASK_INF,
295
+ GGML_OP_SOFT_MAX,
296
+ GGML_OP_ROPE,
297
+ GGML_OP_ALIBI,
298
+ GGML_OP_CONV_1D_1S,
299
+ GGML_OP_CONV_1D_2S,
300
+
301
+ GGML_OP_FLASH_ATTN,
302
+ GGML_OP_FLASH_FF,
303
+
304
+ GGML_OP_MAP_UNARY,
305
+ GGML_OP_MAP_BINARY,
306
+
307
+ GGML_OP_COUNT,
308
+ };
309
+
310
+
311
+ // ggml object
312
+ struct ggml_object {
313
+ size_t offs;
314
+ size_t size;
315
+
316
+ struct ggml_object * next;
317
+
318
+ char padding[8];
319
+ };
320
+
321
+ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
322
+
323
+ // n-dimensional tensor
324
+ struct ggml_tensor {
325
+ enum ggml_type type;
326
+
327
+ int n_dims;
328
+ int64_t ne[GGML_MAX_DIMS]; // number of elements
329
+ size_t nb[GGML_MAX_DIMS]; // stride in bytes:
330
+ // nb[0] = sizeof(type)
331
+ // nb[1] = nb[0] * ne[0] + padding
332
+ // nb[i] = nb[i-1] * ne[i-1]
333
+
334
+ // compute data
335
+ enum ggml_op op;
336
+
337
+ bool is_param;
338
+
339
+ struct ggml_tensor * grad;
340
+ struct ggml_tensor * src0;
341
+ struct ggml_tensor * src1;
342
+ struct ggml_tensor * opt[GGML_MAX_OPT];
343
+
344
+ // thread scheduling
345
+ int n_tasks;
346
+
347
+ // performance
348
+ int perf_runs;
349
+ int64_t perf_cycles;
350
+ int64_t perf_time_us;
351
+
352
+ void * data;
353
+
354
+ char name[32];
355
+
356
+ char padding[8]; // TODO: remove and add padding to name?
357
+ };
358
+
359
+ // computation graph
360
+ struct ggml_cgraph {
361
+ int n_nodes;
362
+ int n_leafs;
363
+ int n_threads;
364
+
365
+ size_t work_size;
366
+ struct ggml_tensor * work;
367
+
368
+ struct ggml_tensor * nodes[GGML_MAX_NODES];
369
+ struct ggml_tensor * grads[GGML_MAX_NODES];
370
+ struct ggml_tensor * leafs[GGML_MAX_NODES];
371
+
372
+ // performance
373
+ int perf_runs;
374
+ int64_t perf_cycles;
375
+ int64_t perf_time_us;
376
+ };
336
377
 
337
- struct ggml_init_params {
338
- // memory pool
339
- size_t mem_size; // bytes
340
- void * mem_buffer; // if NULL, memory will be allocated internally
341
- bool no_alloc; // don't allocate memory for the tensor data
342
- };
378
+ // scratch buffer
379
+ struct ggml_scratch {
380
+ size_t offs;
381
+ size_t size;
382
+ void * data;
383
+ };
343
384
 
344
- void ggml_time_init(void); // call this once at the beginning of the program
345
- int64_t ggml_time_ms(void);
346
- int64_t ggml_time_us(void);
347
- int64_t ggml_cycles(void);
348
- int64_t ggml_cycles_per_ms(void);
385
+ struct ggml_init_params {
386
+ // memory pool
387
+ size_t mem_size; // bytes
388
+ void * mem_buffer; // if NULL, memory will be allocated internally
389
+ bool no_alloc; // don't allocate memory for the tensor data
390
+ };
349
391
 
350
- void ggml_print_object (const struct ggml_object * obj);
351
- void ggml_print_objects(const struct ggml_context * ctx);
392
+ // misc
352
393
 
353
- int64_t ggml_nelements(const struct ggml_tensor * tensor);
354
- size_t ggml_nbytes (const struct ggml_tensor * tensor);
394
+ GGML_API void ggml_time_init(void); // call this once at the beginning of the program
395
+ GGML_API int64_t ggml_time_ms(void);
396
+ GGML_API int64_t ggml_time_us(void);
397
+ GGML_API int64_t ggml_cycles(void);
398
+ GGML_API int64_t ggml_cycles_per_ms(void);
355
399
 
356
- int ggml_blck_size (enum ggml_type type);
357
- size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
358
- float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
400
+ GGML_API void ggml_print_object (const struct ggml_object * obj);
401
+ GGML_API void ggml_print_objects(const struct ggml_context * ctx);
359
402
 
360
- const char * ggml_type_name(enum ggml_type type);
403
+ GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
404
+ GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
361
405
 
362
- size_t ggml_element_size(const struct ggml_tensor * tensor);
406
+ GGML_API int ggml_blck_size (enum ggml_type type);
407
+ GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
408
+ GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
363
409
 
364
- bool ggml_is_quantized(enum ggml_type type);
410
+ GGML_API const char * ggml_type_name(enum ggml_type type);
365
411
 
366
- struct ggml_context * ggml_init(struct ggml_init_params params);
367
- void ggml_free(struct ggml_context * ctx);
412
+ GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
368
413
 
369
- size_t ggml_used_mem(const struct ggml_context * ctx);
414
+ GGML_API bool ggml_is_quantized(enum ggml_type type);
370
415
 
371
- size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
416
+ // TODO: temporary until model loading of ggml examples is refactored
417
+ GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
372
418
 
373
- struct ggml_tensor * ggml_new_tensor(
374
- struct ggml_context * ctx,
375
- enum ggml_type type,
376
- int n_dims,
377
- const int64_t *ne);
378
-
379
- struct ggml_tensor * ggml_new_tensor_1d(
380
- struct ggml_context * ctx,
381
- enum ggml_type type,
382
- int64_t ne0);
383
-
384
- struct ggml_tensor * ggml_new_tensor_2d(
385
- struct ggml_context * ctx,
386
- enum ggml_type type,
387
- int64_t ne0,
388
- int64_t ne1);
389
-
390
- struct ggml_tensor * ggml_new_tensor_3d(
391
- struct ggml_context * ctx,
392
- enum ggml_type type,
393
- int64_t ne0,
394
- int64_t ne1,
395
- int64_t ne2);
396
-
397
- struct ggml_tensor * ggml_new_tensor_4d(
398
- struct ggml_context * ctx,
399
- enum ggml_type type,
400
- int64_t ne0,
401
- int64_t ne1,
402
- int64_t ne2,
403
- int64_t ne3);
404
-
405
- struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
406
- struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
407
-
408
- struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
409
- struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
410
-
411
- struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
412
- struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
413
- struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
414
-
415
- int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
416
- void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
417
-
418
- float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
419
- void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
420
-
421
- void * ggml_get_data (const struct ggml_tensor * tensor);
422
- float * ggml_get_data_f32(const struct ggml_tensor * tensor);
423
-
424
- //
425
- // operations on tensors with backpropagation
426
- //
427
-
428
- struct ggml_tensor * ggml_dup(
429
- struct ggml_context * ctx,
430
- struct ggml_tensor * a);
431
-
432
- struct ggml_tensor * ggml_add(
433
- struct ggml_context * ctx,
434
- struct ggml_tensor * a,
435
- struct ggml_tensor * b);
419
+ // main
436
420
 
421
+ GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
422
+ GGML_API void ggml_free(struct ggml_context * ctx);
437
423
 
438
- struct ggml_tensor * ggml_add_inplace(
439
- struct ggml_context * ctx,
440
- struct ggml_tensor * a,
441
- struct ggml_tensor * b);
424
+ GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
442
425
 
443
- struct ggml_tensor * ggml_sub(
444
- struct ggml_context * ctx,
445
- struct ggml_tensor * a,
446
- struct ggml_tensor * b);
426
+ GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
447
427
 
448
- struct ggml_tensor * ggml_mul(
449
- struct ggml_context * ctx,
450
- struct ggml_tensor * a,
451
- struct ggml_tensor * b);
428
+ GGML_API struct ggml_tensor * ggml_new_tensor(
429
+ struct ggml_context * ctx,
430
+ enum ggml_type type,
431
+ int n_dims,
432
+ const int64_t *ne);
452
433
 
453
- struct ggml_tensor * ggml_div(
454
- struct ggml_context * ctx,
455
- struct ggml_tensor * a,
456
- struct ggml_tensor * b);
457
-
458
- struct ggml_tensor * ggml_sqr(
459
- struct ggml_context * ctx,
460
- struct ggml_tensor * a);
461
-
462
- struct ggml_tensor * ggml_sqrt(
463
- struct ggml_context * ctx,
464
- struct ggml_tensor * a);
465
-
466
- // return scalar
467
- // TODO: compute sum along rows
468
- struct ggml_tensor * ggml_sum(
469
- struct ggml_context * ctx,
470
- struct ggml_tensor * a);
471
-
472
- // mean along rows
473
- struct ggml_tensor * ggml_mean(
474
- struct ggml_context * ctx,
475
- struct ggml_tensor * a);
476
-
477
- // if a is the same shape as b, and a is not parameter, return a
478
- // otherwise, return a new tensor: repeat(a) to fit in b
479
- struct ggml_tensor * ggml_repeat(
480
- struct ggml_context * ctx,
481
- struct ggml_tensor * a,
482
- struct ggml_tensor * b);
483
-
484
- struct ggml_tensor * ggml_abs(
485
- struct ggml_context * ctx,
486
- struct ggml_tensor * a);
487
-
488
- struct ggml_tensor * ggml_sgn(
489
- struct ggml_context * ctx,
490
- struct ggml_tensor * a);
491
-
492
- struct ggml_tensor * ggml_neg(
493
- struct ggml_context * ctx,
494
- struct ggml_tensor * a);
495
-
496
- struct ggml_tensor * ggml_step(
497
- struct ggml_context * ctx,
498
- struct ggml_tensor * a);
499
-
500
- struct ggml_tensor * ggml_relu(
501
- struct ggml_context * ctx,
502
- struct ggml_tensor * a);
503
-
504
- // TODO: double-check this computation is correct
505
- struct ggml_tensor * ggml_gelu(
506
- struct ggml_context * ctx,
507
- struct ggml_tensor * a);
508
-
509
- struct ggml_tensor * ggml_silu(
510
- struct ggml_context * ctx,
511
- struct ggml_tensor * a);
512
-
513
- // normalize along rows
514
- // TODO: eps is hardcoded to 1e-5 for now
515
- struct ggml_tensor * ggml_norm(
516
- struct ggml_context * ctx,
517
- struct ggml_tensor * a);
518
-
519
- struct ggml_tensor * ggml_rms_norm(
520
- struct ggml_context * ctx,
521
- struct ggml_tensor * a);
522
-
523
- // A: m rows, n columns
524
- // B: p rows, n columns (i.e. we transpose it internally)
525
- // result is m columns, p rows
526
- struct ggml_tensor * ggml_mul_mat(
527
- struct ggml_context * ctx,
528
- struct ggml_tensor * a,
529
- struct ggml_tensor * b);
530
-
531
- //
532
- // operations on tensors without backpropagation
533
- //
534
-
535
- // in-place, returns view(a)
536
- struct ggml_tensor * ggml_scale(
537
- struct ggml_context * ctx,
538
- struct ggml_tensor * a,
539
- struct ggml_tensor * b);
540
-
541
- // a -> b, return view(b)
542
- struct ggml_tensor * ggml_cpy(
543
- struct ggml_context * ctx,
544
- struct ggml_tensor * a,
545
- struct ggml_tensor * b);
546
-
547
- // make contiguous
548
- struct ggml_tensor * ggml_cont(
549
- struct ggml_context * ctx,
550
- struct ggml_tensor * a);
551
-
552
- // return view(a), b specifies the new shape
553
- // TODO: when we start computing gradient, make a copy instead of view
554
- struct ggml_tensor * ggml_reshape(
555
- struct ggml_context * ctx,
556
- struct ggml_tensor * a,
557
- struct ggml_tensor * b);
558
-
559
- // return view(a)
560
- // TODO: when we start computing gradient, make a copy instead of view
561
- struct ggml_tensor * ggml_reshape_2d(
562
- struct ggml_context * ctx,
563
- struct ggml_tensor * a,
564
- int64_t ne0,
565
- int64_t ne1);
566
-
567
- // return view(a)
568
- // TODO: when we start computing gradient, make a copy instead of view
569
- struct ggml_tensor * ggml_reshape_3d(
570
- struct ggml_context * ctx,
571
- struct ggml_tensor * a,
572
- int64_t ne0,
573
- int64_t ne1,
574
- int64_t ne2);
575
-
576
- // offset in bytes
577
- struct ggml_tensor * ggml_view_1d(
578
- struct ggml_context * ctx,
579
- struct ggml_tensor * a,
580
- int64_t ne0,
581
- size_t offset);
582
-
583
- struct ggml_tensor * ggml_view_2d(
584
- struct ggml_context * ctx,
585
- struct ggml_tensor * a,
586
- int64_t ne0,
587
- int64_t ne1,
588
- size_t nb1, // row stride in bytes
589
- size_t offset);
590
-
591
- struct ggml_tensor * ggml_view_3d(
592
- struct ggml_context * ctx,
593
- struct ggml_tensor * a,
594
- int64_t ne0,
595
- int64_t ne1,
596
- int64_t ne2,
597
- size_t nb1, // row stride in bytes
598
- size_t nb2, // slice stride in bytes
599
- size_t offset);
600
-
601
- struct ggml_tensor * ggml_permute(
602
- struct ggml_context * ctx,
603
- struct ggml_tensor * a,
604
- int axis0,
605
- int axis1,
606
- int axis2,
607
- int axis3);
608
-
609
- // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
610
- struct ggml_tensor * ggml_transpose(
611
- struct ggml_context * ctx,
612
- struct ggml_tensor * a);
613
-
614
- struct ggml_tensor * ggml_get_rows(
615
- struct ggml_context * ctx,
616
- struct ggml_tensor * a,
617
- struct ggml_tensor * b);
618
-
619
- // set elements above the diagonal to -INF
620
- // in-place, returns view(a)
621
- struct ggml_tensor * ggml_diag_mask_inf(
622
- struct ggml_context * ctx,
623
- struct ggml_tensor * a,
624
- int n_past);
625
-
626
- // in-place, returns view(a)
627
- struct ggml_tensor * ggml_soft_max(
628
- struct ggml_context * ctx,
629
- struct ggml_tensor * a);
630
-
631
- // rotary position embedding
632
- // in-place, returns view(a)
633
- // if mode & 1 == 1, skip n_past elements
634
- // if mode & 2 == 1, GPT-NeoX style
635
- // TODO: avoid creating a new tensor every time
636
- struct ggml_tensor * ggml_rope(
637
- struct ggml_context * ctx,
638
- struct ggml_tensor * a,
639
- int n_past,
640
- int n_dims,
641
- int mode);
642
-
643
- // padding = 1
644
- // TODO: we don't support extra parameters for now
645
- // that's why we are hard-coding the stride, padding, and dilation
646
- // not great ..
647
- struct ggml_tensor * ggml_conv_1d_1s(
648
- struct ggml_context * ctx,
649
- struct ggml_tensor * a,
650
- struct ggml_tensor * b);
651
-
652
- struct ggml_tensor * ggml_conv_1d_2s(
653
- struct ggml_context * ctx,
654
- struct ggml_tensor * a,
655
- struct ggml_tensor * b);
656
-
657
- struct ggml_tensor * ggml_flash_attn(
658
- struct ggml_context * ctx,
659
- struct ggml_tensor * q,
660
- struct ggml_tensor * k,
661
- struct ggml_tensor * v,
662
- bool masked);
663
-
664
- struct ggml_tensor * ggml_flash_ff(
665
- struct ggml_context * ctx,
666
- struct ggml_tensor * a,
667
- struct ggml_tensor * b0,
668
- struct ggml_tensor * b1,
669
- struct ggml_tensor * c0,
670
- struct ggml_tensor * c1);
671
-
672
- // Mapping operations
673
- typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
674
- typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
675
-
676
- struct ggml_tensor * ggml_map_unary_f32(
677
- struct ggml_context * ctx,
678
- struct ggml_tensor * a,
679
- const ggml_unary_op_f32_t fun);
680
-
681
- struct ggml_tensor * ggml_map_binary_f32(
682
- struct ggml_context * ctx,
683
- struct ggml_tensor * a,
684
- struct ggml_tensor * b,
685
- const ggml_binary_op_f32_t fun);
686
-
687
- //
688
- // automatic differentiation
689
- //
690
-
691
- void ggml_set_param(
692
- struct ggml_context * ctx,
693
- struct ggml_tensor * tensor);
694
-
695
- void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
696
-
697
- struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
698
- struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
699
-
700
- void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
701
- void ggml_graph_reset (struct ggml_cgraph * cgraph);
702
-
703
- // print info and performance information for the graph
704
- void ggml_graph_print(const struct ggml_cgraph * cgraph);
705
-
706
- // dump the graph into a file using the dot format
707
- void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
708
-
709
- //
710
- // optimization
711
- //
712
-
713
- // optimization methods
714
- enum ggml_opt_type {
715
- GGML_OPT_ADAM,
716
- GGML_OPT_LBFGS,
717
- };
718
-
719
- // linesearch methods
720
- enum ggml_linesearch {
721
- GGML_LINESEARCH_DEFAULT = 1,
722
-
723
- GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
724
- GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
725
- GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
726
- };
727
-
728
- // optimization return values
729
- enum ggml_opt_result {
730
- GGML_OPT_OK = 0,
731
- GGML_OPT_DID_NOT_CONVERGE,
732
- GGML_OPT_NO_CONTEXT,
733
- GGML_OPT_INVALID_WOLFE,
734
- GGML_OPT_FAIL,
434
+ GGML_API struct ggml_tensor * ggml_new_tensor_1d(
435
+ struct ggml_context * ctx,
436
+ enum ggml_type type,
437
+ int64_t ne0);
735
438
 
736
- GGML_LINESEARCH_FAIL = -128,
737
- GGML_LINESEARCH_MINIMUM_STEP,
738
- GGML_LINESEARCH_MAXIMUM_STEP,
739
- GGML_LINESEARCH_MAXIMUM_ITERATIONS,
740
- GGML_LINESEARCH_INVALID_PARAMETERS,
741
- };
439
+ GGML_API struct ggml_tensor * ggml_new_tensor_2d(
440
+ struct ggml_context * ctx,
441
+ enum ggml_type type,
442
+ int64_t ne0,
443
+ int64_t ne1);
742
444
 
743
- // optimization parameters
744
- //
745
- // see ggml.c (ggml_opt_default_params) for default values
746
- //
747
- struct ggml_opt_params {
748
- enum ggml_opt_type type;
445
+ GGML_API struct ggml_tensor * ggml_new_tensor_3d(
446
+ struct ggml_context * ctx,
447
+ enum ggml_type type,
448
+ int64_t ne0,
449
+ int64_t ne1,
450
+ int64_t ne2);
451
+
452
+ GGML_API struct ggml_tensor * ggml_new_tensor_4d(
453
+ struct ggml_context * ctx,
454
+ enum ggml_type type,
455
+ int64_t ne0,
456
+ int64_t ne1,
457
+ int64_t ne2,
458
+ int64_t ne3);
459
+
460
+ GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
461
+ GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
462
+
463
+ GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
464
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
465
+
466
+ GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
467
+ GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
468
+ GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
749
469
 
750
- int n_threads;
470
+ GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
471
+ GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
472
+
473
+ GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
474
+ GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
475
+
476
+ GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
477
+ GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
478
+
479
+ GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
480
+ GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name);
751
481
 
752
- // delta-based convergence test
753
482
  //
754
- // if past == 0 - disabled
755
- // if past > 0:
756
- // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
483
+ // operations on tensors with backpropagation
757
484
  //
758
- int past;
759
- float delta;
760
485
 
761
- // maximum number of iterations without improvement
486
+ GGML_API struct ggml_tensor * ggml_dup(
487
+ struct ggml_context * ctx,
488
+ struct ggml_tensor * a);
489
+
490
+ GGML_API struct ggml_tensor * ggml_add(
491
+ struct ggml_context * ctx,
492
+ struct ggml_tensor * a,
493
+ struct ggml_tensor * b);
494
+
495
+ GGML_API struct ggml_tensor * ggml_add_inplace(
496
+ struct ggml_context * ctx,
497
+ struct ggml_tensor * a,
498
+ struct ggml_tensor * b);
499
+
500
+ GGML_API struct ggml_tensor * ggml_sub(
501
+ struct ggml_context * ctx,
502
+ struct ggml_tensor * a,
503
+ struct ggml_tensor * b);
504
+
505
+ GGML_API struct ggml_tensor * ggml_mul(
506
+ struct ggml_context * ctx,
507
+ struct ggml_tensor * a,
508
+ struct ggml_tensor * b);
509
+
510
+ GGML_API struct ggml_tensor * ggml_div(
511
+ struct ggml_context * ctx,
512
+ struct ggml_tensor * a,
513
+ struct ggml_tensor * b);
514
+
515
+ GGML_API struct ggml_tensor * ggml_sqr(
516
+ struct ggml_context * ctx,
517
+ struct ggml_tensor * a);
518
+
519
+ GGML_API struct ggml_tensor * ggml_sqrt(
520
+ struct ggml_context * ctx,
521
+ struct ggml_tensor * a);
522
+
523
+ // return scalar
524
+ // TODO: compute sum along rows
525
+ GGML_API struct ggml_tensor * ggml_sum(
526
+ struct ggml_context * ctx,
527
+ struct ggml_tensor * a);
528
+
529
+ // mean along rows
530
+ GGML_API struct ggml_tensor * ggml_mean(
531
+ struct ggml_context * ctx,
532
+ struct ggml_tensor * a);
533
+
534
+ // if a is the same shape as b, and a is not parameter, return a
535
+ // otherwise, return a new tensor: repeat(a) to fit in b
536
+ GGML_API struct ggml_tensor * ggml_repeat(
537
+ struct ggml_context * ctx,
538
+ struct ggml_tensor * a,
539
+ struct ggml_tensor * b);
540
+
541
+ GGML_API struct ggml_tensor * ggml_abs(
542
+ struct ggml_context * ctx,
543
+ struct ggml_tensor * a);
544
+
545
+ GGML_API struct ggml_tensor * ggml_sgn(
546
+ struct ggml_context * ctx,
547
+ struct ggml_tensor * a);
548
+
549
+ GGML_API struct ggml_tensor * ggml_neg(
550
+ struct ggml_context * ctx,
551
+ struct ggml_tensor * a);
552
+
553
+ GGML_API struct ggml_tensor * ggml_step(
554
+ struct ggml_context * ctx,
555
+ struct ggml_tensor * a);
556
+
557
+ GGML_API struct ggml_tensor * ggml_relu(
558
+ struct ggml_context * ctx,
559
+ struct ggml_tensor * a);
560
+
561
+ // TODO: double-check this computation is correct
562
+ GGML_API struct ggml_tensor * ggml_gelu(
563
+ struct ggml_context * ctx,
564
+ struct ggml_tensor * a);
565
+
566
+ GGML_API struct ggml_tensor * ggml_silu(
567
+ struct ggml_context * ctx,
568
+ struct ggml_tensor * a);
569
+
570
+ // normalize along rows
571
+ // TODO: eps is hardcoded to 1e-5 for now
572
+ GGML_API struct ggml_tensor * ggml_norm(
573
+ struct ggml_context * ctx,
574
+ struct ggml_tensor * a);
575
+
576
+ GGML_API struct ggml_tensor * ggml_rms_norm(
577
+ struct ggml_context * ctx,
578
+ struct ggml_tensor * a);
579
+
580
+ // A: m rows, n columns
581
+ // B: p rows, n columns (i.e. we transpose it internally)
582
+ // result is m columns, p rows
583
+ GGML_API struct ggml_tensor * ggml_mul_mat(
584
+ struct ggml_context * ctx,
585
+ struct ggml_tensor * a,
586
+ struct ggml_tensor * b);
587
+
762
588
  //
763
- // if 0 - disabled
764
- // if > 0:
765
- // assume convergence if no cost improvement in this number of iterations
589
+ // operations on tensors without backpropagation
766
590
  //
767
- int max_no_improvement;
768
591
 
769
- bool print_forward_graph;
770
- bool print_backward_graph;
592
+ // in-place, returns view(a)
593
+ GGML_API struct ggml_tensor * ggml_scale(
594
+ struct ggml_context * ctx,
595
+ struct ggml_tensor * a,
596
+ struct ggml_tensor * b);
597
+
598
+ // a -> b, return view(b)
599
+ GGML_API struct ggml_tensor * ggml_cpy(
600
+ struct ggml_context * ctx,
601
+ struct ggml_tensor * a,
602
+ struct ggml_tensor * b);
603
+
604
+ // make contiguous
605
+ GGML_API struct ggml_tensor * ggml_cont(
606
+ struct ggml_context * ctx,
607
+ struct ggml_tensor * a);
608
+
609
+ // return view(a), b specifies the new shape
610
+ // TODO: when we start computing gradient, make a copy instead of view
611
+ GGML_API struct ggml_tensor * ggml_reshape(
612
+ struct ggml_context * ctx,
613
+ struct ggml_tensor * a,
614
+ struct ggml_tensor * b);
615
+
616
+ // return view(a)
617
+ // TODO: when we start computing gradient, make a copy instead of view
618
+ GGML_API struct ggml_tensor * ggml_reshape_2d(
619
+ struct ggml_context * ctx,
620
+ struct ggml_tensor * a,
621
+ int64_t ne0,
622
+ int64_t ne1);
623
+
624
+ // return view(a)
625
+ // TODO: when we start computing gradient, make a copy instead of view
626
+ GGML_API struct ggml_tensor * ggml_reshape_3d(
627
+ struct ggml_context * ctx,
628
+ struct ggml_tensor * a,
629
+ int64_t ne0,
630
+ int64_t ne1,
631
+ int64_t ne2);
632
+
633
+ // offset in bytes
634
+ GGML_API struct ggml_tensor * ggml_view_1d(
635
+ struct ggml_context * ctx,
636
+ struct ggml_tensor * a,
637
+ int64_t ne0,
638
+ size_t offset);
639
+
640
+ GGML_API struct ggml_tensor * ggml_view_2d(
641
+ struct ggml_context * ctx,
642
+ struct ggml_tensor * a,
643
+ int64_t ne0,
644
+ int64_t ne1,
645
+ size_t nb1, // row stride in bytes
646
+ size_t offset);
647
+
648
+ GGML_API struct ggml_tensor * ggml_view_3d(
649
+ struct ggml_context * ctx,
650
+ struct ggml_tensor * a,
651
+ int64_t ne0,
652
+ int64_t ne1,
653
+ int64_t ne2,
654
+ size_t nb1, // row stride in bytes
655
+ size_t nb2, // slice stride in bytes
656
+ size_t offset);
657
+
658
+ GGML_API struct ggml_tensor * ggml_permute(
659
+ struct ggml_context * ctx,
660
+ struct ggml_tensor * a,
661
+ int axis0,
662
+ int axis1,
663
+ int axis2,
664
+ int axis3);
665
+
666
+ // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
667
+ GGML_API struct ggml_tensor * ggml_transpose(
668
+ struct ggml_context * ctx,
669
+ struct ggml_tensor * a);
670
+
671
+ GGML_API struct ggml_tensor * ggml_get_rows(
672
+ struct ggml_context * ctx,
673
+ struct ggml_tensor * a,
674
+ struct ggml_tensor * b);
675
+
676
+ // set elements above the diagonal to -INF
677
+ // in-place, returns view(a)
678
+ GGML_API struct ggml_tensor * ggml_diag_mask_inf(
679
+ struct ggml_context * ctx,
680
+ struct ggml_tensor * a,
681
+ int n_past);
682
+
683
+ // in-place, returns view(a)
684
+ GGML_API struct ggml_tensor * ggml_soft_max(
685
+ struct ggml_context * ctx,
686
+ struct ggml_tensor * a);
687
+
688
+ // rotary position embedding
689
+ // in-place, returns view(a)
690
+ // if mode & 1 == 1, skip n_past elements
691
+ // if mode & 2 == 1, GPT-NeoX style
692
+ // TODO: avoid creating a new tensor every time
693
+ GGML_API struct ggml_tensor * ggml_rope(
694
+ struct ggml_context * ctx,
695
+ struct ggml_tensor * a,
696
+ int n_past,
697
+ int n_dims,
698
+ int mode);
699
+
700
+ // alibi position embedding
701
+ // in-place, returns view(a)
702
+ struct ggml_tensor * ggml_alibi(
703
+ struct ggml_context * ctx,
704
+ struct ggml_tensor * a,
705
+ int n_past,
706
+ int n_head);
707
+
708
+ // padding = 1
709
+ // TODO: we don't support extra parameters for now
710
+ // that's why we are hard-coding the stride, padding, and dilation
711
+ // not great ..
712
+ GGML_API struct ggml_tensor * ggml_conv_1d_1s(
713
+ struct ggml_context * ctx,
714
+ struct ggml_tensor * a,
715
+ struct ggml_tensor * b);
716
+
717
+ GGML_API struct ggml_tensor * ggml_conv_1d_2s(
718
+ struct ggml_context * ctx,
719
+ struct ggml_tensor * a,
720
+ struct ggml_tensor * b);
721
+
722
+ GGML_API struct ggml_tensor * ggml_flash_attn(
723
+ struct ggml_context * ctx,
724
+ struct ggml_tensor * q,
725
+ struct ggml_tensor * k,
726
+ struct ggml_tensor * v,
727
+ bool masked);
728
+
729
+ GGML_API struct ggml_tensor * ggml_flash_ff(
730
+ struct ggml_context * ctx,
731
+ struct ggml_tensor * a,
732
+ struct ggml_tensor * b0,
733
+ struct ggml_tensor * b1,
734
+ struct ggml_tensor * c0,
735
+ struct ggml_tensor * c1);
736
+
737
+ // Mapping operations
738
+ typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
739
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
740
+
741
+ GGML_API struct ggml_tensor * ggml_map_unary_f32(
742
+ struct ggml_context * ctx,
743
+ struct ggml_tensor * a,
744
+ const ggml_unary_op_f32_t fun);
745
+
746
+ GGML_API struct ggml_tensor * ggml_map_binary_f32(
747
+ struct ggml_context * ctx,
748
+ struct ggml_tensor * a,
749
+ struct ggml_tensor * b,
750
+ const ggml_binary_op_f32_t fun);
771
751
 
772
- // ADAM parameters
773
- struct {
774
- int n_iter;
752
+ //
753
+ // automatic differentiation
754
+ //
775
755
 
776
- float alpha; // learning rate
777
- float beta1;
778
- float beta2;
779
- float eps; // epsilon for numerical stability
780
- float eps_f; // epsilon for convergence test
781
- float eps_g; // epsilon for convergence test
782
- } adam;
756
+ GGML_API void ggml_set_param(
757
+ struct ggml_context * ctx,
758
+ struct ggml_tensor * tensor);
783
759
 
784
- // LBFGS parameters
785
- struct {
786
- int m; // number of corrections to approximate the inv. Hessian
787
- int n_iter;
788
- int max_linesearch;
760
+ GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
789
761
 
790
- float eps; // convergence tolerance
791
- float ftol; // line search tolerance
792
- float wolfe;
793
- float min_step;
794
- float max_step;
762
+ GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
763
+ GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
795
764
 
796
- enum ggml_linesearch linesearch;
797
- } lbfgs;
798
- };
765
+ GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
766
+ GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
799
767
 
800
- struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
768
+ // print info and performance information for the graph
769
+ GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
801
770
 
802
- // optimize the function defined by the tensor f
803
- enum ggml_opt_result ggml_opt(
804
- struct ggml_context * ctx,
805
- struct ggml_opt_params params,
806
- struct ggml_tensor * f);
771
+ // dump the graph into a file using the dot format
772
+ GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
807
773
 
808
- //
809
- // quantization
810
- //
774
+ //
775
+ // optimization
776
+ //
811
777
 
812
- size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
813
- size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
814
- size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
815
- size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
778
+ // optimization methods
779
+ enum ggml_opt_type {
780
+ GGML_OPT_ADAM,
781
+ GGML_OPT_LBFGS,
782
+ };
783
+
784
+ // linesearch methods
785
+ enum ggml_linesearch {
786
+ GGML_LINESEARCH_DEFAULT = 1,
787
+
788
+ GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
789
+ GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
790
+ GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
791
+ };
792
+
793
+ // optimization return values
794
+ enum ggml_opt_result {
795
+ GGML_OPT_OK = 0,
796
+ GGML_OPT_DID_NOT_CONVERGE,
797
+ GGML_OPT_NO_CONTEXT,
798
+ GGML_OPT_INVALID_WOLFE,
799
+ GGML_OPT_FAIL,
800
+
801
+ GGML_LINESEARCH_FAIL = -128,
802
+ GGML_LINESEARCH_MINIMUM_STEP,
803
+ GGML_LINESEARCH_MAXIMUM_STEP,
804
+ GGML_LINESEARCH_MAXIMUM_ITERATIONS,
805
+ GGML_LINESEARCH_INVALID_PARAMETERS,
806
+ };
807
+
808
+ // optimization parameters
809
+ //
810
+ // see ggml.c (ggml_opt_default_params) for default values
811
+ //
812
+ struct ggml_opt_params {
813
+ enum ggml_opt_type type;
814
+
815
+ int n_threads;
816
+
817
+ // delta-based convergence test
818
+ //
819
+ // if past == 0 - disabled
820
+ // if past > 0:
821
+ // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
822
+ //
823
+ int past;
824
+ float delta;
825
+
826
+ // maximum number of iterations without improvement
827
+ //
828
+ // if 0 - disabled
829
+ // if > 0:
830
+ // assume convergence if no cost improvement in this number of iterations
831
+ //
832
+ int max_no_improvement;
833
+
834
+ bool print_forward_graph;
835
+ bool print_backward_graph;
836
+
837
+ // ADAM parameters
838
+ struct {
839
+ int n_iter;
840
+
841
+ float alpha; // learning rate
842
+ float beta1;
843
+ float beta2;
844
+ float eps; // epsilon for numerical stability
845
+ float eps_f; // epsilon for convergence test
846
+ float eps_g; // epsilon for convergence test
847
+ } adam;
848
+
849
+ // LBFGS parameters
850
+ struct {
851
+ int m; // number of corrections to approximate the inv. Hessian
852
+ int n_iter;
853
+ int max_linesearch;
854
+
855
+ float eps; // convergence tolerance
856
+ float ftol; // line search tolerance
857
+ float wolfe;
858
+ float min_step;
859
+ float max_step;
860
+
861
+ enum ggml_linesearch linesearch;
862
+ } lbfgs;
863
+ };
864
+
865
+ GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
866
+
867
+ // optimize the function defined by the tensor f
868
+ GGML_API enum ggml_opt_result ggml_opt(
869
+ struct ggml_context * ctx,
870
+ struct ggml_opt_params params,
871
+ struct ggml_tensor * f);
816
872
 
817
- size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
873
+ //
874
+ // quantization
875
+ //
818
876
 
819
- //
820
- // system info
821
- //
877
+ GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
878
+ GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
879
+ GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
880
+ GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
881
+ GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
882
+ GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
822
883
 
823
- int ggml_cpu_has_avx(void);
824
- int ggml_cpu_has_avx2(void);
825
- int ggml_cpu_has_avx512(void);
826
- int ggml_cpu_has_avx512_vbmi(void);
827
- int ggml_cpu_has_avx512_vnni(void);
828
- int ggml_cpu_has_fma(void);
829
- int ggml_cpu_has_neon(void);
830
- int ggml_cpu_has_arm_fma(void);
831
- int ggml_cpu_has_f16c(void);
832
- int ggml_cpu_has_fp16_va(void);
833
- int ggml_cpu_has_wasm_simd(void);
834
- int ggml_cpu_has_blas(void);
835
- int ggml_cpu_has_cublas(void);
836
- int ggml_cpu_has_sse3(void);
837
- int ggml_cpu_has_vsx(void);
884
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
838
885
 
886
+ //
887
+ // system info
888
+ //
839
889
 
840
- //
841
- // Internal types and functions exposed for tests and benchmarks
842
- //
890
+ GGML_API int ggml_cpu_has_avx (void);
891
+ GGML_API int ggml_cpu_has_avx2 (void);
892
+ GGML_API int ggml_cpu_has_avx512 (void);
893
+ GGML_API int ggml_cpu_has_avx512_vbmi(void);
894
+ GGML_API int ggml_cpu_has_avx512_vnni(void);
895
+ GGML_API int ggml_cpu_has_fma (void);
896
+ GGML_API int ggml_cpu_has_neon (void);
897
+ GGML_API int ggml_cpu_has_arm_fma (void);
898
+ GGML_API int ggml_cpu_has_f16c (void);
899
+ GGML_API int ggml_cpu_has_fp16_va (void);
900
+ GGML_API int ggml_cpu_has_wasm_simd (void);
901
+ GGML_API int ggml_cpu_has_blas (void);
902
+ GGML_API int ggml_cpu_has_cublas (void);
903
+ GGML_API int ggml_cpu_has_clblast (void);
904
+ GGML_API int ggml_cpu_has_gpublas (void);
905
+ GGML_API int ggml_cpu_has_sse3 (void);
906
+ GGML_API int ggml_cpu_has_vsx (void);
907
+
908
+ //
909
+ // Internal types and functions exposed for tests and benchmarks
910
+ //
843
911
 
844
912
  #ifdef __cplusplus
845
- // restrict not standard in C++
913
+ // restrict not standard in C++
846
914
  #define GGML_RESTRICT
847
915
  #else
848
916
  #define GGML_RESTRICT restrict
849
917
  #endif
850
- typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
851
- typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
852
- typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
853
-
854
- typedef struct {
855
- dequantize_row_q_t dequantize_row_q;
856
- quantize_row_q_t quantize_row_q;
857
- quantize_row_q_t quantize_row_q_reference;
858
- quantize_row_q_t quantize_row_q_dot;
859
- vec_dot_q_t vec_dot_q;
860
- } quantize_fns_t;
861
-
862
- quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
918
+ typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
919
+ typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
920
+ typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
921
+
922
+ typedef struct {
923
+ dequantize_row_q_t dequantize_row_q;
924
+ quantize_row_q_t quantize_row_q;
925
+ quantize_row_q_t quantize_row_q_reference;
926
+ quantize_row_q_t quantize_row_q_dot;
927
+ vec_dot_q_t vec_dot_q;
928
+ enum ggml_type vec_dot_type;
929
+ } quantize_fns_t;
930
+
931
+ quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
863
932
 
864
933
  #ifdef __cplusplus
865
934
  }