llama_cpp 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -177,11 +177,12 @@ extern "C" {
177
177
  #include <stddef.h>
178
178
  #include <stdbool.h>
179
179
 
180
- #define GGML_MAX_DIMS 4
181
- #define GGML_MAX_NODES 4096
182
- #define GGML_MAX_PARAMS 16
183
- #define GGML_MAX_CONTEXTS 64
184
- #define GGML_MAX_OPT 4
180
+ #define GGML_MAX_DIMS 4
181
+ #define GGML_MAX_NODES 4096
182
+ #define GGML_MAX_PARAMS 16
183
+ #define GGML_MAX_CONTEXTS 64
184
+ #define GGML_MAX_OPT 4
185
+ #define GGML_DEFAULT_N_THREADS 4
185
186
 
186
187
  #ifdef __ARM_NEON
187
188
  // we use the built-in 16-bit float type
@@ -198,13 +199,15 @@ struct ggml_object;
198
199
  struct ggml_context;
199
200
 
200
201
  enum ggml_type {
201
- GGML_TYPE_Q4_0,
202
- GGML_TYPE_Q4_1,
202
+ // explicitly numbered values are used in llama.cpp files
203
+ GGML_TYPE_F32 = 0,
204
+ GGML_TYPE_F16 = 1,
205
+ GGML_TYPE_Q4_0 = 2,
206
+ GGML_TYPE_Q4_1 = 3,
207
+ GGML_TYPE_Q8_0 = 4,
203
208
  GGML_TYPE_I8,
204
209
  GGML_TYPE_I16,
205
210
  GGML_TYPE_I32,
206
- GGML_TYPE_F16,
207
- GGML_TYPE_F32,
208
211
  GGML_TYPE_COUNT,
209
212
  };
210
213
 
@@ -236,6 +239,7 @@ enum ggml_op {
236
239
 
237
240
  GGML_OP_SCALE,
238
241
  GGML_OP_CPY,
242
+ GGML_OP_CONT,
239
243
  GGML_OP_RESHAPE,
240
244
  GGML_OP_VIEW,
241
245
  GGML_OP_PERMUTE,
@@ -250,9 +254,25 @@ enum ggml_op {
250
254
  GGML_OP_FLASH_ATTN,
251
255
  GGML_OP_FLASH_FF,
252
256
 
257
+ GGML_OP_MAP_UNARY,
258
+ GGML_OP_MAP_BINARY,
259
+
253
260
  GGML_OP_COUNT,
254
261
  };
255
262
 
263
+
264
+ // ggml object
265
+ struct ggml_object {
266
+ size_t offs;
267
+ size_t size;
268
+
269
+ struct ggml_object * next;
270
+
271
+ char padding[8];
272
+ };
273
+
274
+ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
275
+
256
276
  // n-dimensional tensor
257
277
  struct ggml_tensor {
258
278
  enum ggml_type type;
@@ -335,6 +355,8 @@ int ggml_blck_size (enum ggml_type type);
335
355
  size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
336
356
  float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
337
357
 
358
+ const char * ggml_type_name(enum ggml_type type);
359
+
338
360
  size_t ggml_element_size(const struct ggml_tensor * tensor);
339
361
 
340
362
  struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -344,13 +366,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
344
366
 
345
367
  size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
346
368
 
347
- bool ggml_mlock_supported(void);
348
- bool ggml_mlock(
349
- struct ggml_context * ctx,
350
- const void *opt_extra_addr,
351
- size_t opt_extra_len,
352
- char **err_p);
353
-
354
369
  struct ggml_tensor * ggml_new_tensor(
355
370
  struct ggml_context * ctx,
356
371
  enum ggml_type type,
@@ -415,6 +430,12 @@ struct ggml_tensor * ggml_add(
415
430
  struct ggml_tensor * a,
416
431
  struct ggml_tensor * b);
417
432
 
433
+
434
+ struct ggml_tensor * ggml_add_inplace(
435
+ struct ggml_context * ctx,
436
+ struct ggml_tensor * a,
437
+ struct ggml_tensor * b);
438
+
418
439
  struct ggml_tensor * ggml_sub(
419
440
  struct ggml_context * ctx,
420
441
  struct ggml_tensor * a,
@@ -519,6 +540,11 @@ struct ggml_tensor * ggml_cpy(
519
540
  struct ggml_tensor * a,
520
541
  struct ggml_tensor * b);
521
542
 
543
+ // make contiguous
544
+ struct ggml_tensor * ggml_cont(
545
+ struct ggml_context * ctx,
546
+ struct ggml_tensor * a);
547
+
522
548
  // return view(a), b specifies the new shape
523
549
  // TODO: when we start computing gradient, make a copy instead of view
524
550
  struct ggml_tensor * ggml_reshape(
@@ -638,6 +664,21 @@ struct ggml_tensor * ggml_flash_ff(
638
664
  struct ggml_tensor * c0,
639
665
  struct ggml_tensor * c1);
640
666
 
667
+ // Mapping operations
668
+ typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
669
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
670
+
671
+ struct ggml_tensor * ggml_map_unary_f32(
672
+ struct ggml_context * ctx,
673
+ struct ggml_tensor * a,
674
+ const ggml_unary_op_f32_t fun);
675
+
676
+ struct ggml_tensor * ggml_map_binary_f32(
677
+ struct ggml_context * ctx,
678
+ struct ggml_tensor * a,
679
+ struct ggml_tensor * b,
680
+ const ggml_binary_op_f32_t fun);
681
+
641
682
  //
642
683
  // automatic differentiation
643
684
  //
@@ -773,6 +814,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
773
814
  int ggml_cpu_has_avx(void);
774
815
  int ggml_cpu_has_avx2(void);
775
816
  int ggml_cpu_has_avx512(void);
817
+ int ggml_cpu_has_avx512_vbmi(void);
818
+ int ggml_cpu_has_avx512_vnni(void);
776
819
  int ggml_cpu_has_fma(void);
777
820
  int ggml_cpu_has_neon(void);
778
821
  int ggml_cpu_has_arm_fma(void);
@@ -783,6 +826,31 @@ int ggml_cpu_has_blas(void);
783
826
  int ggml_cpu_has_sse3(void);
784
827
  int ggml_cpu_has_vsx(void);
785
828
 
829
+
830
+ //
831
+ // Internal types and functions exposed for tests and benchmarks
832
+ //
833
+
834
+ #ifdef __cplusplus
835
+ // restrict not standard in C++
836
+ #define GGML_RESTRICT
837
+ #else
838
+ #define GGML_RESTRICT restrict
839
+ #endif
840
+ typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
841
+ typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
842
+ typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
843
+
844
+ typedef struct {
845
+ dequantize_row_q_t dequantize_row_q;
846
+ quantize_row_q_t quantize_row_q;
847
+ quantize_row_q_t quantize_row_q_reference;
848
+ quantize_row_q_t quantize_row_q_dot;
849
+ vec_dot_q_t vec_dot_q;
850
+ } quantize_fns_t;
851
+
852
+ quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
853
+
786
854
  #ifdef __cplusplus
787
855
  }
788
856
  #endif