llama_cpp 0.3.5 → 0.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -183,6 +183,15 @@
183
183
  # define GGML_API
184
184
  #endif
185
185
 
186
+ // TODO: support for clang
187
+ #ifdef __GNUC__
188
+ # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
189
+ #elif defined(_MSC_VER)
190
+ # define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
191
+ #else
192
+ # define GGML_DEPRECATED(func, hint) func
193
+ #endif
194
+
186
195
  #include <stdint.h>
187
196
  #include <stddef.h>
188
197
  #include <stdbool.h>
@@ -374,6 +383,10 @@ extern "C" {
374
383
  GGML_OP_MAP_UNARY,
375
384
  GGML_OP_MAP_BINARY,
376
385
 
386
+ GGML_OP_MAP_CUSTOM1_F32,
387
+ GGML_OP_MAP_CUSTOM2_F32,
388
+ GGML_OP_MAP_CUSTOM3_F32,
389
+
377
390
  GGML_OP_MAP_CUSTOM1,
378
391
  GGML_OP_MAP_CUSTOM2,
379
392
  GGML_OP_MAP_CUSTOM3,
@@ -570,6 +583,8 @@ extern "C" {
570
583
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
571
584
  GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
572
585
 
586
+ GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
587
+
573
588
  // use this to compute the memory overhead of a tensor
574
589
  GGML_API size_t ggml_tensor_overhead(void);
575
590
 
@@ -1170,7 +1185,18 @@ extern "C" {
1170
1185
  int mode,
1171
1186
  int n_ctx);
1172
1187
 
1173
- // custom RoPE, in-place, returns view(a)
1188
+ // custom RoPE
1189
+ GGML_API struct ggml_tensor * ggml_rope_custom(
1190
+ struct ggml_context * ctx,
1191
+ struct ggml_tensor * a,
1192
+ int n_past,
1193
+ int n_dims,
1194
+ int mode,
1195
+ int n_ctx,
1196
+ float freq_base,
1197
+ float freq_scale);
1198
+
1199
+ // in-place, returns view(a)
1174
1200
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1175
1201
  struct ggml_context * ctx,
1176
1202
  struct ggml_tensor * a,
@@ -1229,7 +1255,7 @@ extern "C" {
1229
1255
 
1230
1256
  // conv_1d with padding = half
1231
1257
  // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1232
- GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1258
+ GGML_API struct ggml_tensor * ggml_conv_1d_ph(
1233
1259
  struct ggml_context * ctx,
1234
1260
  struct ggml_tensor * a,
1235
1261
  struct ggml_tensor * b,
@@ -1242,7 +1268,7 @@ extern "C" {
1242
1268
  GGML_OP_POOL_COUNT,
1243
1269
  };
1244
1270
 
1245
- GGML_API struct ggml_tensor* ggml_pool_1d(
1271
+ GGML_API struct ggml_tensor * ggml_pool_1d(
1246
1272
  struct ggml_context * ctx,
1247
1273
  struct ggml_tensor * a,
1248
1274
  enum ggml_op_pool op,
@@ -1250,7 +1276,7 @@ extern "C" {
1250
1276
  int s0, // stride
1251
1277
  int p0); // padding
1252
1278
 
1253
- GGML_API struct ggml_tensor* ggml_pool_2d(
1279
+ GGML_API struct ggml_tensor * ggml_pool_2d(
1254
1280
  struct ggml_context * ctx,
1255
1281
  struct ggml_tensor * a,
1256
1282
  enum ggml_op_pool op,
@@ -1304,15 +1330,6 @@ extern "C" {
1304
1330
  int h0,
1305
1331
  int w);
1306
1332
 
1307
- // custom operators
1308
-
1309
- typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1310
- typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1311
-
1312
- typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1313
- typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1314
- typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1315
-
1316
1333
  GGML_API struct ggml_tensor * ggml_unary(
1317
1334
  struct ggml_context * ctx,
1318
1335
  struct ggml_tensor * a,
@@ -1323,63 +1340,138 @@ extern "C" {
1323
1340
  struct ggml_tensor * a,
1324
1341
  enum ggml_unary_op op);
1325
1342
 
1326
- GGML_API struct ggml_tensor * ggml_map_unary_f32(
1343
+ // custom operators
1344
+
1345
+ typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1346
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1347
+
1348
+ typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1349
+ typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1350
+ typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1351
+
1352
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
1327
1353
  struct ggml_context * ctx,
1328
1354
  struct ggml_tensor * a,
1329
- ggml_unary_op_f32_t fun);
1355
+ ggml_unary_op_f32_t fun),
1356
+ "use ggml_map_custom1 instead");
1330
1357
 
1331
- GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1358
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1332
1359
  struct ggml_context * ctx,
1333
1360
  struct ggml_tensor * a,
1334
- ggml_unary_op_f32_t fun);
1361
+ ggml_unary_op_f32_t fun),
1362
+ "use ggml_map_custom1_inplace instead");
1335
1363
 
1336
- GGML_API struct ggml_tensor * ggml_map_binary_f32(
1364
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
1337
1365
  struct ggml_context * ctx,
1338
1366
  struct ggml_tensor * a,
1339
1367
  struct ggml_tensor * b,
1340
- ggml_binary_op_f32_t fun);
1368
+ ggml_binary_op_f32_t fun),
1369
+ "use ggml_map_custom2 instead");
1341
1370
 
1342
- GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1371
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1343
1372
  struct ggml_context * ctx,
1344
1373
  struct ggml_tensor * a,
1345
1374
  struct ggml_tensor * b,
1346
- ggml_binary_op_f32_t fun);
1375
+ ggml_binary_op_f32_t fun),
1376
+ "use ggml_map_custom2_inplace instead");
1347
1377
 
1348
- GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1378
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1349
1379
  struct ggml_context * ctx,
1350
1380
  struct ggml_tensor * a,
1351
- ggml_custom1_op_f32_t fun);
1381
+ ggml_custom1_op_f32_t fun),
1382
+ "use ggml_map_custom1 instead");
1352
1383
 
1353
- GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1384
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1354
1385
  struct ggml_context * ctx,
1355
1386
  struct ggml_tensor * a,
1356
- ggml_custom1_op_f32_t fun);
1387
+ ggml_custom1_op_f32_t fun),
1388
+ "use ggml_map_custom1_inplace instead");
1357
1389
 
1358
- GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1390
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1359
1391
  struct ggml_context * ctx,
1360
1392
  struct ggml_tensor * a,
1361
1393
  struct ggml_tensor * b,
1362
- ggml_custom2_op_f32_t fun);
1394
+ ggml_custom2_op_f32_t fun),
1395
+ "use ggml_map_custom2 instead");
1363
1396
 
1364
- GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1397
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1365
1398
  struct ggml_context * ctx,
1366
1399
  struct ggml_tensor * a,
1367
1400
  struct ggml_tensor * b,
1368
- ggml_custom2_op_f32_t fun);
1401
+ ggml_custom2_op_f32_t fun),
1402
+ "use ggml_map_custom2_inplace instead");
1369
1403
 
1370
- GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1404
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1371
1405
  struct ggml_context * ctx,
1372
1406
  struct ggml_tensor * a,
1373
1407
  struct ggml_tensor * b,
1374
1408
  struct ggml_tensor * c,
1375
- ggml_custom3_op_f32_t fun);
1409
+ ggml_custom3_op_f32_t fun),
1410
+ "use ggml_map_custom3 instead");
1376
1411
 
1377
- GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1412
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1378
1413
  struct ggml_context * ctx,
1379
1414
  struct ggml_tensor * a,
1380
1415
  struct ggml_tensor * b,
1381
1416
  struct ggml_tensor * c,
1382
- ggml_custom3_op_f32_t fun);
1417
+ ggml_custom3_op_f32_t fun),
1418
+ "use ggml_map_custom3_inplace instead");
1419
+
1420
+ // custom operators v2
1421
+
1422
+ typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
1423
+ typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
1424
+ typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
1425
+
1426
+ #define GGML_N_TASKS_MAX -1
1427
+
1428
+ GGML_API struct ggml_tensor * ggml_map_custom1(
1429
+ struct ggml_context * ctx,
1430
+ struct ggml_tensor * a,
1431
+ ggml_custom1_op_t fun,
1432
+ int n_tasks,
1433
+ void * userdata);
1434
+
1435
+ GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
1436
+ struct ggml_context * ctx,
1437
+ struct ggml_tensor * a,
1438
+ ggml_custom1_op_t fun,
1439
+ int n_tasks,
1440
+ void * userdata);
1441
+
1442
+ GGML_API struct ggml_tensor * ggml_map_custom2(
1443
+ struct ggml_context * ctx,
1444
+ struct ggml_tensor * a,
1445
+ struct ggml_tensor * b,
1446
+ ggml_custom2_op_t fun,
1447
+ int n_tasks,
1448
+ void * userdata);
1449
+
1450
+ GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
1451
+ struct ggml_context * ctx,
1452
+ struct ggml_tensor * a,
1453
+ struct ggml_tensor * b,
1454
+ ggml_custom2_op_t fun,
1455
+ int n_tasks,
1456
+ void * userdata);
1457
+
1458
+ GGML_API struct ggml_tensor * ggml_map_custom3(
1459
+ struct ggml_context * ctx,
1460
+ struct ggml_tensor * a,
1461
+ struct ggml_tensor * b,
1462
+ struct ggml_tensor * c,
1463
+ ggml_custom3_op_t fun,
1464
+ int n_tasks,
1465
+ void * userdata);
1466
+
1467
+ GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
1468
+ struct ggml_context * ctx,
1469
+ struct ggml_tensor * a,
1470
+ struct ggml_tensor * b,
1471
+ struct ggml_tensor * c,
1472
+ ggml_custom3_op_t fun,
1473
+ int n_tasks,
1474
+ void * userdata);
1383
1475
 
1384
1476
  // loss function
1385
1477
 
@@ -39,6 +39,8 @@
39
39
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
40
40
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
41
41
 
42
+ #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
43
+
42
44
  //
43
45
  // 2-6 bit quantization in super-blocks
44
46
  //
@@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1353
1355
  const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
1354
1356
  const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1355
1357
  const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1356
- const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
1358
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1357
1359
 
1358
1360
  __m256i sumi = _mm256_setzero_si256();
1359
1361
 
@@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1421
1423
  const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
1422
1424
 
1423
1425
  // sumf += -dmin * summs in 32bits*8
1424
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc);
1426
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
1425
1427
 
1426
1428
  const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
1427
1429
  const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
@@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1493
1495
  }
1494
1496
 
1495
1497
  // sumf += dall * isum - dmin * summs in 32bits
1496
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
1498
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
1497
1499
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
1498
1500
  }
1499
1501
 
@@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1644
1646
  summs += dmin * smin;
1645
1647
 
1646
1648
  const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
1647
- const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1648
- const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
1649
+ const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1650
+ const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
1649
1651
 
1650
1652
  const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
1651
1653
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
@@ -1709,10 +1711,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1709
1711
  const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
1710
1712
  const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
1711
1713
 
1712
- const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1713
- const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1714
- const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1715
- const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1714
+ const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1715
+ const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1716
+ const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1717
+ const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1716
1718
 
1717
1719
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
1718
1720
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
@@ -1917,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
1917
1919
  const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
1918
1920
  const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1919
1921
  const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1920
- const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
1922
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1921
1923
 
1922
1924
  // high bit
1923
1925
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
@@ -2128,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2128
2130
  }
2129
2131
 
2130
2132
  // multiply with block scale and accumulate
2131
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2133
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2132
2134
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
2133
2135
 
2134
2136
  }
@@ -2303,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2303
2305
  aux16[0] = a & 0x0f0f;
2304
2306
  aux16[1] = (a >> 4) & 0x0f0f;
2305
2307
 
2306
- const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2307
- const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
2308
+ const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2309
+ const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
2308
2310
 
2309
2311
  memcpy(&aux64, x[i].hmask, 8);
2310
2312
 
2311
2313
  const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
2312
- __m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux);
2314
+ __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
2313
2315
  __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
2314
2316
  q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
2315
2317
  q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
@@ -2318,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2318
2320
  const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
2319
2321
 
2320
2322
  // prepare low and high bits
2321
- const __m256i q3aux = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits);
2323
+ const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
2322
2324
  const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
2323
2325
  const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
2324
2326
 
@@ -2429,7 +2431,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2429
2431
 
2430
2432
  p16_0 = _mm_add_epi32(p16_0, p16_2);
2431
2433
  p16_1 = _mm_add_epi32(p16_1, p16_3);
2432
- __m256i p16 = _mm256_set_m128i(p16_1, p16_0);
2434
+ __m256i p16 = MM256_SET_M128I(p16_1, p16_0);
2433
2435
 
2434
2436
  // multiply with block scale and accumulate
2435
2437
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
@@ -2620,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2620
2622
  acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
2621
2623
 
2622
2624
  const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
2623
- const __m256i scales = _mm256_set_m128i(sc128, sc128);
2625
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
2624
2626
 
2625
2627
  __m256i sumi = _mm256_setzero_si256();
2626
2628
 
@@ -2727,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2727
2729
  }
2728
2730
 
2729
2731
  __m256 vd = _mm256_set1_ps(d);
2730
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2732
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2731
2733
  acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
2732
2734
 
2733
2735
  }
@@ -2968,11 +2970,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2968
2970
 
2969
2971
  const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
2970
2972
  const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
2971
- acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
2973
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
2972
2974
 
2973
2975
  const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
2974
2976
  const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
2975
- acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
2977
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
2976
2978
 
2977
2979
  }
2978
2980
 
@@ -3160,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3160
3162
  summs += dmin * _mm_extract_epi32(hsum, 0);
3161
3163
 
3162
3164
  const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
3163
- const __m256i scales = _mm256_set_m128i(sc128, sc128);
3165
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
3164
3166
 
3165
3167
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
3166
3168
  __m256i hmask = mone;
@@ -3299,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3299
3301
  }
3300
3302
 
3301
3303
  __m256 vd = _mm256_set1_ps(d);
3302
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3304
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
3303
3305
  acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
3304
3306
 
3305
3307
  }
@@ -3462,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3462
3464
 
3463
3465
  const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
3464
3466
 
3465
- const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3466
- const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
3467
+ const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3468
+ const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
3467
3469
 
3468
3470
  int64_t aux64;
3469
3471
  memcpy(&aux64, x[i].qh, 8);
3470
3472
  const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
3471
- const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128);
3473
+ const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
3472
3474
 
3473
3475
  const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
3474
3476
  const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
@@ -3543,7 +3545,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3543
3545
  const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
3544
3546
  const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
3545
3547
 
3546
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
3548
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
3547
3549
 
3548
3550
  }
3549
3551
 
@@ -3925,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3925
3927
 
3926
3928
  }
3927
3929
 
3928
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3930
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
3929
3931
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
3930
3932
  }
3931
3933
 
@@ -4083,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4083
4085
  const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
4084
4086
  const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
4085
4087
 
4086
- const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
4087
- const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
4088
+ const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
4089
+ const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
4088
4090
 
4089
4091
  const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
4090
4092
  const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
@@ -4177,7 +4179,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4177
4179
  sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
4178
4180
  sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
4179
4181
 
4180
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
4182
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
4181
4183
  }
4182
4184
 
4183
4185
  *s = hsum_float_8(acc);
@@ -149,6 +149,46 @@ struct llama_file {
149
149
  }
150
150
  };
151
151
 
152
+ // llama_context_data
153
+ struct llama_data_context {
154
+ virtual void write(const void * src, size_t size) = 0;
155
+ virtual size_t get_size_written() = 0;
156
+ virtual ~llama_data_context() = default;
157
+ };
158
+
159
+ struct llama_data_buffer_context : llama_data_context {
160
+ uint8_t* ptr;
161
+ size_t size_written = 0;
162
+
163
+ llama_data_buffer_context(uint8_t * p) : ptr(p) {}
164
+
165
+ void write(const void * src, size_t size) override {
166
+ memcpy(ptr, src, size);
167
+ ptr += size;
168
+ size_written += size;
169
+ }
170
+
171
+ size_t get_size_written() override {
172
+ return size_written;
173
+ }
174
+ };
175
+
176
+ struct llama_data_file_context : llama_data_context {
177
+ llama_file* file;
178
+ size_t size_written = 0;
179
+
180
+ llama_data_file_context(llama_file * f) : file(f) {}
181
+
182
+ void write(const void * src, size_t size) override {
183
+ file->write_raw(src, size);
184
+ size_written += size;
185
+ }
186
+
187
+ size_t get_size_written() override {
188
+ return size_written;
189
+ }
190
+ };
191
+
152
192
  #if defined(_WIN32)
153
193
  static std::string llama_format_win_err(DWORD err) {
154
194
  LPSTR buf;
@@ -179,7 +219,7 @@ struct llama_mmap {
179
219
  // prefetch/readahead impairs performance on NUMA systems
180
220
  if (numa) { prefetch = 0; }
181
221
  #ifdef __linux__
182
- if (prefetch) { flags |= MAP_POPULATE; }
222
+ if (prefetch >= file->size) { flags |= MAP_POPULATE; }
183
223
  #endif
184
224
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
185
225
  if (addr == MAP_FAILED) {