llama_cpp 0.3.5 → 0.3.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -183,6 +183,15 @@
183
183
  # define GGML_API
184
184
  #endif
185
185
 
186
+ // TODO: support for clang
187
+ #ifdef __GNUC__
188
+ # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
189
+ #elif defined(_MSC_VER)
190
+ # define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
191
+ #else
192
+ # define GGML_DEPRECATED(func, hint) func
193
+ #endif
194
+
186
195
  #include <stdint.h>
187
196
  #include <stddef.h>
188
197
  #include <stdbool.h>
@@ -374,6 +383,10 @@ extern "C" {
374
383
  GGML_OP_MAP_UNARY,
375
384
  GGML_OP_MAP_BINARY,
376
385
 
386
+ GGML_OP_MAP_CUSTOM1_F32,
387
+ GGML_OP_MAP_CUSTOM2_F32,
388
+ GGML_OP_MAP_CUSTOM3_F32,
389
+
377
390
  GGML_OP_MAP_CUSTOM1,
378
391
  GGML_OP_MAP_CUSTOM2,
379
392
  GGML_OP_MAP_CUSTOM3,
@@ -570,6 +583,8 @@ extern "C" {
570
583
  GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
571
584
  GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
572
585
 
586
+ GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
587
+
573
588
  // use this to compute the memory overhead of a tensor
574
589
  GGML_API size_t ggml_tensor_overhead(void);
575
590
 
@@ -1170,7 +1185,18 @@ extern "C" {
1170
1185
  int mode,
1171
1186
  int n_ctx);
1172
1187
 
1173
- // custom RoPE, in-place, returns view(a)
1188
+ // custom RoPE
1189
+ GGML_API struct ggml_tensor * ggml_rope_custom(
1190
+ struct ggml_context * ctx,
1191
+ struct ggml_tensor * a,
1192
+ int n_past,
1193
+ int n_dims,
1194
+ int mode,
1195
+ int n_ctx,
1196
+ float freq_base,
1197
+ float freq_scale);
1198
+
1199
+ // in-place, returns view(a)
1174
1200
  GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
1175
1201
  struct ggml_context * ctx,
1176
1202
  struct ggml_tensor * a,
@@ -1229,7 +1255,7 @@ extern "C" {
1229
1255
 
1230
1256
  // conv_1d with padding = half
1231
1257
  // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
1232
- GGML_API struct ggml_tensor* ggml_conv_1d_ph(
1258
+ GGML_API struct ggml_tensor * ggml_conv_1d_ph(
1233
1259
  struct ggml_context * ctx,
1234
1260
  struct ggml_tensor * a,
1235
1261
  struct ggml_tensor * b,
@@ -1242,7 +1268,7 @@ extern "C" {
1242
1268
  GGML_OP_POOL_COUNT,
1243
1269
  };
1244
1270
 
1245
- GGML_API struct ggml_tensor* ggml_pool_1d(
1271
+ GGML_API struct ggml_tensor * ggml_pool_1d(
1246
1272
  struct ggml_context * ctx,
1247
1273
  struct ggml_tensor * a,
1248
1274
  enum ggml_op_pool op,
@@ -1250,7 +1276,7 @@ extern "C" {
1250
1276
  int s0, // stride
1251
1277
  int p0); // padding
1252
1278
 
1253
- GGML_API struct ggml_tensor* ggml_pool_2d(
1279
+ GGML_API struct ggml_tensor * ggml_pool_2d(
1254
1280
  struct ggml_context * ctx,
1255
1281
  struct ggml_tensor * a,
1256
1282
  enum ggml_op_pool op,
@@ -1304,15 +1330,6 @@ extern "C" {
1304
1330
  int h0,
1305
1331
  int w);
1306
1332
 
1307
- // custom operators
1308
-
1309
- typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1310
- typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1311
-
1312
- typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1313
- typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1314
- typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1315
-
1316
1333
  GGML_API struct ggml_tensor * ggml_unary(
1317
1334
  struct ggml_context * ctx,
1318
1335
  struct ggml_tensor * a,
@@ -1323,63 +1340,138 @@ extern "C" {
1323
1340
  struct ggml_tensor * a,
1324
1341
  enum ggml_unary_op op);
1325
1342
 
1326
- GGML_API struct ggml_tensor * ggml_map_unary_f32(
1343
+ // custom operators
1344
+
1345
+ typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
1346
+ typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
1347
+
1348
+ typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
1349
+ typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1350
+ typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
1351
+
1352
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
1327
1353
  struct ggml_context * ctx,
1328
1354
  struct ggml_tensor * a,
1329
- ggml_unary_op_f32_t fun);
1355
+ ggml_unary_op_f32_t fun),
1356
+ "use ggml_map_custom1 instead");
1330
1357
 
1331
- GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1358
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
1332
1359
  struct ggml_context * ctx,
1333
1360
  struct ggml_tensor * a,
1334
- ggml_unary_op_f32_t fun);
1361
+ ggml_unary_op_f32_t fun),
1362
+ "use ggml_map_custom1_inplace instead");
1335
1363
 
1336
- GGML_API struct ggml_tensor * ggml_map_binary_f32(
1364
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
1337
1365
  struct ggml_context * ctx,
1338
1366
  struct ggml_tensor * a,
1339
1367
  struct ggml_tensor * b,
1340
- ggml_binary_op_f32_t fun);
1368
+ ggml_binary_op_f32_t fun),
1369
+ "use ggml_map_custom2 instead");
1341
1370
 
1342
- GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1371
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
1343
1372
  struct ggml_context * ctx,
1344
1373
  struct ggml_tensor * a,
1345
1374
  struct ggml_tensor * b,
1346
- ggml_binary_op_f32_t fun);
1375
+ ggml_binary_op_f32_t fun),
1376
+ "use ggml_map_custom2_inplace instead");
1347
1377
 
1348
- GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1378
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
1349
1379
  struct ggml_context * ctx,
1350
1380
  struct ggml_tensor * a,
1351
- ggml_custom1_op_f32_t fun);
1381
+ ggml_custom1_op_f32_t fun),
1382
+ "use ggml_map_custom1 instead");
1352
1383
 
1353
- GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1384
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
1354
1385
  struct ggml_context * ctx,
1355
1386
  struct ggml_tensor * a,
1356
- ggml_custom1_op_f32_t fun);
1387
+ ggml_custom1_op_f32_t fun),
1388
+ "use ggml_map_custom1_inplace instead");
1357
1389
 
1358
- GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1390
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
1359
1391
  struct ggml_context * ctx,
1360
1392
  struct ggml_tensor * a,
1361
1393
  struct ggml_tensor * b,
1362
- ggml_custom2_op_f32_t fun);
1394
+ ggml_custom2_op_f32_t fun),
1395
+ "use ggml_map_custom2 instead");
1363
1396
 
1364
- GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1397
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
1365
1398
  struct ggml_context * ctx,
1366
1399
  struct ggml_tensor * a,
1367
1400
  struct ggml_tensor * b,
1368
- ggml_custom2_op_f32_t fun);
1401
+ ggml_custom2_op_f32_t fun),
1402
+ "use ggml_map_custom2_inplace instead");
1369
1403
 
1370
- GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1404
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
1371
1405
  struct ggml_context * ctx,
1372
1406
  struct ggml_tensor * a,
1373
1407
  struct ggml_tensor * b,
1374
1408
  struct ggml_tensor * c,
1375
- ggml_custom3_op_f32_t fun);
1409
+ ggml_custom3_op_f32_t fun),
1410
+ "use ggml_map_custom3 instead");
1376
1411
 
1377
- GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1412
+ GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
1378
1413
  struct ggml_context * ctx,
1379
1414
  struct ggml_tensor * a,
1380
1415
  struct ggml_tensor * b,
1381
1416
  struct ggml_tensor * c,
1382
- ggml_custom3_op_f32_t fun);
1417
+ ggml_custom3_op_f32_t fun),
1418
+ "use ggml_map_custom3_inplace instead");
1419
+
1420
+ // custom operators v2
1421
+
1422
+ typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
1423
+ typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
1424
+ typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
1425
+
1426
+ #define GGML_N_TASKS_MAX -1
1427
+
1428
+ GGML_API struct ggml_tensor * ggml_map_custom1(
1429
+ struct ggml_context * ctx,
1430
+ struct ggml_tensor * a,
1431
+ ggml_custom1_op_t fun,
1432
+ int n_tasks,
1433
+ void * userdata);
1434
+
1435
+ GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
1436
+ struct ggml_context * ctx,
1437
+ struct ggml_tensor * a,
1438
+ ggml_custom1_op_t fun,
1439
+ int n_tasks,
1440
+ void * userdata);
1441
+
1442
+ GGML_API struct ggml_tensor * ggml_map_custom2(
1443
+ struct ggml_context * ctx,
1444
+ struct ggml_tensor * a,
1445
+ struct ggml_tensor * b,
1446
+ ggml_custom2_op_t fun,
1447
+ int n_tasks,
1448
+ void * userdata);
1449
+
1450
+ GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
1451
+ struct ggml_context * ctx,
1452
+ struct ggml_tensor * a,
1453
+ struct ggml_tensor * b,
1454
+ ggml_custom2_op_t fun,
1455
+ int n_tasks,
1456
+ void * userdata);
1457
+
1458
+ GGML_API struct ggml_tensor * ggml_map_custom3(
1459
+ struct ggml_context * ctx,
1460
+ struct ggml_tensor * a,
1461
+ struct ggml_tensor * b,
1462
+ struct ggml_tensor * c,
1463
+ ggml_custom3_op_t fun,
1464
+ int n_tasks,
1465
+ void * userdata);
1466
+
1467
+ GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
1468
+ struct ggml_context * ctx,
1469
+ struct ggml_tensor * a,
1470
+ struct ggml_tensor * b,
1471
+ struct ggml_tensor * c,
1472
+ ggml_custom3_op_t fun,
1473
+ int n_tasks,
1474
+ void * userdata);
1383
1475
 
1384
1476
  // loss function
1385
1477
 
@@ -39,6 +39,8 @@
39
39
  #define MIN(a, b) ((a) < (b) ? (a) : (b))
40
40
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
41
41
 
42
+ #define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
43
+
42
44
  //
43
45
  // 2-6 bit quantization in super-blocks
44
46
  //
@@ -1353,7 +1355,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1353
1355
  const __m256i all_scales = _mm256_cvtepi8_epi16(scales8);
1354
1356
  const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1355
1357
  const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1356
- const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
1358
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1357
1359
 
1358
1360
  __m256i sumi = _mm256_setzero_si256();
1359
1361
 
@@ -1421,7 +1423,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1421
1423
  const __m128i summs_1 = _mm_madd_epi16(mins_1, _mm_loadu_si128((const __m128i*)&y[i].bsums[8]));
1422
1424
 
1423
1425
  // sumf += -dmin * summs in 32bits*8
1424
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(_mm256_set_m128i(summs_1, summs_0))), acc);
1426
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dmin), _mm256_cvtepi32_ps(MM256_SET_M128I(summs_1, summs_0))), acc);
1425
1427
 
1426
1428
  const __m128i scales_0 = _mm_cvtepi8_epi16(scales16);
1427
1429
  const __m128i scales_1 = _mm_cvtepi8_epi16(_mm_unpackhi_epi64(scales16, scales16));
@@ -1493,7 +1495,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1493
1495
  }
1494
1496
 
1495
1497
  // sumf += dall * isum - dmin * summs in 32bits
1496
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
1498
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
1497
1499
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&dall), _mm256_cvtepi32_ps(sumi)), acc);
1498
1500
  }
1499
1501
 
@@ -1644,8 +1646,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1644
1646
  summs += dmin * smin;
1645
1647
 
1646
1648
  const __m128i q2bits = _mm_loadu_si128((const __m128i*)q2);
1647
- const __m256i q2_0 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1648
- const __m256i q2_1 = _mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
1649
+ const __m256i q2_0 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 2), q2bits), m3);
1650
+ const __m256i q2_1 = _mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q2bits, 6), _mm_srli_epi16(q2bits, 4)), m3);
1649
1651
 
1650
1652
  const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)(q8+ 0));
1651
1653
  const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)(q8+32));
@@ -1709,10 +1711,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1709
1711
  const __m128i p2 = _mm_maddubs_epi16(q2_2, _mm256_extractf128_si256(q8_1, 0));
1710
1712
  const __m128i p3 = _mm_maddubs_epi16(q2_3, _mm256_extractf128_si256(q8_1, 1));
1711
1713
 
1712
- const __m256i p_0 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1713
- const __m256i p_1 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1714
- const __m256i p_2 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1715
- const __m256i p_3 = _mm256_set_m128i(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1714
+ const __m256i p_0 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p0, p0)), _mm_cvtepi16_epi32(p0));
1715
+ const __m256i p_1 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p1, p1)), _mm_cvtepi16_epi32(p1));
1716
+ const __m256i p_2 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p2, p2)), _mm_cvtepi16_epi32(p2));
1717
+ const __m256i p_3 = MM256_SET_M128I(_mm_cvtepi16_epi32(_mm_unpackhi_epi64(p3, p3)), _mm_cvtepi16_epi32(p3));
1716
1718
 
1717
1719
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[0]), _mm256_cvtepi32_ps(p_0)), acc);
1718
1720
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d * db[1]), _mm256_cvtepi32_ps(p_1)), acc);
@@ -1917,7 +1919,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
1917
1919
  const __m256i all_scales = _mm256_cvtepi8_epi16(scales128);
1918
1920
  const __m128i l_scales = _mm256_extracti128_si256(all_scales, 0);
1919
1921
  const __m128i h_scales = _mm256_extracti128_si256(all_scales, 1);
1920
- const __m256i scales[2] = {_mm256_set_m128i(l_scales, l_scales), _mm256_set_m128i(h_scales, h_scales)};
1922
+ const __m256i scales[2] = {MM256_SET_M128I(l_scales, l_scales), MM256_SET_M128I(h_scales, h_scales)};
1921
1923
 
1922
1924
  // high bit
1923
1925
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].hmask);
@@ -2128,7 +2130,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2128
2130
  }
2129
2131
 
2130
2132
  // multiply with block scale and accumulate
2131
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2133
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2132
2134
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
2133
2135
 
2134
2136
  }
@@ -2303,13 +2305,13 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2303
2305
  aux16[0] = a & 0x0f0f;
2304
2306
  aux16[1] = (a >> 4) & 0x0f0f;
2305
2307
 
2306
- const __m256i scale_0 = _mm256_set_m128i(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2307
- const __m256i scale_1 = _mm256_set_m128i(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
2308
+ const __m256i scale_0 = MM256_SET_M128I(_mm_set1_epi16(aux8[2] - 8), _mm_set1_epi16(aux8[0] - 8));
2309
+ const __m256i scale_1 = MM256_SET_M128I(_mm_set1_epi16(aux8[3] - 8), _mm_set1_epi16(aux8[1] - 8));
2308
2310
 
2309
2311
  memcpy(&aux64, x[i].hmask, 8);
2310
2312
 
2311
2313
  const __m128i haux = _mm_set_epi64x(aux64 >> 1, aux64 >> 0);
2312
- __m256i q3h_0 = _mm256_set_m128i(_mm_srli_epi16(haux, 2), haux);
2314
+ __m256i q3h_0 = MM256_SET_M128I(_mm_srli_epi16(haux, 2), haux);
2313
2315
  __m256i q3h_1 = _mm256_srli_epi16(q3h_0, 4);
2314
2316
  q3h_0 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_0, m1), 2);
2315
2317
  q3h_1 = _mm256_slli_epi16(_mm256_andnot_si256(q3h_1, m1), 2);
@@ -2318,7 +2320,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2318
2320
  const __m128i q3bits = _mm_loadu_si128((const __m128i*)q3);
2319
2321
 
2320
2322
  // prepare low and high bits
2321
- const __m256i q3aux = _mm256_set_m128i(_mm_srli_epi16(q3bits, 2), q3bits);
2323
+ const __m256i q3aux = MM256_SET_M128I(_mm_srli_epi16(q3bits, 2), q3bits);
2322
2324
  const __m256i q3l_0 = _mm256_and_si256(q3aux, m3);
2323
2325
  const __m256i q3l_1 = _mm256_and_si256(_mm256_srli_epi16(q3aux, 4), m3);
2324
2326
 
@@ -2429,7 +2431,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2429
2431
 
2430
2432
  p16_0 = _mm_add_epi32(p16_0, p16_2);
2431
2433
  p16_1 = _mm_add_epi32(p16_1, p16_3);
2432
- __m256i p16 = _mm256_set_m128i(p16_1, p16_0);
2434
+ __m256i p16 = MM256_SET_M128I(p16_1, p16_0);
2433
2435
 
2434
2436
  // multiply with block scale and accumulate
2435
2437
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(p16)), acc);
@@ -2620,7 +2622,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2620
2622
  acc_m = _mm_fmadd_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod), acc_m);
2621
2623
 
2622
2624
  const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
2623
- const __m256i scales = _mm256_set_m128i(sc128, sc128);
2625
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
2624
2626
 
2625
2627
  __m256i sumi = _mm256_setzero_si256();
2626
2628
 
@@ -2727,7 +2729,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2727
2729
  }
2728
2730
 
2729
2731
  __m256 vd = _mm256_set1_ps(d);
2730
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
2732
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
2731
2733
  acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
2732
2734
 
2733
2735
  }
@@ -2968,11 +2970,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2968
2970
 
2969
2971
  const __m128i p32_0 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_0);
2970
2972
  const __m128i p32_1 = _mm_madd_epi16(_mm_set1_epi16(scales[0]), p16_1);
2971
- acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_1, p32_0))), acc);
2973
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_1, p32_0))), acc);
2972
2974
 
2973
2975
  const __m128i p32_2 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_2);
2974
2976
  const __m128i p32_3 = _mm_madd_epi16(_mm_set1_epi16(scales[1]), p16_3);
2975
- acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(_mm256_set_m128i(p32_3, p32_2))), acc);
2977
+ acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(MM256_SET_M128I(p32_3, p32_2))), acc);
2976
2978
 
2977
2979
  }
2978
2980
 
@@ -3160,7 +3162,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3160
3162
  summs += dmin * _mm_extract_epi32(hsum, 0);
3161
3163
 
3162
3164
  const __m128i sc128 = _mm256_extracti128_si256(mins_and_scales, 0);
3163
- const __m256i scales = _mm256_set_m128i(sc128, sc128);
3165
+ const __m256i scales = MM256_SET_M128I(sc128, sc128);
3164
3166
 
3165
3167
  const __m256i hbits = _mm256_loadu_si256((const __m256i*)x[i].qh);
3166
3168
  __m256i hmask = mone;
@@ -3299,7 +3301,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3299
3301
  }
3300
3302
 
3301
3303
  __m256 vd = _mm256_set1_ps(d);
3302
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3304
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
3303
3305
  acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
3304
3306
 
3305
3307
  }
@@ -3462,13 +3464,13 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3462
3464
 
3463
3465
  const __m256i q5bits = _mm256_loadu_si256((const __m256i*)q5);
3464
3466
 
3465
- const __m256i scale_l = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3466
- const __m256i scale_h = _mm256_set_m128i(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
3467
+ const __m256i scale_l = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[1]), _mm_set1_epi16(x[i].scales[0]));
3468
+ const __m256i scale_h = MM256_SET_M128I(_mm_set1_epi16(x[i].scales[3]), _mm_set1_epi16(x[i].scales[2]));
3467
3469
 
3468
3470
  int64_t aux64;
3469
3471
  memcpy(&aux64, x[i].qh, 8);
3470
3472
  const __m128i haux128 = _mm_set_epi64x(aux64 >> 1, aux64);
3471
- const __m256i haux256 = _mm256_set_m128i(_mm_srli_epi16(haux128, 2), haux128);
3473
+ const __m256i haux256 = MM256_SET_M128I(_mm_srli_epi16(haux128, 2), haux128);
3472
3474
 
3473
3475
  const __m256i q5h_0 = _mm256_slli_epi16(_mm256_andnot_si256(haux256, mone), 4);
3474
3476
  const __m256i q5h_1 = _mm256_slli_epi16(_mm256_andnot_si256(_mm256_srli_epi16(haux256, 4), mone), 4);
@@ -3543,7 +3545,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3543
3545
  const __m128i dot_0 = _mm_sub_epi32(_mm_add_epi32(p16_0, p16_2), _mm_add_epi32(s16_0, s16_2));
3544
3546
  const __m128i dot_1 = _mm_sub_epi32(_mm_add_epi32(p16_1, p16_3), _mm_add_epi32(s16_1, s16_3));
3545
3547
 
3546
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_set_m128i(dot_1, dot_0))), acc);
3548
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(MM256_SET_M128I(dot_1, dot_0))), acc);
3547
3549
 
3548
3550
  }
3549
3551
 
@@ -3925,7 +3927,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3925
3927
 
3926
3928
  }
3927
3929
 
3928
- __m256i sumi = _mm256_set_m128i(sumi_1, sumi_0);
3930
+ __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
3929
3931
  acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
3930
3932
  }
3931
3933
 
@@ -4083,8 +4085,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4083
4085
  const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4);
4084
4086
  const __m128i q4bitsH = _mm_loadu_si128((const __m128i*)qh);
4085
4087
 
4086
- const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
4087
- const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_set_m128i(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
4088
+ const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 2), q4bitsH), m2), 4);
4089
+ const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(MM256_SET_M128I(_mm_srli_epi16(q4bitsH, 6), _mm_srli_epi16(q4bitsH, 4)), m2), 4);
4088
4090
 
4089
4091
  const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
4090
4092
  const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_1);
@@ -4177,7 +4179,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4177
4179
  sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
4178
4180
  sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
4179
4181
 
4180
- acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(_mm256_set_m128i(sumi_1, sumi_0))), acc);
4182
+ acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(MM256_SET_M128I(sumi_1, sumi_0))), acc);
4181
4183
  }
4182
4184
 
4183
4185
  *s = hsum_float_8(acc);
@@ -149,6 +149,46 @@ struct llama_file {
149
149
  }
150
150
  };
151
151
 
152
+ // llama_context_data
153
+ struct llama_data_context {
154
+ virtual void write(const void * src, size_t size) = 0;
155
+ virtual size_t get_size_written() = 0;
156
+ virtual ~llama_data_context() = default;
157
+ };
158
+
159
+ struct llama_data_buffer_context : llama_data_context {
160
+ uint8_t* ptr;
161
+ size_t size_written = 0;
162
+
163
+ llama_data_buffer_context(uint8_t * p) : ptr(p) {}
164
+
165
+ void write(const void * src, size_t size) override {
166
+ memcpy(ptr, src, size);
167
+ ptr += size;
168
+ size_written += size;
169
+ }
170
+
171
+ size_t get_size_written() override {
172
+ return size_written;
173
+ }
174
+ };
175
+
176
+ struct llama_data_file_context : llama_data_context {
177
+ llama_file* file;
178
+ size_t size_written = 0;
179
+
180
+ llama_data_file_context(llama_file * f) : file(f) {}
181
+
182
+ void write(const void * src, size_t size) override {
183
+ file->write_raw(src, size);
184
+ size_written += size;
185
+ }
186
+
187
+ size_t get_size_written() override {
188
+ return size_written;
189
+ }
190
+ };
191
+
152
192
  #if defined(_WIN32)
153
193
  static std::string llama_format_win_err(DWORD err) {
154
194
  LPSTR buf;
@@ -179,7 +219,7 @@ struct llama_mmap {
179
219
  // prefetch/readahead impairs performance on NUMA systems
180
220
  if (numa) { prefetch = 0; }
181
221
  #ifdef __linux__
182
- if (prefetch) { flags |= MAP_POPULATE; }
222
+ if (prefetch >= file->size) { flags |= MAP_POPULATE; }
183
223
  #endif
184
224
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
185
225
  if (addr == MAP_FAILED) {