llama_cpp 0.12.0 → 0.12.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -2340,6 +2340,322 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
2340
2340
  return (n/QK_K*sizeof(block_q6_K));
2341
2341
  }
2342
2342
 
2343
+ // ====================== "True" 2-bit (de)-quantization
2344
+
2345
+ static const uint64_t iq2xxs_grid[256] = {
2346
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
2347
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
2348
+ 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
2349
+ 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
2350
+ 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
2351
+ 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
2352
+ 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
2353
+ 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
2354
+ 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
2355
+ 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
2356
+ 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
2357
+ 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
2358
+ 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
2359
+ 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
2360
+ 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
2361
+ 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
2362
+ 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
2363
+ 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
2364
+ 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
2365
+ 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
2366
+ 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
2367
+ 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
2368
+ 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
2369
+ 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
2370
+ 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
2371
+ 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
2372
+ 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
2373
+ 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
2374
+ 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
2375
+ 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
2376
+ 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
2377
+ 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
2378
+ 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
2379
+ 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
2380
+ 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
2381
+ 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
2382
+ 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
2383
+ 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
2384
+ 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
2385
+ 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
2386
+ 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
2387
+ 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
2388
+ 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
2389
+ 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
2390
+ 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
2391
+ 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
2392
+ 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
2393
+ 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
2394
+ 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
2395
+ 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
2396
+ 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
2397
+ 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
2398
+ 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
2399
+ 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
2400
+ 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
2401
+ 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
2402
+ 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
2403
+ 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
2404
+ 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
2405
+ 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
2406
+ 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
2407
+ 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
2408
+ 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
2409
+ 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
2410
+ };
2411
+
2412
+ static const uint64_t iq2xs_grid[512] = {
2413
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
2414
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
2415
+ 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
2416
+ 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
2417
+ 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
2418
+ 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
2419
+ 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
2420
+ 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
2421
+ 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
2422
+ 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
2423
+ 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
2424
+ 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
2425
+ 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
2426
+ 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
2427
+ 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
2428
+ 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
2429
+ 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
2430
+ 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
2431
+ 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
2432
+ 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
2433
+ 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
2434
+ 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
2435
+ 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
2436
+ 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
2437
+ 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
2438
+ 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
2439
+ 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
2440
+ 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
2441
+ 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
2442
+ 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
2443
+ 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
2444
+ 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
2445
+ 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
2446
+ 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
2447
+ 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
2448
+ 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
2449
+ 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
2450
+ 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
2451
+ 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
2452
+ 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
2453
+ 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
2454
+ 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
2455
+ 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
2456
+ 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
2457
+ 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
2458
+ 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
2459
+ 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
2460
+ 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
2461
+ 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
2462
+ 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
2463
+ 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
2464
+ 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
2465
+ 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
2466
+ 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
2467
+ 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
2468
+ 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
2469
+ 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
2470
+ 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
2471
+ 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
2472
+ 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
2473
+ 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
2474
+ 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
2475
+ 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
2476
+ 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
2477
+ 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
2478
+ 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
2479
+ 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
2480
+ 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
2481
+ 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
2482
+ 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
2483
+ 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
2484
+ 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
2485
+ 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
2486
+ 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
2487
+ 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
2488
+ 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
2489
+ 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
2490
+ 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
2491
+ 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
2492
+ 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
2493
+ 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
2494
+ 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
2495
+ 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
2496
+ 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
2497
+ 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
2498
+ 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
2499
+ 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
2500
+ 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
2501
+ 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
2502
+ 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
2503
+ 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
2504
+ 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
2505
+ 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
2506
+ 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
2507
+ 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
2508
+ 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
2509
+ 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
2510
+ 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
2511
+ 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
2512
+ 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
2513
+ 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
2514
+ 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
2515
+ 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
2516
+ 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
2517
+ 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
2518
+ 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
2519
+ 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
2520
+ 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
2521
+ 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
2522
+ 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
2523
+ 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
2524
+ 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
2525
+ 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
2526
+ 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
2527
+ 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
2528
+ 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
2529
+ 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
2530
+ 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
2531
+ 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
2532
+ 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
2533
+ 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
2534
+ 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
2535
+ 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
2536
+ 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
2537
+ 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
2538
+ 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
2539
+ 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
2540
+ 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
2541
+ };
2542
+
2543
+ static const uint8_t ksigns_iq2xs[128] = {
2544
+ 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
2545
+ 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
2546
+ 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
2547
+ 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
2548
+ 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
2549
+ 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
2550
+ 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
2551
+ 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
2552
+ };
2553
+
2554
+ static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2555
+
2556
+ void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
2557
+ (void)x;
2558
+ (void)y;
2559
+ (void)k;
2560
+ assert(k % QK_K == 0);
2561
+ //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
2562
+ }
2563
+
2564
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
2565
+ assert(k % QK_K == 0);
2566
+ const int nb = k / QK_K;
2567
+
2568
+ uint32_t aux32[2];
2569
+ const uint8_t * aux8 = (const uint8_t *)aux32;
2570
+
2571
+ for (int i = 0; i < nb; i++) {
2572
+
2573
+ const float d = GGML_FP16_TO_FP32(x[i].d);
2574
+
2575
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2576
+ memcpy(aux32, x[i].qs + 4*ib32, 2*sizeof(uint32_t));
2577
+ const float db = d * (0.5f + (aux32[1] >> 28)) * 0.25f;
2578
+ for (int l = 0; l < 4; ++l) {
2579
+ const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
2580
+ const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
2581
+ for (int j = 0; j < 8; ++j) {
2582
+ y[j] = db * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
2583
+ }
2584
+ y += 8;
2585
+ }
2586
+ }
2587
+ }
2588
+ }
2589
+
2590
+ void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
2591
+ assert(k % QK_K == 0);
2592
+ block_iq2_xxs * restrict y = vy;
2593
+ quantize_row_iq2_xxs_reference(x, y, k);
2594
+ }
2595
+
2596
+ size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
2597
+ assert(k % QK_K == 0);
2598
+ (void)hist; // TODO: collect histograms
2599
+
2600
+ for (int j = 0; j < n; j += k) {
2601
+ block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
2602
+ quantize_row_iq2_xxs_reference(src + j, y, k);
2603
+ }
2604
+ return (n/QK_K*sizeof(block_iq2_xxs));
2605
+ }
2606
+
2607
+ // ====================== 2.3125 bpw (de)-quantization
2608
+
2609
+ void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
2610
+ (void)x;
2611
+ (void)y;
2612
+ (void)k;
2613
+ assert(k % QK_K == 0);
2614
+ //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
2615
+ }
2616
+
2617
+ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
2618
+ assert(k % QK_K == 0);
2619
+ const int nb = k / QK_K;
2620
+
2621
+ float db[2];
2622
+
2623
+ for (int i = 0; i < nb; i++) {
2624
+
2625
+ const float d = GGML_FP16_TO_FP32(x[i].d);
2626
+
2627
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2628
+ db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
2629
+ db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
2630
+ for (int l = 0; l < 4; ++l) {
2631
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (x[i].qs[4*ib32 + l] & 511));
2632
+ const uint8_t signs = ksigns_iq2xs[x[i].qs[4*ib32 + l] >> 9];
2633
+ for (int j = 0; j < 8; ++j) {
2634
+ y[j] = db[l/2] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
2635
+ }
2636
+ y += 8;
2637
+ }
2638
+ }
2639
+ }
2640
+ }
2641
+
2642
+ void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
2643
+ assert(k % QK_K == 0);
2644
+ block_iq2_xs * restrict y = vy;
2645
+ quantize_row_iq2_xs_reference(x, y, k);
2646
+ }
2647
+
2648
+ size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
2649
+ assert(k % QK_K == 0);
2650
+ (void)hist; // TODO: collect histograms
2651
+
2652
+ for (int j = 0; j < n; j += k) {
2653
+ block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
2654
+ quantize_row_iq2_xs_reference(src + j, y, k);
2655
+ }
2656
+ return (n/QK_K*sizeof(block_iq2_xs));
2657
+ }
2658
+
2343
2659
  //===================================== Q8_K ==============================================
2344
2660
 
2345
2661
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@@ -2362,7 +2678,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
2362
2678
  x += QK_K;
2363
2679
  continue;
2364
2680
  }
2365
- const float iscale = -128.f/max;
2681
+ //const float iscale = -128.f/max;
2682
+ // We need this change for IQ2_XXS, else the AVX implementation becomes very awkward
2683
+ const float iscale = -127.f/max;
2366
2684
  for (int j = 0; j < QK_K; ++j) {
2367
2685
  int v = nearest_int(iscale*x[j]);
2368
2686
  y[i].qs[j] = MIN(127, v);
@@ -7065,3 +7383,319 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
7065
7383
  }
7066
7384
 
7067
7385
  #endif
7386
+
7387
+ static const int8_t keven_signs_q2xs[1024] = {
7388
+ 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
7389
+ 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
7390
+ 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1,
7391
+ 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
7392
+ 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1,
7393
+ 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1,
7394
+ 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
7395
+ 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1,
7396
+ 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1,
7397
+ 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1,
7398
+ 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1,
7399
+ 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1,
7400
+ 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1,
7401
+ 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1,
7402
+ 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1,
7403
+ 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1,
7404
+ 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1,
7405
+ 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1,
7406
+ 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1,
7407
+ 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
7408
+ 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1,
7409
+ 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1,
7410
+ 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1,
7411
+ 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1,
7412
+ 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1,
7413
+ 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1,
7414
+ 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1,
7415
+ 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1,
7416
+ 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1,
7417
+ 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1,
7418
+ 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
7419
+ 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
7420
+ };
7421
+
7422
+ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
7423
+ assert(n % QK_K == 0);
7424
+
7425
+ const block_iq2_xxs * restrict x = vx;
7426
+ const block_q8_K * restrict y = vy;
7427
+
7428
+ const int nb = n / QK_K;
7429
+
7430
+ #if defined(__ARM_NEON)
7431
+
7432
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
7433
+
7434
+ uint32_t aux32[4];
7435
+ const uint8_t * aux8 = (const uint8_t *)aux32;
7436
+
7437
+ ggml_int8x16x4_t q2u;
7438
+ ggml_int8x16x4_t q2s;
7439
+ ggml_int8x16x4_t q8b;
7440
+
7441
+ float sumf = 0;
7442
+ for (int i = 0; i < nb; ++i) {
7443
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7444
+ const uint16_t * restrict q2 = x[i].qs;
7445
+ const int8_t * restrict q8 = y[i].qs;
7446
+ float sumf1 = 0, sumf2 = 0;
7447
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
7448
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
7449
+ memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
7450
+ q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1])));
7451
+ q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3])));
7452
+ q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9])));
7453
+ q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11])));
7454
+ q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
7455
+ q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
7456
+ q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 7) & 127))));
7457
+ q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127))));
7458
+ q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
7459
+ q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
7460
+ q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
7461
+ q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
7462
+ const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]);
7463
+ const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]);
7464
+ sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28));
7465
+ sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28));
7466
+ }
7467
+ sumf += d*(sumf1 + sumf2);
7468
+ }
7469
+ *s = 0.25f * sumf;
7470
+
7471
+ #elif defined(__AVX2__)
7472
+
7473
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
7474
+
7475
+ uint32_t aux32[4];
7476
+ const uint8_t * aux8 = (const uint8_t *)aux32;
7477
+
7478
+ __m256 accumf = _mm256_setzero_ps();
7479
+ for (int i = 0; i < nb; ++i) {
7480
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7481
+ const uint16_t * restrict q2 = x[i].qs;
7482
+ const int8_t * restrict q8 = y[i].qs;
7483
+ __m256i sumi1 = _mm256_setzero_si256();
7484
+ __m256i sumi2 = _mm256_setzero_si256();
7485
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
7486
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
7487
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
7488
+ memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
7489
+ const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
7490
+ const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
7491
+ const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
7492
+ signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
7493
+ const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
7494
+ signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
7495
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
7496
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
7497
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
7498
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
7499
+ const uint16_t ls1 = aux32[1] >> 28;
7500
+ const uint16_t ls2 = aux32[3] >> 28;
7501
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
7502
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
7503
+ sumi1 = _mm256_add_epi32(sumi1, p1);
7504
+ sumi2 = _mm256_add_epi32(sumi2, p2);
7505
+ }
7506
+
7507
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
7508
+
7509
+ }
7510
+
7511
+ *s = 0.125f * hsum_float_8(accumf);
7512
+
7513
+ #else
7514
+
7515
+ uint32_t aux32[2];
7516
+ const uint8_t * aux8 = (const uint8_t *)aux32;
7517
+
7518
+ float sumf = 0.f;
7519
+ for (int i = 0; i < nb; ++i) {
7520
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7521
+ const uint16_t * restrict q2 = x[i].qs;
7522
+ const int8_t * restrict q8 = y[i].qs;
7523
+ int32_t bsum = 0;
7524
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
7525
+ memcpy(aux32, q2, 2*sizeof(uint32_t));
7526
+ q2 += 4;
7527
+ const uint32_t ls = 2*(aux32[1] >> 28) + 1;
7528
+ int32_t sumi = 0;
7529
+ for (int l = 0; l < 4; ++l) {
7530
+ const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
7531
+ const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
7532
+ for (int j = 0; j < 8; ++j) {
7533
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
7534
+ }
7535
+ q8 += 8;
7536
+ }
7537
+ bsum += sumi * ls;
7538
+ }
7539
+ sumf += d * bsum;
7540
+ }
7541
+ *s = 0.125f * sumf;
7542
+ #endif
7543
+ }
7544
+
7545
+ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
7546
+ assert(n % QK_K == 0);
7547
+
7548
+ const block_iq2_xs * restrict x = vx;
7549
+ const block_q8_K * restrict y = vy;
7550
+
7551
+ const int nb = n / QK_K;
7552
+
7553
+ #if defined(__ARM_NEON)
7554
+
7555
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
7556
+
7557
+ int8x16x4_t q2u;
7558
+ int8x16x4_t q2s;
7559
+ int8x16x4_t q8b;
7560
+
7561
+ int32x4x4_t scales32;
7562
+
7563
+ float sumf = 0;
7564
+ for (int i = 0; i < nb; ++i) {
7565
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7566
+ const uint16_t * restrict q2 = x[i].qs;
7567
+ const int8_t * restrict q8 = y[i].qs;
7568
+ const uint8x8_t scales8 = vld1_u8(x[i].scales);
7569
+ const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
7570
+ const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
7571
+ uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
7572
+ scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1));
7573
+ const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales));
7574
+ const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales));
7575
+ scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1)));
7576
+ scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1)));
7577
+ scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2)));
7578
+ scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
7579
+ int32x4_t sumi = vdupq_n_s32(0);
7580
+ for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
7581
+ q8b = vld1q_s8_x4(q8); q8 += 64;
7582
+ q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
7583
+ q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
7584
+ q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
7585
+ q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511))));
7586
+ q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9))));
7587
+ q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9))));
7588
+ q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9))));
7589
+ q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9))));
7590
+ q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
7591
+ q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
7592
+ q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
7593
+ q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
7594
+ const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]);
7595
+ const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]);
7596
+ const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]);
7597
+ const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]);
7598
+ const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4));
7599
+ sumi = vmlaq_s32(sumi, p, scales32.val[ib64]);
7600
+ q2 += 8;
7601
+ }
7602
+ sumf += d*vaddvq_s32(sumi);
7603
+ }
7604
+ *s = 0.125f * sumf;
7605
+
7606
+ #elif defined(__AVX2__)
7607
+
7608
+ const __m128i m4 = _mm_set1_epi8(0xf);
7609
+ const __m128i m1 = _mm_set1_epi8(1);
7610
+ const __m128i m511 = _mm_set1_epi16(511);
7611
+ const __m128i m127 = _mm_set1_epi16(127);
7612
+
7613
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
7614
+
7615
+ uint64_t aux64;
7616
+
7617
+ // somewhat hacky, but gives a significant boost in performance
7618
+ __m128i aux_gindex, aux_sindex;
7619
+ const uint16_t * gindex = (const uint16_t *)&aux_gindex;
7620
+ const uint16_t * sindex = (const uint16_t *)&aux_sindex;
7621
+
7622
+ __m256 accumf = _mm256_setzero_ps();
7623
+ for (int i = 0; i < nb; ++i) {
7624
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7625
+ const uint16_t * restrict q2 = x[i].qs;
7626
+ const int8_t * restrict q8 = y[i].qs;
7627
+
7628
+ memcpy(&aux64, x[i].scales, 8);
7629
+ __m128i stmp = _mm_set1_epi64x(aux64);
7630
+ stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
7631
+ const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
7632
+
7633
+ __m256i sumi1 = _mm256_setzero_si256();
7634
+ __m256i sumi2 = _mm256_setzero_si256();
7635
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
7636
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
7637
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
7638
+ const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2); q2 += 8;
7639
+ aux_gindex = _mm_and_si128(q2_data, m511);
7640
+ aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127);
7641
+ const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
7642
+ const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
7643
+ const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]);
7644
+ const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]);
7645
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
7646
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
7647
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
7648
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
7649
+
7650
+ const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
7651
+ const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
7652
+
7653
+ sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
7654
+ sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
7655
+ }
7656
+
7657
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
7658
+
7659
+ }
7660
+
7661
+ *s = 0.125f * hsum_float_8(accumf);
7662
+
7663
+ #else
7664
+
7665
+ float sumf = 0.f;
7666
+ for (int i = 0; i < nb; ++i) {
7667
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7668
+ const uint16_t * restrict q2 = x[i].qs;
7669
+ const uint8_t * restrict sc = x[i].scales;
7670
+ const int8_t * restrict q8 = y[i].qs;
7671
+ int32_t bsum = 0;
7672
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
7673
+ const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
7674
+ const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
7675
+ int32_t sumi = 0;
7676
+ for (int l = 0; l < 2; ++l) {
7677
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
7678
+ const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
7679
+ for (int j = 0; j < 8; ++j) {
7680
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
7681
+ }
7682
+ q8 += 8;
7683
+ }
7684
+ bsum += sumi * ls1;
7685
+ sumi = 0;
7686
+ for (int l = 2; l < 4; ++l) {
7687
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
7688
+ const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
7689
+ for (int j = 0; j < 8; ++j) {
7690
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
7691
+ }
7692
+ q8 += 8;
7693
+ }
7694
+ bsum += sumi * ls2;
7695
+ q2 += 4;
7696
+ }
7697
+ sumf += d * bsum;
7698
+ }
7699
+ *s = 0.125f * sumf;
7700
+ #endif
7701
+ }