llama_cpp 0.12.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2340,6 +2340,322 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
2340
2340
  return (n/QK_K*sizeof(block_q6_K));
2341
2341
  }
2342
2342
 
2343
+ // ====================== "True" 2-bit (de)-quantization
2344
+
2345
+ static const uint64_t iq2xxs_grid[256] = {
2346
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
2347
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
2348
+ 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
2349
+ 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
2350
+ 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
2351
+ 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
2352
+ 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
2353
+ 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
2354
+ 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
2355
+ 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
2356
+ 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
2357
+ 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
2358
+ 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
2359
+ 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
2360
+ 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
2361
+ 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
2362
+ 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
2363
+ 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
2364
+ 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
2365
+ 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
2366
+ 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
2367
+ 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
2368
+ 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
2369
+ 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
2370
+ 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
2371
+ 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
2372
+ 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
2373
+ 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
2374
+ 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
2375
+ 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
2376
+ 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
2377
+ 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
2378
+ 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
2379
+ 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
2380
+ 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
2381
+ 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
2382
+ 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
2383
+ 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
2384
+ 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
2385
+ 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
2386
+ 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
2387
+ 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
2388
+ 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
2389
+ 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
2390
+ 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
2391
+ 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
2392
+ 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
2393
+ 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
2394
+ 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
2395
+ 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
2396
+ 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
2397
+ 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
2398
+ 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
2399
+ 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
2400
+ 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
2401
+ 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
2402
+ 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
2403
+ 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
2404
+ 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
2405
+ 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
2406
+ 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
2407
+ 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
2408
+ 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
2409
+ 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
2410
+ };
2411
+
2412
+ static const uint64_t iq2xs_grid[512] = {
2413
+ 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
2414
+ 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
2415
+ 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
2416
+ 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
2417
+ 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
2418
+ 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
2419
+ 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
2420
+ 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
2421
+ 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
2422
+ 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
2423
+ 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
2424
+ 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
2425
+ 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
2426
+ 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
2427
+ 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
2428
+ 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
2429
+ 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
2430
+ 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
2431
+ 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
2432
+ 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
2433
+ 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
2434
+ 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
2435
+ 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
2436
+ 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
2437
+ 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
2438
+ 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
2439
+ 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
2440
+ 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
2441
+ 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
2442
+ 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
2443
+ 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
2444
+ 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
2445
+ 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
2446
+ 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
2447
+ 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
2448
+ 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
2449
+ 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
2450
+ 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
2451
+ 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
2452
+ 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
2453
+ 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
2454
+ 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
2455
+ 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
2456
+ 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
2457
+ 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
2458
+ 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
2459
+ 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
2460
+ 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
2461
+ 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
2462
+ 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
2463
+ 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
2464
+ 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
2465
+ 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
2466
+ 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
2467
+ 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
2468
+ 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
2469
+ 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
2470
+ 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
2471
+ 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
2472
+ 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
2473
+ 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
2474
+ 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
2475
+ 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
2476
+ 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
2477
+ 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
2478
+ 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
2479
+ 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
2480
+ 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
2481
+ 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
2482
+ 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
2483
+ 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
2484
+ 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
2485
+ 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
2486
+ 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
2487
+ 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
2488
+ 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
2489
+ 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
2490
+ 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
2491
+ 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
2492
+ 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
2493
+ 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
2494
+ 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
2495
+ 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
2496
+ 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
2497
+ 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
2498
+ 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
2499
+ 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
2500
+ 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
2501
+ 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
2502
+ 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
2503
+ 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
2504
+ 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
2505
+ 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
2506
+ 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
2507
+ 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
2508
+ 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
2509
+ 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
2510
+ 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
2511
+ 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
2512
+ 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
2513
+ 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
2514
+ 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
2515
+ 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
2516
+ 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
2517
+ 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
2518
+ 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
2519
+ 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
2520
+ 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
2521
+ 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
2522
+ 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
2523
+ 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
2524
+ 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
2525
+ 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
2526
+ 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
2527
+ 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
2528
+ 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
2529
+ 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
2530
+ 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
2531
+ 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
2532
+ 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
2533
+ 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
2534
+ 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
2535
+ 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
2536
+ 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
2537
+ 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
2538
+ 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
2539
+ 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
2540
+ 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
2541
+ };
2542
+
2543
+ static const uint8_t ksigns_iq2xs[128] = {
2544
+ 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
2545
+ 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
2546
+ 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
2547
+ 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
2548
+ 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
2549
+ 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
2550
+ 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
2551
+ 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
2552
+ };
2553
+
2554
+ static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2555
+
2556
+ void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
2557
+ (void)x;
2558
+ (void)y;
2559
+ (void)k;
2560
+ assert(k % QK_K == 0);
2561
+ //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
2562
+ }
2563
+
2564
+ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
2565
+ assert(k % QK_K == 0);
2566
+ const int nb = k / QK_K;
2567
+
2568
+ uint32_t aux32[2];
2569
+ const uint8_t * aux8 = (const uint8_t *)aux32;
2570
+
2571
+ for (int i = 0; i < nb; i++) {
2572
+
2573
+ const float d = GGML_FP16_TO_FP32(x[i].d);
2574
+
2575
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2576
+ memcpy(aux32, x[i].qs + 4*ib32, 2*sizeof(uint32_t));
2577
+ const float db = d * (0.5f + (aux32[1] >> 28)) * 0.25f;
2578
+ for (int l = 0; l < 4; ++l) {
2579
+ const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
2580
+ const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
2581
+ for (int j = 0; j < 8; ++j) {
2582
+ y[j] = db * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
2583
+ }
2584
+ y += 8;
2585
+ }
2586
+ }
2587
+ }
2588
+ }
2589
+
2590
+ void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
2591
+ assert(k % QK_K == 0);
2592
+ block_iq2_xxs * restrict y = vy;
2593
+ quantize_row_iq2_xxs_reference(x, y, k);
2594
+ }
2595
+
2596
+ size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
2597
+ assert(k % QK_K == 0);
2598
+ (void)hist; // TODO: collect histograms
2599
+
2600
+ for (int j = 0; j < n; j += k) {
2601
+ block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
2602
+ quantize_row_iq2_xxs_reference(src + j, y, k);
2603
+ }
2604
+ return (n/QK_K*sizeof(block_iq2_xxs));
2605
+ }
2606
+
2607
+ // ====================== 2.3125 bpw (de)-quantization
2608
+
2609
+ void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
2610
+ (void)x;
2611
+ (void)y;
2612
+ (void)k;
2613
+ assert(k % QK_K == 0);
2614
+ //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
2615
+ }
2616
+
2617
+ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
2618
+ assert(k % QK_K == 0);
2619
+ const int nb = k / QK_K;
2620
+
2621
+ float db[2];
2622
+
2623
+ for (int i = 0; i < nb; i++) {
2624
+
2625
+ const float d = GGML_FP16_TO_FP32(x[i].d);
2626
+
2627
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
2628
+ db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
2629
+ db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
2630
+ for (int l = 0; l < 4; ++l) {
2631
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (x[i].qs[4*ib32 + l] & 511));
2632
+ const uint8_t signs = ksigns_iq2xs[x[i].qs[4*ib32 + l] >> 9];
2633
+ for (int j = 0; j < 8; ++j) {
2634
+ y[j] = db[l/2] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
2635
+ }
2636
+ y += 8;
2637
+ }
2638
+ }
2639
+ }
2640
+ }
2641
+
2642
+ void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
2643
+ assert(k % QK_K == 0);
2644
+ block_iq2_xs * restrict y = vy;
2645
+ quantize_row_iq2_xs_reference(x, y, k);
2646
+ }
2647
+
2648
+ size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
2649
+ assert(k % QK_K == 0);
2650
+ (void)hist; // TODO: collect histograms
2651
+
2652
+ for (int j = 0; j < n; j += k) {
2653
+ block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
2654
+ quantize_row_iq2_xs_reference(src + j, y, k);
2655
+ }
2656
+ return (n/QK_K*sizeof(block_iq2_xs));
2657
+ }
2658
+
2343
2659
  //===================================== Q8_K ==============================================
2344
2660
 
2345
2661
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@@ -2362,7 +2678,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
2362
2678
  x += QK_K;
2363
2679
  continue;
2364
2680
  }
2365
- const float iscale = -128.f/max;
2681
+ //const float iscale = -128.f/max;
2682
+ // We need this change for IQ2_XXS, else the AVX implementation becomes very awkward
2683
+ const float iscale = -127.f/max;
2366
2684
  for (int j = 0; j < QK_K; ++j) {
2367
2685
  int v = nearest_int(iscale*x[j]);
2368
2686
  y[i].qs[j] = MIN(127, v);
@@ -7065,3 +7383,319 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
7065
7383
  }
7066
7384
 
7067
7385
  #endif
7386
+
7387
+ static const int8_t keven_signs_q2xs[1024] = {
7388
+ 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
7389
+ 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
7390
+ 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1,
7391
+ 1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
7392
+ 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1,
7393
+ 1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1,
7394
+ 1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
7395
+ 1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1,
7396
+ 1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1,
7397
+ 1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1,
7398
+ 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1,
7399
+ 1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1,
7400
+ 1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1,
7401
+ 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1,
7402
+ 1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1,
7403
+ 1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1,
7404
+ 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1,
7405
+ 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1,
7406
+ 1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1,
7407
+ 1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
7408
+ 1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1,
7409
+ 1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1,
7410
+ 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1,
7411
+ 1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1,
7412
+ 1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1,
7413
+ 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1,
7414
+ 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1,
7415
+ 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1,
7416
+ 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1,
7417
+ 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1,
7418
+ 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
7419
+ 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
7420
+ };
7421
+
7422
+ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
7423
+ assert(n % QK_K == 0);
7424
+
7425
+ const block_iq2_xxs * restrict x = vx;
7426
+ const block_q8_K * restrict y = vy;
7427
+
7428
+ const int nb = n / QK_K;
7429
+
7430
+ #if defined(__ARM_NEON)
7431
+
7432
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
7433
+
7434
+ uint32_t aux32[4];
7435
+ const uint8_t * aux8 = (const uint8_t *)aux32;
7436
+
7437
+ ggml_int8x16x4_t q2u;
7438
+ ggml_int8x16x4_t q2s;
7439
+ ggml_int8x16x4_t q8b;
7440
+
7441
+ float sumf = 0;
7442
+ for (int i = 0; i < nb; ++i) {
7443
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7444
+ const uint16_t * restrict q2 = x[i].qs;
7445
+ const int8_t * restrict q8 = y[i].qs;
7446
+ float sumf1 = 0, sumf2 = 0;
7447
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
7448
+ q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
7449
+ memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
7450
+ q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1])));
7451
+ q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3])));
7452
+ q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9])));
7453
+ q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11])));
7454
+ q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
7455
+ q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
7456
+ q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 7) & 127))));
7457
+ q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127))));
7458
+ q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
7459
+ q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
7460
+ q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
7461
+ q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
7462
+ const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]);
7463
+ const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]);
7464
+ sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28));
7465
+ sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28));
7466
+ }
7467
+ sumf += d*(sumf1 + sumf2);
7468
+ }
7469
+ *s = 0.25f * sumf;
7470
+
7471
+ #elif defined(__AVX2__)
7472
+
7473
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
7474
+
7475
+ uint32_t aux32[4];
7476
+ const uint8_t * aux8 = (const uint8_t *)aux32;
7477
+
7478
+ __m256 accumf = _mm256_setzero_ps();
7479
+ for (int i = 0; i < nb; ++i) {
7480
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7481
+ const uint16_t * restrict q2 = x[i].qs;
7482
+ const int8_t * restrict q8 = y[i].qs;
7483
+ __m256i sumi1 = _mm256_setzero_si256();
7484
+ __m256i sumi2 = _mm256_setzero_si256();
7485
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
7486
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
7487
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
7488
+ memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
7489
+ const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
7490
+ const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
7491
+ const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
7492
+ signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
7493
+ const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
7494
+ signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
7495
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
7496
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
7497
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
7498
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
7499
+ const uint16_t ls1 = aux32[1] >> 28;
7500
+ const uint16_t ls2 = aux32[3] >> 28;
7501
+ const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
7502
+ const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
7503
+ sumi1 = _mm256_add_epi32(sumi1, p1);
7504
+ sumi2 = _mm256_add_epi32(sumi2, p2);
7505
+ }
7506
+
7507
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
7508
+
7509
+ }
7510
+
7511
+ *s = 0.125f * hsum_float_8(accumf);
7512
+
7513
+ #else
7514
+
7515
+ uint32_t aux32[2];
7516
+ const uint8_t * aux8 = (const uint8_t *)aux32;
7517
+
7518
+ float sumf = 0.f;
7519
+ for (int i = 0; i < nb; ++i) {
7520
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7521
+ const uint16_t * restrict q2 = x[i].qs;
7522
+ const int8_t * restrict q8 = y[i].qs;
7523
+ int32_t bsum = 0;
7524
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
7525
+ memcpy(aux32, q2, 2*sizeof(uint32_t));
7526
+ q2 += 4;
7527
+ const uint32_t ls = 2*(aux32[1] >> 28) + 1;
7528
+ int32_t sumi = 0;
7529
+ for (int l = 0; l < 4; ++l) {
7530
+ const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
7531
+ const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
7532
+ for (int j = 0; j < 8; ++j) {
7533
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
7534
+ }
7535
+ q8 += 8;
7536
+ }
7537
+ bsum += sumi * ls;
7538
+ }
7539
+ sumf += d * bsum;
7540
+ }
7541
+ *s = 0.125f * sumf;
7542
+ #endif
7543
+ }
7544
+
7545
+ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
7546
+ assert(n % QK_K == 0);
7547
+
7548
+ const block_iq2_xs * restrict x = vx;
7549
+ const block_q8_K * restrict y = vy;
7550
+
7551
+ const int nb = n / QK_K;
7552
+
7553
+ #if defined(__ARM_NEON)
7554
+
7555
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
7556
+
7557
+ int8x16x4_t q2u;
7558
+ int8x16x4_t q2s;
7559
+ int8x16x4_t q8b;
7560
+
7561
+ int32x4x4_t scales32;
7562
+
7563
+ float sumf = 0;
7564
+ for (int i = 0; i < nb; ++i) {
7565
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7566
+ const uint16_t * restrict q2 = x[i].qs;
7567
+ const int8_t * restrict q8 = y[i].qs;
7568
+ const uint8x8_t scales8 = vld1_u8(x[i].scales);
7569
+ const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
7570
+ const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
7571
+ uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
7572
+ scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1));
7573
+ const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales));
7574
+ const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales));
7575
+ scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1)));
7576
+ scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1)));
7577
+ scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2)));
7578
+ scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
7579
+ int32x4_t sumi = vdupq_n_s32(0);
7580
+ for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
7581
+ q8b = vld1q_s8_x4(q8); q8 += 64;
7582
+ q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
7583
+ q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
7584
+ q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
7585
+ q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511))));
7586
+ q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9))));
7587
+ q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9))));
7588
+ q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9))));
7589
+ q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9))));
7590
+ q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
7591
+ q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
7592
+ q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
7593
+ q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
7594
+ const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]);
7595
+ const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]);
7596
+ const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]);
7597
+ const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]);
7598
+ const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4));
7599
+ sumi = vmlaq_s32(sumi, p, scales32.val[ib64]);
7600
+ q2 += 8;
7601
+ }
7602
+ sumf += d*vaddvq_s32(sumi);
7603
+ }
7604
+ *s = 0.125f * sumf;
7605
+
7606
+ #elif defined(__AVX2__)
7607
+
7608
+ const __m128i m4 = _mm_set1_epi8(0xf);
7609
+ const __m128i m1 = _mm_set1_epi8(1);
7610
+ const __m128i m511 = _mm_set1_epi16(511);
7611
+ const __m128i m127 = _mm_set1_epi16(127);
7612
+
7613
+ const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
7614
+
7615
+ uint64_t aux64;
7616
+
7617
+ // somewhat hacky, but gives a significant boost in performance
7618
+ __m128i aux_gindex, aux_sindex;
7619
+ const uint16_t * gindex = (const uint16_t *)&aux_gindex;
7620
+ const uint16_t * sindex = (const uint16_t *)&aux_sindex;
7621
+
7622
+ __m256 accumf = _mm256_setzero_ps();
7623
+ for (int i = 0; i < nb; ++i) {
7624
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7625
+ const uint16_t * restrict q2 = x[i].qs;
7626
+ const int8_t * restrict q8 = y[i].qs;
7627
+
7628
+ memcpy(&aux64, x[i].scales, 8);
7629
+ __m128i stmp = _mm_set1_epi64x(aux64);
7630
+ stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
7631
+ const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
7632
+
7633
+ __m256i sumi1 = _mm256_setzero_si256();
7634
+ __m256i sumi2 = _mm256_setzero_si256();
7635
+ for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
7636
+ const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
7637
+ const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
7638
+ const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2); q2 += 8;
7639
+ aux_gindex = _mm_and_si128(q2_data, m511);
7640
+ aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127);
7641
+ const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
7642
+ const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
7643
+ const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]);
7644
+ const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]);
7645
+ const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
7646
+ const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
7647
+ const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
7648
+ const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
7649
+
7650
+ const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
7651
+ const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
7652
+
7653
+ sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
7654
+ sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
7655
+ }
7656
+
7657
+ accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
7658
+
7659
+ }
7660
+
7661
+ *s = 0.125f * hsum_float_8(accumf);
7662
+
7663
+ #else
7664
+
7665
+ float sumf = 0.f;
7666
+ for (int i = 0; i < nb; ++i) {
7667
+ const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
7668
+ const uint16_t * restrict q2 = x[i].qs;
7669
+ const uint8_t * restrict sc = x[i].scales;
7670
+ const int8_t * restrict q8 = y[i].qs;
7671
+ int32_t bsum = 0;
7672
+ for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
7673
+ const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
7674
+ const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
7675
+ int32_t sumi = 0;
7676
+ for (int l = 0; l < 2; ++l) {
7677
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
7678
+ const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
7679
+ for (int j = 0; j < 8; ++j) {
7680
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
7681
+ }
7682
+ q8 += 8;
7683
+ }
7684
+ bsum += sumi * ls1;
7685
+ sumi = 0;
7686
+ for (int l = 2; l < 4; ++l) {
7687
+ const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
7688
+ const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
7689
+ for (int j = 0; j < 8; ++j) {
7690
+ sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
7691
+ }
7692
+ q8 += 8;
7693
+ }
7694
+ bsum += sumi * ls2;
7695
+ q2 += 4;
7696
+ }
7697
+ sumf += d * bsum;
7698
+ }
7699
+ *s = 0.125f * sumf;
7700
+ #endif
7701
+ }