llama_cpp 0.12.0 → 0.12.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -2
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +758 -39
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +86 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-quants.c +635 -1
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -1
- data/vendor/tmp/llama.cpp/ggml.c +91 -52
- data/vendor/tmp/llama.cpp/ggml.h +14 -11
- data/vendor/tmp/llama.cpp/llama.cpp +79 -30
- data/vendor/tmp/llama.cpp/llama.h +14 -0
- metadata +2 -2
@@ -2340,6 +2340,322 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t *
|
|
2340
2340
|
return (n/QK_K*sizeof(block_q6_K));
|
2341
2341
|
}
|
2342
2342
|
|
2343
|
+
// ====================== "True" 2-bit (de)-quantization
|
2344
|
+
|
2345
|
+
static const uint64_t iq2xxs_grid[256] = {
|
2346
|
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
2347
|
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808,
|
2348
|
+
0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819,
|
2349
|
+
0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819,
|
2350
|
+
0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b,
|
2351
|
+
0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808,
|
2352
|
+
0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08,
|
2353
|
+
0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b,
|
2354
|
+
0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819,
|
2355
|
+
0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08,
|
2356
|
+
0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808,
|
2357
|
+
0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08,
|
2358
|
+
0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808,
|
2359
|
+
0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808,
|
2360
|
+
0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919,
|
2361
|
+
0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819,
|
2362
|
+
0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08,
|
2363
|
+
0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908,
|
2364
|
+
0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819,
|
2365
|
+
0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808,
|
2366
|
+
0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808,
|
2367
|
+
0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908,
|
2368
|
+
0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808,
|
2369
|
+
0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08,
|
2370
|
+
0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819,
|
2371
|
+
0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819,
|
2372
|
+
0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819,
|
2373
|
+
0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908,
|
2374
|
+
0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19,
|
2375
|
+
0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819,
|
2376
|
+
0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b,
|
2377
|
+
0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808,
|
2378
|
+
0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908,
|
2379
|
+
0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08,
|
2380
|
+
0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08,
|
2381
|
+
0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908,
|
2382
|
+
0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819,
|
2383
|
+
0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808,
|
2384
|
+
0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808,
|
2385
|
+
0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19,
|
2386
|
+
0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819,
|
2387
|
+
0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919,
|
2388
|
+
0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b,
|
2389
|
+
0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08,
|
2390
|
+
0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808,
|
2391
|
+
0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908,
|
2392
|
+
0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b,
|
2393
|
+
0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819,
|
2394
|
+
0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08,
|
2395
|
+
0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08,
|
2396
|
+
0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808,
|
2397
|
+
0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b,
|
2398
|
+
0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b,
|
2399
|
+
0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908,
|
2400
|
+
0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819,
|
2401
|
+
0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808,
|
2402
|
+
0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908,
|
2403
|
+
0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b,
|
2404
|
+
0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808,
|
2405
|
+
0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b,
|
2406
|
+
0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b,
|
2407
|
+
0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808,
|
2408
|
+
0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19,
|
2409
|
+
0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908,
|
2410
|
+
};
|
2411
|
+
|
2412
|
+
static const uint64_t iq2xs_grid[512] = {
|
2413
|
+
0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08,
|
2414
|
+
0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b,
|
2415
|
+
0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919,
|
2416
|
+
0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b,
|
2417
|
+
0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919,
|
2418
|
+
0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808,
|
2419
|
+
0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819,
|
2420
|
+
0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819,
|
2421
|
+
0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808,
|
2422
|
+
0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b,
|
2423
|
+
0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b,
|
2424
|
+
0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908,
|
2425
|
+
0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908,
|
2426
|
+
0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919,
|
2427
|
+
0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808,
|
2428
|
+
0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919,
|
2429
|
+
0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908,
|
2430
|
+
0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b,
|
2431
|
+
0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908,
|
2432
|
+
0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08,
|
2433
|
+
0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808,
|
2434
|
+
0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808,
|
2435
|
+
0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819,
|
2436
|
+
0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908,
|
2437
|
+
0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819,
|
2438
|
+
0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808,
|
2439
|
+
0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b,
|
2440
|
+
0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819,
|
2441
|
+
0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819,
|
2442
|
+
0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808,
|
2443
|
+
0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908,
|
2444
|
+
0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19,
|
2445
|
+
0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b,
|
2446
|
+
0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b,
|
2447
|
+
0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919,
|
2448
|
+
0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808,
|
2449
|
+
0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819,
|
2450
|
+
0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819,
|
2451
|
+
0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b,
|
2452
|
+
0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908,
|
2453
|
+
0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808,
|
2454
|
+
0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819,
|
2455
|
+
0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808,
|
2456
|
+
0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919,
|
2457
|
+
0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808,
|
2458
|
+
0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808,
|
2459
|
+
0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908,
|
2460
|
+
0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908,
|
2461
|
+
0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808,
|
2462
|
+
0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b,
|
2463
|
+
0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819,
|
2464
|
+
0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919,
|
2465
|
+
0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908,
|
2466
|
+
0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808,
|
2467
|
+
0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908,
|
2468
|
+
0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919,
|
2469
|
+
0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08,
|
2470
|
+
0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19,
|
2471
|
+
0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b,
|
2472
|
+
0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b,
|
2473
|
+
0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808,
|
2474
|
+
0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08,
|
2475
|
+
0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b,
|
2476
|
+
0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908,
|
2477
|
+
0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b,
|
2478
|
+
0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908,
|
2479
|
+
0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08,
|
2480
|
+
0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808,
|
2481
|
+
0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808,
|
2482
|
+
0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08,
|
2483
|
+
0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819,
|
2484
|
+
0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919,
|
2485
|
+
0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808,
|
2486
|
+
0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808,
|
2487
|
+
0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819,
|
2488
|
+
0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819,
|
2489
|
+
0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908,
|
2490
|
+
0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908,
|
2491
|
+
0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b,
|
2492
|
+
0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908,
|
2493
|
+
0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908,
|
2494
|
+
0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908,
|
2495
|
+
0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808,
|
2496
|
+
0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819,
|
2497
|
+
0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819,
|
2498
|
+
0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819,
|
2499
|
+
0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808,
|
2500
|
+
0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b,
|
2501
|
+
0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819,
|
2502
|
+
0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819,
|
2503
|
+
0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08,
|
2504
|
+
0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808,
|
2505
|
+
0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19,
|
2506
|
+
0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919,
|
2507
|
+
0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808,
|
2508
|
+
0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19,
|
2509
|
+
0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b,
|
2510
|
+
0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808,
|
2511
|
+
0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b,
|
2512
|
+
0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b,
|
2513
|
+
0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08,
|
2514
|
+
0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b,
|
2515
|
+
0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808,
|
2516
|
+
0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819,
|
2517
|
+
0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808,
|
2518
|
+
0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808,
|
2519
|
+
0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08,
|
2520
|
+
0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b,
|
2521
|
+
0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19,
|
2522
|
+
0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08,
|
2523
|
+
0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919,
|
2524
|
+
0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08,
|
2525
|
+
0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08,
|
2526
|
+
0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908,
|
2527
|
+
0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908,
|
2528
|
+
0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b,
|
2529
|
+
0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908,
|
2530
|
+
0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808,
|
2531
|
+
0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b,
|
2532
|
+
0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808,
|
2533
|
+
0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808,
|
2534
|
+
0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19,
|
2535
|
+
0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08,
|
2536
|
+
0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808,
|
2537
|
+
0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b,
|
2538
|
+
0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808,
|
2539
|
+
0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b,
|
2540
|
+
0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b,
|
2541
|
+
};
|
2542
|
+
|
2543
|
+
static const uint8_t ksigns_iq2xs[128] = {
|
2544
|
+
0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15,
|
2545
|
+
144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159,
|
2546
|
+
160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175,
|
2547
|
+
48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63,
|
2548
|
+
192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207,
|
2549
|
+
80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95,
|
2550
|
+
96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111,
|
2551
|
+
240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255,
|
2552
|
+
};
|
2553
|
+
|
2554
|
+
static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
|
2555
|
+
|
2556
|
+
void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
|
2557
|
+
(void)x;
|
2558
|
+
(void)y;
|
2559
|
+
(void)k;
|
2560
|
+
assert(k % QK_K == 0);
|
2561
|
+
//fprintf(stderr, "=========================== %s: not implemented\n", __func__);
|
2562
|
+
}
|
2563
|
+
|
2564
|
+
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
|
2565
|
+
assert(k % QK_K == 0);
|
2566
|
+
const int nb = k / QK_K;
|
2567
|
+
|
2568
|
+
uint32_t aux32[2];
|
2569
|
+
const uint8_t * aux8 = (const uint8_t *)aux32;
|
2570
|
+
|
2571
|
+
for (int i = 0; i < nb; i++) {
|
2572
|
+
|
2573
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
2574
|
+
|
2575
|
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
2576
|
+
memcpy(aux32, x[i].qs + 4*ib32, 2*sizeof(uint32_t));
|
2577
|
+
const float db = d * (0.5f + (aux32[1] >> 28)) * 0.25f;
|
2578
|
+
for (int l = 0; l < 4; ++l) {
|
2579
|
+
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
2580
|
+
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
2581
|
+
for (int j = 0; j < 8; ++j) {
|
2582
|
+
y[j] = db * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
2583
|
+
}
|
2584
|
+
y += 8;
|
2585
|
+
}
|
2586
|
+
}
|
2587
|
+
}
|
2588
|
+
}
|
2589
|
+
|
2590
|
+
void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
|
2591
|
+
assert(k % QK_K == 0);
|
2592
|
+
block_iq2_xxs * restrict y = vy;
|
2593
|
+
quantize_row_iq2_xxs_reference(x, y, k);
|
2594
|
+
}
|
2595
|
+
|
2596
|
+
size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
|
2597
|
+
assert(k % QK_K == 0);
|
2598
|
+
(void)hist; // TODO: collect histograms
|
2599
|
+
|
2600
|
+
for (int j = 0; j < n; j += k) {
|
2601
|
+
block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
|
2602
|
+
quantize_row_iq2_xxs_reference(src + j, y, k);
|
2603
|
+
}
|
2604
|
+
return (n/QK_K*sizeof(block_iq2_xxs));
|
2605
|
+
}
|
2606
|
+
|
2607
|
+
// ====================== 2.3125 bpw (de)-quantization
|
2608
|
+
|
2609
|
+
void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
|
2610
|
+
(void)x;
|
2611
|
+
(void)y;
|
2612
|
+
(void)k;
|
2613
|
+
assert(k % QK_K == 0);
|
2614
|
+
//fprintf(stderr, "=========================== %s: not implemented\n", __func__);
|
2615
|
+
}
|
2616
|
+
|
2617
|
+
void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
|
2618
|
+
assert(k % QK_K == 0);
|
2619
|
+
const int nb = k / QK_K;
|
2620
|
+
|
2621
|
+
float db[2];
|
2622
|
+
|
2623
|
+
for (int i = 0; i < nb; i++) {
|
2624
|
+
|
2625
|
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
2626
|
+
|
2627
|
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
2628
|
+
db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f;
|
2629
|
+
db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f;
|
2630
|
+
for (int l = 0; l < 4; ++l) {
|
2631
|
+
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (x[i].qs[4*ib32 + l] & 511));
|
2632
|
+
const uint8_t signs = ksigns_iq2xs[x[i].qs[4*ib32 + l] >> 9];
|
2633
|
+
for (int j = 0; j < 8; ++j) {
|
2634
|
+
y[j] = db[l/2] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
|
2635
|
+
}
|
2636
|
+
y += 8;
|
2637
|
+
}
|
2638
|
+
}
|
2639
|
+
}
|
2640
|
+
}
|
2641
|
+
|
2642
|
+
void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
|
2643
|
+
assert(k % QK_K == 0);
|
2644
|
+
block_iq2_xs * restrict y = vy;
|
2645
|
+
quantize_row_iq2_xs_reference(x, y, k);
|
2646
|
+
}
|
2647
|
+
|
2648
|
+
size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
|
2649
|
+
assert(k % QK_K == 0);
|
2650
|
+
(void)hist; // TODO: collect histograms
|
2651
|
+
|
2652
|
+
for (int j = 0; j < n; j += k) {
|
2653
|
+
block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
|
2654
|
+
quantize_row_iq2_xs_reference(src + j, y, k);
|
2655
|
+
}
|
2656
|
+
return (n/QK_K*sizeof(block_iq2_xs));
|
2657
|
+
}
|
2658
|
+
|
2343
2659
|
//===================================== Q8_K ==============================================
|
2344
2660
|
|
2345
2661
|
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
@@ -2362,7 +2678,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict
|
|
2362
2678
|
x += QK_K;
|
2363
2679
|
continue;
|
2364
2680
|
}
|
2365
|
-
const float iscale = -128.f/max;
|
2681
|
+
//const float iscale = -128.f/max;
|
2682
|
+
// We need this change for IQ2_XXS, else the AVX implementation becomes very awkward
|
2683
|
+
const float iscale = -127.f/max;
|
2366
2684
|
for (int j = 0; j < QK_K; ++j) {
|
2367
2685
|
int v = nearest_int(iscale*x[j]);
|
2368
2686
|
y[i].qs[j] = MIN(127, v);
|
@@ -7065,3 +7383,319 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
7065
7383
|
}
|
7066
7384
|
|
7067
7385
|
#endif
|
7386
|
+
|
7387
|
+
static const int8_t keven_signs_q2xs[1024] = {
|
7388
|
+
1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1,
|
7389
|
+
1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1,
|
7390
|
+
1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, -1,
|
7391
|
+
1, 1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1,
|
7392
|
+
1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, -1,
|
7393
|
+
1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, 1,
|
7394
|
+
1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, 1,
|
7395
|
+
1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, -1,
|
7396
|
+
1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, -1,
|
7397
|
+
1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, 1,
|
7398
|
+
1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, 1,
|
7399
|
+
1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1,
|
7400
|
+
1, 1, 1, 1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, 1,
|
7401
|
+
1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1,
|
7402
|
+
1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, 1, 1, 1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1,
|
7403
|
+
1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1,
|
7404
|
+
1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1,
|
7405
|
+
1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1,
|
7406
|
+
1, 1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, 1,
|
7407
|
+
1, 1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1,
|
7408
|
+
1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, 1,
|
7409
|
+
1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, -1,
|
7410
|
+
1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1,
|
7411
|
+
1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1,
|
7412
|
+
1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1,
|
7413
|
+
1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1,
|
7414
|
+
1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, -1,
|
7415
|
+
1, 1, -1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1,
|
7416
|
+
1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, -1, -1,
|
7417
|
+
1, 1, -1, 1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1,
|
7418
|
+
1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1,
|
7419
|
+
1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1,
|
7420
|
+
};
|
7421
|
+
|
7422
|
+
void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
7423
|
+
assert(n % QK_K == 0);
|
7424
|
+
|
7425
|
+
const block_iq2_xxs * restrict x = vx;
|
7426
|
+
const block_q8_K * restrict y = vy;
|
7427
|
+
|
7428
|
+
const int nb = n / QK_K;
|
7429
|
+
|
7430
|
+
#if defined(__ARM_NEON)
|
7431
|
+
|
7432
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
7433
|
+
|
7434
|
+
uint32_t aux32[4];
|
7435
|
+
const uint8_t * aux8 = (const uint8_t *)aux32;
|
7436
|
+
|
7437
|
+
ggml_int8x16x4_t q2u;
|
7438
|
+
ggml_int8x16x4_t q2s;
|
7439
|
+
ggml_int8x16x4_t q8b;
|
7440
|
+
|
7441
|
+
float sumf = 0;
|
7442
|
+
for (int i = 0; i < nb; ++i) {
|
7443
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
7444
|
+
const uint16_t * restrict q2 = x[i].qs;
|
7445
|
+
const int8_t * restrict q8 = y[i].qs;
|
7446
|
+
float sumf1 = 0, sumf2 = 0;
|
7447
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
7448
|
+
q8b = ggml_vld1q_s8_x4(q8); q8 += 64;
|
7449
|
+
memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
|
7450
|
+
q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 0])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 1])));
|
7451
|
+
q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 2])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 3])));
|
7452
|
+
q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[ 8])), vld1_s8((const void *)(iq2xxs_grid + aux8[ 9])));
|
7453
|
+
q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xxs_grid + aux8[10])), vld1_s8((const void *)(iq2xxs_grid + aux8[11])));
|
7454
|
+
q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 7) & 127))));
|
7455
|
+
q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[1] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[1] >> 21) & 127))));
|
7456
|
+
q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 0) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 7) & 127))));
|
7457
|
+
q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + ((aux32[3] >> 14) & 127))), vld1_s8((const void *)(signs64 + ((aux32[3] >> 21) & 127))));
|
7458
|
+
q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
|
7459
|
+
q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
|
7460
|
+
q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
|
7461
|
+
q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
|
7462
|
+
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]), q2u.val[1], q8b.val[1]);
|
7463
|
+
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]), q2u.val[3], q8b.val[3]);
|
7464
|
+
sumf1 += vaddvq_s32(p1) * (0.5f + (aux32[1] >> 28));
|
7465
|
+
sumf2 += vaddvq_s32(p2) * (0.5f + (aux32[3] >> 28));
|
7466
|
+
}
|
7467
|
+
sumf += d*(sumf1 + sumf2);
|
7468
|
+
}
|
7469
|
+
*s = 0.25f * sumf;
|
7470
|
+
|
7471
|
+
#elif defined(__AVX2__)
|
7472
|
+
|
7473
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
7474
|
+
|
7475
|
+
uint32_t aux32[4];
|
7476
|
+
const uint8_t * aux8 = (const uint8_t *)aux32;
|
7477
|
+
|
7478
|
+
__m256 accumf = _mm256_setzero_ps();
|
7479
|
+
for (int i = 0; i < nb; ++i) {
|
7480
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
7481
|
+
const uint16_t * restrict q2 = x[i].qs;
|
7482
|
+
const int8_t * restrict q8 = y[i].qs;
|
7483
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
7484
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
7485
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
7486
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
7487
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
7488
|
+
memcpy(aux32, q2, 4*sizeof(uint32_t)); q2 += 8;
|
7489
|
+
const __m256i q2_1 = _mm256_set_epi64x(iq2xxs_grid[aux8[ 3]], iq2xxs_grid[aux8[ 2]], iq2xxs_grid[aux8[1]], iq2xxs_grid[aux8[0]]);
|
7490
|
+
const __m256i q2_2 = _mm256_set_epi64x(iq2xxs_grid[aux8[11]], iq2xxs_grid[aux8[10]], iq2xxs_grid[aux8[9]], iq2xxs_grid[aux8[8]]);
|
7491
|
+
const __m256i s2_1 = _mm256_set_epi64x(signs64[(aux32[1] >> 21) & 127], signs64[(aux32[1] >> 14) & 127],
|
7492
|
+
signs64[(aux32[1] >> 7) & 127], signs64[(aux32[1] >> 0) & 127]);
|
7493
|
+
const __m256i s2_2 = _mm256_set_epi64x(signs64[(aux32[3] >> 21) & 127], signs64[(aux32[3] >> 14) & 127],
|
7494
|
+
signs64[(aux32[3] >> 7) & 127], signs64[(aux32[3] >> 0) & 127]);
|
7495
|
+
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
|
7496
|
+
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
|
7497
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
7498
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
7499
|
+
const uint16_t ls1 = aux32[1] >> 28;
|
7500
|
+
const uint16_t ls2 = aux32[3] >> 28;
|
7501
|
+
const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1));
|
7502
|
+
const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1));
|
7503
|
+
sumi1 = _mm256_add_epi32(sumi1, p1);
|
7504
|
+
sumi2 = _mm256_add_epi32(sumi2, p2);
|
7505
|
+
}
|
7506
|
+
|
7507
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
7508
|
+
|
7509
|
+
}
|
7510
|
+
|
7511
|
+
*s = 0.125f * hsum_float_8(accumf);
|
7512
|
+
|
7513
|
+
#else
|
7514
|
+
|
7515
|
+
uint32_t aux32[2];
|
7516
|
+
const uint8_t * aux8 = (const uint8_t *)aux32;
|
7517
|
+
|
7518
|
+
float sumf = 0.f;
|
7519
|
+
for (int i = 0; i < nb; ++i) {
|
7520
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
7521
|
+
const uint16_t * restrict q2 = x[i].qs;
|
7522
|
+
const int8_t * restrict q8 = y[i].qs;
|
7523
|
+
int32_t bsum = 0;
|
7524
|
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
7525
|
+
memcpy(aux32, q2, 2*sizeof(uint32_t));
|
7526
|
+
q2 += 4;
|
7527
|
+
const uint32_t ls = 2*(aux32[1] >> 28) + 1;
|
7528
|
+
int32_t sumi = 0;
|
7529
|
+
for (int l = 0; l < 4; ++l) {
|
7530
|
+
const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]);
|
7531
|
+
const uint8_t signs = ksigns_iq2xs[(aux32[1] >> 7*l) & 127];
|
7532
|
+
for (int j = 0; j < 8; ++j) {
|
7533
|
+
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
7534
|
+
}
|
7535
|
+
q8 += 8;
|
7536
|
+
}
|
7537
|
+
bsum += sumi * ls;
|
7538
|
+
}
|
7539
|
+
sumf += d * bsum;
|
7540
|
+
}
|
7541
|
+
*s = 0.125f * sumf;
|
7542
|
+
#endif
|
7543
|
+
}
|
7544
|
+
|
7545
|
+
void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
7546
|
+
assert(n % QK_K == 0);
|
7547
|
+
|
7548
|
+
const block_iq2_xs * restrict x = vx;
|
7549
|
+
const block_q8_K * restrict y = vy;
|
7550
|
+
|
7551
|
+
const int nb = n / QK_K;
|
7552
|
+
|
7553
|
+
#if defined(__ARM_NEON)
|
7554
|
+
|
7555
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
7556
|
+
|
7557
|
+
int8x16x4_t q2u;
|
7558
|
+
int8x16x4_t q2s;
|
7559
|
+
int8x16x4_t q8b;
|
7560
|
+
|
7561
|
+
int32x4x4_t scales32;
|
7562
|
+
|
7563
|
+
float sumf = 0;
|
7564
|
+
for (int i = 0; i < nb; ++i) {
|
7565
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
7566
|
+
const uint16_t * restrict q2 = x[i].qs;
|
7567
|
+
const int8_t * restrict q8 = y[i].qs;
|
7568
|
+
const uint8x8_t scales8 = vld1_u8(x[i].scales);
|
7569
|
+
const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf));
|
7570
|
+
const uint8x8_t scales_h = vshr_n_u8(scales8, 4);
|
7571
|
+
uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h));
|
7572
|
+
scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1));
|
7573
|
+
const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales));
|
7574
|
+
const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales));
|
7575
|
+
scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1)));
|
7576
|
+
scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1)));
|
7577
|
+
scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2)));
|
7578
|
+
scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2)));
|
7579
|
+
int32x4_t sumi = vdupq_n_s32(0);
|
7580
|
+
for (int ib64 = 0; ib64 < QK_K/64; ++ib64) {
|
7581
|
+
q8b = vld1q_s8_x4(q8); q8 += 64;
|
7582
|
+
q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511))));
|
7583
|
+
q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511))));
|
7584
|
+
q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511))));
|
7585
|
+
q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511))));
|
7586
|
+
q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9))));
|
7587
|
+
q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9))));
|
7588
|
+
q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9))));
|
7589
|
+
q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9))));
|
7590
|
+
q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]);
|
7591
|
+
q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]);
|
7592
|
+
q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]);
|
7593
|
+
q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]);
|
7594
|
+
const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]);
|
7595
|
+
const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]);
|
7596
|
+
const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]);
|
7597
|
+
const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]);
|
7598
|
+
const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4));
|
7599
|
+
sumi = vmlaq_s32(sumi, p, scales32.val[ib64]);
|
7600
|
+
q2 += 8;
|
7601
|
+
}
|
7602
|
+
sumf += d*vaddvq_s32(sumi);
|
7603
|
+
}
|
7604
|
+
*s = 0.125f * sumf;
|
7605
|
+
|
7606
|
+
#elif defined(__AVX2__)
|
7607
|
+
|
7608
|
+
const __m128i m4 = _mm_set1_epi8(0xf);
|
7609
|
+
const __m128i m1 = _mm_set1_epi8(1);
|
7610
|
+
const __m128i m511 = _mm_set1_epi16(511);
|
7611
|
+
const __m128i m127 = _mm_set1_epi16(127);
|
7612
|
+
|
7613
|
+
const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs;
|
7614
|
+
|
7615
|
+
uint64_t aux64;
|
7616
|
+
|
7617
|
+
// somewhat hacky, but gives a significant boost in performance
|
7618
|
+
__m128i aux_gindex, aux_sindex;
|
7619
|
+
const uint16_t * gindex = (const uint16_t *)&aux_gindex;
|
7620
|
+
const uint16_t * sindex = (const uint16_t *)&aux_sindex;
|
7621
|
+
|
7622
|
+
__m256 accumf = _mm256_setzero_ps();
|
7623
|
+
for (int i = 0; i < nb; ++i) {
|
7624
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
7625
|
+
const uint16_t * restrict q2 = x[i].qs;
|
7626
|
+
const int8_t * restrict q8 = y[i].qs;
|
7627
|
+
|
7628
|
+
memcpy(&aux64, x[i].scales, 8);
|
7629
|
+
__m128i stmp = _mm_set1_epi64x(aux64);
|
7630
|
+
stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4));
|
7631
|
+
const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1);
|
7632
|
+
|
7633
|
+
__m256i sumi1 = _mm256_setzero_si256();
|
7634
|
+
__m256i sumi2 = _mm256_setzero_si256();
|
7635
|
+
for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) {
|
7636
|
+
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
7637
|
+
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
7638
|
+
const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2); q2 += 8;
|
7639
|
+
aux_gindex = _mm_and_si128(q2_data, m511);
|
7640
|
+
aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127);
|
7641
|
+
const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]);
|
7642
|
+
const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]);
|
7643
|
+
const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]);
|
7644
|
+
const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]);
|
7645
|
+
const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1);
|
7646
|
+
const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2);
|
7647
|
+
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
7648
|
+
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
7649
|
+
|
7650
|
+
const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0)));
|
7651
|
+
const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1)));
|
7652
|
+
|
7653
|
+
sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1));
|
7654
|
+
sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2));
|
7655
|
+
}
|
7656
|
+
|
7657
|
+
accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf);
|
7658
|
+
|
7659
|
+
}
|
7660
|
+
|
7661
|
+
*s = 0.125f * hsum_float_8(accumf);
|
7662
|
+
|
7663
|
+
#else
|
7664
|
+
|
7665
|
+
float sumf = 0.f;
|
7666
|
+
for (int i = 0; i < nb; ++i) {
|
7667
|
+
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
|
7668
|
+
const uint16_t * restrict q2 = x[i].qs;
|
7669
|
+
const uint8_t * restrict sc = x[i].scales;
|
7670
|
+
const int8_t * restrict q8 = y[i].qs;
|
7671
|
+
int32_t bsum = 0;
|
7672
|
+
for (int ib32 = 0; ib32 < QK_K/32; ++ib32) {
|
7673
|
+
const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1;
|
7674
|
+
const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1;
|
7675
|
+
int32_t sumi = 0;
|
7676
|
+
for (int l = 0; l < 2; ++l) {
|
7677
|
+
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
7678
|
+
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
7679
|
+
for (int j = 0; j < 8; ++j) {
|
7680
|
+
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
7681
|
+
}
|
7682
|
+
q8 += 8;
|
7683
|
+
}
|
7684
|
+
bsum += sumi * ls1;
|
7685
|
+
sumi = 0;
|
7686
|
+
for (int l = 2; l < 4; ++l) {
|
7687
|
+
const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511));
|
7688
|
+
const uint8_t signs = ksigns_iq2xs[q2[l] >> 9];
|
7689
|
+
for (int j = 0; j < 8; ++j) {
|
7690
|
+
sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1);
|
7691
|
+
}
|
7692
|
+
q8 += 8;
|
7693
|
+
}
|
7694
|
+
bsum += sumi * ls2;
|
7695
|
+
q2 += 4;
|
7696
|
+
}
|
7697
|
+
sumf += d * bsum;
|
7698
|
+
}
|
7699
|
+
*s = 0.125f * sumf;
|
7700
|
+
#endif
|
7701
|
+
}
|