digest-xxhash 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/digest/xxhash/ext.c +1 -1
- data/ext/digest/xxhash/xxhash.h +820 -480
- data/lib/digest/xxhash/version.rb +1 -1
- metadata +3 -3
data/ext/digest/xxhash/xxhash.h
CHANGED
@@ -716,8 +716,15 @@ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canoni
|
|
716
716
|
# define XXH_HAS_ATTRIBUTE(x) 0
|
717
717
|
#endif
|
718
718
|
|
719
|
+
/*
|
720
|
+
* C23 __STDC_VERSION__ number hasn't been specified yet. For now
|
721
|
+
* leave as `201711L` (C17 + 1).
|
722
|
+
* TODO: Update to correct value when its been specified.
|
723
|
+
*/
|
724
|
+
#define XXH_C23_VN 201711L
|
725
|
+
|
719
726
|
/* C-language Attributes are added in C23. */
|
720
|
-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__
|
727
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute)
|
721
728
|
# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
|
722
729
|
#else
|
723
730
|
# define XXH_HAS_C_ATTRIBUTE(x) 0
|
@@ -743,6 +750,18 @@ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canoni
|
|
743
750
|
# define XXH_FALLTHROUGH /* fallthrough */
|
744
751
|
#endif
|
745
752
|
|
753
|
+
/*
|
754
|
+
* Define XXH_NOESCAPE for annotated pointers in public API.
|
755
|
+
* https://clang.llvm.org/docs/AttributeReference.html#noescape
|
756
|
+
* As of writing this, only supported by clang.
|
757
|
+
*/
|
758
|
+
#if XXH_HAS_ATTRIBUTE(noescape)
|
759
|
+
# define XXH_NOESCAPE __attribute__((noescape))
|
760
|
+
#else
|
761
|
+
# define XXH_NOESCAPE
|
762
|
+
#endif
|
763
|
+
|
764
|
+
|
746
765
|
/*!
|
747
766
|
* @}
|
748
767
|
* @ingroup public
|
@@ -813,7 +832,7 @@ typedef uint64_t XXH64_hash_t;
|
|
813
832
|
* @see
|
814
833
|
* XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
|
815
834
|
*/
|
816
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
|
835
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
|
817
836
|
|
818
837
|
/******* Streaming *******/
|
819
838
|
#ifndef XXH_NO_STREAM
|
@@ -825,16 +844,16 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(const void* input, size_t length, XX
|
|
825
844
|
typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
|
826
845
|
XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
|
827
846
|
XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
|
828
|
-
XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
|
847
|
+
XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);
|
829
848
|
|
830
|
-
XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed);
|
831
|
-
XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
|
832
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);
|
849
|
+
XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);
|
850
|
+
XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
|
851
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
|
833
852
|
#endif /* !XXH_NO_STREAM */
|
834
853
|
/******* Canonical representation *******/
|
835
854
|
typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
|
836
|
-
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
|
837
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
|
855
|
+
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);
|
856
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);
|
838
857
|
|
839
858
|
#ifndef XXH_NO_XXH3
|
840
859
|
|
@@ -872,7 +891,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canoni
|
|
872
891
|
*
|
873
892
|
* XXH3 implementation is portable:
|
874
893
|
* it has a generic C90 formulation that can be compiled on any platform,
|
875
|
-
* all implementations
|
894
|
+
* all implementations generate exactly the same hash value on all platforms.
|
876
895
|
* Starting from v0.8.0, it's also labelled "stable", meaning that
|
877
896
|
* any future version will also generate the same hash value.
|
878
897
|
*
|
@@ -902,7 +921,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canoni
|
|
902
921
|
* @see
|
903
922
|
* XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version.
|
904
923
|
*/
|
905
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(const void* input, size_t length);
|
924
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);
|
906
925
|
|
907
926
|
/*!
|
908
927
|
* @brief 64-bit seeded variant of XXH3
|
@@ -919,7 +938,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(const void* input, size_t leng
|
|
919
938
|
* @param length The length
|
920
939
|
* @param seed The 64-bit seed to alter the state.
|
921
940
|
*/
|
922
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed);
|
941
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
|
923
942
|
|
924
943
|
/*!
|
925
944
|
* The bare minimum size for a custom secret.
|
@@ -948,7 +967,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(const void* input, si
|
|
948
967
|
* This is not necessarily the case when using the blob of bytes directly
|
949
968
|
* because, when hashing _small_ inputs, only a portion of the secret is employed.
|
950
969
|
*/
|
951
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
|
970
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
|
952
971
|
|
953
972
|
|
954
973
|
/******* Streaming *******/
|
@@ -968,20 +987,20 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(const void* data, s
|
|
968
987
|
typedef struct XXH3_state_s XXH3_state_t;
|
969
988
|
XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
|
970
989
|
XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
|
971
|
-
XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
|
990
|
+
XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);
|
972
991
|
|
973
992
|
/*
|
974
993
|
* XXH3_64bits_reset():
|
975
994
|
* Initialize with default parameters.
|
976
995
|
* digest will be equivalent to `XXH3_64bits()`.
|
977
996
|
*/
|
978
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
|
997
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
|
979
998
|
/*
|
980
999
|
* XXH3_64bits_reset_withSeed():
|
981
1000
|
* Generate a custom secret from `seed`, and store it into `statePtr`.
|
982
1001
|
* digest will be equivalent to `XXH3_64bits_withSeed()`.
|
983
1002
|
*/
|
984
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
|
1003
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
|
985
1004
|
/*!
|
986
1005
|
* XXH3_64bits_reset_withSecret():
|
987
1006
|
* `secret` is referenced, it _must outlive_ the hash streaming session.
|
@@ -991,10 +1010,10 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr,
|
|
991
1010
|
* When in doubt about the randomness of a candidate `secret`,
|
992
1011
|
* consider employing `XXH3_generateSecret()` instead (see below).
|
993
1012
|
*/
|
994
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
|
1013
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
|
995
1014
|
|
996
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
|
997
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr);
|
1015
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
|
1016
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
|
998
1017
|
#endif /* !XXH_NO_STREAM */
|
999
1018
|
|
1000
1019
|
/* note : canonical representation of XXH3 is the same as XXH64
|
@@ -1033,11 +1052,11 @@ typedef struct {
|
|
1033
1052
|
* @see
|
1034
1053
|
* XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version.
|
1035
1054
|
*/
|
1036
|
-
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(const void* data, size_t len);
|
1055
|
+
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
|
1037
1056
|
/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */
|
1038
|
-
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
|
1057
|
+
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
|
1039
1058
|
/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */
|
1040
|
-
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
|
1059
|
+
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
|
1041
1060
|
|
1042
1061
|
/******* Streaming *******/
|
1043
1062
|
#ifndef XXH_NO_STREAM
|
@@ -1053,12 +1072,12 @@ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(const void* data,
|
|
1053
1072
|
* All reset and streaming functions have same meaning as their 64-bit counterpart.
|
1054
1073
|
*/
|
1055
1074
|
|
1056
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
|
1057
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
|
1058
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
|
1075
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
|
1076
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
|
1077
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
|
1059
1078
|
|
1060
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
|
1061
|
-
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
|
1079
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
|
1080
|
+
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
|
1062
1081
|
#endif /* !XXH_NO_STREAM */
|
1063
1082
|
|
1064
1083
|
/* Following helper functions make it possible to compare XXH128_hast_t values.
|
@@ -1079,13 +1098,13 @@ XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
|
|
1079
1098
|
* =0 if *h128_1 == *h128_2
|
1080
1099
|
* <0 if *h128_1 < *h128_2
|
1081
1100
|
*/
|
1082
|
-
XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(const void* h128_1, const void* h128_2);
|
1101
|
+
XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);
|
1083
1102
|
|
1084
1103
|
|
1085
1104
|
/******* Canonical representation *******/
|
1086
1105
|
typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
|
1087
|
-
XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
|
1088
|
-
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
|
1106
|
+
XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);
|
1107
|
+
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);
|
1089
1108
|
|
1090
1109
|
|
1091
1110
|
#endif /* !XXH_NO_XXH3 */
|
@@ -1266,13 +1285,18 @@ struct XXH3_state_s {
|
|
1266
1285
|
* Note that this doesn't prepare the state for a streaming operation,
|
1267
1286
|
* it's still necessary to use XXH3_NNbits_reset*() afterwards.
|
1268
1287
|
*/
|
1269
|
-
#define XXH3_INITSTATE(XXH3_state_ptr)
|
1288
|
+
#define XXH3_INITSTATE(XXH3_state_ptr) \
|
1289
|
+
do { \
|
1290
|
+
XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \
|
1291
|
+
tmp_xxh3_state_ptr->seed = 0; \
|
1292
|
+
tmp_xxh3_state_ptr->extSecret = NULL; \
|
1293
|
+
} while(0)
|
1270
1294
|
|
1271
1295
|
|
1272
1296
|
/*!
|
1273
1297
|
* simple alias to pre-selected XXH3_128bits variant
|
1274
1298
|
*/
|
1275
|
-
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
|
1299
|
+
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
|
1276
1300
|
|
1277
1301
|
|
1278
1302
|
/* === Experimental API === */
|
@@ -1329,7 +1353,7 @@ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(const void* data, size_t len, XXH6
|
|
1329
1353
|
* }
|
1330
1354
|
* @endcode
|
1331
1355
|
*/
|
1332
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
|
1356
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);
|
1333
1357
|
|
1334
1358
|
/*!
|
1335
1359
|
* @brief Generate the same secret as the _withSeed() variants.
|
@@ -1368,7 +1392,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secr
|
|
1368
1392
|
* @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes
|
1369
1393
|
* @param seed The seed to seed the state.
|
1370
1394
|
*/
|
1371
|
-
XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
|
1395
|
+
XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);
|
1372
1396
|
|
1373
1397
|
/*!
|
1374
1398
|
* These variants generate hash values using either
|
@@ -1397,24 +1421,24 @@ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_
|
|
1397
1421
|
* because only portions of the secret are employed for small data.
|
1398
1422
|
*/
|
1399
1423
|
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
|
1400
|
-
XXH3_64bits_withSecretandSeed(const void* data, size_t len,
|
1401
|
-
const void* secret, size_t secretSize,
|
1424
|
+
XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
|
1425
|
+
XXH_NOESCAPE const void* secret, size_t secretSize,
|
1402
1426
|
XXH64_hash_t seed);
|
1403
1427
|
/*! @copydoc XXH3_64bits_withSecretandSeed() */
|
1404
1428
|
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
|
1405
|
-
XXH3_128bits_withSecretandSeed(const void* input, size_t length,
|
1406
|
-
const void* secret, size_t secretSize,
|
1429
|
+
XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
|
1430
|
+
XXH_NOESCAPE const void* secret, size_t secretSize,
|
1407
1431
|
XXH64_hash_t seed64);
|
1408
1432
|
#ifndef XXH_NO_STREAM
|
1409
1433
|
/*! @copydoc XXH3_64bits_withSecretandSeed() */
|
1410
1434
|
XXH_PUBLIC_API XXH_errorcode
|
1411
|
-
XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
|
1412
|
-
const void* secret, size_t secretSize,
|
1435
|
+
XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
|
1436
|
+
XXH_NOESCAPE const void* secret, size_t secretSize,
|
1413
1437
|
XXH64_hash_t seed64);
|
1414
1438
|
/*! @copydoc XXH3_64bits_withSecretandSeed() */
|
1415
1439
|
XXH_PUBLIC_API XXH_errorcode
|
1416
|
-
XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
|
1417
|
-
const void* secret, size_t secretSize,
|
1440
|
+
XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
|
1441
|
+
XXH_NOESCAPE const void* secret, size_t secretSize,
|
1418
1442
|
XXH64_hash_t seed64);
|
1419
1443
|
#endif /* !XXH_NO_STREAM */
|
1420
1444
|
|
@@ -1522,7 +1546,7 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
|
|
1522
1546
|
* care, as what works on one compiler/platform/optimization level may cause
|
1523
1547
|
* another to read garbage data or even crash.
|
1524
1548
|
*
|
1525
|
-
* See
|
1549
|
+
* See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
|
1526
1550
|
*
|
1527
1551
|
* Prefer these methods in priority order (0 > 3 > 1 > 2)
|
1528
1552
|
*/
|
@@ -1608,6 +1632,23 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
|
|
1608
1632
|
*/
|
1609
1633
|
# define XXH_NO_INLINE_HINTS 0
|
1610
1634
|
|
1635
|
+
/*!
|
1636
|
+
* @def XXH3_INLINE_SECRET
|
1637
|
+
* @brief Determines whether to inline the XXH3 withSecret code.
|
1638
|
+
*
|
1639
|
+
* When the secret size is known, the compiler can improve the performance
|
1640
|
+
* of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().
|
1641
|
+
*
|
1642
|
+
* However, if the secret size is not known, it doesn't have any benefit. This
|
1643
|
+
* happens when xxHash is compiled into a global symbol. Therefore, if
|
1644
|
+
* @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.
|
1645
|
+
*
|
1646
|
+
* Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers
|
1647
|
+
* that are *sometimes* force inline on -Og, and it is impossible to automatically
|
1648
|
+
* detect this optimization level.
|
1649
|
+
*/
|
1650
|
+
# define XXH3_INLINE_SECRET 0
|
1651
|
+
|
1611
1652
|
/*!
|
1612
1653
|
* @def XXH32_ENDJMP
|
1613
1654
|
* @brief Whether to use a jump for `XXH32_finalize`.
|
@@ -1682,6 +1723,15 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
|
|
1682
1723
|
# endif
|
1683
1724
|
#endif
|
1684
1725
|
|
1726
|
+
#ifndef XXH3_INLINE_SECRET
|
1727
|
+
# if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \
|
1728
|
+
|| !defined(XXH_INLINE_ALL)
|
1729
|
+
# define XXH3_INLINE_SECRET 0
|
1730
|
+
# else
|
1731
|
+
# define XXH3_INLINE_SECRET 1
|
1732
|
+
# endif
|
1733
|
+
#endif
|
1734
|
+
|
1685
1735
|
#ifndef XXH32_ENDJMP
|
1686
1736
|
/* generally preferable for performance */
|
1687
1737
|
# define XXH32_ENDJMP 0
|
@@ -1778,6 +1828,11 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
|
|
1778
1828
|
# define XXH_NO_INLINE static
|
1779
1829
|
#endif
|
1780
1830
|
|
1831
|
+
#if XXH3_INLINE_SECRET
|
1832
|
+
# define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE
|
1833
|
+
#else
|
1834
|
+
# define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE
|
1835
|
+
#endif
|
1781
1836
|
|
1782
1837
|
|
1783
1838
|
/* *************************************
|
@@ -1803,7 +1858,7 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
|
|
1803
1858
|
# include <assert.h> /* note: can still be disabled with NDEBUG */
|
1804
1859
|
# define XXH_ASSERT(c) assert(c)
|
1805
1860
|
#else
|
1806
|
-
# define XXH_ASSERT(c) (
|
1861
|
+
# define XXH_ASSERT(c) XXH_ASSUME(c)
|
1807
1862
|
#endif
|
1808
1863
|
|
1809
1864
|
/* note: use after variable declarations */
|
@@ -1835,11 +1890,17 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
|
|
1835
1890
|
* XXH3_initCustomSecret_scalar().
|
1836
1891
|
*/
|
1837
1892
|
#if defined(__GNUC__) || defined(__clang__)
|
1838
|
-
# define XXH_COMPILER_GUARD(var) __asm__
|
1893
|
+
# define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))
|
1839
1894
|
#else
|
1840
1895
|
# define XXH_COMPILER_GUARD(var) ((void)0)
|
1841
1896
|
#endif
|
1842
1897
|
|
1898
|
+
#if defined(__clang__)
|
1899
|
+
# define XXH_COMPILER_GUARD_W(var) __asm__("" : "+w" (var))
|
1900
|
+
#else
|
1901
|
+
# define XXH_COMPILER_GUARD_W(var) ((void)0)
|
1902
|
+
#endif
|
1903
|
+
|
1843
1904
|
/* *************************************
|
1844
1905
|
* Basic Types
|
1845
1906
|
***************************************/
|
@@ -1946,7 +2007,7 @@ static xxh_u32 XXH_read32(const void* ptr)
|
|
1946
2007
|
|
1947
2008
|
/*
|
1948
2009
|
* Portable and safe solution. Generally efficient.
|
1949
|
-
* see:
|
2010
|
+
* see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
|
1950
2011
|
*/
|
1951
2012
|
static xxh_u32 XXH_read32(const void* memPtr)
|
1952
2013
|
{
|
@@ -2022,6 +2083,51 @@ static int XXH_isLittleEndian(void)
|
|
2022
2083
|
# define XXH_HAS_BUILTIN(x) 0
|
2023
2084
|
#endif
|
2024
2085
|
|
2086
|
+
|
2087
|
+
|
2088
|
+
/*
|
2089
|
+
* C23 and future versions have standard "unreachable()".
|
2090
|
+
* Once it has been implemented reliably we can add it as an
|
2091
|
+
* additional case:
|
2092
|
+
*
|
2093
|
+
* ```
|
2094
|
+
* #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN)
|
2095
|
+
* # include <stddef.h>
|
2096
|
+
* # ifdef unreachable
|
2097
|
+
* # define XXH_UNREACHABLE() unreachable()
|
2098
|
+
* # endif
|
2099
|
+
* #endif
|
2100
|
+
* ```
|
2101
|
+
*
|
2102
|
+
* Note C++23 also has std::unreachable() which can be detected
|
2103
|
+
* as follows:
|
2104
|
+
* ```
|
2105
|
+
* #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)
|
2106
|
+
* # include <utility>
|
2107
|
+
* # define XXH_UNREACHABLE() std::unreachable()
|
2108
|
+
* #endif
|
2109
|
+
* ```
|
2110
|
+
* NB: `__cpp_lib_unreachable` is defined in the `<version>` header.
|
2111
|
+
* We don't use that as including `<utility>` in `extern "C"` blocks
|
2112
|
+
* doesn't work on GCC12
|
2113
|
+
*/
|
2114
|
+
|
2115
|
+
#if XXH_HAS_BUILTIN(__builtin_unreachable)
|
2116
|
+
# define XXH_UNREACHABLE() __builtin_unreachable()
|
2117
|
+
|
2118
|
+
#elif defined(_MSC_VER)
|
2119
|
+
# define XXH_UNREACHABLE() __assume(0)
|
2120
|
+
|
2121
|
+
#else
|
2122
|
+
# define XXH_UNREACHABLE()
|
2123
|
+
#endif
|
2124
|
+
|
2125
|
+
#if XXH_HAS_BUILTIN(__builtin_assume)
|
2126
|
+
# define XXH_ASSUME(c) __builtin_assume(c)
|
2127
|
+
#else
|
2128
|
+
# define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
|
2129
|
+
#endif
|
2130
|
+
|
2025
2131
|
/*!
|
2026
2132
|
* @internal
|
2027
2133
|
* @def XXH_rotl32(x,r)
|
@@ -2211,9 +2317,9 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
|
|
2211
2317
|
* can load data, while v3 can multiply. SSE forces them to operate
|
2212
2318
|
* together.
|
2213
2319
|
*
|
2214
|
-
* This is also enabled on AArch64, as Clang
|
2215
|
-
*
|
2216
|
-
*
|
2320
|
+
* This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing
|
2321
|
+
* the loop. NEON is only faster on the A53, and with the newer cores, it is less
|
2322
|
+
* than half the speed.
|
2217
2323
|
*/
|
2218
2324
|
XXH_COMPILER_GUARD(acc);
|
2219
2325
|
#endif
|
@@ -2288,41 +2394,41 @@ XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
|
|
2288
2394
|
} else {
|
2289
2395
|
switch(len&15) /* or switch(bEnd - p) */ {
|
2290
2396
|
case 12: XXH_PROCESS4;
|
2291
|
-
XXH_FALLTHROUGH;
|
2397
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2292
2398
|
case 8: XXH_PROCESS4;
|
2293
|
-
XXH_FALLTHROUGH;
|
2399
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2294
2400
|
case 4: XXH_PROCESS4;
|
2295
2401
|
return XXH32_avalanche(hash);
|
2296
2402
|
|
2297
2403
|
case 13: XXH_PROCESS4;
|
2298
|
-
XXH_FALLTHROUGH;
|
2404
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2299
2405
|
case 9: XXH_PROCESS4;
|
2300
|
-
XXH_FALLTHROUGH;
|
2406
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2301
2407
|
case 5: XXH_PROCESS4;
|
2302
2408
|
XXH_PROCESS1;
|
2303
2409
|
return XXH32_avalanche(hash);
|
2304
2410
|
|
2305
2411
|
case 14: XXH_PROCESS4;
|
2306
|
-
XXH_FALLTHROUGH;
|
2412
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2307
2413
|
case 10: XXH_PROCESS4;
|
2308
|
-
XXH_FALLTHROUGH;
|
2414
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2309
2415
|
case 6: XXH_PROCESS4;
|
2310
2416
|
XXH_PROCESS1;
|
2311
2417
|
XXH_PROCESS1;
|
2312
2418
|
return XXH32_avalanche(hash);
|
2313
2419
|
|
2314
2420
|
case 15: XXH_PROCESS4;
|
2315
|
-
XXH_FALLTHROUGH;
|
2421
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2316
2422
|
case 11: XXH_PROCESS4;
|
2317
|
-
XXH_FALLTHROUGH;
|
2423
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2318
2424
|
case 7: XXH_PROCESS4;
|
2319
|
-
XXH_FALLTHROUGH;
|
2425
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2320
2426
|
case 3: XXH_PROCESS1;
|
2321
|
-
XXH_FALLTHROUGH;
|
2427
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2322
2428
|
case 2: XXH_PROCESS1;
|
2323
|
-
XXH_FALLTHROUGH;
|
2429
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2324
2430
|
case 1: XXH_PROCESS1;
|
2325
|
-
XXH_FALLTHROUGH;
|
2431
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2326
2432
|
case 0: return XXH32_avalanche(hash);
|
2327
2433
|
}
|
2328
2434
|
XXH_ASSERT(0);
|
@@ -2590,7 +2696,7 @@ static xxh_u64 XXH_read64(const void* ptr)
|
|
2590
2696
|
|
2591
2697
|
/*
|
2592
2698
|
* Portable and safe solution. Generally efficient.
|
2593
|
-
* see:
|
2699
|
+
* see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
|
2594
2700
|
*/
|
2595
2701
|
static xxh_u64 XXH_read64(const void* memPtr)
|
2596
2702
|
{
|
@@ -2823,7 +2929,7 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment
|
|
2823
2929
|
|
2824
2930
|
|
2825
2931
|
/*! @ingroup XXH64_family */
|
2826
|
-
XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
|
2932
|
+
XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
|
2827
2933
|
{
|
2828
2934
|
#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
|
2829
2935
|
/* Simple version, good for code maintenance, but unfortunately slow for small inputs */
|
@@ -2857,13 +2963,13 @@ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
|
|
2857
2963
|
}
|
2858
2964
|
|
2859
2965
|
/*! @ingroup XXH64_family */
|
2860
|
-
XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
|
2966
|
+
XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
|
2861
2967
|
{
|
2862
2968
|
XXH_memcpy(dstState, srcState, sizeof(*dstState));
|
2863
2969
|
}
|
2864
2970
|
|
2865
2971
|
/*! @ingroup XXH64_family */
|
2866
|
-
XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
|
2972
|
+
XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
|
2867
2973
|
{
|
2868
2974
|
XXH_ASSERT(statePtr != NULL);
|
2869
2975
|
memset(statePtr, 0, sizeof(*statePtr));
|
@@ -2876,7 +2982,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t s
|
|
2876
2982
|
|
2877
2983
|
/*! @ingroup XXH64_family */
|
2878
2984
|
XXH_PUBLIC_API XXH_errorcode
|
2879
|
-
XXH64_update (XXH64_state_t* state, const void* input, size_t len)
|
2985
|
+
XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
|
2880
2986
|
{
|
2881
2987
|
if (input==NULL) {
|
2882
2988
|
XXH_ASSERT(len == 0);
|
@@ -2927,7 +3033,7 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
|
|
2927
3033
|
|
2928
3034
|
|
2929
3035
|
/*! @ingroup XXH64_family */
|
2930
|
-
XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
|
3036
|
+
XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
|
2931
3037
|
{
|
2932
3038
|
xxh_u64 h64;
|
2933
3039
|
|
@@ -2950,7 +3056,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
|
|
2950
3056
|
/******* Canonical representation *******/
|
2951
3057
|
|
2952
3058
|
/*! @ingroup XXH64_family */
|
2953
|
-
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
|
3059
|
+
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
|
2954
3060
|
{
|
2955
3061
|
XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
|
2956
3062
|
if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
|
@@ -2958,7 +3064,7 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t
|
|
2958
3064
|
}
|
2959
3065
|
|
2960
3066
|
/*! @ingroup XXH64_family */
|
2961
|
-
XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
|
3067
|
+
XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
|
2962
3068
|
{
|
2963
3069
|
return XXH_readBE64(src);
|
2964
3070
|
}
|
@@ -2979,11 +3085,19 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
|
|
2979
3085
|
/* === Compiler specifics === */
|
2980
3086
|
|
2981
3087
|
#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
|
2982
|
-
# define XXH_RESTRICT
|
3088
|
+
# define XXH_RESTRICT /* disable */
|
2983
3089
|
#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
|
2984
3090
|
# define XXH_RESTRICT restrict
|
3091
|
+
#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
|
3092
|
+
|| (defined (__clang__)) \
|
3093
|
+
|| (defined (_MSC_VER) && (_MSC_VER >= 1400)) \
|
3094
|
+
|| (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))
|
3095
|
+
/*
|
3096
|
+
* There are a LOT more compilers that recognize __restrict but this
|
3097
|
+
* covers the major ones.
|
3098
|
+
*/
|
3099
|
+
# define XXH_RESTRICT __restrict
|
2985
3100
|
#else
|
2986
|
-
/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
|
2987
3101
|
# define XXH_RESTRICT /* disable */
|
2988
3102
|
#endif
|
2989
3103
|
|
@@ -2998,9 +3112,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
|
|
2998
3112
|
#endif
|
2999
3113
|
|
3000
3114
|
#if defined(__GNUC__) || defined(__clang__)
|
3115
|
+
# if defined(__ARM_FEATURE_SVE)
|
3116
|
+
# include <arm_sve.h>
|
3117
|
+
# endif
|
3001
3118
|
# if defined(__ARM_NEON__) || defined(__ARM_NEON) \
|
3002
|
-
|| defined(
|
3003
|
-
|| defined(_M_ARM64)
|
3119
|
+
|| (defined(_M_ARM) && _M_ARM >= 7) \
|
3120
|
+
|| defined(_M_ARM64) || defined(_M_ARM64EC)
|
3004
3121
|
# define inline __inline__ /* circumvent a clang bug */
|
3005
3122
|
# include <arm_neon.h>
|
3006
3123
|
# undef inline
|
@@ -3125,12 +3242,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|
|
3125
3242
|
XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */
|
3126
3243
|
XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */
|
3127
3244
|
XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
|
3245
|
+
XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */
|
3128
3246
|
};
|
3129
3247
|
/*!
|
3130
3248
|
* @ingroup tuning
|
3131
3249
|
* @brief Selects the minimum alignment for XXH3's accumulators.
|
3132
3250
|
*
|
3133
|
-
* When using SIMD, this should match the alignment
|
3251
|
+
* When using SIMD, this should match the alignment required for said vector
|
3134
3252
|
* type, so, for example, 32 for AVX2.
|
3135
3253
|
*
|
3136
3254
|
* Default: Auto detected.
|
@@ -3146,10 +3264,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|
|
3146
3264
|
# define XXH_AVX512 3
|
3147
3265
|
# define XXH_NEON 4
|
3148
3266
|
# define XXH_VSX 5
|
3267
|
+
# define XXH_SVE 6
|
3149
3268
|
#endif
|
3150
3269
|
|
3151
3270
|
#ifndef XXH_VECTOR /* can be defined on command line */
|
3152
|
-
# if (
|
3271
|
+
# if defined(__ARM_FEATURE_SVE)
|
3272
|
+
# define XXH_VECTOR XXH_SVE
|
3273
|
+
# elif ( \
|
3153
3274
|
defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
|
3154
3275
|
|| defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
|
3155
3276
|
) && ( \
|
@@ -3172,6 +3293,17 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|
|
3172
3293
|
# endif
|
3173
3294
|
#endif
|
3174
3295
|
|
3296
|
+
/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
|
3297
|
+
#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
|
3298
|
+
# ifdef _MSC_VER
|
3299
|
+
# pragma warning(once : 4606)
|
3300
|
+
# else
|
3301
|
+
# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
|
3302
|
+
# endif
|
3303
|
+
# undef XXH_VECTOR
|
3304
|
+
# define XXH_VECTOR XXH_SCALAR
|
3305
|
+
#endif
|
3306
|
+
|
3175
3307
|
/*
|
3176
3308
|
* Controls the alignment of the accumulator,
|
3177
3309
|
* for compatibility with aligned vector loads, which are usually faster.
|
@@ -3191,16 +3323,26 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|
|
3191
3323
|
# define XXH_ACC_ALIGN 16
|
3192
3324
|
# elif XXH_VECTOR == XXH_AVX512 /* avx512 */
|
3193
3325
|
# define XXH_ACC_ALIGN 64
|
3326
|
+
# elif XXH_VECTOR == XXH_SVE /* sve */
|
3327
|
+
# define XXH_ACC_ALIGN 64
|
3194
3328
|
# endif
|
3195
3329
|
#endif
|
3196
3330
|
|
3197
3331
|
#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
|
3198
3332
|
|| XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
|
3199
3333
|
# define XXH_SEC_ALIGN XXH_ACC_ALIGN
|
3334
|
+
#elif XXH_VECTOR == XXH_SVE
|
3335
|
+
# define XXH_SEC_ALIGN XXH_ACC_ALIGN
|
3200
3336
|
#else
|
3201
3337
|
# define XXH_SEC_ALIGN 8
|
3202
3338
|
#endif
|
3203
3339
|
|
3340
|
+
#if defined(__GNUC__) || defined(__clang__)
|
3341
|
+
# define XXH_ALIASING __attribute__((may_alias))
|
3342
|
+
#else
|
3343
|
+
# define XXH_ALIASING /* nothing */
|
3344
|
+
#endif
|
3345
|
+
|
3204
3346
|
/*
|
3205
3347
|
* UGLY HACK:
|
3206
3348
|
* GCC usually generates the best code with -O3 for xxHash.
|
@@ -3229,107 +3371,16 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|
|
3229
3371
|
# pragma GCC optimize("-O2")
|
3230
3372
|
#endif
|
3231
3373
|
|
3232
|
-
|
3233
3374
|
#if XXH_VECTOR == XXH_NEON
|
3375
|
+
|
3234
3376
|
/*
|
3235
|
-
*
|
3236
|
-
*
|
3237
|
-
*
|
3238
|
-
* While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
|
3239
|
-
*
|
3240
|
-
* To do the same operation, the 128-bit 'Q' register needs to be split into
|
3241
|
-
* two 64-bit 'D' registers, performing this operation::
|
3242
|
-
*
|
3243
|
-
* [ a | b ]
|
3244
|
-
* | '---------. .--------' |
|
3245
|
-
* | x |
|
3246
|
-
* | .---------' '--------. |
|
3247
|
-
* [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]
|
3248
|
-
*
|
3249
|
-
* Due to significant changes in aarch64, the fastest method for aarch64 is
|
3250
|
-
* completely different than the fastest method for ARMv7-A.
|
3251
|
-
*
|
3252
|
-
* ARMv7-A treats D registers as unions overlaying Q registers, so modifying
|
3253
|
-
* D11 will modify the high half of Q5. This is similar to how modifying AH
|
3254
|
-
* will only affect bits 8-15 of AX on x86.
|
3255
|
-
*
|
3256
|
-
* VZIP takes two registers, and puts even lanes in one register and odd lanes
|
3257
|
-
* in the other.
|
3258
|
-
*
|
3259
|
-
* On ARMv7-A, this strangely modifies both parameters in place instead of
|
3260
|
-
* taking the usual 3-operand form.
|
3261
|
-
*
|
3262
|
-
* Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
|
3263
|
-
* lower and upper halves of the Q register to end up with the high and low
|
3264
|
-
* halves where we want - all in one instruction.
|
3265
|
-
*
|
3266
|
-
* vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
|
3267
|
-
*
|
3268
|
-
* Unfortunately we need inline assembly for this: Instructions modifying two
|
3269
|
-
* registers at once is not possible in GCC or Clang's IR, and they have to
|
3270
|
-
* create a copy.
|
3377
|
+
* UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3
|
3378
|
+
* optimizes out the entire hashLong loop because of the aliasing violation.
|
3271
3379
|
*
|
3272
|
-
*
|
3273
|
-
*
|
3274
|
-
* In order to make it easier to write a decent compiler for aarch64, many
|
3275
|
-
* quirks were removed, such as conditional execution.
|
3276
|
-
*
|
3277
|
-
* NEON was also affected by this.
|
3278
|
-
*
|
3279
|
-
* aarch64 cannot access the high bits of a Q-form register, and writes to a
|
3280
|
-
* D-form register zero the high bits, similar to how writes to W-form scalar
|
3281
|
-
* registers (or DWORD registers on x86_64) work.
|
3282
|
-
*
|
3283
|
-
* The formerly free vget_high intrinsics now require a vext (with a few
|
3284
|
-
* exceptions)
|
3285
|
-
*
|
3286
|
-
* Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
|
3287
|
-
* of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
|
3288
|
-
* operand.
|
3289
|
-
*
|
3290
|
-
* The equivalent of the VZIP.32 on the lower and upper halves would be this
|
3291
|
-
* mess:
|
3292
|
-
*
|
3293
|
-
* ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
|
3294
|
-
* zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }
|
3295
|
-
* zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
|
3296
|
-
*
|
3297
|
-
* Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
|
3298
|
-
*
|
3299
|
-
* shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
|
3300
|
-
* xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
|
3301
|
-
*
|
3302
|
-
* This is available on ARMv7-A, but is less efficient than a single VZIP.32.
|
3380
|
+
* However, GCC is also inefficient at load-store optimization with vld1q/vst1q,
|
3381
|
+
* so the only option is to mark it as aliasing.
|
3303
3382
|
*/
|
3304
|
-
|
3305
|
-
/*!
|
3306
|
-
* Function-like macro:
|
3307
|
-
* void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
|
3308
|
-
* {
|
3309
|
-
* outLo = (uint32x2_t)(in & 0xFFFFFFFF);
|
3310
|
-
* outHi = (uint32x2_t)(in >> 32);
|
3311
|
-
* in = UNDEFINED;
|
3312
|
-
* }
|
3313
|
-
*/
|
3314
|
-
# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
|
3315
|
-
&& (defined(__GNUC__) || defined(__clang__)) \
|
3316
|
-
&& (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
|
3317
|
-
# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
|
3318
|
-
do { \
|
3319
|
-
/* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
|
3320
|
-
/* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \
|
3321
|
-
/* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
|
3322
|
-
__asm__("vzip.32 %e0, %f0" : "+w" (in)); \
|
3323
|
-
(outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \
|
3324
|
-
(outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
|
3325
|
-
} while (0)
|
3326
|
-
# else
|
3327
|
-
# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
|
3328
|
-
do { \
|
3329
|
-
(outLo) = vmovn_u64 (in); \
|
3330
|
-
(outHi) = vshrn_n_u64 ((in), 32); \
|
3331
|
-
} while (0)
|
3332
|
-
# endif
|
3383
|
+
typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
|
3333
3384
|
|
3334
3385
|
/*!
|
3335
3386
|
* @internal
|
@@ -3347,7 +3398,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|
|
3347
3398
|
#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
|
3348
3399
|
XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
|
3349
3400
|
{
|
3350
|
-
return *(
|
3401
|
+
return *(xxh_aliasing_uint64x2_t const *)ptr;
|
3351
3402
|
}
|
3352
3403
|
#else
|
3353
3404
|
XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
|
@@ -3355,38 +3406,75 @@ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
|
|
3355
3406
|
return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
|
3356
3407
|
}
|
3357
3408
|
#endif
|
3409
|
+
|
3410
|
+
/*!
|
3411
|
+
* @internal
|
3412
|
+
* @brief `vmlal_u32` on low and high halves of a vector.
|
3413
|
+
*
|
3414
|
+
* This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with
|
3415
|
+
* inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`
|
3416
|
+
* with `vmlal_u32`.
|
3417
|
+
*/
|
3418
|
+
#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11
|
3419
|
+
XXH_FORCE_INLINE uint64x2_t
|
3420
|
+
XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
|
3421
|
+
{
|
3422
|
+
/* Inline assembly is the only way */
|
3423
|
+
__asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));
|
3424
|
+
return acc;
|
3425
|
+
}
|
3426
|
+
XXH_FORCE_INLINE uint64x2_t
|
3427
|
+
XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
|
3428
|
+
{
|
3429
|
+
/* This intrinsic works as expected */
|
3430
|
+
return vmlal_high_u32(acc, lhs, rhs);
|
3431
|
+
}
|
3432
|
+
#else
|
3433
|
+
/* Portable intrinsic versions */
|
3434
|
+
XXH_FORCE_INLINE uint64x2_t
|
3435
|
+
XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
|
3436
|
+
{
|
3437
|
+
return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));
|
3438
|
+
}
|
3439
|
+
/*! @copydoc XXH_vmlal_low_u32
|
3440
|
+
* Assume the compiler converts this to vmlal_high_u32 on aarch64 */
|
3441
|
+
XXH_FORCE_INLINE uint64x2_t
|
3442
|
+
XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
|
3443
|
+
{
|
3444
|
+
return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));
|
3445
|
+
}
|
3446
|
+
#endif
|
3447
|
+
|
3358
3448
|
/*!
|
3359
3449
|
* @ingroup tuning
|
3360
3450
|
* @brief Controls the NEON to scalar ratio for XXH3
|
3361
3451
|
*
|
3362
|
-
*
|
3363
|
-
* 2 lanes on scalar by default.
|
3452
|
+
* This can be set to 2, 4, 6, or 8.
|
3364
3453
|
*
|
3365
|
-
*
|
3366
|
-
* emulated 64-bit arithmetic is too slow.
|
3454
|
+
* ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.
|
3367
3455
|
*
|
3368
|
-
*
|
3456
|
+
* For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those
|
3457
|
+
* can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU
|
3458
|
+
* bandwidth.
|
3369
3459
|
*
|
3370
|
-
*
|
3371
|
-
* have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,
|
3372
|
-
* you are only using 2/3 of the CPU bandwidth.
|
3373
|
-
*
|
3374
|
-
* This is even more noticable on the more advanced cores like the A76 which
|
3460
|
+
* This is even more noticeable on the more advanced cores like the Cortex-A76 which
|
3375
3461
|
* can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
|
3376
3462
|
*
|
3377
|
-
* Therefore,
|
3378
|
-
*
|
3379
|
-
*
|
3380
|
-
*
|
3463
|
+
* Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes
|
3464
|
+
* and 2 scalar lanes, which is chosen by default.
|
3465
|
+
*
|
3466
|
+
* This does not apply to Apple processors or 32-bit processors, which run better with
|
3467
|
+
* full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.
|
3381
3468
|
*
|
3382
3469
|
* This change benefits CPUs with large micro-op buffers without negatively affecting
|
3383
|
-
* other CPUs:
|
3470
|
+
* most other CPUs:
|
3384
3471
|
*
|
3385
3472
|
* | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. |
|
3386
3473
|
* |:----------------------|:--------------------|----------:|-----------:|------:|
|
3387
3474
|
* | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% |
|
3388
3475
|
* | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% |
|
3389
3476
|
* | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% |
|
3477
|
+
* | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% |
|
3390
3478
|
*
|
3391
3479
|
* It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
|
3392
3480
|
*
|
@@ -3394,7 +3482,7 @@ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
|
|
3394
3482
|
*/
|
3395
3483
|
# ifndef XXH3_NEON_LANES
|
3396
3484
|
# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
|
3397
|
-
&& XXH_SIZE_OPT <= 0
|
3485
|
+
&& !defined(__APPLE__) && XXH_SIZE_OPT <= 0
|
3398
3486
|
# define XXH3_NEON_LANES 6
|
3399
3487
|
# else
|
3400
3488
|
# define XXH3_NEON_LANES XXH_ACC_NB
|
@@ -3442,6 +3530,11 @@ typedef __vector unsigned long long xxh_u64x2;
|
|
3442
3530
|
typedef __vector unsigned char xxh_u8x16;
|
3443
3531
|
typedef __vector unsigned xxh_u32x4;
|
3444
3532
|
|
3533
|
+
/*
|
3534
|
+
* UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.
|
3535
|
+
*/
|
3536
|
+
typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;
|
3537
|
+
|
3445
3538
|
# ifndef XXH_VSX_BE
|
3446
3539
|
# if defined(__BIG_ENDIAN__) \
|
3447
3540
|
|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
@@ -3516,6 +3609,20 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
|
|
3516
3609
|
# endif /* XXH_vec_mulo, XXH_vec_mule */
|
3517
3610
|
#endif /* XXH_VECTOR == XXH_VSX */
|
3518
3611
|
|
3612
|
+
#if XXH_VECTOR == XXH_SVE
|
3613
|
+
#define ACCRND(acc, offset) \
|
3614
|
+
do { \
|
3615
|
+
svuint64_t input_vec = svld1_u64(mask, xinput + offset); \
|
3616
|
+
svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \
|
3617
|
+
svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \
|
3618
|
+
svuint64_t swapped = svtbl_u64(input_vec, kSwap); \
|
3619
|
+
svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \
|
3620
|
+
svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \
|
3621
|
+
svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
|
3622
|
+
acc = svadd_u64_x(mask, acc, mul); \
|
3623
|
+
} while (0)
|
3624
|
+
#endif /* XXH_VECTOR == XXH_SVE */
|
3625
|
+
|
3519
3626
|
|
3520
3627
|
/* prefetch
|
3521
3628
|
* can be disabled, by declaring XXH_NO_PREFETCH build macro */
|
@@ -3952,31 +4059,33 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
3952
4059
|
XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
|
3953
4060
|
XXH_ASSERT(16 < len && len <= 128);
|
3954
4061
|
|
3955
|
-
{ xxh_u64 acc = len * XXH_PRIME64_1;
|
4062
|
+
{ xxh_u64 acc = len * XXH_PRIME64_1, acc_end;
|
3956
4063
|
#if XXH_SIZE_OPT >= 1
|
3957
4064
|
/* Smaller and cleaner, but slightly slower. */
|
3958
|
-
|
4065
|
+
unsigned int i = (unsigned int)(len - 1) / 32;
|
3959
4066
|
do {
|
3960
4067
|
acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
|
3961
4068
|
acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
|
3962
4069
|
} while (i-- != 0);
|
4070
|
+
acc_end = 0;
|
3963
4071
|
#else
|
4072
|
+
acc += XXH3_mix16B(input+0, secret+0, seed);
|
4073
|
+
acc_end = XXH3_mix16B(input+len-16, secret+16, seed);
|
3964
4074
|
if (len > 32) {
|
4075
|
+
acc += XXH3_mix16B(input+16, secret+32, seed);
|
4076
|
+
acc_end += XXH3_mix16B(input+len-32, secret+48, seed);
|
3965
4077
|
if (len > 64) {
|
4078
|
+
acc += XXH3_mix16B(input+32, secret+64, seed);
|
4079
|
+
acc_end += XXH3_mix16B(input+len-48, secret+80, seed);
|
4080
|
+
|
3966
4081
|
if (len > 96) {
|
3967
4082
|
acc += XXH3_mix16B(input+48, secret+96, seed);
|
3968
|
-
|
4083
|
+
acc_end += XXH3_mix16B(input+len-64, secret+112, seed);
|
3969
4084
|
}
|
3970
|
-
acc += XXH3_mix16B(input+32, secret+64, seed);
|
3971
|
-
acc += XXH3_mix16B(input+len-48, secret+80, seed);
|
3972
4085
|
}
|
3973
|
-
acc += XXH3_mix16B(input+16, secret+32, seed);
|
3974
|
-
acc += XXH3_mix16B(input+len-32, secret+48, seed);
|
3975
4086
|
}
|
3976
|
-
acc += XXH3_mix16B(input+0, secret+0, seed);
|
3977
|
-
acc += XXH3_mix16B(input+len-16, secret+16, seed);
|
3978
4087
|
#endif
|
3979
|
-
return XXH3_avalanche(acc);
|
4088
|
+
return XXH3_avalanche(acc + acc_end);
|
3980
4089
|
}
|
3981
4090
|
}
|
3982
4091
|
|
@@ -3994,13 +4103,17 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
3994
4103
|
#define XXH3_MIDSIZE_LASTOFFSET 17
|
3995
4104
|
|
3996
4105
|
{ xxh_u64 acc = len * XXH_PRIME64_1;
|
3997
|
-
|
3998
|
-
int
|
4106
|
+
xxh_u64 acc_end;
|
4107
|
+
unsigned int const nbRounds = (unsigned int)len / 16;
|
4108
|
+
unsigned int i;
|
4109
|
+
XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
|
3999
4110
|
for (i=0; i<8; i++) {
|
4000
4111
|
acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
|
4001
4112
|
}
|
4002
|
-
|
4113
|
+
/* last bytes */
|
4114
|
+
acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
|
4003
4115
|
XXH_ASSERT(nbRounds >= 8);
|
4116
|
+
acc = XXH3_avalanche(acc);
|
4004
4117
|
#if defined(__clang__) /* Clang */ \
|
4005
4118
|
&& (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
|
4006
4119
|
&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
|
@@ -4027,11 +4140,13 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
4027
4140
|
#pragma clang loop vectorize(disable)
|
4028
4141
|
#endif
|
4029
4142
|
for (i=8 ; i < nbRounds; i++) {
|
4030
|
-
|
4143
|
+
/*
|
4144
|
+
* Prevents clang for unrolling the acc loop and interleaving with this one.
|
4145
|
+
*/
|
4146
|
+
XXH_COMPILER_GUARD(acc);
|
4147
|
+
acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
|
4031
4148
|
}
|
4032
|
-
|
4033
|
-
acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
|
4034
|
-
return XXH3_avalanche(acc);
|
4149
|
+
return XXH3_avalanche(acc + acc_end);
|
4035
4150
|
}
|
4036
4151
|
}
|
4037
4152
|
|
@@ -4047,6 +4162,47 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
4047
4162
|
# define ACC_NB XXH_ACC_NB
|
4048
4163
|
#endif
|
4049
4164
|
|
4165
|
+
#ifndef XXH_PREFETCH_DIST
|
4166
|
+
# ifdef __clang__
|
4167
|
+
# define XXH_PREFETCH_DIST 320
|
4168
|
+
# else
|
4169
|
+
# if (XXH_VECTOR == XXH_AVX512)
|
4170
|
+
# define XXH_PREFETCH_DIST 512
|
4171
|
+
# else
|
4172
|
+
# define XXH_PREFETCH_DIST 384
|
4173
|
+
# endif
|
4174
|
+
# endif /* __clang__ */
|
4175
|
+
#endif /* XXH_PREFETCH_DIST */
|
4176
|
+
|
4177
|
+
/*
|
4178
|
+
* These macros are to generate an XXH3_accumulate() function.
|
4179
|
+
* The two arguments select the name suffix and target attribute.
|
4180
|
+
*
|
4181
|
+
* The name of this symbol is XXH3_accumulate_<name>() and it calls
|
4182
|
+
* XXH3_accumulate_512_<name>().
|
4183
|
+
*
|
4184
|
+
* It may be useful to hand implement this function if the compiler fails to
|
4185
|
+
* optimize the inline function.
|
4186
|
+
*/
|
4187
|
+
#define XXH3_ACCUMULATE_TEMPLATE(name) \
|
4188
|
+
void \
|
4189
|
+
XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \
|
4190
|
+
const xxh_u8* XXH_RESTRICT input, \
|
4191
|
+
const xxh_u8* XXH_RESTRICT secret, \
|
4192
|
+
size_t nbStripes) \
|
4193
|
+
{ \
|
4194
|
+
size_t n; \
|
4195
|
+
for (n = 0; n < nbStripes; n++ ) { \
|
4196
|
+
const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \
|
4197
|
+
XXH_PREFETCH(in + XXH_PREFETCH_DIST); \
|
4198
|
+
XXH3_accumulate_512_##name( \
|
4199
|
+
acc, \
|
4200
|
+
in, \
|
4201
|
+
secret + n*XXH_SECRET_CONSUME_RATE); \
|
4202
|
+
} \
|
4203
|
+
}
|
4204
|
+
|
4205
|
+
|
4050
4206
|
XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
|
4051
4207
|
{
|
4052
4208
|
if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
|
@@ -4115,7 +4271,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
|
|
4115
4271
|
/* data_key = data_vec ^ key_vec; */
|
4116
4272
|
__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
|
4117
4273
|
/* data_key_lo = data_key >> 32; */
|
4118
|
-
__m512i const data_key_lo =
|
4274
|
+
__m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
|
4119
4275
|
/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
|
4120
4276
|
__m512i const product = _mm512_mul_epu32 (data_key, data_key_lo);
|
4121
4277
|
/* xacc[0] += swap(data_vec); */
|
@@ -4125,6 +4281,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
|
|
4125
4281
|
*xacc = _mm512_add_epi64(product, sum);
|
4126
4282
|
}
|
4127
4283
|
}
|
4284
|
+
XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
|
4128
4285
|
|
4129
4286
|
/*
|
4130
4287
|
* XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
|
@@ -4158,13 +4315,12 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
|
4158
4315
|
/* xacc[0] ^= (xacc[0] >> 47) */
|
4159
4316
|
__m512i const acc_vec = *xacc;
|
4160
4317
|
__m512i const shifted = _mm512_srli_epi64 (acc_vec, 47);
|
4161
|
-
__m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted);
|
4162
4318
|
/* xacc[0] ^= secret; */
|
4163
4319
|
__m512i const key_vec = _mm512_loadu_si512 (secret);
|
4164
|
-
__m512i const data_key =
|
4320
|
+
__m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
|
4165
4321
|
|
4166
4322
|
/* xacc[0] *= XXH_PRIME32_1; */
|
4167
|
-
__m512i const data_key_hi =
|
4323
|
+
__m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
|
4168
4324
|
__m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32);
|
4169
4325
|
__m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32);
|
4170
4326
|
*xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
|
@@ -4179,7 +4335,8 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
|
|
4179
4335
|
XXH_ASSERT(((size_t)customSecret & 63) == 0);
|
4180
4336
|
(void)(&XXH_writeLE64);
|
4181
4337
|
{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
|
4182
|
-
__m512i const
|
4338
|
+
__m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
|
4339
|
+
__m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
|
4183
4340
|
|
4184
4341
|
const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret);
|
4185
4342
|
__m512i* const dest = ( __m512i*) customSecret;
|
@@ -4187,14 +4344,7 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
|
|
4187
4344
|
XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
|
4188
4345
|
XXH_ASSERT(((size_t)dest & 63) == 0);
|
4189
4346
|
for (i=0; i < nbRounds; ++i) {
|
4190
|
-
|
4191
|
-
* this will warn "discards 'const' qualifier". */
|
4192
|
-
union {
|
4193
|
-
const __m512i* cp;
|
4194
|
-
void* p;
|
4195
|
-
} remote_const_void;
|
4196
|
-
remote_const_void.cp = src + i;
|
4197
|
-
dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
|
4347
|
+
dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
|
4198
4348
|
} }
|
4199
4349
|
}
|
4200
4350
|
|
@@ -4230,7 +4380,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
|
|
4230
4380
|
/* data_key = data_vec ^ key_vec; */
|
4231
4381
|
__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
|
4232
4382
|
/* data_key_lo = data_key >> 32; */
|
4233
|
-
__m256i const data_key_lo =
|
4383
|
+
__m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
|
4234
4384
|
/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
|
4235
4385
|
__m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);
|
4236
4386
|
/* xacc[i] += swap(data_vec); */
|
@@ -4240,6 +4390,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
|
|
4240
4390
|
xacc[i] = _mm256_add_epi64(product, sum);
|
4241
4391
|
} }
|
4242
4392
|
}
|
4393
|
+
XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
|
4243
4394
|
|
4244
4395
|
XXH_FORCE_INLINE XXH_TARGET_AVX2 void
|
4245
4396
|
XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
@@ -4262,7 +4413,7 @@ XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
|
4262
4413
|
__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
|
4263
4414
|
|
4264
4415
|
/* xacc[i] *= XXH_PRIME32_1; */
|
4265
|
-
__m256i const data_key_hi =
|
4416
|
+
__m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
|
4266
4417
|
__m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
|
4267
4418
|
__m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
|
4268
4419
|
xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
|
@@ -4294,12 +4445,12 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR
|
|
4294
4445
|
XXH_ASSERT(((size_t)dest & 31) == 0);
|
4295
4446
|
|
4296
4447
|
/* GCC -O2 need unroll loop manually */
|
4297
|
-
dest[0] = _mm256_add_epi64(
|
4298
|
-
dest[1] = _mm256_add_epi64(
|
4299
|
-
dest[2] = _mm256_add_epi64(
|
4300
|
-
dest[3] = _mm256_add_epi64(
|
4301
|
-
dest[4] = _mm256_add_epi64(
|
4302
|
-
dest[5] = _mm256_add_epi64(
|
4448
|
+
dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
|
4449
|
+
dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
|
4450
|
+
dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
|
4451
|
+
dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
|
4452
|
+
dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
|
4453
|
+
dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
|
4303
4454
|
}
|
4304
4455
|
}
|
4305
4456
|
|
@@ -4346,6 +4497,7 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
|
|
4346
4497
|
xacc[i] = _mm_add_epi64(product, sum);
|
4347
4498
|
} }
|
4348
4499
|
}
|
4500
|
+
XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
|
4349
4501
|
|
4350
4502
|
XXH_FORCE_INLINE XXH_TARGET_SSE2 void
|
4351
4503
|
XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
@@ -4431,6 +4583,16 @@ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
|
|
4431
4583
|
* CPU, and it also mitigates some GCC codegen issues.
|
4432
4584
|
*
|
4433
4585
|
* @see XXH3_NEON_LANES for configuring this and details about this optimization.
|
4586
|
+
*
|
4587
|
+
* NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit
|
4588
|
+
* integers instead of the other platforms which mask full 64-bit vectors,
|
4589
|
+
* so the setup is more complicated than just shifting right.
|
4590
|
+
*
|
4591
|
+
* Additionally, there is an optimization for 4 lanes at once noted below.
|
4592
|
+
*
|
4593
|
+
* Since, as stated, the most optimal amount of lanes for Cortexes is 6,
|
4594
|
+
* there needs to be *three* versions of the accumulate operation used
|
4595
|
+
* for the remaining 2 lanes.
|
4434
4596
|
*/
|
4435
4597
|
XXH_FORCE_INLINE void
|
4436
4598
|
XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
|
@@ -4439,49 +4601,113 @@ XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
|
|
4439
4601
|
{
|
4440
4602
|
XXH_ASSERT((((size_t)acc) & 15) == 0);
|
4441
4603
|
XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
|
4442
|
-
{
|
4443
|
-
|
4604
|
+
{ /* GCC for darwin arm64 does not like aliasing here */
|
4605
|
+
xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;
|
4444
4606
|
/* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
|
4445
4607
|
uint8_t const* const xinput = (const uint8_t *) input;
|
4446
4608
|
uint8_t const* const xsecret = (const uint8_t *) secret;
|
4447
4609
|
|
4448
4610
|
size_t i;
|
4449
|
-
/*
|
4611
|
+
/* Scalar lanes use the normal scalarRound routine */
|
4450
4612
|
for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
|
4451
4613
|
XXH3_scalarRound(acc, input, secret, i);
|
4452
4614
|
}
|
4453
|
-
|
4454
|
-
|
4615
|
+
i = 0;
|
4616
|
+
/* 4 NEON lanes at a time. */
|
4617
|
+
for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
|
4618
|
+
/* data_vec = xinput[i]; */
|
4619
|
+
uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16));
|
4620
|
+
uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16));
|
4621
|
+
/* key_vec = xsecret[i]; */
|
4622
|
+
uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16));
|
4623
|
+
uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16));
|
4624
|
+
/* data_swap = swap(data_vec) */
|
4625
|
+
uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
|
4626
|
+
uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
|
4627
|
+
/* data_key = data_vec ^ key_vec; */
|
4628
|
+
uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
|
4629
|
+
uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
|
4630
|
+
|
4631
|
+
/*
|
4632
|
+
* If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
|
4633
|
+
* de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
|
4634
|
+
* get one vector with the low 32 bits of each lane, and one vector
|
4635
|
+
* with the high 32 bits of each lane.
|
4636
|
+
*
|
4637
|
+
* This compiles to two instructions on AArch64 and has a paired vector
|
4638
|
+
* result, which is an artifact from ARMv7a's version which modified both
|
4639
|
+
* vectors in place.
|
4640
|
+
*
|
4641
|
+
* [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
|
4642
|
+
* [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
|
4643
|
+
*/
|
4644
|
+
uint32x4x2_t unzipped = vuzpq_u32(
|
4645
|
+
vreinterpretq_u32_u64(data_key_1),
|
4646
|
+
vreinterpretq_u32_u64(data_key_2)
|
4647
|
+
);
|
4648
|
+
/* data_key_lo = data_key & 0xFFFFFFFF */
|
4649
|
+
uint32x4_t data_key_lo = unzipped.val[0];
|
4650
|
+
/* data_key_hi = data_key >> 32 */
|
4651
|
+
uint32x4_t data_key_hi = unzipped.val[1];
|
4652
|
+
/*
|
4653
|
+
* Then, we can split the vectors horizontally and multiply which, as for most
|
4654
|
+
* widening intrinsics, have a variant that works on both high half vectors
|
4655
|
+
* for free on AArch64.
|
4656
|
+
*
|
4657
|
+
* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
|
4658
|
+
*/
|
4659
|
+
uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);
|
4660
|
+
uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);
|
4661
|
+
/*
|
4662
|
+
* Clang reorders
|
4663
|
+
* a += b * c; // umlal swap.2d, dkl.2s, dkh.2s
|
4664
|
+
* c += a; // add acc.2d, acc.2d, swap.2d
|
4665
|
+
* to
|
4666
|
+
* c += a; // add acc.2d, acc.2d, swap.2d
|
4667
|
+
* c += b * c; // umlal acc.2d, dkl.2s, dkh.2s
|
4668
|
+
*
|
4669
|
+
* While it would make sense in theory since the addition is faster,
|
4670
|
+
* for reasons likely related to umlal being limited to certain NEON
|
4671
|
+
* pipelines, this is worse. A compiler guard fixes this.
|
4672
|
+
*/
|
4673
|
+
XXH_COMPILER_GUARD_W(sum_1);
|
4674
|
+
XXH_COMPILER_GUARD_W(sum_2);
|
4675
|
+
/* xacc[i] = acc_vec + sum; */
|
4676
|
+
xacc[i] = vaddq_u64(xacc[i], sum_1);
|
4677
|
+
xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);
|
4678
|
+
}
|
4679
|
+
/* Operate on the remaining NEON lanes 2 at a time. */
|
4680
|
+
for (; i < XXH3_NEON_LANES / 2; i++) {
|
4455
4681
|
/* data_vec = xinput[i]; */
|
4456
4682
|
uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16));
|
4457
4683
|
/* key_vec = xsecret[i]; */
|
4458
4684
|
uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));
|
4459
|
-
uint64x2_t data_key;
|
4460
|
-
uint32x2_t data_key_lo, data_key_hi;
|
4461
4685
|
/* acc_vec_2 = swap(data_vec) */
|
4462
|
-
uint64x2_t
|
4686
|
+
uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);
|
4463
4687
|
/* data_key = data_vec ^ key_vec; */
|
4464
|
-
data_key = veorq_u64(data_vec, key_vec);
|
4465
|
-
/*
|
4466
|
-
|
4467
|
-
|
4468
|
-
|
4469
|
-
|
4470
|
-
|
4471
|
-
|
4472
|
-
|
4473
|
-
|
4688
|
+
uint64x2_t data_key = veorq_u64(data_vec, key_vec);
|
4689
|
+
/* For two lanes, just use VMOVN and VSHRN. */
|
4690
|
+
/* data_key_lo = data_key & 0xFFFFFFFF; */
|
4691
|
+
uint32x2_t data_key_lo = vmovn_u64(data_key);
|
4692
|
+
/* data_key_hi = data_key >> 32; */
|
4693
|
+
uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
|
4694
|
+
/* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */
|
4695
|
+
uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);
|
4696
|
+
/* Same Clang workaround as before */
|
4697
|
+
XXH_COMPILER_GUARD_W(sum);
|
4698
|
+
/* xacc[i] = acc_vec + sum; */
|
4699
|
+
xacc[i] = vaddq_u64 (xacc[i], sum);
|
4474
4700
|
}
|
4475
|
-
|
4476
4701
|
}
|
4477
4702
|
}
|
4703
|
+
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
|
4478
4704
|
|
4479
4705
|
XXH_FORCE_INLINE void
|
4480
4706
|
XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
4481
4707
|
{
|
4482
4708
|
XXH_ASSERT((((size_t)acc) & 15) == 0);
|
4483
4709
|
|
4484
|
-
{
|
4710
|
+
{ xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc;
|
4485
4711
|
uint8_t const* xsecret = (uint8_t const*) secret;
|
4486
4712
|
uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);
|
4487
4713
|
|
@@ -4493,47 +4719,42 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
|
4493
4719
|
for (i=0; i < XXH3_NEON_LANES / 2; i++) {
|
4494
4720
|
/* xacc[i] ^= (xacc[i] >> 47); */
|
4495
4721
|
uint64x2_t acc_vec = xacc[i];
|
4496
|
-
uint64x2_t shifted = vshrq_n_u64
|
4497
|
-
uint64x2_t data_vec = veorq_u64
|
4722
|
+
uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
|
4723
|
+
uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
|
4498
4724
|
|
4499
4725
|
/* xacc[i] ^= xsecret[i]; */
|
4500
|
-
uint64x2_t key_vec = XXH_vld1q_u64
|
4501
|
-
uint64x2_t data_key = veorq_u64
|
4726
|
+
uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));
|
4727
|
+
uint64x2_t data_key = veorq_u64(data_vec, key_vec);
|
4502
4728
|
|
4503
4729
|
/* xacc[i] *= XXH_PRIME32_1 */
|
4504
|
-
uint32x2_t data_key_lo
|
4505
|
-
|
4506
|
-
|
4507
|
-
*
|
4508
|
-
|
4509
|
-
|
4510
|
-
|
4511
|
-
|
4512
|
-
|
4513
|
-
|
4514
|
-
|
4515
|
-
|
4516
|
-
|
4517
|
-
|
4518
|
-
|
4519
|
-
|
4520
|
-
|
4521
|
-
|
4522
|
-
|
4523
|
-
|
4524
|
-
|
4525
|
-
|
4526
|
-
|
4527
|
-
|
4528
|
-
|
4529
|
-
prod_hi = vshlq_n_u64(prod_hi, 32);
|
4530
|
-
/* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
|
4531
|
-
xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime);
|
4532
|
-
}
|
4730
|
+
uint32x2_t data_key_lo = vmovn_u64(data_key);
|
4731
|
+
uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
|
4732
|
+
/*
|
4733
|
+
* prod_hi = (data_key >> 32) * XXH_PRIME32_1;
|
4734
|
+
*
|
4735
|
+
* Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
|
4736
|
+
* incorrectly "optimize" this:
|
4737
|
+
* tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));
|
4738
|
+
* shifted = vshll_n_u32(tmp, 32);
|
4739
|
+
* to this:
|
4740
|
+
* tmp = "vmulq_u64"(a, b); // no such thing!
|
4741
|
+
* shifted = vshlq_n_u64(tmp, 32);
|
4742
|
+
*
|
4743
|
+
* However, unlike SSE, Clang lacks a 64-bit multiply routine
|
4744
|
+
* for NEON, and it scalarizes two 64-bit multiplies instead.
|
4745
|
+
*
|
4746
|
+
* vmull_u32 has the same timing as vmul_u32, and it avoids
|
4747
|
+
* this bug completely.
|
4748
|
+
* See https://bugs.llvm.org/show_bug.cgi?id=39967
|
4749
|
+
*/
|
4750
|
+
uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
|
4751
|
+
/* xacc[i] = prod_hi << 32; */
|
4752
|
+
prod_hi = vshlq_n_u64(prod_hi, 32);
|
4753
|
+
/* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
|
4754
|
+
xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime);
|
4533
4755
|
}
|
4534
4756
|
}
|
4535
4757
|
}
|
4536
|
-
|
4537
4758
|
#endif
|
4538
4759
|
|
4539
4760
|
#if (XXH_VECTOR == XXH_VSX)
|
@@ -4544,23 +4765,23 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
|
|
4544
4765
|
const void* XXH_RESTRICT secret)
|
4545
4766
|
{
|
4546
4767
|
/* presumed aligned */
|
4547
|
-
|
4548
|
-
|
4549
|
-
|
4768
|
+
xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
|
4769
|
+
xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */
|
4770
|
+
xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */
|
4550
4771
|
xxh_u64x2 const v32 = { 32, 32 };
|
4551
4772
|
size_t i;
|
4552
4773
|
for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
|
4553
4774
|
/* data_vec = xinput[i]; */
|
4554
|
-
xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
|
4775
|
+
xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);
|
4555
4776
|
/* key_vec = xsecret[i]; */
|
4556
|
-
xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
|
4777
|
+
xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i);
|
4557
4778
|
xxh_u64x2 const data_key = data_vec ^ key_vec;
|
4558
4779
|
/* shuffled = (data_key << 32) | (data_key >> 32); */
|
4559
4780
|
xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
|
4560
4781
|
/* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
|
4561
4782
|
xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
|
4562
4783
|
/* acc_vec = xacc[i]; */
|
4563
|
-
xxh_u64x2 acc_vec =
|
4784
|
+
xxh_u64x2 acc_vec = xacc[i];
|
4564
4785
|
acc_vec += product;
|
4565
4786
|
|
4566
4787
|
/* swap high and low halves */
|
@@ -4569,18 +4790,18 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
|
|
4569
4790
|
#else
|
4570
4791
|
acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
|
4571
4792
|
#endif
|
4572
|
-
|
4573
|
-
vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
|
4793
|
+
xacc[i] = acc_vec;
|
4574
4794
|
}
|
4575
4795
|
}
|
4796
|
+
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
|
4576
4797
|
|
4577
4798
|
XXH_FORCE_INLINE void
|
4578
4799
|
XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
4579
4800
|
{
|
4580
4801
|
XXH_ASSERT((((size_t)acc) & 15) == 0);
|
4581
4802
|
|
4582
|
-
{
|
4583
|
-
const
|
4803
|
+
{ xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
|
4804
|
+
const xxh_u8* const xsecret = (const xxh_u8*) secret;
|
4584
4805
|
/* constants */
|
4585
4806
|
xxh_u64x2 const v32 = { 32, 32 };
|
4586
4807
|
xxh_u64x2 const v47 = { 47, 47 };
|
@@ -4592,7 +4813,7 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
|
4592
4813
|
xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
|
4593
4814
|
|
4594
4815
|
/* xacc[i] ^= xsecret[i]; */
|
4595
|
-
xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
|
4816
|
+
xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i);
|
4596
4817
|
xxh_u64x2 const data_key = data_vec ^ key_vec;
|
4597
4818
|
|
4598
4819
|
/* xacc[i] *= XXH_PRIME32_1 */
|
@@ -4606,8 +4827,148 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
|
4606
4827
|
|
4607
4828
|
#endif
|
4608
4829
|
|
4830
|
+
#if (XXH_VECTOR == XXH_SVE)
|
4831
|
+
|
4832
|
+
XXH_FORCE_INLINE void
|
4833
|
+
XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
|
4834
|
+
const void* XXH_RESTRICT input,
|
4835
|
+
const void* XXH_RESTRICT secret)
|
4836
|
+
{
|
4837
|
+
uint64_t *xacc = (uint64_t *)acc;
|
4838
|
+
const uint64_t *xinput = (const uint64_t *)(const void *)input;
|
4839
|
+
const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
|
4840
|
+
svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
|
4841
|
+
uint64_t element_count = svcntd();
|
4842
|
+
if (element_count >= 8) {
|
4843
|
+
svbool_t mask = svptrue_pat_b64(SV_VL8);
|
4844
|
+
svuint64_t vacc = svld1_u64(mask, xacc);
|
4845
|
+
ACCRND(vacc, 0);
|
4846
|
+
svst1_u64(mask, xacc, vacc);
|
4847
|
+
} else if (element_count == 2) { /* sve128 */
|
4848
|
+
svbool_t mask = svptrue_pat_b64(SV_VL2);
|
4849
|
+
svuint64_t acc0 = svld1_u64(mask, xacc + 0);
|
4850
|
+
svuint64_t acc1 = svld1_u64(mask, xacc + 2);
|
4851
|
+
svuint64_t acc2 = svld1_u64(mask, xacc + 4);
|
4852
|
+
svuint64_t acc3 = svld1_u64(mask, xacc + 6);
|
4853
|
+
ACCRND(acc0, 0);
|
4854
|
+
ACCRND(acc1, 2);
|
4855
|
+
ACCRND(acc2, 4);
|
4856
|
+
ACCRND(acc3, 6);
|
4857
|
+
svst1_u64(mask, xacc + 0, acc0);
|
4858
|
+
svst1_u64(mask, xacc + 2, acc1);
|
4859
|
+
svst1_u64(mask, xacc + 4, acc2);
|
4860
|
+
svst1_u64(mask, xacc + 6, acc3);
|
4861
|
+
} else {
|
4862
|
+
svbool_t mask = svptrue_pat_b64(SV_VL4);
|
4863
|
+
svuint64_t acc0 = svld1_u64(mask, xacc + 0);
|
4864
|
+
svuint64_t acc1 = svld1_u64(mask, xacc + 4);
|
4865
|
+
ACCRND(acc0, 0);
|
4866
|
+
ACCRND(acc1, 4);
|
4867
|
+
svst1_u64(mask, xacc + 0, acc0);
|
4868
|
+
svst1_u64(mask, xacc + 4, acc1);
|
4869
|
+
}
|
4870
|
+
}
|
4871
|
+
|
4872
|
+
XXH_FORCE_INLINE void
|
4873
|
+
XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
|
4874
|
+
const xxh_u8* XXH_RESTRICT input,
|
4875
|
+
const xxh_u8* XXH_RESTRICT secret,
|
4876
|
+
size_t nbStripes)
|
4877
|
+
{
|
4878
|
+
if (nbStripes != 0) {
|
4879
|
+
uint64_t *xacc = (uint64_t *)acc;
|
4880
|
+
const uint64_t *xinput = (const uint64_t *)(const void *)input;
|
4881
|
+
const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
|
4882
|
+
svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
|
4883
|
+
uint64_t element_count = svcntd();
|
4884
|
+
if (element_count >= 8) {
|
4885
|
+
svbool_t mask = svptrue_pat_b64(SV_VL8);
|
4886
|
+
svuint64_t vacc = svld1_u64(mask, xacc + 0);
|
4887
|
+
do {
|
4888
|
+
/* svprfd(svbool_t, void *, enum svfprop); */
|
4889
|
+
svprfd(mask, xinput + 128, SV_PLDL1STRM);
|
4890
|
+
ACCRND(vacc, 0);
|
4891
|
+
xinput += 8;
|
4892
|
+
xsecret += 1;
|
4893
|
+
nbStripes--;
|
4894
|
+
} while (nbStripes != 0);
|
4895
|
+
|
4896
|
+
svst1_u64(mask, xacc + 0, vacc);
|
4897
|
+
} else if (element_count == 2) { /* sve128 */
|
4898
|
+
svbool_t mask = svptrue_pat_b64(SV_VL2);
|
4899
|
+
svuint64_t acc0 = svld1_u64(mask, xacc + 0);
|
4900
|
+
svuint64_t acc1 = svld1_u64(mask, xacc + 2);
|
4901
|
+
svuint64_t acc2 = svld1_u64(mask, xacc + 4);
|
4902
|
+
svuint64_t acc3 = svld1_u64(mask, xacc + 6);
|
4903
|
+
do {
|
4904
|
+
svprfd(mask, xinput + 128, SV_PLDL1STRM);
|
4905
|
+
ACCRND(acc0, 0);
|
4906
|
+
ACCRND(acc1, 2);
|
4907
|
+
ACCRND(acc2, 4);
|
4908
|
+
ACCRND(acc3, 6);
|
4909
|
+
xinput += 8;
|
4910
|
+
xsecret += 1;
|
4911
|
+
nbStripes--;
|
4912
|
+
} while (nbStripes != 0);
|
4913
|
+
|
4914
|
+
svst1_u64(mask, xacc + 0, acc0);
|
4915
|
+
svst1_u64(mask, xacc + 2, acc1);
|
4916
|
+
svst1_u64(mask, xacc + 4, acc2);
|
4917
|
+
svst1_u64(mask, xacc + 6, acc3);
|
4918
|
+
} else {
|
4919
|
+
svbool_t mask = svptrue_pat_b64(SV_VL4);
|
4920
|
+
svuint64_t acc0 = svld1_u64(mask, xacc + 0);
|
4921
|
+
svuint64_t acc1 = svld1_u64(mask, xacc + 4);
|
4922
|
+
do {
|
4923
|
+
svprfd(mask, xinput + 128, SV_PLDL1STRM);
|
4924
|
+
ACCRND(acc0, 0);
|
4925
|
+
ACCRND(acc1, 4);
|
4926
|
+
xinput += 8;
|
4927
|
+
xsecret += 1;
|
4928
|
+
nbStripes--;
|
4929
|
+
} while (nbStripes != 0);
|
4930
|
+
|
4931
|
+
svst1_u64(mask, xacc + 0, acc0);
|
4932
|
+
svst1_u64(mask, xacc + 4, acc1);
|
4933
|
+
}
|
4934
|
+
}
|
4935
|
+
}
|
4936
|
+
|
4937
|
+
#endif
|
4938
|
+
|
4609
4939
|
/* scalar variants - universal */
|
4610
4940
|
|
4941
|
+
#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
|
4942
|
+
/*
|
4943
|
+
* In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they
|
4944
|
+
* emit an excess mask and a full 64-bit multiply-add (MADD X-form).
|
4945
|
+
*
|
4946
|
+
* While this might not seem like much, as AArch64 is a 64-bit architecture, only
|
4947
|
+
* big Cortex designs have a full 64-bit multiplier.
|
4948
|
+
*
|
4949
|
+
* On the little cores, the smaller 32-bit multiplier is used, and full 64-bit
|
4950
|
+
* multiplies expand to 2-3 multiplies in microcode. This has a major penalty
|
4951
|
+
* of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.
|
4952
|
+
*
|
4953
|
+
* Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does
|
4954
|
+
* not have this penalty and does the mask automatically.
|
4955
|
+
*/
|
4956
|
+
XXH_FORCE_INLINE xxh_u64
|
4957
|
+
XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
|
4958
|
+
{
|
4959
|
+
xxh_u64 ret;
|
4960
|
+
/* note: %x = 64-bit register, %w = 32-bit register */
|
4961
|
+
__asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));
|
4962
|
+
return ret;
|
4963
|
+
}
|
4964
|
+
#else
|
4965
|
+
XXH_FORCE_INLINE xxh_u64
|
4966
|
+
XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
|
4967
|
+
{
|
4968
|
+
return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;
|
4969
|
+
}
|
4970
|
+
#endif
|
4971
|
+
|
4611
4972
|
/*!
|
4612
4973
|
* @internal
|
4613
4974
|
* @brief Scalar round for @ref XXH3_accumulate_512_scalar().
|
@@ -4630,7 +4991,7 @@ XXH3_scalarRound(void* XXH_RESTRICT acc,
|
|
4630
4991
|
xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
|
4631
4992
|
xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
|
4632
4993
|
xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
|
4633
|
-
xacc[lane]
|
4994
|
+
xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);
|
4634
4995
|
}
|
4635
4996
|
}
|
4636
4997
|
|
@@ -4655,6 +5016,7 @@ XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
|
|
4655
5016
|
XXH3_scalarRound(acc, input, secret, i);
|
4656
5017
|
}
|
4657
5018
|
}
|
5019
|
+
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
|
4658
5020
|
|
4659
5021
|
/*!
|
4660
5022
|
* @internal
|
@@ -4706,10 +5068,10 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
|
|
4706
5068
|
const xxh_u8* kSecretPtr = XXH3_kSecret;
|
4707
5069
|
XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
|
4708
5070
|
|
4709
|
-
#if defined(
|
5071
|
+
#if defined(__GNUC__) && defined(__aarch64__)
|
4710
5072
|
/*
|
4711
5073
|
* UGLY HACK:
|
4712
|
-
* Clang
|
5074
|
+
* GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are
|
4713
5075
|
* placed sequentially, in order, at the top of the unrolled loop.
|
4714
5076
|
*
|
4715
5077
|
* While MOVK is great for generating constants (2 cycles for a 64-bit
|
@@ -4724,7 +5086,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
|
|
4724
5086
|
* ADD
|
4725
5087
|
* SUB STR
|
4726
5088
|
* STR
|
4727
|
-
* By forcing loads from memory (as the asm line causes
|
5089
|
+
* By forcing loads from memory (as the asm line causes the compiler to assume
|
4728
5090
|
* that XXH3_kSecretPtr has been changed), the pipelines are used more
|
4729
5091
|
* efficiently:
|
4730
5092
|
* I L S
|
@@ -4741,17 +5103,11 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
|
|
4741
5103
|
*/
|
4742
5104
|
XXH_COMPILER_GUARD(kSecretPtr);
|
4743
5105
|
#endif
|
4744
|
-
/*
|
4745
|
-
* Note: in debug mode, this overrides the asm optimization
|
4746
|
-
* and Clang will emit MOVK chains again.
|
4747
|
-
*/
|
4748
|
-
XXH_ASSERT(kSecretPtr == XXH3_kSecret);
|
4749
|
-
|
4750
5106
|
{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
|
4751
5107
|
int i;
|
4752
5108
|
for (i=0; i < nbRounds; i++) {
|
4753
5109
|
/*
|
4754
|
-
* The asm hack causes
|
5110
|
+
* The asm hack causes the compiler to assume that kSecretPtr aliases with
|
4755
5111
|
* customSecret, and on aarch64, this prevented LDP from merging two
|
4756
5112
|
* loads together for free. Putting the loads together before the stores
|
4757
5113
|
* properly generates LDP.
|
@@ -4764,7 +5120,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
|
|
4764
5120
|
}
|
4765
5121
|
|
4766
5122
|
|
4767
|
-
typedef void (*
|
5123
|
+
typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
|
4768
5124
|
typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
|
4769
5125
|
typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
|
4770
5126
|
|
@@ -4772,36 +5128,48 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
|
|
4772
5128
|
#if (XXH_VECTOR == XXH_AVX512)
|
4773
5129
|
|
4774
5130
|
#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
|
5131
|
+
#define XXH3_accumulate XXH3_accumulate_avx512
|
4775
5132
|
#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
|
4776
5133
|
#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
|
4777
5134
|
|
4778
5135
|
#elif (XXH_VECTOR == XXH_AVX2)
|
4779
5136
|
|
4780
5137
|
#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
|
5138
|
+
#define XXH3_accumulate XXH3_accumulate_avx2
|
4781
5139
|
#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
|
4782
5140
|
#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
|
4783
5141
|
|
4784
5142
|
#elif (XXH_VECTOR == XXH_SSE2)
|
4785
5143
|
|
4786
5144
|
#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
|
5145
|
+
#define XXH3_accumulate XXH3_accumulate_sse2
|
4787
5146
|
#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
|
4788
5147
|
#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
|
4789
5148
|
|
4790
5149
|
#elif (XXH_VECTOR == XXH_NEON)
|
4791
5150
|
|
4792
5151
|
#define XXH3_accumulate_512 XXH3_accumulate_512_neon
|
5152
|
+
#define XXH3_accumulate XXH3_accumulate_neon
|
4793
5153
|
#define XXH3_scrambleAcc XXH3_scrambleAcc_neon
|
4794
5154
|
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
|
4795
5155
|
|
4796
5156
|
#elif (XXH_VECTOR == XXH_VSX)
|
4797
5157
|
|
4798
5158
|
#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
|
5159
|
+
#define XXH3_accumulate XXH3_accumulate_vsx
|
4799
5160
|
#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
|
4800
5161
|
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
|
4801
5162
|
|
5163
|
+
#elif (XXH_VECTOR == XXH_SVE)
|
5164
|
+
#define XXH3_accumulate_512 XXH3_accumulate_512_sve
|
5165
|
+
#define XXH3_accumulate XXH3_accumulate_sve
|
5166
|
+
#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
|
5167
|
+
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
|
5168
|
+
|
4802
5169
|
#else /* scalar */
|
4803
5170
|
|
4804
5171
|
#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
|
5172
|
+
#define XXH3_accumulate XXH3_accumulate_scalar
|
4805
5173
|
#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
|
4806
5174
|
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
|
4807
5175
|
|
@@ -4812,45 +5180,11 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
|
|
4812
5180
|
# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
|
4813
5181
|
#endif
|
4814
5182
|
|
4815
|
-
#ifndef XXH_PREFETCH_DIST
|
4816
|
-
# ifdef __clang__
|
4817
|
-
# define XXH_PREFETCH_DIST 320
|
4818
|
-
# else
|
4819
|
-
# if (XXH_VECTOR == XXH_AVX512)
|
4820
|
-
# define XXH_PREFETCH_DIST 512
|
4821
|
-
# else
|
4822
|
-
# define XXH_PREFETCH_DIST 384
|
4823
|
-
# endif
|
4824
|
-
# endif /* __clang__ */
|
4825
|
-
#endif /* XXH_PREFETCH_DIST */
|
4826
|
-
|
4827
|
-
/*
|
4828
|
-
* XXH3_accumulate()
|
4829
|
-
* Loops over XXH3_accumulate_512().
|
4830
|
-
* Assumption: nbStripes will not overflow the secret size
|
4831
|
-
*/
|
4832
|
-
XXH_FORCE_INLINE void
|
4833
|
-
XXH3_accumulate( xxh_u64* XXH_RESTRICT acc,
|
4834
|
-
const xxh_u8* XXH_RESTRICT input,
|
4835
|
-
const xxh_u8* XXH_RESTRICT secret,
|
4836
|
-
size_t nbStripes,
|
4837
|
-
XXH3_f_accumulate_512 f_acc512)
|
4838
|
-
{
|
4839
|
-
size_t n;
|
4840
|
-
for (n = 0; n < nbStripes; n++ ) {
|
4841
|
-
const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
|
4842
|
-
XXH_PREFETCH(in + XXH_PREFETCH_DIST);
|
4843
|
-
f_acc512(acc,
|
4844
|
-
in,
|
4845
|
-
secret + n*XXH_SECRET_CONSUME_RATE);
|
4846
|
-
}
|
4847
|
-
}
|
4848
|
-
|
4849
5183
|
XXH_FORCE_INLINE void
|
4850
5184
|
XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
|
4851
5185
|
const xxh_u8* XXH_RESTRICT input, size_t len,
|
4852
5186
|
const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
|
4853
|
-
|
5187
|
+
XXH3_f_accumulate f_acc,
|
4854
5188
|
XXH3_f_scrambleAcc f_scramble)
|
4855
5189
|
{
|
4856
5190
|
size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
|
@@ -4862,7 +5196,7 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
|
|
4862
5196
|
XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
|
4863
5197
|
|
4864
5198
|
for (n = 0; n < nb_blocks; n++) {
|
4865
|
-
|
5199
|
+
f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
|
4866
5200
|
f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
|
4867
5201
|
}
|
4868
5202
|
|
@@ -4870,12 +5204,12 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
|
|
4870
5204
|
XXH_ASSERT(len > XXH_STRIPE_LEN);
|
4871
5205
|
{ size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
|
4872
5206
|
XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
|
4873
|
-
|
5207
|
+
f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
|
4874
5208
|
|
4875
5209
|
/* last stripe */
|
4876
5210
|
{ const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
|
4877
5211
|
#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */
|
4878
|
-
|
5212
|
+
XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
|
4879
5213
|
} }
|
4880
5214
|
}
|
4881
5215
|
|
@@ -4920,12 +5254,12 @@ XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secre
|
|
4920
5254
|
XXH_FORCE_INLINE XXH64_hash_t
|
4921
5255
|
XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
|
4922
5256
|
const void* XXH_RESTRICT secret, size_t secretSize,
|
4923
|
-
|
5257
|
+
XXH3_f_accumulate f_acc,
|
4924
5258
|
XXH3_f_scrambleAcc f_scramble)
|
4925
5259
|
{
|
4926
5260
|
XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
|
4927
5261
|
|
4928
|
-
XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize,
|
5262
|
+
XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
|
4929
5263
|
|
4930
5264
|
/* converge into final hash */
|
4931
5265
|
XXH_STATIC_ASSERT(sizeof(acc) == 64);
|
@@ -4939,13 +5273,15 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
|
|
4939
5273
|
* It's important for performance to transmit secret's size (when it's static)
|
4940
5274
|
* so that the compiler can properly optimize the vectorized loop.
|
4941
5275
|
* This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
|
5276
|
+
* When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
|
5277
|
+
* breaks -Og, this is XXH_NO_INLINE.
|
4942
5278
|
*/
|
4943
|
-
|
5279
|
+
XXH3_WITH_SECRET_INLINE XXH64_hash_t
|
4944
5280
|
XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
|
4945
5281
|
XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
|
4946
5282
|
{
|
4947
5283
|
(void)seed64;
|
4948
|
-
return XXH3_hashLong_64b_internal(input, len, secret, secretLen,
|
5284
|
+
return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
|
4949
5285
|
}
|
4950
5286
|
|
4951
5287
|
/*
|
@@ -4959,7 +5295,7 @@ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
|
|
4959
5295
|
XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
|
4960
5296
|
{
|
4961
5297
|
(void)seed64; (void)secret; (void)secretLen;
|
4962
|
-
return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
|
5298
|
+
return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
|
4963
5299
|
}
|
4964
5300
|
|
4965
5301
|
/*
|
@@ -4976,7 +5312,7 @@ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
|
|
4976
5312
|
XXH_FORCE_INLINE XXH64_hash_t
|
4977
5313
|
XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
|
4978
5314
|
XXH64_hash_t seed,
|
4979
|
-
|
5315
|
+
XXH3_f_accumulate f_acc,
|
4980
5316
|
XXH3_f_scrambleAcc f_scramble,
|
4981
5317
|
XXH3_f_initCustomSecret f_initSec)
|
4982
5318
|
{
|
@@ -4984,12 +5320,12 @@ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
|
|
4984
5320
|
if (seed == 0)
|
4985
5321
|
return XXH3_hashLong_64b_internal(input, len,
|
4986
5322
|
XXH3_kSecret, sizeof(XXH3_kSecret),
|
4987
|
-
|
5323
|
+
f_acc, f_scramble);
|
4988
5324
|
#endif
|
4989
5325
|
{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
|
4990
5326
|
f_initSec(secret, seed);
|
4991
5327
|
return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
|
4992
|
-
|
5328
|
+
f_acc, f_scramble);
|
4993
5329
|
}
|
4994
5330
|
}
|
4995
5331
|
|
@@ -4997,12 +5333,12 @@ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
|
|
4997
5333
|
* It's important for performance that XXH3_hashLong is not inlined.
|
4998
5334
|
*/
|
4999
5335
|
XXH_NO_INLINE XXH64_hash_t
|
5000
|
-
XXH3_hashLong_64b_withSeed(const void* input, size_t len,
|
5001
|
-
XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
|
5336
|
+
XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
|
5337
|
+
XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
|
5002
5338
|
{
|
5003
5339
|
(void)secret; (void)secretLen;
|
5004
5340
|
return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
|
5005
|
-
|
5341
|
+
XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
|
5006
5342
|
}
|
5007
5343
|
|
5008
5344
|
|
@@ -5035,27 +5371,27 @@ XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
|
|
5035
5371
|
/* === Public entry point === */
|
5036
5372
|
|
5037
5373
|
/*! @ingroup XXH3_family */
|
5038
|
-
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t length)
|
5374
|
+
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
|
5039
5375
|
{
|
5040
5376
|
return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
|
5041
5377
|
}
|
5042
5378
|
|
5043
5379
|
/*! @ingroup XXH3_family */
|
5044
5380
|
XXH_PUBLIC_API XXH64_hash_t
|
5045
|
-
XXH3_64bits_withSecret(const void* input, size_t length, const void* secret, size_t secretSize)
|
5381
|
+
XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
|
5046
5382
|
{
|
5047
5383
|
return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
|
5048
5384
|
}
|
5049
5385
|
|
5050
5386
|
/*! @ingroup XXH3_family */
|
5051
5387
|
XXH_PUBLIC_API XXH64_hash_t
|
5052
|
-
XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed)
|
5388
|
+
XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
|
5053
5389
|
{
|
5054
5390
|
return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
|
5055
5391
|
}
|
5056
5392
|
|
5057
5393
|
XXH_PUBLIC_API XXH64_hash_t
|
5058
|
-
XXH3_64bits_withSecretandSeed(const void* input, size_t length, const void* secret, size_t secretSize, XXH64_hash_t seed)
|
5394
|
+
XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
|
5059
5395
|
{
|
5060
5396
|
if (length <= XXH3_MIDSIZE_MAX)
|
5061
5397
|
return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
|
@@ -5148,7 +5484,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
|
|
5148
5484
|
|
5149
5485
|
/*! @ingroup XXH3_family */
|
5150
5486
|
XXH_PUBLIC_API void
|
5151
|
-
XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
|
5487
|
+
XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
|
5152
5488
|
{
|
5153
5489
|
XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
|
5154
5490
|
}
|
@@ -5182,7 +5518,7 @@ XXH3_reset_internal(XXH3_state_t* statePtr,
|
|
5182
5518
|
|
5183
5519
|
/*! @ingroup XXH3_family */
|
5184
5520
|
XXH_PUBLIC_API XXH_errorcode
|
5185
|
-
XXH3_64bits_reset(XXH3_state_t* statePtr)
|
5521
|
+
XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
|
5186
5522
|
{
|
5187
5523
|
if (statePtr == NULL) return XXH_ERROR;
|
5188
5524
|
XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
|
@@ -5191,7 +5527,7 @@ XXH3_64bits_reset(XXH3_state_t* statePtr)
|
|
5191
5527
|
|
5192
5528
|
/*! @ingroup XXH3_family */
|
5193
5529
|
XXH_PUBLIC_API XXH_errorcode
|
5194
|
-
XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
|
5530
|
+
XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
|
5195
5531
|
{
|
5196
5532
|
if (statePtr == NULL) return XXH_ERROR;
|
5197
5533
|
XXH3_reset_internal(statePtr, 0, secret, secretSize);
|
@@ -5202,7 +5538,7 @@ XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t
|
|
5202
5538
|
|
5203
5539
|
/*! @ingroup XXH3_family */
|
5204
5540
|
XXH_PUBLIC_API XXH_errorcode
|
5205
|
-
XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
|
5541
|
+
XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
|
5206
5542
|
{
|
5207
5543
|
if (statePtr == NULL) return XXH_ERROR;
|
5208
5544
|
if (seed==0) return XXH3_64bits_reset(statePtr);
|
@@ -5214,7 +5550,7 @@ XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
|
|
5214
5550
|
|
5215
5551
|
/*! @ingroup XXH3_family */
|
5216
5552
|
XXH_PUBLIC_API XXH_errorcode
|
5217
|
-
XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)
|
5553
|
+
XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
|
5218
5554
|
{
|
5219
5555
|
if (statePtr == NULL) return XXH_ERROR;
|
5220
5556
|
if (secret == NULL) return XXH_ERROR;
|
@@ -5224,31 +5560,57 @@ XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret,
|
|
5224
5560
|
return XXH_OK;
|
5225
5561
|
}
|
5226
5562
|
|
5227
|
-
|
5228
|
-
*
|
5229
|
-
*
|
5230
|
-
|
5563
|
+
/*!
|
5564
|
+
* @internal
|
5565
|
+
* @brief Processes a large input for XXH3_update() and XXH3_digest_long().
|
5566
|
+
*
|
5567
|
+
* Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.
|
5568
|
+
*
|
5569
|
+
* @param acc Pointer to the 8 accumulator lanes
|
5570
|
+
* @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block*
|
5571
|
+
* @param nbStripesPerBlock Number of stripes in a block
|
5572
|
+
* @param input Input pointer
|
5573
|
+
* @param nbStripes Number of stripes to process
|
5574
|
+
* @param secret Secret pointer
|
5575
|
+
* @param secretLimit Offset of the last block in @p secret
|
5576
|
+
* @param f_acc Pointer to an XXH3_accumulate implementation
|
5577
|
+
* @param f_scramble Pointer to an XXH3_scrambleAcc implementation
|
5578
|
+
* @return Pointer past the end of @p input after processing
|
5579
|
+
*/
|
5580
|
+
XXH_FORCE_INLINE const xxh_u8 *
|
5231
5581
|
XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
|
5232
5582
|
size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
|
5233
5583
|
const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
|
5234
5584
|
const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
|
5235
|
-
|
5585
|
+
XXH3_f_accumulate f_acc,
|
5236
5586
|
XXH3_f_scrambleAcc f_scramble)
|
5237
5587
|
{
|
5238
|
-
|
5239
|
-
|
5240
|
-
if (nbStripesPerBlock - *nbStripesSoFarPtr
|
5241
|
-
/*
|
5242
|
-
size_t
|
5243
|
-
|
5244
|
-
|
5245
|
-
|
5246
|
-
|
5247
|
-
|
5248
|
-
|
5249
|
-
|
5588
|
+
const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;
|
5589
|
+
/* Process full blocks */
|
5590
|
+
if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {
|
5591
|
+
/* Process the initial partial block... */
|
5592
|
+
size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;
|
5593
|
+
|
5594
|
+
do {
|
5595
|
+
/* Accumulate and scramble */
|
5596
|
+
f_acc(acc, input, initialSecret, nbStripesThisIter);
|
5597
|
+
f_scramble(acc, secret + secretLimit);
|
5598
|
+
input += nbStripesThisIter * XXH_STRIPE_LEN;
|
5599
|
+
nbStripes -= nbStripesThisIter;
|
5600
|
+
/* Then continue the loop with the full block size */
|
5601
|
+
nbStripesThisIter = nbStripesPerBlock;
|
5602
|
+
initialSecret = secret;
|
5603
|
+
} while (nbStripes >= nbStripesPerBlock);
|
5604
|
+
*nbStripesSoFarPtr = 0;
|
5605
|
+
}
|
5606
|
+
/* Process a partial block */
|
5607
|
+
if (nbStripes > 0) {
|
5608
|
+
f_acc(acc, input, initialSecret, nbStripes);
|
5609
|
+
input += nbStripes * XXH_STRIPE_LEN;
|
5250
5610
|
*nbStripesSoFarPtr += nbStripes;
|
5251
5611
|
}
|
5612
|
+
/* Return end pointer */
|
5613
|
+
return input;
|
5252
5614
|
}
|
5253
5615
|
|
5254
5616
|
#ifndef XXH3_STREAM_USE_STACK
|
@@ -5262,7 +5624,7 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
|
|
5262
5624
|
XXH_FORCE_INLINE XXH_errorcode
|
5263
5625
|
XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
|
5264
5626
|
const xxh_u8* XXH_RESTRICT input, size_t len,
|
5265
|
-
|
5627
|
+
XXH3_f_accumulate f_acc,
|
5266
5628
|
XXH3_f_scrambleAcc f_scramble)
|
5267
5629
|
{
|
5268
5630
|
if (input==NULL) {
|
@@ -5278,7 +5640,8 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
|
|
5278
5640
|
* when operating accumulators directly into state.
|
5279
5641
|
* Operating into stack space seems to enable proper optimization.
|
5280
5642
|
* clang, on the other hand, doesn't seem to need this trick */
|
5281
|
-
XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
|
5643
|
+
XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
|
5644
|
+
XXH_memcpy(acc, state->acc, sizeof(acc));
|
5282
5645
|
#else
|
5283
5646
|
xxh_u64* XXH_RESTRICT const acc = state->acc;
|
5284
5647
|
#endif
|
@@ -5286,7 +5649,7 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
|
|
5286
5649
|
XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
|
5287
5650
|
|
5288
5651
|
/* small input : just fill in tmp buffer */
|
5289
|
-
if (
|
5652
|
+
if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {
|
5290
5653
|
XXH_memcpy(state->buffer + state->bufferedSize, input, len);
|
5291
5654
|
state->bufferedSize += (XXH32_hash_t)len;
|
5292
5655
|
return XXH_OK;
|
@@ -5308,57 +5671,20 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
|
|
5308
5671
|
&state->nbStripesSoFar, state->nbStripesPerBlock,
|
5309
5672
|
state->buffer, XXH3_INTERNALBUFFER_STRIPES,
|
5310
5673
|
secret, state->secretLimit,
|
5311
|
-
|
5674
|
+
f_acc, f_scramble);
|
5312
5675
|
state->bufferedSize = 0;
|
5313
5676
|
}
|
5314
5677
|
XXH_ASSERT(input < bEnd);
|
5315
|
-
|
5316
|
-
/* large input to consume : ingest per full block */
|
5317
|
-
if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
|
5678
|
+
if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
|
5318
5679
|
size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
|
5319
|
-
|
5320
|
-
/* join to current block's end */
|
5321
|
-
{ size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
|
5322
|
-
XXH_ASSERT(nbStripesToEnd <= nbStripes);
|
5323
|
-
XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);
|
5324
|
-
f_scramble(acc, secret + state->secretLimit);
|
5325
|
-
state->nbStripesSoFar = 0;
|
5326
|
-
input += nbStripesToEnd * XXH_STRIPE_LEN;
|
5327
|
-
nbStripes -= nbStripesToEnd;
|
5328
|
-
}
|
5329
|
-
/* consume per entire blocks */
|
5330
|
-
while(nbStripes >= state->nbStripesPerBlock) {
|
5331
|
-
XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);
|
5332
|
-
f_scramble(acc, secret + state->secretLimit);
|
5333
|
-
input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
|
5334
|
-
nbStripes -= state->nbStripesPerBlock;
|
5335
|
-
}
|
5336
|
-
/* consume last partial block */
|
5337
|
-
XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);
|
5338
|
-
input += nbStripes * XXH_STRIPE_LEN;
|
5339
|
-
XXH_ASSERT(input < bEnd); /* at least some bytes left */
|
5340
|
-
state->nbStripesSoFar = nbStripes;
|
5341
|
-
/* buffer predecessor of last partial stripe */
|
5342
|
-
XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
|
5343
|
-
XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
|
5344
|
-
} else {
|
5345
|
-
/* content to consume <= block size */
|
5346
|
-
/* Consume input by a multiple of internal buffer size */
|
5347
|
-
if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
|
5348
|
-
const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
|
5349
|
-
do {
|
5350
|
-
XXH3_consumeStripes(acc,
|
5680
|
+
input = XXH3_consumeStripes(acc,
|
5351
5681
|
&state->nbStripesSoFar, state->nbStripesPerBlock,
|
5352
|
-
|
5353
|
-
|
5354
|
-
|
5355
|
-
|
5356
|
-
} while (input<limit);
|
5357
|
-
/* buffer predecessor of last partial stripe */
|
5358
|
-
XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
|
5359
|
-
}
|
5360
|
-
}
|
5682
|
+
input, nbStripes,
|
5683
|
+
secret, state->secretLimit,
|
5684
|
+
f_acc, f_scramble);
|
5685
|
+
XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
|
5361
5686
|
|
5687
|
+
}
|
5362
5688
|
/* Some remaining input (always) : buffer it */
|
5363
5689
|
XXH_ASSERT(input < bEnd);
|
5364
5690
|
XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
|
@@ -5367,7 +5693,7 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
|
|
5367
5693
|
state->bufferedSize = (XXH32_hash_t)(bEnd-input);
|
5368
5694
|
#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
|
5369
5695
|
/* save stack accumulators into state */
|
5370
|
-
|
5696
|
+
XXH_memcpy(state->acc, acc, sizeof(acc));
|
5371
5697
|
#endif
|
5372
5698
|
}
|
5373
5699
|
|
@@ -5376,10 +5702,10 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
|
|
5376
5702
|
|
5377
5703
|
/*! @ingroup XXH3_family */
|
5378
5704
|
XXH_PUBLIC_API XXH_errorcode
|
5379
|
-
XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
|
5705
|
+
XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
|
5380
5706
|
{
|
5381
5707
|
return XXH3_update(state, (const xxh_u8*)input, len,
|
5382
|
-
|
5708
|
+
XXH3_accumulate, XXH3_scrambleAcc);
|
5383
5709
|
}
|
5384
5710
|
|
5385
5711
|
|
@@ -5388,37 +5714,40 @@ XXH3_digest_long (XXH64_hash_t* acc,
|
|
5388
5714
|
const XXH3_state_t* state,
|
5389
5715
|
const unsigned char* secret)
|
5390
5716
|
{
|
5717
|
+
xxh_u8 lastStripe[XXH_STRIPE_LEN];
|
5718
|
+
const xxh_u8* lastStripePtr;
|
5719
|
+
|
5391
5720
|
/*
|
5392
5721
|
* Digest on a local copy. This way, the state remains unaltered, and it can
|
5393
5722
|
* continue ingesting more input afterwards.
|
5394
5723
|
*/
|
5395
5724
|
XXH_memcpy(acc, state->acc, sizeof(state->acc));
|
5396
5725
|
if (state->bufferedSize >= XXH_STRIPE_LEN) {
|
5726
|
+
/* Consume remaining stripes then point to remaining data in buffer */
|
5397
5727
|
size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
|
5398
5728
|
size_t nbStripesSoFar = state->nbStripesSoFar;
|
5399
5729
|
XXH3_consumeStripes(acc,
|
5400
5730
|
&nbStripesSoFar, state->nbStripesPerBlock,
|
5401
5731
|
state->buffer, nbStripes,
|
5402
5732
|
secret, state->secretLimit,
|
5403
|
-
|
5404
|
-
|
5405
|
-
XXH3_accumulate_512(acc,
|
5406
|
-
state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
|
5407
|
-
secret + state->secretLimit - XXH_SECRET_LASTACC_START);
|
5733
|
+
XXH3_accumulate, XXH3_scrambleAcc);
|
5734
|
+
lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;
|
5408
5735
|
} else { /* bufferedSize < XXH_STRIPE_LEN */
|
5409
|
-
|
5736
|
+
/* Copy to temp buffer */
|
5410
5737
|
size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
|
5411
5738
|
XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */
|
5412
5739
|
XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
|
5413
5740
|
XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
|
5414
|
-
|
5415
|
-
lastStripe,
|
5416
|
-
secret + state->secretLimit - XXH_SECRET_LASTACC_START);
|
5741
|
+
lastStripePtr = lastStripe;
|
5417
5742
|
}
|
5743
|
+
/* Last stripe */
|
5744
|
+
XXH3_accumulate_512(acc,
|
5745
|
+
lastStripePtr,
|
5746
|
+
secret + state->secretLimit - XXH_SECRET_LASTACC_START);
|
5418
5747
|
}
|
5419
5748
|
|
5420
5749
|
/*! @ingroup XXH3_family */
|
5421
|
-
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
|
5750
|
+
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
|
5422
5751
|
{
|
5423
5752
|
const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
|
5424
5753
|
if (state->totalLen > XXH3_MIDSIZE_MAX) {
|
@@ -5631,7 +5960,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
5631
5960
|
#if XXH_SIZE_OPT >= 1
|
5632
5961
|
{
|
5633
5962
|
/* Smaller, but slightly slower. */
|
5634
|
-
|
5963
|
+
unsigned int i = (unsigned int)(len - 1) / 32;
|
5635
5964
|
do {
|
5636
5965
|
acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
|
5637
5966
|
} while (i-- != 0);
|
@@ -5669,25 +5998,34 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
5669
5998
|
XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
|
5670
5999
|
|
5671
6000
|
{ XXH128_hash_t acc;
|
5672
|
-
|
5673
|
-
int i;
|
6001
|
+
unsigned i;
|
5674
6002
|
acc.low64 = len * XXH_PRIME64_1;
|
5675
6003
|
acc.high64 = 0;
|
5676
|
-
|
6004
|
+
/*
|
6005
|
+
* We set as `i` as offset + 32. We do this so that unchanged
|
6006
|
+
* `len` can be used as upper bound. This reaches a sweet spot
|
6007
|
+
* where both x86 and aarch64 get simple agen and good codegen
|
6008
|
+
* for the loop.
|
6009
|
+
*/
|
6010
|
+
for (i = 32; i < 160; i += 32) {
|
5677
6011
|
acc = XXH128_mix32B(acc,
|
5678
|
-
input +
|
5679
|
-
input +
|
5680
|
-
secret +
|
6012
|
+
input + i - 32,
|
6013
|
+
input + i - 16,
|
6014
|
+
secret + i - 32,
|
5681
6015
|
seed);
|
5682
6016
|
}
|
5683
6017
|
acc.low64 = XXH3_avalanche(acc.low64);
|
5684
6018
|
acc.high64 = XXH3_avalanche(acc.high64);
|
5685
|
-
|
5686
|
-
|
6019
|
+
/*
|
6020
|
+
* NB: `i <= len` will duplicate the last 32-bytes if
|
6021
|
+
* len % 32 was zero. This is an unfortunate necessity to keep
|
6022
|
+
* the hash result stable.
|
6023
|
+
*/
|
6024
|
+
for (i=160; i <= len; i += 32) {
|
5687
6025
|
acc = XXH128_mix32B(acc,
|
5688
|
-
input +
|
5689
|
-
input +
|
5690
|
-
secret + XXH3_MIDSIZE_STARTOFFSET +
|
6026
|
+
input + i - 32,
|
6027
|
+
input + i - 16,
|
6028
|
+
secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
|
5691
6029
|
seed);
|
5692
6030
|
}
|
5693
6031
|
/* last bytes */
|
@@ -5695,7 +6033,7 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
5695
6033
|
input + len - 16,
|
5696
6034
|
input + len - 32,
|
5697
6035
|
secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
|
5698
|
-
|
6036
|
+
(XXH64_hash_t)0 - seed);
|
5699
6037
|
|
5700
6038
|
{ XXH128_hash_t h128;
|
5701
6039
|
h128.low64 = acc.low64 + acc.high64;
|
@@ -5712,12 +6050,12 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
5712
6050
|
XXH_FORCE_INLINE XXH128_hash_t
|
5713
6051
|
XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
|
5714
6052
|
const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
|
5715
|
-
|
6053
|
+
XXH3_f_accumulate f_acc,
|
5716
6054
|
XXH3_f_scrambleAcc f_scramble)
|
5717
6055
|
{
|
5718
6056
|
XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
|
5719
6057
|
|
5720
|
-
XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize,
|
6058
|
+
XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
|
5721
6059
|
|
5722
6060
|
/* converge into final hash */
|
5723
6061
|
XXH_STATIC_ASSERT(sizeof(acc) == 64);
|
@@ -5744,38 +6082,41 @@ XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
|
|
5744
6082
|
{
|
5745
6083
|
(void)seed64; (void)secret; (void)secretLen;
|
5746
6084
|
return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
|
5747
|
-
|
6085
|
+
XXH3_accumulate, XXH3_scrambleAcc);
|
5748
6086
|
}
|
5749
6087
|
|
5750
6088
|
/*
|
5751
6089
|
* It's important for performance to pass @p secretLen (when it's static)
|
5752
6090
|
* to the compiler, so that it can properly optimize the vectorized loop.
|
6091
|
+
*
|
6092
|
+
* When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
|
6093
|
+
* breaks -Og, this is XXH_NO_INLINE.
|
5753
6094
|
*/
|
5754
|
-
|
6095
|
+
XXH3_WITH_SECRET_INLINE XXH128_hash_t
|
5755
6096
|
XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
|
5756
6097
|
XXH64_hash_t seed64,
|
5757
6098
|
const void* XXH_RESTRICT secret, size_t secretLen)
|
5758
6099
|
{
|
5759
6100
|
(void)seed64;
|
5760
6101
|
return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
|
5761
|
-
|
6102
|
+
XXH3_accumulate, XXH3_scrambleAcc);
|
5762
6103
|
}
|
5763
6104
|
|
5764
6105
|
XXH_FORCE_INLINE XXH128_hash_t
|
5765
6106
|
XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
|
5766
6107
|
XXH64_hash_t seed64,
|
5767
|
-
|
6108
|
+
XXH3_f_accumulate f_acc,
|
5768
6109
|
XXH3_f_scrambleAcc f_scramble,
|
5769
6110
|
XXH3_f_initCustomSecret f_initSec)
|
5770
6111
|
{
|
5771
6112
|
if (seed64 == 0)
|
5772
6113
|
return XXH3_hashLong_128b_internal(input, len,
|
5773
6114
|
XXH3_kSecret, sizeof(XXH3_kSecret),
|
5774
|
-
|
6115
|
+
f_acc, f_scramble);
|
5775
6116
|
{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
|
5776
6117
|
f_initSec(secret, seed64);
|
5777
6118
|
return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
|
5778
|
-
|
6119
|
+
f_acc, f_scramble);
|
5779
6120
|
}
|
5780
6121
|
}
|
5781
6122
|
|
@@ -5788,7 +6129,7 @@ XXH3_hashLong_128b_withSeed(const void* input, size_t len,
|
|
5788
6129
|
{
|
5789
6130
|
(void)secret; (void)secretLen;
|
5790
6131
|
return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
|
5791
|
-
|
6132
|
+
XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
|
5792
6133
|
}
|
5793
6134
|
|
5794
6135
|
typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
|
@@ -5819,7 +6160,7 @@ XXH3_128bits_internal(const void* input, size_t len,
|
|
5819
6160
|
/* === Public XXH128 API === */
|
5820
6161
|
|
5821
6162
|
/*! @ingroup XXH3_family */
|
5822
|
-
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
|
6163
|
+
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
|
5823
6164
|
{
|
5824
6165
|
return XXH3_128bits_internal(input, len, 0,
|
5825
6166
|
XXH3_kSecret, sizeof(XXH3_kSecret),
|
@@ -5828,7 +6169,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
|
|
5828
6169
|
|
5829
6170
|
/*! @ingroup XXH3_family */
|
5830
6171
|
XXH_PUBLIC_API XXH128_hash_t
|
5831
|
-
XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
|
6172
|
+
XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
|
5832
6173
|
{
|
5833
6174
|
return XXH3_128bits_internal(input, len, 0,
|
5834
6175
|
(const xxh_u8*)secret, secretSize,
|
@@ -5837,7 +6178,7 @@ XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_
|
|
5837
6178
|
|
5838
6179
|
/*! @ingroup XXH3_family */
|
5839
6180
|
XXH_PUBLIC_API XXH128_hash_t
|
5840
|
-
XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
|
6181
|
+
XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
|
5841
6182
|
{
|
5842
6183
|
return XXH3_128bits_internal(input, len, seed,
|
5843
6184
|
XXH3_kSecret, sizeof(XXH3_kSecret),
|
@@ -5846,7 +6187,7 @@ XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
|
|
5846
6187
|
|
5847
6188
|
/*! @ingroup XXH3_family */
|
5848
6189
|
XXH_PUBLIC_API XXH128_hash_t
|
5849
|
-
XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
|
6190
|
+
XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
|
5850
6191
|
{
|
5851
6192
|
if (len <= XXH3_MIDSIZE_MAX)
|
5852
6193
|
return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
|
@@ -5855,7 +6196,7 @@ XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret
|
|
5855
6196
|
|
5856
6197
|
/*! @ingroup XXH3_family */
|
5857
6198
|
XXH_PUBLIC_API XXH128_hash_t
|
5858
|
-
XXH128(const void* input, size_t len, XXH64_hash_t seed)
|
6199
|
+
XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
|
5859
6200
|
{
|
5860
6201
|
return XXH3_128bits_withSeed(input, len, seed);
|
5861
6202
|
}
|
@@ -5870,42 +6211,41 @@ XXH128(const void* input, size_t len, XXH64_hash_t seed)
|
|
5870
6211
|
|
5871
6212
|
/*! @ingroup XXH3_family */
|
5872
6213
|
XXH_PUBLIC_API XXH_errorcode
|
5873
|
-
XXH3_128bits_reset(XXH3_state_t* statePtr)
|
6214
|
+
XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
|
5874
6215
|
{
|
5875
6216
|
return XXH3_64bits_reset(statePtr);
|
5876
6217
|
}
|
5877
6218
|
|
5878
6219
|
/*! @ingroup XXH3_family */
|
5879
6220
|
XXH_PUBLIC_API XXH_errorcode
|
5880
|
-
XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
|
6221
|
+
XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
|
5881
6222
|
{
|
5882
6223
|
return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
|
5883
6224
|
}
|
5884
6225
|
|
5885
6226
|
/*! @ingroup XXH3_family */
|
5886
6227
|
XXH_PUBLIC_API XXH_errorcode
|
5887
|
-
XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
|
6228
|
+
XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
|
5888
6229
|
{
|
5889
6230
|
return XXH3_64bits_reset_withSeed(statePtr, seed);
|
5890
6231
|
}
|
5891
6232
|
|
5892
6233
|
/*! @ingroup XXH3_family */
|
5893
6234
|
XXH_PUBLIC_API XXH_errorcode
|
5894
|
-
XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
|
6235
|
+
XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
|
5895
6236
|
{
|
5896
6237
|
return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
|
5897
6238
|
}
|
5898
6239
|
|
5899
6240
|
/*! @ingroup XXH3_family */
|
5900
6241
|
XXH_PUBLIC_API XXH_errorcode
|
5901
|
-
XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
|
6242
|
+
XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
|
5902
6243
|
{
|
5903
|
-
return
|
5904
|
-
XXH3_accumulate_512, XXH3_scrambleAcc);
|
6244
|
+
return XXH3_64bits_update(state, input, len);
|
5905
6245
|
}
|
5906
6246
|
|
5907
6247
|
/*! @ingroup XXH3_family */
|
5908
|
-
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
|
6248
|
+
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
|
5909
6249
|
{
|
5910
6250
|
const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
|
5911
6251
|
if (state->totalLen > XXH3_MIDSIZE_MAX) {
|
@@ -5947,7 +6287,7 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
|
|
5947
6287
|
* <0 if *h128_1 < *h128_2
|
5948
6288
|
* =0 if *h128_1 == *h128_2 */
|
5949
6289
|
/*! @ingroup XXH3_family */
|
5950
|
-
XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
|
6290
|
+
XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
|
5951
6291
|
{
|
5952
6292
|
XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
|
5953
6293
|
XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
|
@@ -5961,7 +6301,7 @@ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
|
|
5961
6301
|
/*====== Canonical representation ======*/
|
5962
6302
|
/*! @ingroup XXH3_family */
|
5963
6303
|
XXH_PUBLIC_API void
|
5964
|
-
XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
|
6304
|
+
XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
|
5965
6305
|
{
|
5966
6306
|
XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
|
5967
6307
|
if (XXH_CPU_LITTLE_ENDIAN) {
|
@@ -5974,7 +6314,7 @@ XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
|
|
5974
6314
|
|
5975
6315
|
/*! @ingroup XXH3_family */
|
5976
6316
|
XXH_PUBLIC_API XXH128_hash_t
|
5977
|
-
XXH128_hashFromCanonical(const XXH128_canonical_t* src)
|
6317
|
+
XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
|
5978
6318
|
{
|
5979
6319
|
XXH128_hash_t h;
|
5980
6320
|
h.high64 = XXH_readBE64(src);
|
@@ -5998,7 +6338,7 @@ XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
|
|
5998
6338
|
|
5999
6339
|
/*! @ingroup XXH3_family */
|
6000
6340
|
XXH_PUBLIC_API XXH_errorcode
|
6001
|
-
XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)
|
6341
|
+
XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
|
6002
6342
|
{
|
6003
6343
|
#if (XXH_DEBUGLEVEL >= 1)
|
6004
6344
|
XXH_ASSERT(secretBuffer != NULL);
|
@@ -6043,7 +6383,7 @@ XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSee
|
|
6043
6383
|
|
6044
6384
|
/*! @ingroup XXH3_family */
|
6045
6385
|
XXH_PUBLIC_API void
|
6046
|
-
XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
|
6386
|
+
XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
|
6047
6387
|
{
|
6048
6388
|
XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
|
6049
6389
|
XXH3_initCustomSecret(secret, seed);
|
@@ -6071,5 +6411,5 @@ XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
|
|
6071
6411
|
|
6072
6412
|
|
6073
6413
|
#if defined (__cplusplus)
|
6074
|
-
}
|
6414
|
+
} /* extern "C" */
|
6075
6415
|
#endif
|