digest-xxhash 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/digest/xxhash/ext.c +1 -1
- data/ext/digest/xxhash/xxhash.h +820 -480
- data/lib/digest/xxhash/version.rb +1 -1
- metadata +3 -3
data/ext/digest/xxhash/xxhash.h
CHANGED
@@ -716,8 +716,15 @@ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canoni
|
|
716
716
|
# define XXH_HAS_ATTRIBUTE(x) 0
|
717
717
|
#endif
|
718
718
|
|
719
|
+
/*
|
720
|
+
* C23 __STDC_VERSION__ number hasn't been specified yet. For now
|
721
|
+
* leave as `201711L` (C17 + 1).
|
722
|
+
* TODO: Update to correct value when its been specified.
|
723
|
+
*/
|
724
|
+
#define XXH_C23_VN 201711L
|
725
|
+
|
719
726
|
/* C-language Attributes are added in C23. */
|
720
|
-
#if defined(__STDC_VERSION__) && (__STDC_VERSION__
|
727
|
+
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute)
|
721
728
|
# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
|
722
729
|
#else
|
723
730
|
# define XXH_HAS_C_ATTRIBUTE(x) 0
|
@@ -743,6 +750,18 @@ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canoni
|
|
743
750
|
# define XXH_FALLTHROUGH /* fallthrough */
|
744
751
|
#endif
|
745
752
|
|
753
|
+
/*
|
754
|
+
* Define XXH_NOESCAPE for annotated pointers in public API.
|
755
|
+
* https://clang.llvm.org/docs/AttributeReference.html#noescape
|
756
|
+
* As of writing this, only supported by clang.
|
757
|
+
*/
|
758
|
+
#if XXH_HAS_ATTRIBUTE(noescape)
|
759
|
+
# define XXH_NOESCAPE __attribute__((noescape))
|
760
|
+
#else
|
761
|
+
# define XXH_NOESCAPE
|
762
|
+
#endif
|
763
|
+
|
764
|
+
|
746
765
|
/*!
|
747
766
|
* @}
|
748
767
|
* @ingroup public
|
@@ -813,7 +832,7 @@ typedef uint64_t XXH64_hash_t;
|
|
813
832
|
* @see
|
814
833
|
* XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
|
815
834
|
*/
|
816
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
|
835
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
|
817
836
|
|
818
837
|
/******* Streaming *******/
|
819
838
|
#ifndef XXH_NO_STREAM
|
@@ -825,16 +844,16 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(const void* input, size_t length, XX
|
|
825
844
|
typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
|
826
845
|
XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
|
827
846
|
XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
|
828
|
-
XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
|
847
|
+
XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);
|
829
848
|
|
830
|
-
XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed);
|
831
|
-
XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
|
832
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);
|
849
|
+
XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);
|
850
|
+
XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
|
851
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
|
833
852
|
#endif /* !XXH_NO_STREAM */
|
834
853
|
/******* Canonical representation *******/
|
835
854
|
typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
|
836
|
-
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
|
837
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
|
855
|
+
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);
|
856
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);
|
838
857
|
|
839
858
|
#ifndef XXH_NO_XXH3
|
840
859
|
|
@@ -872,7 +891,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canoni
|
|
872
891
|
*
|
873
892
|
* XXH3 implementation is portable:
|
874
893
|
* it has a generic C90 formulation that can be compiled on any platform,
|
875
|
-
* all implementations
|
894
|
+
* all implementations generate exactly the same hash value on all platforms.
|
876
895
|
* Starting from v0.8.0, it's also labelled "stable", meaning that
|
877
896
|
* any future version will also generate the same hash value.
|
878
897
|
*
|
@@ -902,7 +921,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canoni
|
|
902
921
|
* @see
|
903
922
|
* XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version.
|
904
923
|
*/
|
905
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(const void* input, size_t length);
|
924
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);
|
906
925
|
|
907
926
|
/*!
|
908
927
|
* @brief 64-bit seeded variant of XXH3
|
@@ -919,7 +938,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(const void* input, size_t leng
|
|
919
938
|
* @param length The length
|
920
939
|
* @param seed The 64-bit seed to alter the state.
|
921
940
|
*/
|
922
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed);
|
941
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
|
923
942
|
|
924
943
|
/*!
|
925
944
|
* The bare minimum size for a custom secret.
|
@@ -948,7 +967,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(const void* input, si
|
|
948
967
|
* This is not necessarily the case when using the blob of bytes directly
|
949
968
|
* because, when hashing _small_ inputs, only a portion of the secret is employed.
|
950
969
|
*/
|
951
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
|
970
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
|
952
971
|
|
953
972
|
|
954
973
|
/******* Streaming *******/
|
@@ -968,20 +987,20 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(const void* data, s
|
|
968
987
|
typedef struct XXH3_state_s XXH3_state_t;
|
969
988
|
XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
|
970
989
|
XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
|
971
|
-
XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
|
990
|
+
XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);
|
972
991
|
|
973
992
|
/*
|
974
993
|
* XXH3_64bits_reset():
|
975
994
|
* Initialize with default parameters.
|
976
995
|
* digest will be equivalent to `XXH3_64bits()`.
|
977
996
|
*/
|
978
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
|
997
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
|
979
998
|
/*
|
980
999
|
* XXH3_64bits_reset_withSeed():
|
981
1000
|
* Generate a custom secret from `seed`, and store it into `statePtr`.
|
982
1001
|
* digest will be equivalent to `XXH3_64bits_withSeed()`.
|
983
1002
|
*/
|
984
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
|
1003
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
|
985
1004
|
/*!
|
986
1005
|
* XXH3_64bits_reset_withSecret():
|
987
1006
|
* `secret` is referenced, it _must outlive_ the hash streaming session.
|
@@ -991,10 +1010,10 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr,
|
|
991
1010
|
* When in doubt about the randomness of a candidate `secret`,
|
992
1011
|
* consider employing `XXH3_generateSecret()` instead (see below).
|
993
1012
|
*/
|
994
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
|
1013
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
|
995
1014
|
|
996
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
|
997
|
-
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr);
|
1015
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
|
1016
|
+
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
|
998
1017
|
#endif /* !XXH_NO_STREAM */
|
999
1018
|
|
1000
1019
|
/* note : canonical representation of XXH3 is the same as XXH64
|
@@ -1033,11 +1052,11 @@ typedef struct {
|
|
1033
1052
|
* @see
|
1034
1053
|
* XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version.
|
1035
1054
|
*/
|
1036
|
-
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(const void* data, size_t len);
|
1055
|
+
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
|
1037
1056
|
/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */
|
1038
|
-
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
|
1057
|
+
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
|
1039
1058
|
/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */
|
1040
|
-
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
|
1059
|
+
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
|
1041
1060
|
|
1042
1061
|
/******* Streaming *******/
|
1043
1062
|
#ifndef XXH_NO_STREAM
|
@@ -1053,12 +1072,12 @@ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(const void* data,
|
|
1053
1072
|
* All reset and streaming functions have same meaning as their 64-bit counterpart.
|
1054
1073
|
*/
|
1055
1074
|
|
1056
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
|
1057
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
|
1058
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
|
1075
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
|
1076
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
|
1077
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
|
1059
1078
|
|
1060
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
|
1061
|
-
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
|
1079
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
|
1080
|
+
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
|
1062
1081
|
#endif /* !XXH_NO_STREAM */
|
1063
1082
|
|
1064
1083
|
/* Following helper functions make it possible to compare XXH128_hast_t values.
|
@@ -1079,13 +1098,13 @@ XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
|
|
1079
1098
|
* =0 if *h128_1 == *h128_2
|
1080
1099
|
* <0 if *h128_1 < *h128_2
|
1081
1100
|
*/
|
1082
|
-
XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(const void* h128_1, const void* h128_2);
|
1101
|
+
XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);
|
1083
1102
|
|
1084
1103
|
|
1085
1104
|
/******* Canonical representation *******/
|
1086
1105
|
typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
|
1087
|
-
XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
|
1088
|
-
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
|
1106
|
+
XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);
|
1107
|
+
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);
|
1089
1108
|
|
1090
1109
|
|
1091
1110
|
#endif /* !XXH_NO_XXH3 */
|
@@ -1266,13 +1285,18 @@ struct XXH3_state_s {
|
|
1266
1285
|
* Note that this doesn't prepare the state for a streaming operation,
|
1267
1286
|
* it's still necessary to use XXH3_NNbits_reset*() afterwards.
|
1268
1287
|
*/
|
1269
|
-
#define XXH3_INITSTATE(XXH3_state_ptr)
|
1288
|
+
#define XXH3_INITSTATE(XXH3_state_ptr) \
|
1289
|
+
do { \
|
1290
|
+
XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \
|
1291
|
+
tmp_xxh3_state_ptr->seed = 0; \
|
1292
|
+
tmp_xxh3_state_ptr->extSecret = NULL; \
|
1293
|
+
} while(0)
|
1270
1294
|
|
1271
1295
|
|
1272
1296
|
/*!
|
1273
1297
|
* simple alias to pre-selected XXH3_128bits variant
|
1274
1298
|
*/
|
1275
|
-
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
|
1299
|
+
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
|
1276
1300
|
|
1277
1301
|
|
1278
1302
|
/* === Experimental API === */
|
@@ -1329,7 +1353,7 @@ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(const void* data, size_t len, XXH6
|
|
1329
1353
|
* }
|
1330
1354
|
* @endcode
|
1331
1355
|
*/
|
1332
|
-
XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
|
1356
|
+
XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);
|
1333
1357
|
|
1334
1358
|
/*!
|
1335
1359
|
* @brief Generate the same secret as the _withSeed() variants.
|
@@ -1368,7 +1392,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secr
|
|
1368
1392
|
* @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes
|
1369
1393
|
* @param seed The seed to seed the state.
|
1370
1394
|
*/
|
1371
|
-
XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
|
1395
|
+
XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);
|
1372
1396
|
|
1373
1397
|
/*!
|
1374
1398
|
* These variants generate hash values using either
|
@@ -1397,24 +1421,24 @@ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_
|
|
1397
1421
|
* because only portions of the secret are employed for small data.
|
1398
1422
|
*/
|
1399
1423
|
XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
|
1400
|
-
XXH3_64bits_withSecretandSeed(const void* data, size_t len,
|
1401
|
-
const void* secret, size_t secretSize,
|
1424
|
+
XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
|
1425
|
+
XXH_NOESCAPE const void* secret, size_t secretSize,
|
1402
1426
|
XXH64_hash_t seed);
|
1403
1427
|
/*! @copydoc XXH3_64bits_withSecretandSeed() */
|
1404
1428
|
XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
|
1405
|
-
XXH3_128bits_withSecretandSeed(const void* input, size_t length,
|
1406
|
-
const void* secret, size_t secretSize,
|
1429
|
+
XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
|
1430
|
+
XXH_NOESCAPE const void* secret, size_t secretSize,
|
1407
1431
|
XXH64_hash_t seed64);
|
1408
1432
|
#ifndef XXH_NO_STREAM
|
1409
1433
|
/*! @copydoc XXH3_64bits_withSecretandSeed() */
|
1410
1434
|
XXH_PUBLIC_API XXH_errorcode
|
1411
|
-
XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
|
1412
|
-
const void* secret, size_t secretSize,
|
1435
|
+
XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
|
1436
|
+
XXH_NOESCAPE const void* secret, size_t secretSize,
|
1413
1437
|
XXH64_hash_t seed64);
|
1414
1438
|
/*! @copydoc XXH3_64bits_withSecretandSeed() */
|
1415
1439
|
XXH_PUBLIC_API XXH_errorcode
|
1416
|
-
XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
|
1417
|
-
const void* secret, size_t secretSize,
|
1440
|
+
XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
|
1441
|
+
XXH_NOESCAPE const void* secret, size_t secretSize,
|
1418
1442
|
XXH64_hash_t seed64);
|
1419
1443
|
#endif /* !XXH_NO_STREAM */
|
1420
1444
|
|
@@ -1522,7 +1546,7 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
|
|
1522
1546
|
* care, as what works on one compiler/platform/optimization level may cause
|
1523
1547
|
* another to read garbage data or even crash.
|
1524
1548
|
*
|
1525
|
-
* See
|
1549
|
+
* See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
|
1526
1550
|
*
|
1527
1551
|
* Prefer these methods in priority order (0 > 3 > 1 > 2)
|
1528
1552
|
*/
|
@@ -1608,6 +1632,23 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
|
|
1608
1632
|
*/
|
1609
1633
|
# define XXH_NO_INLINE_HINTS 0
|
1610
1634
|
|
1635
|
+
/*!
|
1636
|
+
* @def XXH3_INLINE_SECRET
|
1637
|
+
* @brief Determines whether to inline the XXH3 withSecret code.
|
1638
|
+
*
|
1639
|
+
* When the secret size is known, the compiler can improve the performance
|
1640
|
+
* of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().
|
1641
|
+
*
|
1642
|
+
* However, if the secret size is not known, it doesn't have any benefit. This
|
1643
|
+
* happens when xxHash is compiled into a global symbol. Therefore, if
|
1644
|
+
* @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.
|
1645
|
+
*
|
1646
|
+
* Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers
|
1647
|
+
* that are *sometimes* force inline on -Og, and it is impossible to automatically
|
1648
|
+
* detect this optimization level.
|
1649
|
+
*/
|
1650
|
+
# define XXH3_INLINE_SECRET 0
|
1651
|
+
|
1611
1652
|
/*!
|
1612
1653
|
* @def XXH32_ENDJMP
|
1613
1654
|
* @brief Whether to use a jump for `XXH32_finalize`.
|
@@ -1682,6 +1723,15 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
|
|
1682
1723
|
# endif
|
1683
1724
|
#endif
|
1684
1725
|
|
1726
|
+
#ifndef XXH3_INLINE_SECRET
|
1727
|
+
# if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \
|
1728
|
+
|| !defined(XXH_INLINE_ALL)
|
1729
|
+
# define XXH3_INLINE_SECRET 0
|
1730
|
+
# else
|
1731
|
+
# define XXH3_INLINE_SECRET 1
|
1732
|
+
# endif
|
1733
|
+
#endif
|
1734
|
+
|
1685
1735
|
#ifndef XXH32_ENDJMP
|
1686
1736
|
/* generally preferable for performance */
|
1687
1737
|
# define XXH32_ENDJMP 0
|
@@ -1778,6 +1828,11 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
|
|
1778
1828
|
# define XXH_NO_INLINE static
|
1779
1829
|
#endif
|
1780
1830
|
|
1831
|
+
#if XXH3_INLINE_SECRET
|
1832
|
+
# define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE
|
1833
|
+
#else
|
1834
|
+
# define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE
|
1835
|
+
#endif
|
1781
1836
|
|
1782
1837
|
|
1783
1838
|
/* *************************************
|
@@ -1803,7 +1858,7 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
|
|
1803
1858
|
# include <assert.h> /* note: can still be disabled with NDEBUG */
|
1804
1859
|
# define XXH_ASSERT(c) assert(c)
|
1805
1860
|
#else
|
1806
|
-
# define XXH_ASSERT(c) (
|
1861
|
+
# define XXH_ASSERT(c) XXH_ASSUME(c)
|
1807
1862
|
#endif
|
1808
1863
|
|
1809
1864
|
/* note: use after variable declarations */
|
@@ -1835,11 +1890,17 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
|
|
1835
1890
|
* XXH3_initCustomSecret_scalar().
|
1836
1891
|
*/
|
1837
1892
|
#if defined(__GNUC__) || defined(__clang__)
|
1838
|
-
# define XXH_COMPILER_GUARD(var) __asm__
|
1893
|
+
# define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))
|
1839
1894
|
#else
|
1840
1895
|
# define XXH_COMPILER_GUARD(var) ((void)0)
|
1841
1896
|
#endif
|
1842
1897
|
|
1898
|
+
#if defined(__clang__)
|
1899
|
+
# define XXH_COMPILER_GUARD_W(var) __asm__("" : "+w" (var))
|
1900
|
+
#else
|
1901
|
+
# define XXH_COMPILER_GUARD_W(var) ((void)0)
|
1902
|
+
#endif
|
1903
|
+
|
1843
1904
|
/* *************************************
|
1844
1905
|
* Basic Types
|
1845
1906
|
***************************************/
|
@@ -1946,7 +2007,7 @@ static xxh_u32 XXH_read32(const void* ptr)
|
|
1946
2007
|
|
1947
2008
|
/*
|
1948
2009
|
* Portable and safe solution. Generally efficient.
|
1949
|
-
* see:
|
2010
|
+
* see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
|
1950
2011
|
*/
|
1951
2012
|
static xxh_u32 XXH_read32(const void* memPtr)
|
1952
2013
|
{
|
@@ -2022,6 +2083,51 @@ static int XXH_isLittleEndian(void)
|
|
2022
2083
|
# define XXH_HAS_BUILTIN(x) 0
|
2023
2084
|
#endif
|
2024
2085
|
|
2086
|
+
|
2087
|
+
|
2088
|
+
/*
|
2089
|
+
* C23 and future versions have standard "unreachable()".
|
2090
|
+
* Once it has been implemented reliably we can add it as an
|
2091
|
+
* additional case:
|
2092
|
+
*
|
2093
|
+
* ```
|
2094
|
+
* #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN)
|
2095
|
+
* # include <stddef.h>
|
2096
|
+
* # ifdef unreachable
|
2097
|
+
* # define XXH_UNREACHABLE() unreachable()
|
2098
|
+
* # endif
|
2099
|
+
* #endif
|
2100
|
+
* ```
|
2101
|
+
*
|
2102
|
+
* Note C++23 also has std::unreachable() which can be detected
|
2103
|
+
* as follows:
|
2104
|
+
* ```
|
2105
|
+
* #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)
|
2106
|
+
* # include <utility>
|
2107
|
+
* # define XXH_UNREACHABLE() std::unreachable()
|
2108
|
+
* #endif
|
2109
|
+
* ```
|
2110
|
+
* NB: `__cpp_lib_unreachable` is defined in the `<version>` header.
|
2111
|
+
* We don't use that as including `<utility>` in `extern "C"` blocks
|
2112
|
+
* doesn't work on GCC12
|
2113
|
+
*/
|
2114
|
+
|
2115
|
+
#if XXH_HAS_BUILTIN(__builtin_unreachable)
|
2116
|
+
# define XXH_UNREACHABLE() __builtin_unreachable()
|
2117
|
+
|
2118
|
+
#elif defined(_MSC_VER)
|
2119
|
+
# define XXH_UNREACHABLE() __assume(0)
|
2120
|
+
|
2121
|
+
#else
|
2122
|
+
# define XXH_UNREACHABLE()
|
2123
|
+
#endif
|
2124
|
+
|
2125
|
+
#if XXH_HAS_BUILTIN(__builtin_assume)
|
2126
|
+
# define XXH_ASSUME(c) __builtin_assume(c)
|
2127
|
+
#else
|
2128
|
+
# define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
|
2129
|
+
#endif
|
2130
|
+
|
2025
2131
|
/*!
|
2026
2132
|
* @internal
|
2027
2133
|
* @def XXH_rotl32(x,r)
|
@@ -2211,9 +2317,9 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
|
|
2211
2317
|
* can load data, while v3 can multiply. SSE forces them to operate
|
2212
2318
|
* together.
|
2213
2319
|
*
|
2214
|
-
* This is also enabled on AArch64, as Clang
|
2215
|
-
*
|
2216
|
-
*
|
2320
|
+
* This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing
|
2321
|
+
* the loop. NEON is only faster on the A53, and with the newer cores, it is less
|
2322
|
+
* than half the speed.
|
2217
2323
|
*/
|
2218
2324
|
XXH_COMPILER_GUARD(acc);
|
2219
2325
|
#endif
|
@@ -2288,41 +2394,41 @@ XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
|
|
2288
2394
|
} else {
|
2289
2395
|
switch(len&15) /* or switch(bEnd - p) */ {
|
2290
2396
|
case 12: XXH_PROCESS4;
|
2291
|
-
XXH_FALLTHROUGH;
|
2397
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2292
2398
|
case 8: XXH_PROCESS4;
|
2293
|
-
XXH_FALLTHROUGH;
|
2399
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2294
2400
|
case 4: XXH_PROCESS4;
|
2295
2401
|
return XXH32_avalanche(hash);
|
2296
2402
|
|
2297
2403
|
case 13: XXH_PROCESS4;
|
2298
|
-
XXH_FALLTHROUGH;
|
2404
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2299
2405
|
case 9: XXH_PROCESS4;
|
2300
|
-
XXH_FALLTHROUGH;
|
2406
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2301
2407
|
case 5: XXH_PROCESS4;
|
2302
2408
|
XXH_PROCESS1;
|
2303
2409
|
return XXH32_avalanche(hash);
|
2304
2410
|
|
2305
2411
|
case 14: XXH_PROCESS4;
|
2306
|
-
XXH_FALLTHROUGH;
|
2412
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2307
2413
|
case 10: XXH_PROCESS4;
|
2308
|
-
XXH_FALLTHROUGH;
|
2414
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2309
2415
|
case 6: XXH_PROCESS4;
|
2310
2416
|
XXH_PROCESS1;
|
2311
2417
|
XXH_PROCESS1;
|
2312
2418
|
return XXH32_avalanche(hash);
|
2313
2419
|
|
2314
2420
|
case 15: XXH_PROCESS4;
|
2315
|
-
XXH_FALLTHROUGH;
|
2421
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2316
2422
|
case 11: XXH_PROCESS4;
|
2317
|
-
XXH_FALLTHROUGH;
|
2423
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2318
2424
|
case 7: XXH_PROCESS4;
|
2319
|
-
XXH_FALLTHROUGH;
|
2425
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2320
2426
|
case 3: XXH_PROCESS1;
|
2321
|
-
XXH_FALLTHROUGH;
|
2427
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2322
2428
|
case 2: XXH_PROCESS1;
|
2323
|
-
XXH_FALLTHROUGH;
|
2429
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2324
2430
|
case 1: XXH_PROCESS1;
|
2325
|
-
XXH_FALLTHROUGH;
|
2431
|
+
XXH_FALLTHROUGH; /* fallthrough */
|
2326
2432
|
case 0: return XXH32_avalanche(hash);
|
2327
2433
|
}
|
2328
2434
|
XXH_ASSERT(0);
|
@@ -2590,7 +2696,7 @@ static xxh_u64 XXH_read64(const void* ptr)
|
|
2590
2696
|
|
2591
2697
|
/*
|
2592
2698
|
* Portable and safe solution. Generally efficient.
|
2593
|
-
* see:
|
2699
|
+
* see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
|
2594
2700
|
*/
|
2595
2701
|
static xxh_u64 XXH_read64(const void* memPtr)
|
2596
2702
|
{
|
@@ -2823,7 +2929,7 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment
|
|
2823
2929
|
|
2824
2930
|
|
2825
2931
|
/*! @ingroup XXH64_family */
|
2826
|
-
XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
|
2932
|
+
XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
|
2827
2933
|
{
|
2828
2934
|
#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
|
2829
2935
|
/* Simple version, good for code maintenance, but unfortunately slow for small inputs */
|
@@ -2857,13 +2963,13 @@ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
|
|
2857
2963
|
}
|
2858
2964
|
|
2859
2965
|
/*! @ingroup XXH64_family */
|
2860
|
-
XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
|
2966
|
+
XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
|
2861
2967
|
{
|
2862
2968
|
XXH_memcpy(dstState, srcState, sizeof(*dstState));
|
2863
2969
|
}
|
2864
2970
|
|
2865
2971
|
/*! @ingroup XXH64_family */
|
2866
|
-
XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
|
2972
|
+
XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
|
2867
2973
|
{
|
2868
2974
|
XXH_ASSERT(statePtr != NULL);
|
2869
2975
|
memset(statePtr, 0, sizeof(*statePtr));
|
@@ -2876,7 +2982,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t s
|
|
2876
2982
|
|
2877
2983
|
/*! @ingroup XXH64_family */
|
2878
2984
|
XXH_PUBLIC_API XXH_errorcode
|
2879
|
-
XXH64_update (XXH64_state_t* state, const void* input, size_t len)
|
2985
|
+
XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
|
2880
2986
|
{
|
2881
2987
|
if (input==NULL) {
|
2882
2988
|
XXH_ASSERT(len == 0);
|
@@ -2927,7 +3033,7 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
|
|
2927
3033
|
|
2928
3034
|
|
2929
3035
|
/*! @ingroup XXH64_family */
|
2930
|
-
XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
|
3036
|
+
XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
|
2931
3037
|
{
|
2932
3038
|
xxh_u64 h64;
|
2933
3039
|
|
@@ -2950,7 +3056,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
|
|
2950
3056
|
/******* Canonical representation *******/
|
2951
3057
|
|
2952
3058
|
/*! @ingroup XXH64_family */
|
2953
|
-
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
|
3059
|
+
XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
|
2954
3060
|
{
|
2955
3061
|
XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
|
2956
3062
|
if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
|
@@ -2958,7 +3064,7 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t
|
|
2958
3064
|
}
|
2959
3065
|
|
2960
3066
|
/*! @ingroup XXH64_family */
|
2961
|
-
XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
|
3067
|
+
XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
|
2962
3068
|
{
|
2963
3069
|
return XXH_readBE64(src);
|
2964
3070
|
}
|
@@ -2979,11 +3085,19 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
|
|
2979
3085
|
/* === Compiler specifics === */
|
2980
3086
|
|
2981
3087
|
#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
|
2982
|
-
# define XXH_RESTRICT
|
3088
|
+
# define XXH_RESTRICT /* disable */
|
2983
3089
|
#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
|
2984
3090
|
# define XXH_RESTRICT restrict
|
3091
|
+
#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
|
3092
|
+
|| (defined (__clang__)) \
|
3093
|
+
|| (defined (_MSC_VER) && (_MSC_VER >= 1400)) \
|
3094
|
+
|| (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))
|
3095
|
+
/*
|
3096
|
+
* There are a LOT more compilers that recognize __restrict but this
|
3097
|
+
* covers the major ones.
|
3098
|
+
*/
|
3099
|
+
# define XXH_RESTRICT __restrict
|
2985
3100
|
#else
|
2986
|
-
/* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
|
2987
3101
|
# define XXH_RESTRICT /* disable */
|
2988
3102
|
#endif
|
2989
3103
|
|
@@ -2998,9 +3112,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
|
|
2998
3112
|
#endif
|
2999
3113
|
|
3000
3114
|
#if defined(__GNUC__) || defined(__clang__)
|
3115
|
+
# if defined(__ARM_FEATURE_SVE)
|
3116
|
+
# include <arm_sve.h>
|
3117
|
+
# endif
|
3001
3118
|
# if defined(__ARM_NEON__) || defined(__ARM_NEON) \
|
3002
|
-
|| defined(
|
3003
|
-
|| defined(_M_ARM64)
|
3119
|
+
|| (defined(_M_ARM) && _M_ARM >= 7) \
|
3120
|
+
|| defined(_M_ARM64) || defined(_M_ARM64EC)
|
3004
3121
|
# define inline __inline__ /* circumvent a clang bug */
|
3005
3122
|
# include <arm_neon.h>
|
3006
3123
|
# undef inline
|
@@ -3125,12 +3242,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|
|
3125
3242
|
XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */
|
3126
3243
|
XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */
|
3127
3244
|
XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
|
3245
|
+
XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */
|
3128
3246
|
};
|
3129
3247
|
/*!
|
3130
3248
|
* @ingroup tuning
|
3131
3249
|
* @brief Selects the minimum alignment for XXH3's accumulators.
|
3132
3250
|
*
|
3133
|
-
* When using SIMD, this should match the alignment
|
3251
|
+
* When using SIMD, this should match the alignment required for said vector
|
3134
3252
|
* type, so, for example, 32 for AVX2.
|
3135
3253
|
*
|
3136
3254
|
* Default: Auto detected.
|
@@ -3146,10 +3264,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|
|
3146
3264
|
# define XXH_AVX512 3
|
3147
3265
|
# define XXH_NEON 4
|
3148
3266
|
# define XXH_VSX 5
|
3267
|
+
# define XXH_SVE 6
|
3149
3268
|
#endif
|
3150
3269
|
|
3151
3270
|
#ifndef XXH_VECTOR /* can be defined on command line */
|
3152
|
-
# if (
|
3271
|
+
# if defined(__ARM_FEATURE_SVE)
|
3272
|
+
# define XXH_VECTOR XXH_SVE
|
3273
|
+
# elif ( \
|
3153
3274
|
defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
|
3154
3275
|
|| defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
|
3155
3276
|
) && ( \
|
@@ -3172,6 +3293,17 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|
|
3172
3293
|
# endif
|
3173
3294
|
#endif
|
3174
3295
|
|
3296
|
+
/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
|
3297
|
+
#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
|
3298
|
+
# ifdef _MSC_VER
|
3299
|
+
# pragma warning(once : 4606)
|
3300
|
+
# else
|
3301
|
+
# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
|
3302
|
+
# endif
|
3303
|
+
# undef XXH_VECTOR
|
3304
|
+
# define XXH_VECTOR XXH_SCALAR
|
3305
|
+
#endif
|
3306
|
+
|
3175
3307
|
/*
|
3176
3308
|
* Controls the alignment of the accumulator,
|
3177
3309
|
* for compatibility with aligned vector loads, which are usually faster.
|
@@ -3191,16 +3323,26 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|
|
3191
3323
|
# define XXH_ACC_ALIGN 16
|
3192
3324
|
# elif XXH_VECTOR == XXH_AVX512 /* avx512 */
|
3193
3325
|
# define XXH_ACC_ALIGN 64
|
3326
|
+
# elif XXH_VECTOR == XXH_SVE /* sve */
|
3327
|
+
# define XXH_ACC_ALIGN 64
|
3194
3328
|
# endif
|
3195
3329
|
#endif
|
3196
3330
|
|
3197
3331
|
#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
|
3198
3332
|
|| XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
|
3199
3333
|
# define XXH_SEC_ALIGN XXH_ACC_ALIGN
|
3334
|
+
#elif XXH_VECTOR == XXH_SVE
|
3335
|
+
# define XXH_SEC_ALIGN XXH_ACC_ALIGN
|
3200
3336
|
#else
|
3201
3337
|
# define XXH_SEC_ALIGN 8
|
3202
3338
|
#endif
|
3203
3339
|
|
3340
|
+
#if defined(__GNUC__) || defined(__clang__)
|
3341
|
+
# define XXH_ALIASING __attribute__((may_alias))
|
3342
|
+
#else
|
3343
|
+
# define XXH_ALIASING /* nothing */
|
3344
|
+
#endif
|
3345
|
+
|
3204
3346
|
/*
|
3205
3347
|
* UGLY HACK:
|
3206
3348
|
* GCC usually generates the best code with -O3 for xxHash.
|
@@ -3229,107 +3371,16 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|
|
3229
3371
|
# pragma GCC optimize("-O2")
|
3230
3372
|
#endif
|
3231
3373
|
|
3232
|
-
|
3233
3374
|
#if XXH_VECTOR == XXH_NEON
|
3375
|
+
|
3234
3376
|
/*
|
3235
|
-
*
|
3236
|
-
*
|
3237
|
-
*
|
3238
|
-
* While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
|
3239
|
-
*
|
3240
|
-
* To do the same operation, the 128-bit 'Q' register needs to be split into
|
3241
|
-
* two 64-bit 'D' registers, performing this operation::
|
3242
|
-
*
|
3243
|
-
* [ a | b ]
|
3244
|
-
* | '---------. .--------' |
|
3245
|
-
* | x |
|
3246
|
-
* | .---------' '--------. |
|
3247
|
-
* [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]
|
3248
|
-
*
|
3249
|
-
* Due to significant changes in aarch64, the fastest method for aarch64 is
|
3250
|
-
* completely different than the fastest method for ARMv7-A.
|
3251
|
-
*
|
3252
|
-
* ARMv7-A treats D registers as unions overlaying Q registers, so modifying
|
3253
|
-
* D11 will modify the high half of Q5. This is similar to how modifying AH
|
3254
|
-
* will only affect bits 8-15 of AX on x86.
|
3255
|
-
*
|
3256
|
-
* VZIP takes two registers, and puts even lanes in one register and odd lanes
|
3257
|
-
* in the other.
|
3258
|
-
*
|
3259
|
-
* On ARMv7-A, this strangely modifies both parameters in place instead of
|
3260
|
-
* taking the usual 3-operand form.
|
3261
|
-
*
|
3262
|
-
* Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
|
3263
|
-
* lower and upper halves of the Q register to end up with the high and low
|
3264
|
-
* halves where we want - all in one instruction.
|
3265
|
-
*
|
3266
|
-
* vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
|
3267
|
-
*
|
3268
|
-
* Unfortunately we need inline assembly for this: Instructions modifying two
|
3269
|
-
* registers at once is not possible in GCC or Clang's IR, and they have to
|
3270
|
-
* create a copy.
|
3377
|
+
* UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3
|
3378
|
+
* optimizes out the entire hashLong loop because of the aliasing violation.
|
3271
3379
|
*
|
3272
|
-
*
|
3273
|
-
*
|
3274
|
-
* In order to make it easier to write a decent compiler for aarch64, many
|
3275
|
-
* quirks were removed, such as conditional execution.
|
3276
|
-
*
|
3277
|
-
* NEON was also affected by this.
|
3278
|
-
*
|
3279
|
-
* aarch64 cannot access the high bits of a Q-form register, and writes to a
|
3280
|
-
* D-form register zero the high bits, similar to how writes to W-form scalar
|
3281
|
-
* registers (or DWORD registers on x86_64) work.
|
3282
|
-
*
|
3283
|
-
* The formerly free vget_high intrinsics now require a vext (with a few
|
3284
|
-
* exceptions)
|
3285
|
-
*
|
3286
|
-
* Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
|
3287
|
-
* of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
|
3288
|
-
* operand.
|
3289
|
-
*
|
3290
|
-
* The equivalent of the VZIP.32 on the lower and upper halves would be this
|
3291
|
-
* mess:
|
3292
|
-
*
|
3293
|
-
* ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
|
3294
|
-
* zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }
|
3295
|
-
* zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
|
3296
|
-
*
|
3297
|
-
* Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
|
3298
|
-
*
|
3299
|
-
* shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
|
3300
|
-
* xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
|
3301
|
-
*
|
3302
|
-
* This is available on ARMv7-A, but is less efficient than a single VZIP.32.
|
3380
|
+
* However, GCC is also inefficient at load-store optimization with vld1q/vst1q,
|
3381
|
+
* so the only option is to mark it as aliasing.
|
3303
3382
|
*/
|
3304
|
-
|
3305
|
-
/*!
|
3306
|
-
* Function-like macro:
|
3307
|
-
* void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
|
3308
|
-
* {
|
3309
|
-
* outLo = (uint32x2_t)(in & 0xFFFFFFFF);
|
3310
|
-
* outHi = (uint32x2_t)(in >> 32);
|
3311
|
-
* in = UNDEFINED;
|
3312
|
-
* }
|
3313
|
-
*/
|
3314
|
-
# if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
|
3315
|
-
&& (defined(__GNUC__) || defined(__clang__)) \
|
3316
|
-
&& (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
|
3317
|
-
# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
|
3318
|
-
do { \
|
3319
|
-
/* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
|
3320
|
-
/* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \
|
3321
|
-
/* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
|
3322
|
-
__asm__("vzip.32 %e0, %f0" : "+w" (in)); \
|
3323
|
-
(outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \
|
3324
|
-
(outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
|
3325
|
-
} while (0)
|
3326
|
-
# else
|
3327
|
-
# define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
|
3328
|
-
do { \
|
3329
|
-
(outLo) = vmovn_u64 (in); \
|
3330
|
-
(outHi) = vshrn_n_u64 ((in), 32); \
|
3331
|
-
} while (0)
|
3332
|
-
# endif
|
3383
|
+
typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
|
3333
3384
|
|
3334
3385
|
/*!
|
3335
3386
|
* @internal
|
@@ -3347,7 +3398,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
|
|
3347
3398
|
#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
|
3348
3399
|
XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
|
3349
3400
|
{
|
3350
|
-
return *(
|
3401
|
+
return *(xxh_aliasing_uint64x2_t const *)ptr;
|
3351
3402
|
}
|
3352
3403
|
#else
|
3353
3404
|
XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
|
@@ -3355,38 +3406,75 @@ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
|
|
3355
3406
|
return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
|
3356
3407
|
}
|
3357
3408
|
#endif
|
3409
|
+
|
3410
|
+
/*!
|
3411
|
+
* @internal
|
3412
|
+
* @brief `vmlal_u32` on low and high halves of a vector.
|
3413
|
+
*
|
3414
|
+
* This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with
|
3415
|
+
* inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`
|
3416
|
+
* with `vmlal_u32`.
|
3417
|
+
*/
|
3418
|
+
#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11
|
3419
|
+
XXH_FORCE_INLINE uint64x2_t
|
3420
|
+
XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
|
3421
|
+
{
|
3422
|
+
/* Inline assembly is the only way */
|
3423
|
+
__asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));
|
3424
|
+
return acc;
|
3425
|
+
}
|
3426
|
+
XXH_FORCE_INLINE uint64x2_t
|
3427
|
+
XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
|
3428
|
+
{
|
3429
|
+
/* This intrinsic works as expected */
|
3430
|
+
return vmlal_high_u32(acc, lhs, rhs);
|
3431
|
+
}
|
3432
|
+
#else
|
3433
|
+
/* Portable intrinsic versions */
|
3434
|
+
XXH_FORCE_INLINE uint64x2_t
|
3435
|
+
XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
|
3436
|
+
{
|
3437
|
+
return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));
|
3438
|
+
}
|
3439
|
+
/*! @copydoc XXH_vmlal_low_u32
|
3440
|
+
* Assume the compiler converts this to vmlal_high_u32 on aarch64 */
|
3441
|
+
XXH_FORCE_INLINE uint64x2_t
|
3442
|
+
XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
|
3443
|
+
{
|
3444
|
+
return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));
|
3445
|
+
}
|
3446
|
+
#endif
|
3447
|
+
|
3358
3448
|
/*!
|
3359
3449
|
* @ingroup tuning
|
3360
3450
|
* @brief Controls the NEON to scalar ratio for XXH3
|
3361
3451
|
*
|
3362
|
-
*
|
3363
|
-
* 2 lanes on scalar by default.
|
3452
|
+
* This can be set to 2, 4, 6, or 8.
|
3364
3453
|
*
|
3365
|
-
*
|
3366
|
-
* emulated 64-bit arithmetic is too slow.
|
3454
|
+
* ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.
|
3367
3455
|
*
|
3368
|
-
*
|
3456
|
+
* For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those
|
3457
|
+
* can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU
|
3458
|
+
* bandwidth.
|
3369
3459
|
*
|
3370
|
-
*
|
3371
|
-
* have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,
|
3372
|
-
* you are only using 2/3 of the CPU bandwidth.
|
3373
|
-
*
|
3374
|
-
* This is even more noticable on the more advanced cores like the A76 which
|
3460
|
+
* This is even more noticeable on the more advanced cores like the Cortex-A76 which
|
3375
3461
|
* can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
|
3376
3462
|
*
|
3377
|
-
* Therefore,
|
3378
|
-
*
|
3379
|
-
*
|
3380
|
-
*
|
3463
|
+
* Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes
|
3464
|
+
* and 2 scalar lanes, which is chosen by default.
|
3465
|
+
*
|
3466
|
+
* This does not apply to Apple processors or 32-bit processors, which run better with
|
3467
|
+
* full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.
|
3381
3468
|
*
|
3382
3469
|
* This change benefits CPUs with large micro-op buffers without negatively affecting
|
3383
|
-
* other CPUs:
|
3470
|
+
* most other CPUs:
|
3384
3471
|
*
|
3385
3472
|
* | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. |
|
3386
3473
|
* |:----------------------|:--------------------|----------:|-----------:|------:|
|
3387
3474
|
* | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% |
|
3388
3475
|
* | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% |
|
3389
3476
|
* | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% |
|
3477
|
+
* | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% |
|
3390
3478
|
*
|
3391
3479
|
* It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
|
3392
3480
|
*
|
@@ -3394,7 +3482,7 @@ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
|
|
3394
3482
|
*/
|
3395
3483
|
# ifndef XXH3_NEON_LANES
|
3396
3484
|
# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
|
3397
|
-
&& XXH_SIZE_OPT <= 0
|
3485
|
+
&& !defined(__APPLE__) && XXH_SIZE_OPT <= 0
|
3398
3486
|
# define XXH3_NEON_LANES 6
|
3399
3487
|
# else
|
3400
3488
|
# define XXH3_NEON_LANES XXH_ACC_NB
|
@@ -3442,6 +3530,11 @@ typedef __vector unsigned long long xxh_u64x2;
|
|
3442
3530
|
typedef __vector unsigned char xxh_u8x16;
|
3443
3531
|
typedef __vector unsigned xxh_u32x4;
|
3444
3532
|
|
3533
|
+
/*
|
3534
|
+
* UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.
|
3535
|
+
*/
|
3536
|
+
typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;
|
3537
|
+
|
3445
3538
|
# ifndef XXH_VSX_BE
|
3446
3539
|
# if defined(__BIG_ENDIAN__) \
|
3447
3540
|
|| (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
@@ -3516,6 +3609,20 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
|
|
3516
3609
|
# endif /* XXH_vec_mulo, XXH_vec_mule */
|
3517
3610
|
#endif /* XXH_VECTOR == XXH_VSX */
|
3518
3611
|
|
3612
|
+
#if XXH_VECTOR == XXH_SVE
|
3613
|
+
#define ACCRND(acc, offset) \
|
3614
|
+
do { \
|
3615
|
+
svuint64_t input_vec = svld1_u64(mask, xinput + offset); \
|
3616
|
+
svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \
|
3617
|
+
svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \
|
3618
|
+
svuint64_t swapped = svtbl_u64(input_vec, kSwap); \
|
3619
|
+
svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \
|
3620
|
+
svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \
|
3621
|
+
svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
|
3622
|
+
acc = svadd_u64_x(mask, acc, mul); \
|
3623
|
+
} while (0)
|
3624
|
+
#endif /* XXH_VECTOR == XXH_SVE */
|
3625
|
+
|
3519
3626
|
|
3520
3627
|
/* prefetch
|
3521
3628
|
* can be disabled, by declaring XXH_NO_PREFETCH build macro */
|
@@ -3952,31 +4059,33 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
3952
4059
|
XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
|
3953
4060
|
XXH_ASSERT(16 < len && len <= 128);
|
3954
4061
|
|
3955
|
-
{ xxh_u64 acc = len * XXH_PRIME64_1;
|
4062
|
+
{ xxh_u64 acc = len * XXH_PRIME64_1, acc_end;
|
3956
4063
|
#if XXH_SIZE_OPT >= 1
|
3957
4064
|
/* Smaller and cleaner, but slightly slower. */
|
3958
|
-
|
4065
|
+
unsigned int i = (unsigned int)(len - 1) / 32;
|
3959
4066
|
do {
|
3960
4067
|
acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
|
3961
4068
|
acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
|
3962
4069
|
} while (i-- != 0);
|
4070
|
+
acc_end = 0;
|
3963
4071
|
#else
|
4072
|
+
acc += XXH3_mix16B(input+0, secret+0, seed);
|
4073
|
+
acc_end = XXH3_mix16B(input+len-16, secret+16, seed);
|
3964
4074
|
if (len > 32) {
|
4075
|
+
acc += XXH3_mix16B(input+16, secret+32, seed);
|
4076
|
+
acc_end += XXH3_mix16B(input+len-32, secret+48, seed);
|
3965
4077
|
if (len > 64) {
|
4078
|
+
acc += XXH3_mix16B(input+32, secret+64, seed);
|
4079
|
+
acc_end += XXH3_mix16B(input+len-48, secret+80, seed);
|
4080
|
+
|
3966
4081
|
if (len > 96) {
|
3967
4082
|
acc += XXH3_mix16B(input+48, secret+96, seed);
|
3968
|
-
|
4083
|
+
acc_end += XXH3_mix16B(input+len-64, secret+112, seed);
|
3969
4084
|
}
|
3970
|
-
acc += XXH3_mix16B(input+32, secret+64, seed);
|
3971
|
-
acc += XXH3_mix16B(input+len-48, secret+80, seed);
|
3972
4085
|
}
|
3973
|
-
acc += XXH3_mix16B(input+16, secret+32, seed);
|
3974
|
-
acc += XXH3_mix16B(input+len-32, secret+48, seed);
|
3975
4086
|
}
|
3976
|
-
acc += XXH3_mix16B(input+0, secret+0, seed);
|
3977
|
-
acc += XXH3_mix16B(input+len-16, secret+16, seed);
|
3978
4087
|
#endif
|
3979
|
-
return XXH3_avalanche(acc);
|
4088
|
+
return XXH3_avalanche(acc + acc_end);
|
3980
4089
|
}
|
3981
4090
|
}
|
3982
4091
|
|
@@ -3994,13 +4103,17 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
3994
4103
|
#define XXH3_MIDSIZE_LASTOFFSET 17
|
3995
4104
|
|
3996
4105
|
{ xxh_u64 acc = len * XXH_PRIME64_1;
|
3997
|
-
|
3998
|
-
int
|
4106
|
+
xxh_u64 acc_end;
|
4107
|
+
unsigned int const nbRounds = (unsigned int)len / 16;
|
4108
|
+
unsigned int i;
|
4109
|
+
XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
|
3999
4110
|
for (i=0; i<8; i++) {
|
4000
4111
|
acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
|
4001
4112
|
}
|
4002
|
-
|
4113
|
+
/* last bytes */
|
4114
|
+
acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
|
4003
4115
|
XXH_ASSERT(nbRounds >= 8);
|
4116
|
+
acc = XXH3_avalanche(acc);
|
4004
4117
|
#if defined(__clang__) /* Clang */ \
|
4005
4118
|
&& (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
|
4006
4119
|
&& !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
|
@@ -4027,11 +4140,13 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
4027
4140
|
#pragma clang loop vectorize(disable)
|
4028
4141
|
#endif
|
4029
4142
|
for (i=8 ; i < nbRounds; i++) {
|
4030
|
-
|
4143
|
+
/*
|
4144
|
+
* Prevents clang for unrolling the acc loop and interleaving with this one.
|
4145
|
+
*/
|
4146
|
+
XXH_COMPILER_GUARD(acc);
|
4147
|
+
acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
|
4031
4148
|
}
|
4032
|
-
|
4033
|
-
acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
|
4034
|
-
return XXH3_avalanche(acc);
|
4149
|
+
return XXH3_avalanche(acc + acc_end);
|
4035
4150
|
}
|
4036
4151
|
}
|
4037
4152
|
|
@@ -4047,6 +4162,47 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
4047
4162
|
# define ACC_NB XXH_ACC_NB
|
4048
4163
|
#endif
|
4049
4164
|
|
4165
|
+
#ifndef XXH_PREFETCH_DIST
|
4166
|
+
# ifdef __clang__
|
4167
|
+
# define XXH_PREFETCH_DIST 320
|
4168
|
+
# else
|
4169
|
+
# if (XXH_VECTOR == XXH_AVX512)
|
4170
|
+
# define XXH_PREFETCH_DIST 512
|
4171
|
+
# else
|
4172
|
+
# define XXH_PREFETCH_DIST 384
|
4173
|
+
# endif
|
4174
|
+
# endif /* __clang__ */
|
4175
|
+
#endif /* XXH_PREFETCH_DIST */
|
4176
|
+
|
4177
|
+
/*
|
4178
|
+
* These macros are to generate an XXH3_accumulate() function.
|
4179
|
+
* The two arguments select the name suffix and target attribute.
|
4180
|
+
*
|
4181
|
+
* The name of this symbol is XXH3_accumulate_<name>() and it calls
|
4182
|
+
* XXH3_accumulate_512_<name>().
|
4183
|
+
*
|
4184
|
+
* It may be useful to hand implement this function if the compiler fails to
|
4185
|
+
* optimize the inline function.
|
4186
|
+
*/
|
4187
|
+
#define XXH3_ACCUMULATE_TEMPLATE(name) \
|
4188
|
+
void \
|
4189
|
+
XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \
|
4190
|
+
const xxh_u8* XXH_RESTRICT input, \
|
4191
|
+
const xxh_u8* XXH_RESTRICT secret, \
|
4192
|
+
size_t nbStripes) \
|
4193
|
+
{ \
|
4194
|
+
size_t n; \
|
4195
|
+
for (n = 0; n < nbStripes; n++ ) { \
|
4196
|
+
const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \
|
4197
|
+
XXH_PREFETCH(in + XXH_PREFETCH_DIST); \
|
4198
|
+
XXH3_accumulate_512_##name( \
|
4199
|
+
acc, \
|
4200
|
+
in, \
|
4201
|
+
secret + n*XXH_SECRET_CONSUME_RATE); \
|
4202
|
+
} \
|
4203
|
+
}
|
4204
|
+
|
4205
|
+
|
4050
4206
|
XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
|
4051
4207
|
{
|
4052
4208
|
if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
|
@@ -4115,7 +4271,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
|
|
4115
4271
|
/* data_key = data_vec ^ key_vec; */
|
4116
4272
|
__m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
|
4117
4273
|
/* data_key_lo = data_key >> 32; */
|
4118
|
-
__m512i const data_key_lo =
|
4274
|
+
__m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
|
4119
4275
|
/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
|
4120
4276
|
__m512i const product = _mm512_mul_epu32 (data_key, data_key_lo);
|
4121
4277
|
/* xacc[0] += swap(data_vec); */
|
@@ -4125,6 +4281,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
|
|
4125
4281
|
*xacc = _mm512_add_epi64(product, sum);
|
4126
4282
|
}
|
4127
4283
|
}
|
4284
|
+
XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
|
4128
4285
|
|
4129
4286
|
/*
|
4130
4287
|
* XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
|
@@ -4158,13 +4315,12 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
|
4158
4315
|
/* xacc[0] ^= (xacc[0] >> 47) */
|
4159
4316
|
__m512i const acc_vec = *xacc;
|
4160
4317
|
__m512i const shifted = _mm512_srli_epi64 (acc_vec, 47);
|
4161
|
-
__m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted);
|
4162
4318
|
/* xacc[0] ^= secret; */
|
4163
4319
|
__m512i const key_vec = _mm512_loadu_si512 (secret);
|
4164
|
-
__m512i const data_key =
|
4320
|
+
__m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
|
4165
4321
|
|
4166
4322
|
/* xacc[0] *= XXH_PRIME32_1; */
|
4167
|
-
__m512i const data_key_hi =
|
4323
|
+
__m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
|
4168
4324
|
__m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32);
|
4169
4325
|
__m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32);
|
4170
4326
|
*xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
|
@@ -4179,7 +4335,8 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
|
|
4179
4335
|
XXH_ASSERT(((size_t)customSecret & 63) == 0);
|
4180
4336
|
(void)(&XXH_writeLE64);
|
4181
4337
|
{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
|
4182
|
-
__m512i const
|
4338
|
+
__m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
|
4339
|
+
__m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
|
4183
4340
|
|
4184
4341
|
const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret);
|
4185
4342
|
__m512i* const dest = ( __m512i*) customSecret;
|
@@ -4187,14 +4344,7 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
|
|
4187
4344
|
XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
|
4188
4345
|
XXH_ASSERT(((size_t)dest & 63) == 0);
|
4189
4346
|
for (i=0; i < nbRounds; ++i) {
|
4190
|
-
|
4191
|
-
* this will warn "discards 'const' qualifier". */
|
4192
|
-
union {
|
4193
|
-
const __m512i* cp;
|
4194
|
-
void* p;
|
4195
|
-
} remote_const_void;
|
4196
|
-
remote_const_void.cp = src + i;
|
4197
|
-
dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
|
4347
|
+
dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
|
4198
4348
|
} }
|
4199
4349
|
}
|
4200
4350
|
|
@@ -4230,7 +4380,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
|
|
4230
4380
|
/* data_key = data_vec ^ key_vec; */
|
4231
4381
|
__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
|
4232
4382
|
/* data_key_lo = data_key >> 32; */
|
4233
|
-
__m256i const data_key_lo =
|
4383
|
+
__m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
|
4234
4384
|
/* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
|
4235
4385
|
__m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);
|
4236
4386
|
/* xacc[i] += swap(data_vec); */
|
@@ -4240,6 +4390,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
|
|
4240
4390
|
xacc[i] = _mm256_add_epi64(product, sum);
|
4241
4391
|
} }
|
4242
4392
|
}
|
4393
|
+
XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
|
4243
4394
|
|
4244
4395
|
XXH_FORCE_INLINE XXH_TARGET_AVX2 void
|
4245
4396
|
XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
@@ -4262,7 +4413,7 @@ XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
|
4262
4413
|
__m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
|
4263
4414
|
|
4264
4415
|
/* xacc[i] *= XXH_PRIME32_1; */
|
4265
|
-
__m256i const data_key_hi =
|
4416
|
+
__m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
|
4266
4417
|
__m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
|
4267
4418
|
__m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
|
4268
4419
|
xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
|
@@ -4294,12 +4445,12 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR
|
|
4294
4445
|
XXH_ASSERT(((size_t)dest & 31) == 0);
|
4295
4446
|
|
4296
4447
|
/* GCC -O2 need unroll loop manually */
|
4297
|
-
dest[0] = _mm256_add_epi64(
|
4298
|
-
dest[1] = _mm256_add_epi64(
|
4299
|
-
dest[2] = _mm256_add_epi64(
|
4300
|
-
dest[3] = _mm256_add_epi64(
|
4301
|
-
dest[4] = _mm256_add_epi64(
|
4302
|
-
dest[5] = _mm256_add_epi64(
|
4448
|
+
dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
|
4449
|
+
dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
|
4450
|
+
dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
|
4451
|
+
dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
|
4452
|
+
dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
|
4453
|
+
dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
|
4303
4454
|
}
|
4304
4455
|
}
|
4305
4456
|
|
@@ -4346,6 +4497,7 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
|
|
4346
4497
|
xacc[i] = _mm_add_epi64(product, sum);
|
4347
4498
|
} }
|
4348
4499
|
}
|
4500
|
+
XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
|
4349
4501
|
|
4350
4502
|
XXH_FORCE_INLINE XXH_TARGET_SSE2 void
|
4351
4503
|
XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
@@ -4431,6 +4583,16 @@ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
|
|
4431
4583
|
* CPU, and it also mitigates some GCC codegen issues.
|
4432
4584
|
*
|
4433
4585
|
* @see XXH3_NEON_LANES for configuring this and details about this optimization.
|
4586
|
+
*
|
4587
|
+
* NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit
|
4588
|
+
* integers instead of the other platforms which mask full 64-bit vectors,
|
4589
|
+
* so the setup is more complicated than just shifting right.
|
4590
|
+
*
|
4591
|
+
* Additionally, there is an optimization for 4 lanes at once noted below.
|
4592
|
+
*
|
4593
|
+
* Since, as stated, the most optimal amount of lanes for Cortexes is 6,
|
4594
|
+
* there needs to be *three* versions of the accumulate operation used
|
4595
|
+
* for the remaining 2 lanes.
|
4434
4596
|
*/
|
4435
4597
|
XXH_FORCE_INLINE void
|
4436
4598
|
XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
|
@@ -4439,49 +4601,113 @@ XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
|
|
4439
4601
|
{
|
4440
4602
|
XXH_ASSERT((((size_t)acc) & 15) == 0);
|
4441
4603
|
XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
|
4442
|
-
{
|
4443
|
-
|
4604
|
+
{ /* GCC for darwin arm64 does not like aliasing here */
|
4605
|
+
xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;
|
4444
4606
|
/* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
|
4445
4607
|
uint8_t const* const xinput = (const uint8_t *) input;
|
4446
4608
|
uint8_t const* const xsecret = (const uint8_t *) secret;
|
4447
4609
|
|
4448
4610
|
size_t i;
|
4449
|
-
/*
|
4611
|
+
/* Scalar lanes use the normal scalarRound routine */
|
4450
4612
|
for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
|
4451
4613
|
XXH3_scalarRound(acc, input, secret, i);
|
4452
4614
|
}
|
4453
|
-
|
4454
|
-
|
4615
|
+
i = 0;
|
4616
|
+
/* 4 NEON lanes at a time. */
|
4617
|
+
for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
|
4618
|
+
/* data_vec = xinput[i]; */
|
4619
|
+
uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16));
|
4620
|
+
uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16));
|
4621
|
+
/* key_vec = xsecret[i]; */
|
4622
|
+
uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16));
|
4623
|
+
uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16));
|
4624
|
+
/* data_swap = swap(data_vec) */
|
4625
|
+
uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
|
4626
|
+
uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
|
4627
|
+
/* data_key = data_vec ^ key_vec; */
|
4628
|
+
uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
|
4629
|
+
uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
|
4630
|
+
|
4631
|
+
/*
|
4632
|
+
* If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
|
4633
|
+
* de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
|
4634
|
+
* get one vector with the low 32 bits of each lane, and one vector
|
4635
|
+
* with the high 32 bits of each lane.
|
4636
|
+
*
|
4637
|
+
* This compiles to two instructions on AArch64 and has a paired vector
|
4638
|
+
* result, which is an artifact from ARMv7a's version which modified both
|
4639
|
+
* vectors in place.
|
4640
|
+
*
|
4641
|
+
* [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
|
4642
|
+
* [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
|
4643
|
+
*/
|
4644
|
+
uint32x4x2_t unzipped = vuzpq_u32(
|
4645
|
+
vreinterpretq_u32_u64(data_key_1),
|
4646
|
+
vreinterpretq_u32_u64(data_key_2)
|
4647
|
+
);
|
4648
|
+
/* data_key_lo = data_key & 0xFFFFFFFF */
|
4649
|
+
uint32x4_t data_key_lo = unzipped.val[0];
|
4650
|
+
/* data_key_hi = data_key >> 32 */
|
4651
|
+
uint32x4_t data_key_hi = unzipped.val[1];
|
4652
|
+
/*
|
4653
|
+
* Then, we can split the vectors horizontally and multiply which, as for most
|
4654
|
+
* widening intrinsics, have a variant that works on both high half vectors
|
4655
|
+
* for free on AArch64.
|
4656
|
+
*
|
4657
|
+
* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
|
4658
|
+
*/
|
4659
|
+
uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);
|
4660
|
+
uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);
|
4661
|
+
/*
|
4662
|
+
* Clang reorders
|
4663
|
+
* a += b * c; // umlal swap.2d, dkl.2s, dkh.2s
|
4664
|
+
* c += a; // add acc.2d, acc.2d, swap.2d
|
4665
|
+
* to
|
4666
|
+
* c += a; // add acc.2d, acc.2d, swap.2d
|
4667
|
+
* c += b * c; // umlal acc.2d, dkl.2s, dkh.2s
|
4668
|
+
*
|
4669
|
+
* While it would make sense in theory since the addition is faster,
|
4670
|
+
* for reasons likely related to umlal being limited to certain NEON
|
4671
|
+
* pipelines, this is worse. A compiler guard fixes this.
|
4672
|
+
*/
|
4673
|
+
XXH_COMPILER_GUARD_W(sum_1);
|
4674
|
+
XXH_COMPILER_GUARD_W(sum_2);
|
4675
|
+
/* xacc[i] = acc_vec + sum; */
|
4676
|
+
xacc[i] = vaddq_u64(xacc[i], sum_1);
|
4677
|
+
xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);
|
4678
|
+
}
|
4679
|
+
/* Operate on the remaining NEON lanes 2 at a time. */
|
4680
|
+
for (; i < XXH3_NEON_LANES / 2; i++) {
|
4455
4681
|
/* data_vec = xinput[i]; */
|
4456
4682
|
uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16));
|
4457
4683
|
/* key_vec = xsecret[i]; */
|
4458
4684
|
uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));
|
4459
|
-
uint64x2_t data_key;
|
4460
|
-
uint32x2_t data_key_lo, data_key_hi;
|
4461
4685
|
/* acc_vec_2 = swap(data_vec) */
|
4462
|
-
uint64x2_t
|
4686
|
+
uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);
|
4463
4687
|
/* data_key = data_vec ^ key_vec; */
|
4464
|
-
data_key = veorq_u64(data_vec, key_vec);
|
4465
|
-
/*
|
4466
|
-
|
4467
|
-
|
4468
|
-
|
4469
|
-
|
4470
|
-
|
4471
|
-
|
4472
|
-
|
4473
|
-
|
4688
|
+
uint64x2_t data_key = veorq_u64(data_vec, key_vec);
|
4689
|
+
/* For two lanes, just use VMOVN and VSHRN. */
|
4690
|
+
/* data_key_lo = data_key & 0xFFFFFFFF; */
|
4691
|
+
uint32x2_t data_key_lo = vmovn_u64(data_key);
|
4692
|
+
/* data_key_hi = data_key >> 32; */
|
4693
|
+
uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
|
4694
|
+
/* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */
|
4695
|
+
uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);
|
4696
|
+
/* Same Clang workaround as before */
|
4697
|
+
XXH_COMPILER_GUARD_W(sum);
|
4698
|
+
/* xacc[i] = acc_vec + sum; */
|
4699
|
+
xacc[i] = vaddq_u64 (xacc[i], sum);
|
4474
4700
|
}
|
4475
|
-
|
4476
4701
|
}
|
4477
4702
|
}
|
4703
|
+
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
|
4478
4704
|
|
4479
4705
|
XXH_FORCE_INLINE void
|
4480
4706
|
XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
4481
4707
|
{
|
4482
4708
|
XXH_ASSERT((((size_t)acc) & 15) == 0);
|
4483
4709
|
|
4484
|
-
{
|
4710
|
+
{ xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc;
|
4485
4711
|
uint8_t const* xsecret = (uint8_t const*) secret;
|
4486
4712
|
uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);
|
4487
4713
|
|
@@ -4493,47 +4719,42 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
|
4493
4719
|
for (i=0; i < XXH3_NEON_LANES / 2; i++) {
|
4494
4720
|
/* xacc[i] ^= (xacc[i] >> 47); */
|
4495
4721
|
uint64x2_t acc_vec = xacc[i];
|
4496
|
-
uint64x2_t shifted = vshrq_n_u64
|
4497
|
-
uint64x2_t data_vec = veorq_u64
|
4722
|
+
uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
|
4723
|
+
uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
|
4498
4724
|
|
4499
4725
|
/* xacc[i] ^= xsecret[i]; */
|
4500
|
-
uint64x2_t key_vec = XXH_vld1q_u64
|
4501
|
-
uint64x2_t data_key = veorq_u64
|
4726
|
+
uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));
|
4727
|
+
uint64x2_t data_key = veorq_u64(data_vec, key_vec);
|
4502
4728
|
|
4503
4729
|
/* xacc[i] *= XXH_PRIME32_1 */
|
4504
|
-
uint32x2_t data_key_lo
|
4505
|
-
|
4506
|
-
|
4507
|
-
*
|
4508
|
-
|
4509
|
-
|
4510
|
-
|
4511
|
-
|
4512
|
-
|
4513
|
-
|
4514
|
-
|
4515
|
-
|
4516
|
-
|
4517
|
-
|
4518
|
-
|
4519
|
-
|
4520
|
-
|
4521
|
-
|
4522
|
-
|
4523
|
-
|
4524
|
-
|
4525
|
-
|
4526
|
-
|
4527
|
-
|
4528
|
-
|
4529
|
-
prod_hi = vshlq_n_u64(prod_hi, 32);
|
4530
|
-
/* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
|
4531
|
-
xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime);
|
4532
|
-
}
|
4730
|
+
uint32x2_t data_key_lo = vmovn_u64(data_key);
|
4731
|
+
uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
|
4732
|
+
/*
|
4733
|
+
* prod_hi = (data_key >> 32) * XXH_PRIME32_1;
|
4734
|
+
*
|
4735
|
+
* Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
|
4736
|
+
* incorrectly "optimize" this:
|
4737
|
+
* tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));
|
4738
|
+
* shifted = vshll_n_u32(tmp, 32);
|
4739
|
+
* to this:
|
4740
|
+
* tmp = "vmulq_u64"(a, b); // no such thing!
|
4741
|
+
* shifted = vshlq_n_u64(tmp, 32);
|
4742
|
+
*
|
4743
|
+
* However, unlike SSE, Clang lacks a 64-bit multiply routine
|
4744
|
+
* for NEON, and it scalarizes two 64-bit multiplies instead.
|
4745
|
+
*
|
4746
|
+
* vmull_u32 has the same timing as vmul_u32, and it avoids
|
4747
|
+
* this bug completely.
|
4748
|
+
* See https://bugs.llvm.org/show_bug.cgi?id=39967
|
4749
|
+
*/
|
4750
|
+
uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
|
4751
|
+
/* xacc[i] = prod_hi << 32; */
|
4752
|
+
prod_hi = vshlq_n_u64(prod_hi, 32);
|
4753
|
+
/* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
|
4754
|
+
xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime);
|
4533
4755
|
}
|
4534
4756
|
}
|
4535
4757
|
}
|
4536
|
-
|
4537
4758
|
#endif
|
4538
4759
|
|
4539
4760
|
#if (XXH_VECTOR == XXH_VSX)
|
@@ -4544,23 +4765,23 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
|
|
4544
4765
|
const void* XXH_RESTRICT secret)
|
4545
4766
|
{
|
4546
4767
|
/* presumed aligned */
|
4547
|
-
|
4548
|
-
|
4549
|
-
|
4768
|
+
xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
|
4769
|
+
xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */
|
4770
|
+
xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */
|
4550
4771
|
xxh_u64x2 const v32 = { 32, 32 };
|
4551
4772
|
size_t i;
|
4552
4773
|
for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
|
4553
4774
|
/* data_vec = xinput[i]; */
|
4554
|
-
xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
|
4775
|
+
xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);
|
4555
4776
|
/* key_vec = xsecret[i]; */
|
4556
|
-
xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
|
4777
|
+
xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i);
|
4557
4778
|
xxh_u64x2 const data_key = data_vec ^ key_vec;
|
4558
4779
|
/* shuffled = (data_key << 32) | (data_key >> 32); */
|
4559
4780
|
xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
|
4560
4781
|
/* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
|
4561
4782
|
xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
|
4562
4783
|
/* acc_vec = xacc[i]; */
|
4563
|
-
xxh_u64x2 acc_vec =
|
4784
|
+
xxh_u64x2 acc_vec = xacc[i];
|
4564
4785
|
acc_vec += product;
|
4565
4786
|
|
4566
4787
|
/* swap high and low halves */
|
@@ -4569,18 +4790,18 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
|
|
4569
4790
|
#else
|
4570
4791
|
acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
|
4571
4792
|
#endif
|
4572
|
-
|
4573
|
-
vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
|
4793
|
+
xacc[i] = acc_vec;
|
4574
4794
|
}
|
4575
4795
|
}
|
4796
|
+
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
|
4576
4797
|
|
4577
4798
|
XXH_FORCE_INLINE void
|
4578
4799
|
XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
4579
4800
|
{
|
4580
4801
|
XXH_ASSERT((((size_t)acc) & 15) == 0);
|
4581
4802
|
|
4582
|
-
{
|
4583
|
-
const
|
4803
|
+
{ xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
|
4804
|
+
const xxh_u8* const xsecret = (const xxh_u8*) secret;
|
4584
4805
|
/* constants */
|
4585
4806
|
xxh_u64x2 const v32 = { 32, 32 };
|
4586
4807
|
xxh_u64x2 const v47 = { 47, 47 };
|
@@ -4592,7 +4813,7 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
|
4592
4813
|
xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
|
4593
4814
|
|
4594
4815
|
/* xacc[i] ^= xsecret[i]; */
|
4595
|
-
xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
|
4816
|
+
xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i);
|
4596
4817
|
xxh_u64x2 const data_key = data_vec ^ key_vec;
|
4597
4818
|
|
4598
4819
|
/* xacc[i] *= XXH_PRIME32_1 */
|
@@ -4606,8 +4827,148 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
|
|
4606
4827
|
|
4607
4828
|
#endif
|
4608
4829
|
|
4830
|
+
#if (XXH_VECTOR == XXH_SVE)
|
4831
|
+
|
4832
|
+
XXH_FORCE_INLINE void
|
4833
|
+
XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
|
4834
|
+
const void* XXH_RESTRICT input,
|
4835
|
+
const void* XXH_RESTRICT secret)
|
4836
|
+
{
|
4837
|
+
uint64_t *xacc = (uint64_t *)acc;
|
4838
|
+
const uint64_t *xinput = (const uint64_t *)(const void *)input;
|
4839
|
+
const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
|
4840
|
+
svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
|
4841
|
+
uint64_t element_count = svcntd();
|
4842
|
+
if (element_count >= 8) {
|
4843
|
+
svbool_t mask = svptrue_pat_b64(SV_VL8);
|
4844
|
+
svuint64_t vacc = svld1_u64(mask, xacc);
|
4845
|
+
ACCRND(vacc, 0);
|
4846
|
+
svst1_u64(mask, xacc, vacc);
|
4847
|
+
} else if (element_count == 2) { /* sve128 */
|
4848
|
+
svbool_t mask = svptrue_pat_b64(SV_VL2);
|
4849
|
+
svuint64_t acc0 = svld1_u64(mask, xacc + 0);
|
4850
|
+
svuint64_t acc1 = svld1_u64(mask, xacc + 2);
|
4851
|
+
svuint64_t acc2 = svld1_u64(mask, xacc + 4);
|
4852
|
+
svuint64_t acc3 = svld1_u64(mask, xacc + 6);
|
4853
|
+
ACCRND(acc0, 0);
|
4854
|
+
ACCRND(acc1, 2);
|
4855
|
+
ACCRND(acc2, 4);
|
4856
|
+
ACCRND(acc3, 6);
|
4857
|
+
svst1_u64(mask, xacc + 0, acc0);
|
4858
|
+
svst1_u64(mask, xacc + 2, acc1);
|
4859
|
+
svst1_u64(mask, xacc + 4, acc2);
|
4860
|
+
svst1_u64(mask, xacc + 6, acc3);
|
4861
|
+
} else {
|
4862
|
+
svbool_t mask = svptrue_pat_b64(SV_VL4);
|
4863
|
+
svuint64_t acc0 = svld1_u64(mask, xacc + 0);
|
4864
|
+
svuint64_t acc1 = svld1_u64(mask, xacc + 4);
|
4865
|
+
ACCRND(acc0, 0);
|
4866
|
+
ACCRND(acc1, 4);
|
4867
|
+
svst1_u64(mask, xacc + 0, acc0);
|
4868
|
+
svst1_u64(mask, xacc + 4, acc1);
|
4869
|
+
}
|
4870
|
+
}
|
4871
|
+
|
4872
|
+
XXH_FORCE_INLINE void
|
4873
|
+
XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
|
4874
|
+
const xxh_u8* XXH_RESTRICT input,
|
4875
|
+
const xxh_u8* XXH_RESTRICT secret,
|
4876
|
+
size_t nbStripes)
|
4877
|
+
{
|
4878
|
+
if (nbStripes != 0) {
|
4879
|
+
uint64_t *xacc = (uint64_t *)acc;
|
4880
|
+
const uint64_t *xinput = (const uint64_t *)(const void *)input;
|
4881
|
+
const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
|
4882
|
+
svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
|
4883
|
+
uint64_t element_count = svcntd();
|
4884
|
+
if (element_count >= 8) {
|
4885
|
+
svbool_t mask = svptrue_pat_b64(SV_VL8);
|
4886
|
+
svuint64_t vacc = svld1_u64(mask, xacc + 0);
|
4887
|
+
do {
|
4888
|
+
/* svprfd(svbool_t, void *, enum svfprop); */
|
4889
|
+
svprfd(mask, xinput + 128, SV_PLDL1STRM);
|
4890
|
+
ACCRND(vacc, 0);
|
4891
|
+
xinput += 8;
|
4892
|
+
xsecret += 1;
|
4893
|
+
nbStripes--;
|
4894
|
+
} while (nbStripes != 0);
|
4895
|
+
|
4896
|
+
svst1_u64(mask, xacc + 0, vacc);
|
4897
|
+
} else if (element_count == 2) { /* sve128 */
|
4898
|
+
svbool_t mask = svptrue_pat_b64(SV_VL2);
|
4899
|
+
svuint64_t acc0 = svld1_u64(mask, xacc + 0);
|
4900
|
+
svuint64_t acc1 = svld1_u64(mask, xacc + 2);
|
4901
|
+
svuint64_t acc2 = svld1_u64(mask, xacc + 4);
|
4902
|
+
svuint64_t acc3 = svld1_u64(mask, xacc + 6);
|
4903
|
+
do {
|
4904
|
+
svprfd(mask, xinput + 128, SV_PLDL1STRM);
|
4905
|
+
ACCRND(acc0, 0);
|
4906
|
+
ACCRND(acc1, 2);
|
4907
|
+
ACCRND(acc2, 4);
|
4908
|
+
ACCRND(acc3, 6);
|
4909
|
+
xinput += 8;
|
4910
|
+
xsecret += 1;
|
4911
|
+
nbStripes--;
|
4912
|
+
} while (nbStripes != 0);
|
4913
|
+
|
4914
|
+
svst1_u64(mask, xacc + 0, acc0);
|
4915
|
+
svst1_u64(mask, xacc + 2, acc1);
|
4916
|
+
svst1_u64(mask, xacc + 4, acc2);
|
4917
|
+
svst1_u64(mask, xacc + 6, acc3);
|
4918
|
+
} else {
|
4919
|
+
svbool_t mask = svptrue_pat_b64(SV_VL4);
|
4920
|
+
svuint64_t acc0 = svld1_u64(mask, xacc + 0);
|
4921
|
+
svuint64_t acc1 = svld1_u64(mask, xacc + 4);
|
4922
|
+
do {
|
4923
|
+
svprfd(mask, xinput + 128, SV_PLDL1STRM);
|
4924
|
+
ACCRND(acc0, 0);
|
4925
|
+
ACCRND(acc1, 4);
|
4926
|
+
xinput += 8;
|
4927
|
+
xsecret += 1;
|
4928
|
+
nbStripes--;
|
4929
|
+
} while (nbStripes != 0);
|
4930
|
+
|
4931
|
+
svst1_u64(mask, xacc + 0, acc0);
|
4932
|
+
svst1_u64(mask, xacc + 4, acc1);
|
4933
|
+
}
|
4934
|
+
}
|
4935
|
+
}
|
4936
|
+
|
4937
|
+
#endif
|
4938
|
+
|
4609
4939
|
/* scalar variants - universal */
|
4610
4940
|
|
4941
|
+
#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
|
4942
|
+
/*
|
4943
|
+
* In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they
|
4944
|
+
* emit an excess mask and a full 64-bit multiply-add (MADD X-form).
|
4945
|
+
*
|
4946
|
+
* While this might not seem like much, as AArch64 is a 64-bit architecture, only
|
4947
|
+
* big Cortex designs have a full 64-bit multiplier.
|
4948
|
+
*
|
4949
|
+
* On the little cores, the smaller 32-bit multiplier is used, and full 64-bit
|
4950
|
+
* multiplies expand to 2-3 multiplies in microcode. This has a major penalty
|
4951
|
+
* of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.
|
4952
|
+
*
|
4953
|
+
* Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does
|
4954
|
+
* not have this penalty and does the mask automatically.
|
4955
|
+
*/
|
4956
|
+
XXH_FORCE_INLINE xxh_u64
|
4957
|
+
XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
|
4958
|
+
{
|
4959
|
+
xxh_u64 ret;
|
4960
|
+
/* note: %x = 64-bit register, %w = 32-bit register */
|
4961
|
+
__asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));
|
4962
|
+
return ret;
|
4963
|
+
}
|
4964
|
+
#else
|
4965
|
+
XXH_FORCE_INLINE xxh_u64
|
4966
|
+
XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
|
4967
|
+
{
|
4968
|
+
return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;
|
4969
|
+
}
|
4970
|
+
#endif
|
4971
|
+
|
4611
4972
|
/*!
|
4612
4973
|
* @internal
|
4613
4974
|
* @brief Scalar round for @ref XXH3_accumulate_512_scalar().
|
@@ -4630,7 +4991,7 @@ XXH3_scalarRound(void* XXH_RESTRICT acc,
|
|
4630
4991
|
xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
|
4631
4992
|
xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
|
4632
4993
|
xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
|
4633
|
-
xacc[lane]
|
4994
|
+
xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);
|
4634
4995
|
}
|
4635
4996
|
}
|
4636
4997
|
|
@@ -4655,6 +5016,7 @@ XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
|
|
4655
5016
|
XXH3_scalarRound(acc, input, secret, i);
|
4656
5017
|
}
|
4657
5018
|
}
|
5019
|
+
XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
|
4658
5020
|
|
4659
5021
|
/*!
|
4660
5022
|
* @internal
|
@@ -4706,10 +5068,10 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
|
|
4706
5068
|
const xxh_u8* kSecretPtr = XXH3_kSecret;
|
4707
5069
|
XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
|
4708
5070
|
|
4709
|
-
#if defined(
|
5071
|
+
#if defined(__GNUC__) && defined(__aarch64__)
|
4710
5072
|
/*
|
4711
5073
|
* UGLY HACK:
|
4712
|
-
* Clang
|
5074
|
+
* GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are
|
4713
5075
|
* placed sequentially, in order, at the top of the unrolled loop.
|
4714
5076
|
*
|
4715
5077
|
* While MOVK is great for generating constants (2 cycles for a 64-bit
|
@@ -4724,7 +5086,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
|
|
4724
5086
|
* ADD
|
4725
5087
|
* SUB STR
|
4726
5088
|
* STR
|
4727
|
-
* By forcing loads from memory (as the asm line causes
|
5089
|
+
* By forcing loads from memory (as the asm line causes the compiler to assume
|
4728
5090
|
* that XXH3_kSecretPtr has been changed), the pipelines are used more
|
4729
5091
|
* efficiently:
|
4730
5092
|
* I L S
|
@@ -4741,17 +5103,11 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
|
|
4741
5103
|
*/
|
4742
5104
|
XXH_COMPILER_GUARD(kSecretPtr);
|
4743
5105
|
#endif
|
4744
|
-
/*
|
4745
|
-
* Note: in debug mode, this overrides the asm optimization
|
4746
|
-
* and Clang will emit MOVK chains again.
|
4747
|
-
*/
|
4748
|
-
XXH_ASSERT(kSecretPtr == XXH3_kSecret);
|
4749
|
-
|
4750
5106
|
{ int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
|
4751
5107
|
int i;
|
4752
5108
|
for (i=0; i < nbRounds; i++) {
|
4753
5109
|
/*
|
4754
|
-
* The asm hack causes
|
5110
|
+
* The asm hack causes the compiler to assume that kSecretPtr aliases with
|
4755
5111
|
* customSecret, and on aarch64, this prevented LDP from merging two
|
4756
5112
|
* loads together for free. Putting the loads together before the stores
|
4757
5113
|
* properly generates LDP.
|
@@ -4764,7 +5120,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
|
|
4764
5120
|
}
|
4765
5121
|
|
4766
5122
|
|
4767
|
-
typedef void (*
|
5123
|
+
typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
|
4768
5124
|
typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
|
4769
5125
|
typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
|
4770
5126
|
|
@@ -4772,36 +5128,48 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
|
|
4772
5128
|
#if (XXH_VECTOR == XXH_AVX512)
|
4773
5129
|
|
4774
5130
|
#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
|
5131
|
+
#define XXH3_accumulate XXH3_accumulate_avx512
|
4775
5132
|
#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
|
4776
5133
|
#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
|
4777
5134
|
|
4778
5135
|
#elif (XXH_VECTOR == XXH_AVX2)
|
4779
5136
|
|
4780
5137
|
#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
|
5138
|
+
#define XXH3_accumulate XXH3_accumulate_avx2
|
4781
5139
|
#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
|
4782
5140
|
#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
|
4783
5141
|
|
4784
5142
|
#elif (XXH_VECTOR == XXH_SSE2)
|
4785
5143
|
|
4786
5144
|
#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
|
5145
|
+
#define XXH3_accumulate XXH3_accumulate_sse2
|
4787
5146
|
#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
|
4788
5147
|
#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
|
4789
5148
|
|
4790
5149
|
#elif (XXH_VECTOR == XXH_NEON)
|
4791
5150
|
|
4792
5151
|
#define XXH3_accumulate_512 XXH3_accumulate_512_neon
|
5152
|
+
#define XXH3_accumulate XXH3_accumulate_neon
|
4793
5153
|
#define XXH3_scrambleAcc XXH3_scrambleAcc_neon
|
4794
5154
|
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
|
4795
5155
|
|
4796
5156
|
#elif (XXH_VECTOR == XXH_VSX)
|
4797
5157
|
|
4798
5158
|
#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
|
5159
|
+
#define XXH3_accumulate XXH3_accumulate_vsx
|
4799
5160
|
#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
|
4800
5161
|
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
|
4801
5162
|
|
5163
|
+
#elif (XXH_VECTOR == XXH_SVE)
|
5164
|
+
#define XXH3_accumulate_512 XXH3_accumulate_512_sve
|
5165
|
+
#define XXH3_accumulate XXH3_accumulate_sve
|
5166
|
+
#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
|
5167
|
+
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
|
5168
|
+
|
4802
5169
|
#else /* scalar */
|
4803
5170
|
|
4804
5171
|
#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
|
5172
|
+
#define XXH3_accumulate XXH3_accumulate_scalar
|
4805
5173
|
#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
|
4806
5174
|
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
|
4807
5175
|
|
@@ -4812,45 +5180,11 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
|
|
4812
5180
|
# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
|
4813
5181
|
#endif
|
4814
5182
|
|
4815
|
-
#ifndef XXH_PREFETCH_DIST
|
4816
|
-
# ifdef __clang__
|
4817
|
-
# define XXH_PREFETCH_DIST 320
|
4818
|
-
# else
|
4819
|
-
# if (XXH_VECTOR == XXH_AVX512)
|
4820
|
-
# define XXH_PREFETCH_DIST 512
|
4821
|
-
# else
|
4822
|
-
# define XXH_PREFETCH_DIST 384
|
4823
|
-
# endif
|
4824
|
-
# endif /* __clang__ */
|
4825
|
-
#endif /* XXH_PREFETCH_DIST */
|
4826
|
-
|
4827
|
-
/*
|
4828
|
-
* XXH3_accumulate()
|
4829
|
-
* Loops over XXH3_accumulate_512().
|
4830
|
-
* Assumption: nbStripes will not overflow the secret size
|
4831
|
-
*/
|
4832
|
-
XXH_FORCE_INLINE void
|
4833
|
-
XXH3_accumulate( xxh_u64* XXH_RESTRICT acc,
|
4834
|
-
const xxh_u8* XXH_RESTRICT input,
|
4835
|
-
const xxh_u8* XXH_RESTRICT secret,
|
4836
|
-
size_t nbStripes,
|
4837
|
-
XXH3_f_accumulate_512 f_acc512)
|
4838
|
-
{
|
4839
|
-
size_t n;
|
4840
|
-
for (n = 0; n < nbStripes; n++ ) {
|
4841
|
-
const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
|
4842
|
-
XXH_PREFETCH(in + XXH_PREFETCH_DIST);
|
4843
|
-
f_acc512(acc,
|
4844
|
-
in,
|
4845
|
-
secret + n*XXH_SECRET_CONSUME_RATE);
|
4846
|
-
}
|
4847
|
-
}
|
4848
|
-
|
4849
5183
|
XXH_FORCE_INLINE void
|
4850
5184
|
XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
|
4851
5185
|
const xxh_u8* XXH_RESTRICT input, size_t len,
|
4852
5186
|
const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
|
4853
|
-
|
5187
|
+
XXH3_f_accumulate f_acc,
|
4854
5188
|
XXH3_f_scrambleAcc f_scramble)
|
4855
5189
|
{
|
4856
5190
|
size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
|
@@ -4862,7 +5196,7 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
|
|
4862
5196
|
XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
|
4863
5197
|
|
4864
5198
|
for (n = 0; n < nb_blocks; n++) {
|
4865
|
-
|
5199
|
+
f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
|
4866
5200
|
f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
|
4867
5201
|
}
|
4868
5202
|
|
@@ -4870,12 +5204,12 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
|
|
4870
5204
|
XXH_ASSERT(len > XXH_STRIPE_LEN);
|
4871
5205
|
{ size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
|
4872
5206
|
XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
|
4873
|
-
|
5207
|
+
f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
|
4874
5208
|
|
4875
5209
|
/* last stripe */
|
4876
5210
|
{ const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
|
4877
5211
|
#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */
|
4878
|
-
|
5212
|
+
XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
|
4879
5213
|
} }
|
4880
5214
|
}
|
4881
5215
|
|
@@ -4920,12 +5254,12 @@ XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secre
|
|
4920
5254
|
XXH_FORCE_INLINE XXH64_hash_t
|
4921
5255
|
XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
|
4922
5256
|
const void* XXH_RESTRICT secret, size_t secretSize,
|
4923
|
-
|
5257
|
+
XXH3_f_accumulate f_acc,
|
4924
5258
|
XXH3_f_scrambleAcc f_scramble)
|
4925
5259
|
{
|
4926
5260
|
XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
|
4927
5261
|
|
4928
|
-
XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize,
|
5262
|
+
XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
|
4929
5263
|
|
4930
5264
|
/* converge into final hash */
|
4931
5265
|
XXH_STATIC_ASSERT(sizeof(acc) == 64);
|
@@ -4939,13 +5273,15 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
|
|
4939
5273
|
* It's important for performance to transmit secret's size (when it's static)
|
4940
5274
|
* so that the compiler can properly optimize the vectorized loop.
|
4941
5275
|
* This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
|
5276
|
+
* When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
|
5277
|
+
* breaks -Og, this is XXH_NO_INLINE.
|
4942
5278
|
*/
|
4943
|
-
|
5279
|
+
XXH3_WITH_SECRET_INLINE XXH64_hash_t
|
4944
5280
|
XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
|
4945
5281
|
XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
|
4946
5282
|
{
|
4947
5283
|
(void)seed64;
|
4948
|
-
return XXH3_hashLong_64b_internal(input, len, secret, secretLen,
|
5284
|
+
return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
|
4949
5285
|
}
|
4950
5286
|
|
4951
5287
|
/*
|
@@ -4959,7 +5295,7 @@ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
|
|
4959
5295
|
XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
|
4960
5296
|
{
|
4961
5297
|
(void)seed64; (void)secret; (void)secretLen;
|
4962
|
-
return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
|
5298
|
+
return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
|
4963
5299
|
}
|
4964
5300
|
|
4965
5301
|
/*
|
@@ -4976,7 +5312,7 @@ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
|
|
4976
5312
|
XXH_FORCE_INLINE XXH64_hash_t
|
4977
5313
|
XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
|
4978
5314
|
XXH64_hash_t seed,
|
4979
|
-
|
5315
|
+
XXH3_f_accumulate f_acc,
|
4980
5316
|
XXH3_f_scrambleAcc f_scramble,
|
4981
5317
|
XXH3_f_initCustomSecret f_initSec)
|
4982
5318
|
{
|
@@ -4984,12 +5320,12 @@ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
|
|
4984
5320
|
if (seed == 0)
|
4985
5321
|
return XXH3_hashLong_64b_internal(input, len,
|
4986
5322
|
XXH3_kSecret, sizeof(XXH3_kSecret),
|
4987
|
-
|
5323
|
+
f_acc, f_scramble);
|
4988
5324
|
#endif
|
4989
5325
|
{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
|
4990
5326
|
f_initSec(secret, seed);
|
4991
5327
|
return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
|
4992
|
-
|
5328
|
+
f_acc, f_scramble);
|
4993
5329
|
}
|
4994
5330
|
}
|
4995
5331
|
|
@@ -4997,12 +5333,12 @@ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
|
|
4997
5333
|
* It's important for performance that XXH3_hashLong is not inlined.
|
4998
5334
|
*/
|
4999
5335
|
XXH_NO_INLINE XXH64_hash_t
|
5000
|
-
XXH3_hashLong_64b_withSeed(const void* input, size_t len,
|
5001
|
-
XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
|
5336
|
+
XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
|
5337
|
+
XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
|
5002
5338
|
{
|
5003
5339
|
(void)secret; (void)secretLen;
|
5004
5340
|
return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
|
5005
|
-
|
5341
|
+
XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
|
5006
5342
|
}
|
5007
5343
|
|
5008
5344
|
|
@@ -5035,27 +5371,27 @@ XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
|
|
5035
5371
|
/* === Public entry point === */
|
5036
5372
|
|
5037
5373
|
/*! @ingroup XXH3_family */
|
5038
|
-
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t length)
|
5374
|
+
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
|
5039
5375
|
{
|
5040
5376
|
return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
|
5041
5377
|
}
|
5042
5378
|
|
5043
5379
|
/*! @ingroup XXH3_family */
|
5044
5380
|
XXH_PUBLIC_API XXH64_hash_t
|
5045
|
-
XXH3_64bits_withSecret(const void* input, size_t length, const void* secret, size_t secretSize)
|
5381
|
+
XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
|
5046
5382
|
{
|
5047
5383
|
return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
|
5048
5384
|
}
|
5049
5385
|
|
5050
5386
|
/*! @ingroup XXH3_family */
|
5051
5387
|
XXH_PUBLIC_API XXH64_hash_t
|
5052
|
-
XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed)
|
5388
|
+
XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
|
5053
5389
|
{
|
5054
5390
|
return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
|
5055
5391
|
}
|
5056
5392
|
|
5057
5393
|
XXH_PUBLIC_API XXH64_hash_t
|
5058
|
-
XXH3_64bits_withSecretandSeed(const void* input, size_t length, const void* secret, size_t secretSize, XXH64_hash_t seed)
|
5394
|
+
XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
|
5059
5395
|
{
|
5060
5396
|
if (length <= XXH3_MIDSIZE_MAX)
|
5061
5397
|
return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
|
@@ -5148,7 +5484,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
|
|
5148
5484
|
|
5149
5485
|
/*! @ingroup XXH3_family */
|
5150
5486
|
XXH_PUBLIC_API void
|
5151
|
-
XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
|
5487
|
+
XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
|
5152
5488
|
{
|
5153
5489
|
XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
|
5154
5490
|
}
|
@@ -5182,7 +5518,7 @@ XXH3_reset_internal(XXH3_state_t* statePtr,
|
|
5182
5518
|
|
5183
5519
|
/*! @ingroup XXH3_family */
|
5184
5520
|
XXH_PUBLIC_API XXH_errorcode
|
5185
|
-
XXH3_64bits_reset(XXH3_state_t* statePtr)
|
5521
|
+
XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
|
5186
5522
|
{
|
5187
5523
|
if (statePtr == NULL) return XXH_ERROR;
|
5188
5524
|
XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
|
@@ -5191,7 +5527,7 @@ XXH3_64bits_reset(XXH3_state_t* statePtr)
|
|
5191
5527
|
|
5192
5528
|
/*! @ingroup XXH3_family */
|
5193
5529
|
XXH_PUBLIC_API XXH_errorcode
|
5194
|
-
XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
|
5530
|
+
XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
|
5195
5531
|
{
|
5196
5532
|
if (statePtr == NULL) return XXH_ERROR;
|
5197
5533
|
XXH3_reset_internal(statePtr, 0, secret, secretSize);
|
@@ -5202,7 +5538,7 @@ XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t
|
|
5202
5538
|
|
5203
5539
|
/*! @ingroup XXH3_family */
|
5204
5540
|
XXH_PUBLIC_API XXH_errorcode
|
5205
|
-
XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
|
5541
|
+
XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
|
5206
5542
|
{
|
5207
5543
|
if (statePtr == NULL) return XXH_ERROR;
|
5208
5544
|
if (seed==0) return XXH3_64bits_reset(statePtr);
|
@@ -5214,7 +5550,7 @@ XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
|
|
5214
5550
|
|
5215
5551
|
/*! @ingroup XXH3_family */
|
5216
5552
|
XXH_PUBLIC_API XXH_errorcode
|
5217
|
-
XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)
|
5553
|
+
XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
|
5218
5554
|
{
|
5219
5555
|
if (statePtr == NULL) return XXH_ERROR;
|
5220
5556
|
if (secret == NULL) return XXH_ERROR;
|
@@ -5224,31 +5560,57 @@ XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret,
|
|
5224
5560
|
return XXH_OK;
|
5225
5561
|
}
|
5226
5562
|
|
5227
|
-
|
5228
|
-
*
|
5229
|
-
*
|
5230
|
-
|
5563
|
+
/*!
|
5564
|
+
* @internal
|
5565
|
+
* @brief Processes a large input for XXH3_update() and XXH3_digest_long().
|
5566
|
+
*
|
5567
|
+
* Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.
|
5568
|
+
*
|
5569
|
+
* @param acc Pointer to the 8 accumulator lanes
|
5570
|
+
* @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block*
|
5571
|
+
* @param nbStripesPerBlock Number of stripes in a block
|
5572
|
+
* @param input Input pointer
|
5573
|
+
* @param nbStripes Number of stripes to process
|
5574
|
+
* @param secret Secret pointer
|
5575
|
+
* @param secretLimit Offset of the last block in @p secret
|
5576
|
+
* @param f_acc Pointer to an XXH3_accumulate implementation
|
5577
|
+
* @param f_scramble Pointer to an XXH3_scrambleAcc implementation
|
5578
|
+
* @return Pointer past the end of @p input after processing
|
5579
|
+
*/
|
5580
|
+
XXH_FORCE_INLINE const xxh_u8 *
|
5231
5581
|
XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
|
5232
5582
|
size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
|
5233
5583
|
const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
|
5234
5584
|
const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
|
5235
|
-
|
5585
|
+
XXH3_f_accumulate f_acc,
|
5236
5586
|
XXH3_f_scrambleAcc f_scramble)
|
5237
5587
|
{
|
5238
|
-
|
5239
|
-
|
5240
|
-
if (nbStripesPerBlock - *nbStripesSoFarPtr
|
5241
|
-
/*
|
5242
|
-
size_t
|
5243
|
-
|
5244
|
-
|
5245
|
-
|
5246
|
-
|
5247
|
-
|
5248
|
-
|
5249
|
-
|
5588
|
+
const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;
|
5589
|
+
/* Process full blocks */
|
5590
|
+
if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {
|
5591
|
+
/* Process the initial partial block... */
|
5592
|
+
size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;
|
5593
|
+
|
5594
|
+
do {
|
5595
|
+
/* Accumulate and scramble */
|
5596
|
+
f_acc(acc, input, initialSecret, nbStripesThisIter);
|
5597
|
+
f_scramble(acc, secret + secretLimit);
|
5598
|
+
input += nbStripesThisIter * XXH_STRIPE_LEN;
|
5599
|
+
nbStripes -= nbStripesThisIter;
|
5600
|
+
/* Then continue the loop with the full block size */
|
5601
|
+
nbStripesThisIter = nbStripesPerBlock;
|
5602
|
+
initialSecret = secret;
|
5603
|
+
} while (nbStripes >= nbStripesPerBlock);
|
5604
|
+
*nbStripesSoFarPtr = 0;
|
5605
|
+
}
|
5606
|
+
/* Process a partial block */
|
5607
|
+
if (nbStripes > 0) {
|
5608
|
+
f_acc(acc, input, initialSecret, nbStripes);
|
5609
|
+
input += nbStripes * XXH_STRIPE_LEN;
|
5250
5610
|
*nbStripesSoFarPtr += nbStripes;
|
5251
5611
|
}
|
5612
|
+
/* Return end pointer */
|
5613
|
+
return input;
|
5252
5614
|
}
|
5253
5615
|
|
5254
5616
|
#ifndef XXH3_STREAM_USE_STACK
|
@@ -5262,7 +5624,7 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
|
|
5262
5624
|
XXH_FORCE_INLINE XXH_errorcode
|
5263
5625
|
XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
|
5264
5626
|
const xxh_u8* XXH_RESTRICT input, size_t len,
|
5265
|
-
|
5627
|
+
XXH3_f_accumulate f_acc,
|
5266
5628
|
XXH3_f_scrambleAcc f_scramble)
|
5267
5629
|
{
|
5268
5630
|
if (input==NULL) {
|
@@ -5278,7 +5640,8 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
|
|
5278
5640
|
* when operating accumulators directly into state.
|
5279
5641
|
* Operating into stack space seems to enable proper optimization.
|
5280
5642
|
* clang, on the other hand, doesn't seem to need this trick */
|
5281
|
-
XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
|
5643
|
+
XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
|
5644
|
+
XXH_memcpy(acc, state->acc, sizeof(acc));
|
5282
5645
|
#else
|
5283
5646
|
xxh_u64* XXH_RESTRICT const acc = state->acc;
|
5284
5647
|
#endif
|
@@ -5286,7 +5649,7 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
|
|
5286
5649
|
XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
|
5287
5650
|
|
5288
5651
|
/* small input : just fill in tmp buffer */
|
5289
|
-
if (
|
5652
|
+
if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {
|
5290
5653
|
XXH_memcpy(state->buffer + state->bufferedSize, input, len);
|
5291
5654
|
state->bufferedSize += (XXH32_hash_t)len;
|
5292
5655
|
return XXH_OK;
|
@@ -5308,57 +5671,20 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
|
|
5308
5671
|
&state->nbStripesSoFar, state->nbStripesPerBlock,
|
5309
5672
|
state->buffer, XXH3_INTERNALBUFFER_STRIPES,
|
5310
5673
|
secret, state->secretLimit,
|
5311
|
-
|
5674
|
+
f_acc, f_scramble);
|
5312
5675
|
state->bufferedSize = 0;
|
5313
5676
|
}
|
5314
5677
|
XXH_ASSERT(input < bEnd);
|
5315
|
-
|
5316
|
-
/* large input to consume : ingest per full block */
|
5317
|
-
if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
|
5678
|
+
if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
|
5318
5679
|
size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
|
5319
|
-
|
5320
|
-
/* join to current block's end */
|
5321
|
-
{ size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
|
5322
|
-
XXH_ASSERT(nbStripesToEnd <= nbStripes);
|
5323
|
-
XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);
|
5324
|
-
f_scramble(acc, secret + state->secretLimit);
|
5325
|
-
state->nbStripesSoFar = 0;
|
5326
|
-
input += nbStripesToEnd * XXH_STRIPE_LEN;
|
5327
|
-
nbStripes -= nbStripesToEnd;
|
5328
|
-
}
|
5329
|
-
/* consume per entire blocks */
|
5330
|
-
while(nbStripes >= state->nbStripesPerBlock) {
|
5331
|
-
XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);
|
5332
|
-
f_scramble(acc, secret + state->secretLimit);
|
5333
|
-
input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
|
5334
|
-
nbStripes -= state->nbStripesPerBlock;
|
5335
|
-
}
|
5336
|
-
/* consume last partial block */
|
5337
|
-
XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);
|
5338
|
-
input += nbStripes * XXH_STRIPE_LEN;
|
5339
|
-
XXH_ASSERT(input < bEnd); /* at least some bytes left */
|
5340
|
-
state->nbStripesSoFar = nbStripes;
|
5341
|
-
/* buffer predecessor of last partial stripe */
|
5342
|
-
XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
|
5343
|
-
XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
|
5344
|
-
} else {
|
5345
|
-
/* content to consume <= block size */
|
5346
|
-
/* Consume input by a multiple of internal buffer size */
|
5347
|
-
if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
|
5348
|
-
const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
|
5349
|
-
do {
|
5350
|
-
XXH3_consumeStripes(acc,
|
5680
|
+
input = XXH3_consumeStripes(acc,
|
5351
5681
|
&state->nbStripesSoFar, state->nbStripesPerBlock,
|
5352
|
-
|
5353
|
-
|
5354
|
-
|
5355
|
-
|
5356
|
-
} while (input<limit);
|
5357
|
-
/* buffer predecessor of last partial stripe */
|
5358
|
-
XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
|
5359
|
-
}
|
5360
|
-
}
|
5682
|
+
input, nbStripes,
|
5683
|
+
secret, state->secretLimit,
|
5684
|
+
f_acc, f_scramble);
|
5685
|
+
XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
|
5361
5686
|
|
5687
|
+
}
|
5362
5688
|
/* Some remaining input (always) : buffer it */
|
5363
5689
|
XXH_ASSERT(input < bEnd);
|
5364
5690
|
XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
|
@@ -5367,7 +5693,7 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
|
|
5367
5693
|
state->bufferedSize = (XXH32_hash_t)(bEnd-input);
|
5368
5694
|
#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
|
5369
5695
|
/* save stack accumulators into state */
|
5370
|
-
|
5696
|
+
XXH_memcpy(state->acc, acc, sizeof(acc));
|
5371
5697
|
#endif
|
5372
5698
|
}
|
5373
5699
|
|
@@ -5376,10 +5702,10 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
|
|
5376
5702
|
|
5377
5703
|
/*! @ingroup XXH3_family */
|
5378
5704
|
XXH_PUBLIC_API XXH_errorcode
|
5379
|
-
XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
|
5705
|
+
XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
|
5380
5706
|
{
|
5381
5707
|
return XXH3_update(state, (const xxh_u8*)input, len,
|
5382
|
-
|
5708
|
+
XXH3_accumulate, XXH3_scrambleAcc);
|
5383
5709
|
}
|
5384
5710
|
|
5385
5711
|
|
@@ -5388,37 +5714,40 @@ XXH3_digest_long (XXH64_hash_t* acc,
|
|
5388
5714
|
const XXH3_state_t* state,
|
5389
5715
|
const unsigned char* secret)
|
5390
5716
|
{
|
5717
|
+
xxh_u8 lastStripe[XXH_STRIPE_LEN];
|
5718
|
+
const xxh_u8* lastStripePtr;
|
5719
|
+
|
5391
5720
|
/*
|
5392
5721
|
* Digest on a local copy. This way, the state remains unaltered, and it can
|
5393
5722
|
* continue ingesting more input afterwards.
|
5394
5723
|
*/
|
5395
5724
|
XXH_memcpy(acc, state->acc, sizeof(state->acc));
|
5396
5725
|
if (state->bufferedSize >= XXH_STRIPE_LEN) {
|
5726
|
+
/* Consume remaining stripes then point to remaining data in buffer */
|
5397
5727
|
size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
|
5398
5728
|
size_t nbStripesSoFar = state->nbStripesSoFar;
|
5399
5729
|
XXH3_consumeStripes(acc,
|
5400
5730
|
&nbStripesSoFar, state->nbStripesPerBlock,
|
5401
5731
|
state->buffer, nbStripes,
|
5402
5732
|
secret, state->secretLimit,
|
5403
|
-
|
5404
|
-
|
5405
|
-
XXH3_accumulate_512(acc,
|
5406
|
-
state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
|
5407
|
-
secret + state->secretLimit - XXH_SECRET_LASTACC_START);
|
5733
|
+
XXH3_accumulate, XXH3_scrambleAcc);
|
5734
|
+
lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;
|
5408
5735
|
} else { /* bufferedSize < XXH_STRIPE_LEN */
|
5409
|
-
|
5736
|
+
/* Copy to temp buffer */
|
5410
5737
|
size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
|
5411
5738
|
XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */
|
5412
5739
|
XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
|
5413
5740
|
XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
|
5414
|
-
|
5415
|
-
lastStripe,
|
5416
|
-
secret + state->secretLimit - XXH_SECRET_LASTACC_START);
|
5741
|
+
lastStripePtr = lastStripe;
|
5417
5742
|
}
|
5743
|
+
/* Last stripe */
|
5744
|
+
XXH3_accumulate_512(acc,
|
5745
|
+
lastStripePtr,
|
5746
|
+
secret + state->secretLimit - XXH_SECRET_LASTACC_START);
|
5418
5747
|
}
|
5419
5748
|
|
5420
5749
|
/*! @ingroup XXH3_family */
|
5421
|
-
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
|
5750
|
+
XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
|
5422
5751
|
{
|
5423
5752
|
const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
|
5424
5753
|
if (state->totalLen > XXH3_MIDSIZE_MAX) {
|
@@ -5631,7 +5960,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
5631
5960
|
#if XXH_SIZE_OPT >= 1
|
5632
5961
|
{
|
5633
5962
|
/* Smaller, but slightly slower. */
|
5634
|
-
|
5963
|
+
unsigned int i = (unsigned int)(len - 1) / 32;
|
5635
5964
|
do {
|
5636
5965
|
acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
|
5637
5966
|
} while (i-- != 0);
|
@@ -5669,25 +5998,34 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
5669
5998
|
XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
|
5670
5999
|
|
5671
6000
|
{ XXH128_hash_t acc;
|
5672
|
-
|
5673
|
-
int i;
|
6001
|
+
unsigned i;
|
5674
6002
|
acc.low64 = len * XXH_PRIME64_1;
|
5675
6003
|
acc.high64 = 0;
|
5676
|
-
|
6004
|
+
/*
|
6005
|
+
* We set as `i` as offset + 32. We do this so that unchanged
|
6006
|
+
* `len` can be used as upper bound. This reaches a sweet spot
|
6007
|
+
* where both x86 and aarch64 get simple agen and good codegen
|
6008
|
+
* for the loop.
|
6009
|
+
*/
|
6010
|
+
for (i = 32; i < 160; i += 32) {
|
5677
6011
|
acc = XXH128_mix32B(acc,
|
5678
|
-
input +
|
5679
|
-
input +
|
5680
|
-
secret +
|
6012
|
+
input + i - 32,
|
6013
|
+
input + i - 16,
|
6014
|
+
secret + i - 32,
|
5681
6015
|
seed);
|
5682
6016
|
}
|
5683
6017
|
acc.low64 = XXH3_avalanche(acc.low64);
|
5684
6018
|
acc.high64 = XXH3_avalanche(acc.high64);
|
5685
|
-
|
5686
|
-
|
6019
|
+
/*
|
6020
|
+
* NB: `i <= len` will duplicate the last 32-bytes if
|
6021
|
+
* len % 32 was zero. This is an unfortunate necessity to keep
|
6022
|
+
* the hash result stable.
|
6023
|
+
*/
|
6024
|
+
for (i=160; i <= len; i += 32) {
|
5687
6025
|
acc = XXH128_mix32B(acc,
|
5688
|
-
input +
|
5689
|
-
input +
|
5690
|
-
secret + XXH3_MIDSIZE_STARTOFFSET +
|
6026
|
+
input + i - 32,
|
6027
|
+
input + i - 16,
|
6028
|
+
secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
|
5691
6029
|
seed);
|
5692
6030
|
}
|
5693
6031
|
/* last bytes */
|
@@ -5695,7 +6033,7 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
5695
6033
|
input + len - 16,
|
5696
6034
|
input + len - 32,
|
5697
6035
|
secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
|
5698
|
-
|
6036
|
+
(XXH64_hash_t)0 - seed);
|
5699
6037
|
|
5700
6038
|
{ XXH128_hash_t h128;
|
5701
6039
|
h128.low64 = acc.low64 + acc.high64;
|
@@ -5712,12 +6050,12 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
|
|
5712
6050
|
XXH_FORCE_INLINE XXH128_hash_t
|
5713
6051
|
XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
|
5714
6052
|
const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
|
5715
|
-
|
6053
|
+
XXH3_f_accumulate f_acc,
|
5716
6054
|
XXH3_f_scrambleAcc f_scramble)
|
5717
6055
|
{
|
5718
6056
|
XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
|
5719
6057
|
|
5720
|
-
XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize,
|
6058
|
+
XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
|
5721
6059
|
|
5722
6060
|
/* converge into final hash */
|
5723
6061
|
XXH_STATIC_ASSERT(sizeof(acc) == 64);
|
@@ -5744,38 +6082,41 @@ XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
|
|
5744
6082
|
{
|
5745
6083
|
(void)seed64; (void)secret; (void)secretLen;
|
5746
6084
|
return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
|
5747
|
-
|
6085
|
+
XXH3_accumulate, XXH3_scrambleAcc);
|
5748
6086
|
}
|
5749
6087
|
|
5750
6088
|
/*
|
5751
6089
|
* It's important for performance to pass @p secretLen (when it's static)
|
5752
6090
|
* to the compiler, so that it can properly optimize the vectorized loop.
|
6091
|
+
*
|
6092
|
+
* When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
|
6093
|
+
* breaks -Og, this is XXH_NO_INLINE.
|
5753
6094
|
*/
|
5754
|
-
|
6095
|
+
XXH3_WITH_SECRET_INLINE XXH128_hash_t
|
5755
6096
|
XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
|
5756
6097
|
XXH64_hash_t seed64,
|
5757
6098
|
const void* XXH_RESTRICT secret, size_t secretLen)
|
5758
6099
|
{
|
5759
6100
|
(void)seed64;
|
5760
6101
|
return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
|
5761
|
-
|
6102
|
+
XXH3_accumulate, XXH3_scrambleAcc);
|
5762
6103
|
}
|
5763
6104
|
|
5764
6105
|
XXH_FORCE_INLINE XXH128_hash_t
|
5765
6106
|
XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
|
5766
6107
|
XXH64_hash_t seed64,
|
5767
|
-
|
6108
|
+
XXH3_f_accumulate f_acc,
|
5768
6109
|
XXH3_f_scrambleAcc f_scramble,
|
5769
6110
|
XXH3_f_initCustomSecret f_initSec)
|
5770
6111
|
{
|
5771
6112
|
if (seed64 == 0)
|
5772
6113
|
return XXH3_hashLong_128b_internal(input, len,
|
5773
6114
|
XXH3_kSecret, sizeof(XXH3_kSecret),
|
5774
|
-
|
6115
|
+
f_acc, f_scramble);
|
5775
6116
|
{ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
|
5776
6117
|
f_initSec(secret, seed64);
|
5777
6118
|
return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
|
5778
|
-
|
6119
|
+
f_acc, f_scramble);
|
5779
6120
|
}
|
5780
6121
|
}
|
5781
6122
|
|
@@ -5788,7 +6129,7 @@ XXH3_hashLong_128b_withSeed(const void* input, size_t len,
|
|
5788
6129
|
{
|
5789
6130
|
(void)secret; (void)secretLen;
|
5790
6131
|
return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
|
5791
|
-
|
6132
|
+
XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
|
5792
6133
|
}
|
5793
6134
|
|
5794
6135
|
typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
|
@@ -5819,7 +6160,7 @@ XXH3_128bits_internal(const void* input, size_t len,
|
|
5819
6160
|
/* === Public XXH128 API === */
|
5820
6161
|
|
5821
6162
|
/*! @ingroup XXH3_family */
|
5822
|
-
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
|
6163
|
+
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
|
5823
6164
|
{
|
5824
6165
|
return XXH3_128bits_internal(input, len, 0,
|
5825
6166
|
XXH3_kSecret, sizeof(XXH3_kSecret),
|
@@ -5828,7 +6169,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
|
|
5828
6169
|
|
5829
6170
|
/*! @ingroup XXH3_family */
|
5830
6171
|
XXH_PUBLIC_API XXH128_hash_t
|
5831
|
-
XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
|
6172
|
+
XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
|
5832
6173
|
{
|
5833
6174
|
return XXH3_128bits_internal(input, len, 0,
|
5834
6175
|
(const xxh_u8*)secret, secretSize,
|
@@ -5837,7 +6178,7 @@ XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_
|
|
5837
6178
|
|
5838
6179
|
/*! @ingroup XXH3_family */
|
5839
6180
|
XXH_PUBLIC_API XXH128_hash_t
|
5840
|
-
XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
|
6181
|
+
XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
|
5841
6182
|
{
|
5842
6183
|
return XXH3_128bits_internal(input, len, seed,
|
5843
6184
|
XXH3_kSecret, sizeof(XXH3_kSecret),
|
@@ -5846,7 +6187,7 @@ XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
|
|
5846
6187
|
|
5847
6188
|
/*! @ingroup XXH3_family */
|
5848
6189
|
XXH_PUBLIC_API XXH128_hash_t
|
5849
|
-
XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
|
6190
|
+
XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
|
5850
6191
|
{
|
5851
6192
|
if (len <= XXH3_MIDSIZE_MAX)
|
5852
6193
|
return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
|
@@ -5855,7 +6196,7 @@ XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret
|
|
5855
6196
|
|
5856
6197
|
/*! @ingroup XXH3_family */
|
5857
6198
|
XXH_PUBLIC_API XXH128_hash_t
|
5858
|
-
XXH128(const void* input, size_t len, XXH64_hash_t seed)
|
6199
|
+
XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
|
5859
6200
|
{
|
5860
6201
|
return XXH3_128bits_withSeed(input, len, seed);
|
5861
6202
|
}
|
@@ -5870,42 +6211,41 @@ XXH128(const void* input, size_t len, XXH64_hash_t seed)
|
|
5870
6211
|
|
5871
6212
|
/*! @ingroup XXH3_family */
|
5872
6213
|
XXH_PUBLIC_API XXH_errorcode
|
5873
|
-
XXH3_128bits_reset(XXH3_state_t* statePtr)
|
6214
|
+
XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
|
5874
6215
|
{
|
5875
6216
|
return XXH3_64bits_reset(statePtr);
|
5876
6217
|
}
|
5877
6218
|
|
5878
6219
|
/*! @ingroup XXH3_family */
|
5879
6220
|
XXH_PUBLIC_API XXH_errorcode
|
5880
|
-
XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
|
6221
|
+
XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
|
5881
6222
|
{
|
5882
6223
|
return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
|
5883
6224
|
}
|
5884
6225
|
|
5885
6226
|
/*! @ingroup XXH3_family */
|
5886
6227
|
XXH_PUBLIC_API XXH_errorcode
|
5887
|
-
XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
|
6228
|
+
XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
|
5888
6229
|
{
|
5889
6230
|
return XXH3_64bits_reset_withSeed(statePtr, seed);
|
5890
6231
|
}
|
5891
6232
|
|
5892
6233
|
/*! @ingroup XXH3_family */
|
5893
6234
|
XXH_PUBLIC_API XXH_errorcode
|
5894
|
-
XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
|
6235
|
+
XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
|
5895
6236
|
{
|
5896
6237
|
return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
|
5897
6238
|
}
|
5898
6239
|
|
5899
6240
|
/*! @ingroup XXH3_family */
|
5900
6241
|
XXH_PUBLIC_API XXH_errorcode
|
5901
|
-
XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
|
6242
|
+
XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
|
5902
6243
|
{
|
5903
|
-
return
|
5904
|
-
XXH3_accumulate_512, XXH3_scrambleAcc);
|
6244
|
+
return XXH3_64bits_update(state, input, len);
|
5905
6245
|
}
|
5906
6246
|
|
5907
6247
|
/*! @ingroup XXH3_family */
|
5908
|
-
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
|
6248
|
+
XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
|
5909
6249
|
{
|
5910
6250
|
const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
|
5911
6251
|
if (state->totalLen > XXH3_MIDSIZE_MAX) {
|
@@ -5947,7 +6287,7 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
|
|
5947
6287
|
* <0 if *h128_1 < *h128_2
|
5948
6288
|
* =0 if *h128_1 == *h128_2 */
|
5949
6289
|
/*! @ingroup XXH3_family */
|
5950
|
-
XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
|
6290
|
+
XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
|
5951
6291
|
{
|
5952
6292
|
XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
|
5953
6293
|
XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
|
@@ -5961,7 +6301,7 @@ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
|
|
5961
6301
|
/*====== Canonical representation ======*/
|
5962
6302
|
/*! @ingroup XXH3_family */
|
5963
6303
|
XXH_PUBLIC_API void
|
5964
|
-
XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
|
6304
|
+
XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
|
5965
6305
|
{
|
5966
6306
|
XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
|
5967
6307
|
if (XXH_CPU_LITTLE_ENDIAN) {
|
@@ -5974,7 +6314,7 @@ XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
|
|
5974
6314
|
|
5975
6315
|
/*! @ingroup XXH3_family */
|
5976
6316
|
XXH_PUBLIC_API XXH128_hash_t
|
5977
|
-
XXH128_hashFromCanonical(const XXH128_canonical_t* src)
|
6317
|
+
XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
|
5978
6318
|
{
|
5979
6319
|
XXH128_hash_t h;
|
5980
6320
|
h.high64 = XXH_readBE64(src);
|
@@ -5998,7 +6338,7 @@ XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
|
|
5998
6338
|
|
5999
6339
|
/*! @ingroup XXH3_family */
|
6000
6340
|
XXH_PUBLIC_API XXH_errorcode
|
6001
|
-
XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)
|
6341
|
+
XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
|
6002
6342
|
{
|
6003
6343
|
#if (XXH_DEBUGLEVEL >= 1)
|
6004
6344
|
XXH_ASSERT(secretBuffer != NULL);
|
@@ -6043,7 +6383,7 @@ XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSee
|
|
6043
6383
|
|
6044
6384
|
/*! @ingroup XXH3_family */
|
6045
6385
|
XXH_PUBLIC_API void
|
6046
|
-
XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
|
6386
|
+
XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
|
6047
6387
|
{
|
6048
6388
|
XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
|
6049
6389
|
XXH3_initCustomSecret(secret, seed);
|
@@ -6071,5 +6411,5 @@ XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
|
|
6071
6411
|
|
6072
6412
|
|
6073
6413
|
#if defined (__cplusplus)
|
6074
|
-
}
|
6414
|
+
} /* extern "C" */
|
6075
6415
|
#endif
|