digest-xxhash 0.2.5 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -716,8 +716,15 @@ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canoni
716
716
  # define XXH_HAS_ATTRIBUTE(x) 0
717
717
  #endif
718
718
 
719
+ /*
720
+ * C23 __STDC_VERSION__ number hasn't been specified yet. For now
721
+ * leave as `201711L` (C17 + 1).
722
+ * TODO: Update to correct value when its been specified.
723
+ */
724
+ #define XXH_C23_VN 201711L
725
+
719
726
  /* C-language Attributes are added in C23. */
720
- #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
727
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute)
721
728
  # define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
722
729
  #else
723
730
  # define XXH_HAS_C_ATTRIBUTE(x) 0
@@ -743,6 +750,18 @@ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canoni
743
750
  # define XXH_FALLTHROUGH /* fallthrough */
744
751
  #endif
745
752
 
753
+ /*
754
+ * Define XXH_NOESCAPE for annotated pointers in public API.
755
+ * https://clang.llvm.org/docs/AttributeReference.html#noescape
756
+ * As of writing this, only supported by clang.
757
+ */
758
+ #if XXH_HAS_ATTRIBUTE(noescape)
759
+ # define XXH_NOESCAPE __attribute__((noescape))
760
+ #else
761
+ # define XXH_NOESCAPE
762
+ #endif
763
+
764
+
746
765
  /*!
747
766
  * @}
748
767
  * @ingroup public
@@ -813,7 +832,7 @@ typedef uint64_t XXH64_hash_t;
813
832
  * @see
814
833
  * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
815
834
  */
816
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
835
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
817
836
 
818
837
  /******* Streaming *******/
819
838
  #ifndef XXH_NO_STREAM
@@ -825,16 +844,16 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(const void* input, size_t length, XX
825
844
  typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
826
845
  XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
827
846
  XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
828
- XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
847
+ XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);
829
848
 
830
- XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed);
831
- XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
832
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);
849
+ XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);
850
+ XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
851
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
833
852
  #endif /* !XXH_NO_STREAM */
834
853
  /******* Canonical representation *******/
835
854
  typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
836
- XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
837
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
855
+ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);
856
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);
838
857
 
839
858
  #ifndef XXH_NO_XXH3
840
859
 
@@ -872,7 +891,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canoni
872
891
  *
873
892
  * XXH3 implementation is portable:
874
893
  * it has a generic C90 formulation that can be compiled on any platform,
875
- * all implementations generage exactly the same hash value on all platforms.
894
+ * all implementations generate exactly the same hash value on all platforms.
876
895
  * Starting from v0.8.0, it's also labelled "stable", meaning that
877
896
  * any future version will also generate the same hash value.
878
897
  *
@@ -902,7 +921,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canoni
902
921
  * @see
903
922
  * XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version.
904
923
  */
905
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(const void* input, size_t length);
924
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);
906
925
 
907
926
  /*!
908
927
  * @brief 64-bit seeded variant of XXH3
@@ -919,7 +938,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(const void* input, size_t leng
919
938
  * @param length The length
920
939
  * @param seed The 64-bit seed to alter the state.
921
940
  */
922
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed);
941
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
923
942
 
924
943
  /*!
925
944
  * The bare minimum size for a custom secret.
@@ -948,7 +967,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(const void* input, si
948
967
  * This is not necessarily the case when using the blob of bytes directly
949
968
  * because, when hashing _small_ inputs, only a portion of the secret is employed.
950
969
  */
951
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
970
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
952
971
 
953
972
 
954
973
  /******* Streaming *******/
@@ -968,20 +987,20 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(const void* data, s
968
987
  typedef struct XXH3_state_s XXH3_state_t;
969
988
  XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
970
989
  XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
971
- XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
990
+ XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);
972
991
 
973
992
  /*
974
993
  * XXH3_64bits_reset():
975
994
  * Initialize with default parameters.
976
995
  * digest will be equivalent to `XXH3_64bits()`.
977
996
  */
978
- XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
997
+ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
979
998
  /*
980
999
  * XXH3_64bits_reset_withSeed():
981
1000
  * Generate a custom secret from `seed`, and store it into `statePtr`.
982
1001
  * digest will be equivalent to `XXH3_64bits_withSeed()`.
983
1002
  */
984
- XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
1003
+ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
985
1004
  /*!
986
1005
  * XXH3_64bits_reset_withSecret():
987
1006
  * `secret` is referenced, it _must outlive_ the hash streaming session.
@@ -991,10 +1010,10 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr,
991
1010
  * When in doubt about the randomness of a candidate `secret`,
992
1011
  * consider employing `XXH3_generateSecret()` instead (see below).
993
1012
  */
994
- XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
1013
+ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
995
1014
 
996
- XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
997
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr);
1015
+ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
1016
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
998
1017
  #endif /* !XXH_NO_STREAM */
999
1018
 
1000
1019
  /* note : canonical representation of XXH3 is the same as XXH64
@@ -1033,11 +1052,11 @@ typedef struct {
1033
1052
  * @see
1034
1053
  * XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version.
1035
1054
  */
1036
- XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(const void* data, size_t len);
1055
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
1037
1056
  /*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */
1038
- XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
1057
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
1039
1058
  /*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */
1040
- XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
1059
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
1041
1060
 
1042
1061
  /******* Streaming *******/
1043
1062
  #ifndef XXH_NO_STREAM
@@ -1053,12 +1072,12 @@ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(const void* data,
1053
1072
  * All reset and streaming functions have same meaning as their 64-bit counterpart.
1054
1073
  */
1055
1074
 
1056
- XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
1057
- XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
1058
- XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
1075
+ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
1076
+ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
1077
+ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
1059
1078
 
1060
- XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
1061
- XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
1079
+ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
1080
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
1062
1081
  #endif /* !XXH_NO_STREAM */
1063
1082
 
1064
1083
  /* Following helper functions make it possible to compare XXH128_hast_t values.
@@ -1079,13 +1098,13 @@ XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
1079
1098
  * =0 if *h128_1 == *h128_2
1080
1099
  * <0 if *h128_1 < *h128_2
1081
1100
  */
1082
- XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(const void* h128_1, const void* h128_2);
1101
+ XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);
1083
1102
 
1084
1103
 
1085
1104
  /******* Canonical representation *******/
1086
1105
  typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
1087
- XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
1088
- XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
1106
+ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);
1107
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);
1089
1108
 
1090
1109
 
1091
1110
  #endif /* !XXH_NO_XXH3 */
@@ -1266,13 +1285,18 @@ struct XXH3_state_s {
1266
1285
  * Note that this doesn't prepare the state for a streaming operation,
1267
1286
  * it's still necessary to use XXH3_NNbits_reset*() afterwards.
1268
1287
  */
1269
- #define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; }
1288
+ #define XXH3_INITSTATE(XXH3_state_ptr) \
1289
+ do { \
1290
+ XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \
1291
+ tmp_xxh3_state_ptr->seed = 0; \
1292
+ tmp_xxh3_state_ptr->extSecret = NULL; \
1293
+ } while(0)
1270
1294
 
1271
1295
 
1272
1296
  /*!
1273
1297
  * simple alias to pre-selected XXH3_128bits variant
1274
1298
  */
1275
- XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
1299
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
1276
1300
 
1277
1301
 
1278
1302
  /* === Experimental API === */
@@ -1329,7 +1353,7 @@ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(const void* data, size_t len, XXH6
1329
1353
  * }
1330
1354
  * @endcode
1331
1355
  */
1332
- XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
1356
+ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);
1333
1357
 
1334
1358
  /*!
1335
1359
  * @brief Generate the same secret as the _withSeed() variants.
@@ -1368,7 +1392,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secr
1368
1392
  * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes
1369
1393
  * @param seed The seed to seed the state.
1370
1394
  */
1371
- XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
1395
+ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);
1372
1396
 
1373
1397
  /*!
1374
1398
  * These variants generate hash values using either
@@ -1397,24 +1421,24 @@ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_
1397
1421
  * because only portions of the secret are employed for small data.
1398
1422
  */
1399
1423
  XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
1400
- XXH3_64bits_withSecretandSeed(const void* data, size_t len,
1401
- const void* secret, size_t secretSize,
1424
+ XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
1425
+ XXH_NOESCAPE const void* secret, size_t secretSize,
1402
1426
  XXH64_hash_t seed);
1403
1427
  /*! @copydoc XXH3_64bits_withSecretandSeed() */
1404
1428
  XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
1405
- XXH3_128bits_withSecretandSeed(const void* input, size_t length,
1406
- const void* secret, size_t secretSize,
1429
+ XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
1430
+ XXH_NOESCAPE const void* secret, size_t secretSize,
1407
1431
  XXH64_hash_t seed64);
1408
1432
  #ifndef XXH_NO_STREAM
1409
1433
  /*! @copydoc XXH3_64bits_withSecretandSeed() */
1410
1434
  XXH_PUBLIC_API XXH_errorcode
1411
- XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1412
- const void* secret, size_t secretSize,
1435
+ XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
1436
+ XXH_NOESCAPE const void* secret, size_t secretSize,
1413
1437
  XXH64_hash_t seed64);
1414
1438
  /*! @copydoc XXH3_64bits_withSecretandSeed() */
1415
1439
  XXH_PUBLIC_API XXH_errorcode
1416
- XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1417
- const void* secret, size_t secretSize,
1440
+ XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
1441
+ XXH_NOESCAPE const void* secret, size_t secretSize,
1418
1442
  XXH64_hash_t seed64);
1419
1443
  #endif /* !XXH_NO_STREAM */
1420
1444
 
@@ -1522,7 +1546,7 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1522
1546
  * care, as what works on one compiler/platform/optimization level may cause
1523
1547
  * another to read garbage data or even crash.
1524
1548
  *
1525
- * See http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
1549
+ * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
1526
1550
  *
1527
1551
  * Prefer these methods in priority order (0 > 3 > 1 > 2)
1528
1552
  */
@@ -1608,6 +1632,23 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1608
1632
  */
1609
1633
  # define XXH_NO_INLINE_HINTS 0
1610
1634
 
1635
+ /*!
1636
+ * @def XXH3_INLINE_SECRET
1637
+ * @brief Determines whether to inline the XXH3 withSecret code.
1638
+ *
1639
+ * When the secret size is known, the compiler can improve the performance
1640
+ * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().
1641
+ *
1642
+ * However, if the secret size is not known, it doesn't have any benefit. This
1643
+ * happens when xxHash is compiled into a global symbol. Therefore, if
1644
+ * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.
1645
+ *
1646
+ * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers
1647
+ * that are *sometimes* force inline on -Og, and it is impossible to automatically
1648
+ * detect this optimization level.
1649
+ */
1650
+ # define XXH3_INLINE_SECRET 0
1651
+
1611
1652
  /*!
1612
1653
  * @def XXH32_ENDJMP
1613
1654
  * @brief Whether to use a jump for `XXH32_finalize`.
@@ -1682,6 +1723,15 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1682
1723
  # endif
1683
1724
  #endif
1684
1725
 
1726
+ #ifndef XXH3_INLINE_SECRET
1727
+ # if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \
1728
+ || !defined(XXH_INLINE_ALL)
1729
+ # define XXH3_INLINE_SECRET 0
1730
+ # else
1731
+ # define XXH3_INLINE_SECRET 1
1732
+ # endif
1733
+ #endif
1734
+
1685
1735
  #ifndef XXH32_ENDJMP
1686
1736
  /* generally preferable for performance */
1687
1737
  # define XXH32_ENDJMP 0
@@ -1778,6 +1828,11 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1778
1828
  # define XXH_NO_INLINE static
1779
1829
  #endif
1780
1830
 
1831
+ #if XXH3_INLINE_SECRET
1832
+ # define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE
1833
+ #else
1834
+ # define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE
1835
+ #endif
1781
1836
 
1782
1837
 
1783
1838
  /* *************************************
@@ -1803,7 +1858,7 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1803
1858
  # include <assert.h> /* note: can still be disabled with NDEBUG */
1804
1859
  # define XXH_ASSERT(c) assert(c)
1805
1860
  #else
1806
- # define XXH_ASSERT(c) ((void)0)
1861
+ # define XXH_ASSERT(c) XXH_ASSUME(c)
1807
1862
  #endif
1808
1863
 
1809
1864
  /* note: use after variable declarations */
@@ -1835,11 +1890,17 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1835
1890
  * XXH3_initCustomSecret_scalar().
1836
1891
  */
1837
1892
  #if defined(__GNUC__) || defined(__clang__)
1838
- # define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
1893
+ # define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))
1839
1894
  #else
1840
1895
  # define XXH_COMPILER_GUARD(var) ((void)0)
1841
1896
  #endif
1842
1897
 
1898
+ #if defined(__clang__)
1899
+ # define XXH_COMPILER_GUARD_W(var) __asm__("" : "+w" (var))
1900
+ #else
1901
+ # define XXH_COMPILER_GUARD_W(var) ((void)0)
1902
+ #endif
1903
+
1843
1904
  /* *************************************
1844
1905
  * Basic Types
1845
1906
  ***************************************/
@@ -1946,7 +2007,7 @@ static xxh_u32 XXH_read32(const void* ptr)
1946
2007
 
1947
2008
  /*
1948
2009
  * Portable and safe solution. Generally efficient.
1949
- * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
2010
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
1950
2011
  */
1951
2012
  static xxh_u32 XXH_read32(const void* memPtr)
1952
2013
  {
@@ -2022,6 +2083,51 @@ static int XXH_isLittleEndian(void)
2022
2083
  # define XXH_HAS_BUILTIN(x) 0
2023
2084
  #endif
2024
2085
 
2086
+
2087
+
2088
+ /*
2089
+ * C23 and future versions have standard "unreachable()".
2090
+ * Once it has been implemented reliably we can add it as an
2091
+ * additional case:
2092
+ *
2093
+ * ```
2094
+ * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN)
2095
+ * # include <stddef.h>
2096
+ * # ifdef unreachable
2097
+ * # define XXH_UNREACHABLE() unreachable()
2098
+ * # endif
2099
+ * #endif
2100
+ * ```
2101
+ *
2102
+ * Note C++23 also has std::unreachable() which can be detected
2103
+ * as follows:
2104
+ * ```
2105
+ * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)
2106
+ * # include <utility>
2107
+ * # define XXH_UNREACHABLE() std::unreachable()
2108
+ * #endif
2109
+ * ```
2110
+ * NB: `__cpp_lib_unreachable` is defined in the `<version>` header.
2111
+ * We don't use that as including `<utility>` in `extern "C"` blocks
2112
+ * doesn't work on GCC12
2113
+ */
2114
+
2115
+ #if XXH_HAS_BUILTIN(__builtin_unreachable)
2116
+ # define XXH_UNREACHABLE() __builtin_unreachable()
2117
+
2118
+ #elif defined(_MSC_VER)
2119
+ # define XXH_UNREACHABLE() __assume(0)
2120
+
2121
+ #else
2122
+ # define XXH_UNREACHABLE()
2123
+ #endif
2124
+
2125
+ #if XXH_HAS_BUILTIN(__builtin_assume)
2126
+ # define XXH_ASSUME(c) __builtin_assume(c)
2127
+ #else
2128
+ # define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
2129
+ #endif
2130
+
2025
2131
  /*!
2026
2132
  * @internal
2027
2133
  * @def XXH_rotl32(x,r)
@@ -2211,9 +2317,9 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
2211
2317
  * can load data, while v3 can multiply. SSE forces them to operate
2212
2318
  * together.
2213
2319
  *
2214
- * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
2215
- * and it is pointless writing a NEON implementation that is basically the
2216
- * same speed as scalar for XXH32.
2320
+ * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing
2321
+ * the loop. NEON is only faster on the A53, and with the newer cores, it is less
2322
+ * than half the speed.
2217
2323
  */
2218
2324
  XXH_COMPILER_GUARD(acc);
2219
2325
  #endif
@@ -2288,41 +2394,41 @@ XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
2288
2394
  } else {
2289
2395
  switch(len&15) /* or switch(bEnd - p) */ {
2290
2396
  case 12: XXH_PROCESS4;
2291
- XXH_FALLTHROUGH;
2397
+ XXH_FALLTHROUGH; /* fallthrough */
2292
2398
  case 8: XXH_PROCESS4;
2293
- XXH_FALLTHROUGH;
2399
+ XXH_FALLTHROUGH; /* fallthrough */
2294
2400
  case 4: XXH_PROCESS4;
2295
2401
  return XXH32_avalanche(hash);
2296
2402
 
2297
2403
  case 13: XXH_PROCESS4;
2298
- XXH_FALLTHROUGH;
2404
+ XXH_FALLTHROUGH; /* fallthrough */
2299
2405
  case 9: XXH_PROCESS4;
2300
- XXH_FALLTHROUGH;
2406
+ XXH_FALLTHROUGH; /* fallthrough */
2301
2407
  case 5: XXH_PROCESS4;
2302
2408
  XXH_PROCESS1;
2303
2409
  return XXH32_avalanche(hash);
2304
2410
 
2305
2411
  case 14: XXH_PROCESS4;
2306
- XXH_FALLTHROUGH;
2412
+ XXH_FALLTHROUGH; /* fallthrough */
2307
2413
  case 10: XXH_PROCESS4;
2308
- XXH_FALLTHROUGH;
2414
+ XXH_FALLTHROUGH; /* fallthrough */
2309
2415
  case 6: XXH_PROCESS4;
2310
2416
  XXH_PROCESS1;
2311
2417
  XXH_PROCESS1;
2312
2418
  return XXH32_avalanche(hash);
2313
2419
 
2314
2420
  case 15: XXH_PROCESS4;
2315
- XXH_FALLTHROUGH;
2421
+ XXH_FALLTHROUGH; /* fallthrough */
2316
2422
  case 11: XXH_PROCESS4;
2317
- XXH_FALLTHROUGH;
2423
+ XXH_FALLTHROUGH; /* fallthrough */
2318
2424
  case 7: XXH_PROCESS4;
2319
- XXH_FALLTHROUGH;
2425
+ XXH_FALLTHROUGH; /* fallthrough */
2320
2426
  case 3: XXH_PROCESS1;
2321
- XXH_FALLTHROUGH;
2427
+ XXH_FALLTHROUGH; /* fallthrough */
2322
2428
  case 2: XXH_PROCESS1;
2323
- XXH_FALLTHROUGH;
2429
+ XXH_FALLTHROUGH; /* fallthrough */
2324
2430
  case 1: XXH_PROCESS1;
2325
- XXH_FALLTHROUGH;
2431
+ XXH_FALLTHROUGH; /* fallthrough */
2326
2432
  case 0: return XXH32_avalanche(hash);
2327
2433
  }
2328
2434
  XXH_ASSERT(0);
@@ -2590,7 +2696,7 @@ static xxh_u64 XXH_read64(const void* ptr)
2590
2696
 
2591
2697
  /*
2592
2698
  * Portable and safe solution. Generally efficient.
2593
- * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
2699
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
2594
2700
  */
2595
2701
  static xxh_u64 XXH_read64(const void* memPtr)
2596
2702
  {
@@ -2823,7 +2929,7 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment
2823
2929
 
2824
2930
 
2825
2931
  /*! @ingroup XXH64_family */
2826
- XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
2932
+ XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
2827
2933
  {
2828
2934
  #if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
2829
2935
  /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
@@ -2857,13 +2963,13 @@ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
2857
2963
  }
2858
2964
 
2859
2965
  /*! @ingroup XXH64_family */
2860
- XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
2966
+ XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
2861
2967
  {
2862
2968
  XXH_memcpy(dstState, srcState, sizeof(*dstState));
2863
2969
  }
2864
2970
 
2865
2971
  /*! @ingroup XXH64_family */
2866
- XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
2972
+ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
2867
2973
  {
2868
2974
  XXH_ASSERT(statePtr != NULL);
2869
2975
  memset(statePtr, 0, sizeof(*statePtr));
@@ -2876,7 +2982,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t s
2876
2982
 
2877
2983
  /*! @ingroup XXH64_family */
2878
2984
  XXH_PUBLIC_API XXH_errorcode
2879
- XXH64_update (XXH64_state_t* state, const void* input, size_t len)
2985
+ XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
2880
2986
  {
2881
2987
  if (input==NULL) {
2882
2988
  XXH_ASSERT(len == 0);
@@ -2927,7 +3033,7 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
2927
3033
 
2928
3034
 
2929
3035
  /*! @ingroup XXH64_family */
2930
- XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
3036
+ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
2931
3037
  {
2932
3038
  xxh_u64 h64;
2933
3039
 
@@ -2950,7 +3056,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
2950
3056
  /******* Canonical representation *******/
2951
3057
 
2952
3058
  /*! @ingroup XXH64_family */
2953
- XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
3059
+ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
2954
3060
  {
2955
3061
  XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
2956
3062
  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
@@ -2958,7 +3064,7 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t
2958
3064
  }
2959
3065
 
2960
3066
  /*! @ingroup XXH64_family */
2961
- XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
3067
+ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
2962
3068
  {
2963
3069
  return XXH_readBE64(src);
2964
3070
  }
@@ -2979,11 +3085,19 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
2979
3085
  /* === Compiler specifics === */
2980
3086
 
2981
3087
  #if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
2982
- # define XXH_RESTRICT /* disable */
3088
+ # define XXH_RESTRICT /* disable */
2983
3089
  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
2984
3090
  # define XXH_RESTRICT restrict
3091
+ #elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
3092
+ || (defined (__clang__)) \
3093
+ || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \
3094
+ || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))
3095
+ /*
3096
+ * There are a LOT more compilers that recognize __restrict but this
3097
+ * covers the major ones.
3098
+ */
3099
+ # define XXH_RESTRICT __restrict
2985
3100
  #else
2986
- /* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
2987
3101
  # define XXH_RESTRICT /* disable */
2988
3102
  #endif
2989
3103
 
@@ -2998,9 +3112,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
2998
3112
  #endif
2999
3113
 
3000
3114
  #if defined(__GNUC__) || defined(__clang__)
3115
+ # if defined(__ARM_FEATURE_SVE)
3116
+ # include <arm_sve.h>
3117
+ # endif
3001
3118
  # if defined(__ARM_NEON__) || defined(__ARM_NEON) \
3002
- || defined(__aarch64__) || defined(_M_ARM) \
3003
- || defined(_M_ARM64) || defined(_M_ARM64EC)
3119
+ || (defined(_M_ARM) && _M_ARM >= 7) \
3120
+ || defined(_M_ARM64) || defined(_M_ARM64EC)
3004
3121
  # define inline __inline__ /* circumvent a clang bug */
3005
3122
  # include <arm_neon.h>
3006
3123
  # undef inline
@@ -3125,12 +3242,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3125
3242
  XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */
3126
3243
  XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */
3127
3244
  XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
3245
+ XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */
3128
3246
  };
3129
3247
  /*!
3130
3248
  * @ingroup tuning
3131
3249
  * @brief Selects the minimum alignment for XXH3's accumulators.
3132
3250
  *
3133
- * When using SIMD, this should match the alignment reqired for said vector
3251
+ * When using SIMD, this should match the alignment required for said vector
3134
3252
  * type, so, for example, 32 for AVX2.
3135
3253
  *
3136
3254
  * Default: Auto detected.
@@ -3146,10 +3264,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3146
3264
  # define XXH_AVX512 3
3147
3265
  # define XXH_NEON 4
3148
3266
  # define XXH_VSX 5
3267
+ # define XXH_SVE 6
3149
3268
  #endif
3150
3269
 
3151
3270
  #ifndef XXH_VECTOR /* can be defined on command line */
3152
- # if ( \
3271
+ # if defined(__ARM_FEATURE_SVE)
3272
+ # define XXH_VECTOR XXH_SVE
3273
+ # elif ( \
3153
3274
  defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
3154
3275
  || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
3155
3276
  ) && ( \
@@ -3172,6 +3293,17 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3172
3293
  # endif
3173
3294
  #endif
3174
3295
 
3296
+ /* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
3297
+ #if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
3298
+ # ifdef _MSC_VER
3299
+ # pragma warning(once : 4606)
3300
+ # else
3301
+ # warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
3302
+ # endif
3303
+ # undef XXH_VECTOR
3304
+ # define XXH_VECTOR XXH_SCALAR
3305
+ #endif
3306
+
3175
3307
  /*
3176
3308
  * Controls the alignment of the accumulator,
3177
3309
  * for compatibility with aligned vector loads, which are usually faster.
@@ -3191,16 +3323,26 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3191
3323
  # define XXH_ACC_ALIGN 16
3192
3324
  # elif XXH_VECTOR == XXH_AVX512 /* avx512 */
3193
3325
  # define XXH_ACC_ALIGN 64
3326
+ # elif XXH_VECTOR == XXH_SVE /* sve */
3327
+ # define XXH_ACC_ALIGN 64
3194
3328
  # endif
3195
3329
  #endif
3196
3330
 
3197
3331
  #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
3198
3332
  || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
3199
3333
  # define XXH_SEC_ALIGN XXH_ACC_ALIGN
3334
+ #elif XXH_VECTOR == XXH_SVE
3335
+ # define XXH_SEC_ALIGN XXH_ACC_ALIGN
3200
3336
  #else
3201
3337
  # define XXH_SEC_ALIGN 8
3202
3338
  #endif
3203
3339
 
3340
+ #if defined(__GNUC__) || defined(__clang__)
3341
+ # define XXH_ALIASING __attribute__((may_alias))
3342
+ #else
3343
+ # define XXH_ALIASING /* nothing */
3344
+ #endif
3345
+
3204
3346
  /*
3205
3347
  * UGLY HACK:
3206
3348
  * GCC usually generates the best code with -O3 for xxHash.
@@ -3229,107 +3371,16 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3229
3371
  # pragma GCC optimize("-O2")
3230
3372
  #endif
3231
3373
 
3232
-
3233
3374
  #if XXH_VECTOR == XXH_NEON
3375
+
3234
3376
  /*
3235
- * NEON's setup for vmlal_u32 is a little more complicated than it is on
3236
- * SSE2, AVX2, and VSX.
3237
- *
3238
- * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
3239
- *
3240
- * To do the same operation, the 128-bit 'Q' register needs to be split into
3241
- * two 64-bit 'D' registers, performing this operation::
3242
- *
3243
- * [ a | b ]
3244
- * | '---------. .--------' |
3245
- * | x |
3246
- * | .---------' '--------. |
3247
- * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]
3248
- *
3249
- * Due to significant changes in aarch64, the fastest method for aarch64 is
3250
- * completely different than the fastest method for ARMv7-A.
3251
- *
3252
- * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
3253
- * D11 will modify the high half of Q5. This is similar to how modifying AH
3254
- * will only affect bits 8-15 of AX on x86.
3255
- *
3256
- * VZIP takes two registers, and puts even lanes in one register and odd lanes
3257
- * in the other.
3258
- *
3259
- * On ARMv7-A, this strangely modifies both parameters in place instead of
3260
- * taking the usual 3-operand form.
3261
- *
3262
- * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
3263
- * lower and upper halves of the Q register to end up with the high and low
3264
- * halves where we want - all in one instruction.
3265
- *
3266
- * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
3267
- *
3268
- * Unfortunately we need inline assembly for this: Instructions modifying two
3269
- * registers at once is not possible in GCC or Clang's IR, and they have to
3270
- * create a copy.
3377
+ * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3
3378
+ * optimizes out the entire hashLong loop because of the aliasing violation.
3271
3379
  *
3272
- * aarch64 requires a different approach.
3273
- *
3274
- * In order to make it easier to write a decent compiler for aarch64, many
3275
- * quirks were removed, such as conditional execution.
3276
- *
3277
- * NEON was also affected by this.
3278
- *
3279
- * aarch64 cannot access the high bits of a Q-form register, and writes to a
3280
- * D-form register zero the high bits, similar to how writes to W-form scalar
3281
- * registers (or DWORD registers on x86_64) work.
3282
- *
3283
- * The formerly free vget_high intrinsics now require a vext (with a few
3284
- * exceptions)
3285
- *
3286
- * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
3287
- * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
3288
- * operand.
3289
- *
3290
- * The equivalent of the VZIP.32 on the lower and upper halves would be this
3291
- * mess:
3292
- *
3293
- * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
3294
- * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }
3295
- * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
3296
- *
3297
- * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
3298
- *
3299
- * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
3300
- * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
3301
- *
3302
- * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
3380
+ * However, GCC is also inefficient at load-store optimization with vld1q/vst1q,
3381
+ * so the only option is to mark it as aliasing.
3303
3382
  */
3304
-
3305
- /*!
3306
- * Function-like macro:
3307
- * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
3308
- * {
3309
- * outLo = (uint32x2_t)(in & 0xFFFFFFFF);
3310
- * outHi = (uint32x2_t)(in >> 32);
3311
- * in = UNDEFINED;
3312
- * }
3313
- */
3314
- # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
3315
- && (defined(__GNUC__) || defined(__clang__)) \
3316
- && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
3317
- # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
3318
- do { \
3319
- /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
3320
- /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \
3321
- /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
3322
- __asm__("vzip.32 %e0, %f0" : "+w" (in)); \
3323
- (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \
3324
- (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
3325
- } while (0)
3326
- # else
3327
- # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
3328
- do { \
3329
- (outLo) = vmovn_u64 (in); \
3330
- (outHi) = vshrn_n_u64 ((in), 32); \
3331
- } while (0)
3332
- # endif
3383
+ typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
3333
3384
 
3334
3385
  /*!
3335
3386
  * @internal
@@ -3347,7 +3398,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3347
3398
  #if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
3348
3399
  XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
3349
3400
  {
3350
- return *(uint64x2_t const*)ptr;
3401
+ return *(xxh_aliasing_uint64x2_t const *)ptr;
3351
3402
  }
3352
3403
  #else
3353
3404
  XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
@@ -3355,38 +3406,75 @@ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
3355
3406
  return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
3356
3407
  }
3357
3408
  #endif
3409
+
3410
+ /*!
3411
+ * @internal
3412
+ * @brief `vmlal_u32` on low and high halves of a vector.
3413
+ *
3414
+ * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with
3415
+ * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`
3416
+ * with `vmlal_u32`.
3417
+ */
3418
+ #if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11
3419
+ XXH_FORCE_INLINE uint64x2_t
3420
+ XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
3421
+ {
3422
+ /* Inline assembly is the only way */
3423
+ __asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));
3424
+ return acc;
3425
+ }
3426
+ XXH_FORCE_INLINE uint64x2_t
3427
+ XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
3428
+ {
3429
+ /* This intrinsic works as expected */
3430
+ return vmlal_high_u32(acc, lhs, rhs);
3431
+ }
3432
+ #else
3433
+ /* Portable intrinsic versions */
3434
+ XXH_FORCE_INLINE uint64x2_t
3435
+ XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
3436
+ {
3437
+ return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));
3438
+ }
3439
+ /*! @copydoc XXH_vmlal_low_u32
3440
+ * Assume the compiler converts this to vmlal_high_u32 on aarch64 */
3441
+ XXH_FORCE_INLINE uint64x2_t
3442
+ XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
3443
+ {
3444
+ return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));
3445
+ }
3446
+ #endif
3447
+
3358
3448
  /*!
3359
3449
  * @ingroup tuning
3360
3450
  * @brief Controls the NEON to scalar ratio for XXH3
3361
3451
  *
3362
- * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and
3363
- * 2 lanes on scalar by default.
3452
+ * This can be set to 2, 4, 6, or 8.
3364
3453
  *
3365
- * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the
3366
- * emulated 64-bit arithmetic is too slow.
3454
+ * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.
3367
3455
  *
3368
- * Modern ARM CPUs are _very_ sensitive to how their pipelines are used.
3456
+ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those
3457
+ * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU
3458
+ * bandwidth.
3369
3459
  *
3370
- * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't
3371
- * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,
3372
- * you are only using 2/3 of the CPU bandwidth.
3373
- *
3374
- * This is even more noticable on the more advanced cores like the A76 which
3460
+ * This is even more noticeable on the more advanced cores like the Cortex-A76 which
3375
3461
  * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
3376
3462
  *
3377
- * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the
3378
- * remaining lanes will use scalar instructions. This improves the bandwidth
3379
- * and also gives the integer pipelines something to do besides twiddling loop
3380
- * counters and pointers.
3463
+ * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes
3464
+ * and 2 scalar lanes, which is chosen by default.
3465
+ *
3466
+ * This does not apply to Apple processors or 32-bit processors, which run better with
3467
+ * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.
3381
3468
  *
3382
3469
  * This change benefits CPUs with large micro-op buffers without negatively affecting
3383
- * other CPUs:
3470
+ * most other CPUs:
3384
3471
  *
3385
3472
  * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. |
3386
3473
  * |:----------------------|:--------------------|----------:|-----------:|------:|
3387
3474
  * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% |
3388
3475
  * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% |
3389
3476
  * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% |
3477
+ * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% |
3390
3478
  *
3391
3479
  * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
3392
3480
  *
@@ -3394,7 +3482,7 @@ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
3394
3482
  */
3395
3483
  # ifndef XXH3_NEON_LANES
3396
3484
  # if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
3397
- && XXH_SIZE_OPT <= 0
3485
+ && !defined(__APPLE__) && XXH_SIZE_OPT <= 0
3398
3486
  # define XXH3_NEON_LANES 6
3399
3487
  # else
3400
3488
  # define XXH3_NEON_LANES XXH_ACC_NB
@@ -3442,6 +3530,11 @@ typedef __vector unsigned long long xxh_u64x2;
3442
3530
  typedef __vector unsigned char xxh_u8x16;
3443
3531
  typedef __vector unsigned xxh_u32x4;
3444
3532
 
3533
+ /*
3534
+ * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.
3535
+ */
3536
+ typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;
3537
+
3445
3538
  # ifndef XXH_VSX_BE
3446
3539
  # if defined(__BIG_ENDIAN__) \
3447
3540
  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
@@ -3516,6 +3609,20 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
3516
3609
  # endif /* XXH_vec_mulo, XXH_vec_mule */
3517
3610
  #endif /* XXH_VECTOR == XXH_VSX */
3518
3611
 
3612
+ #if XXH_VECTOR == XXH_SVE
3613
+ #define ACCRND(acc, offset) \
3614
+ do { \
3615
+ svuint64_t input_vec = svld1_u64(mask, xinput + offset); \
3616
+ svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \
3617
+ svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \
3618
+ svuint64_t swapped = svtbl_u64(input_vec, kSwap); \
3619
+ svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \
3620
+ svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \
3621
+ svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
3622
+ acc = svadd_u64_x(mask, acc, mul); \
3623
+ } while (0)
3624
+ #endif /* XXH_VECTOR == XXH_SVE */
3625
+
3519
3626
 
3520
3627
  /* prefetch
3521
3628
  * can be disabled, by declaring XXH_NO_PREFETCH build macro */
@@ -3952,31 +4059,33 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
3952
4059
  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
3953
4060
  XXH_ASSERT(16 < len && len <= 128);
3954
4061
 
3955
- { xxh_u64 acc = len * XXH_PRIME64_1;
4062
+ { xxh_u64 acc = len * XXH_PRIME64_1, acc_end;
3956
4063
  #if XXH_SIZE_OPT >= 1
3957
4064
  /* Smaller and cleaner, but slightly slower. */
3958
- size_t i = (len - 1) / 32;
4065
+ unsigned int i = (unsigned int)(len - 1) / 32;
3959
4066
  do {
3960
4067
  acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
3961
4068
  acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
3962
4069
  } while (i-- != 0);
4070
+ acc_end = 0;
3963
4071
  #else
4072
+ acc += XXH3_mix16B(input+0, secret+0, seed);
4073
+ acc_end = XXH3_mix16B(input+len-16, secret+16, seed);
3964
4074
  if (len > 32) {
4075
+ acc += XXH3_mix16B(input+16, secret+32, seed);
4076
+ acc_end += XXH3_mix16B(input+len-32, secret+48, seed);
3965
4077
  if (len > 64) {
4078
+ acc += XXH3_mix16B(input+32, secret+64, seed);
4079
+ acc_end += XXH3_mix16B(input+len-48, secret+80, seed);
4080
+
3966
4081
  if (len > 96) {
3967
4082
  acc += XXH3_mix16B(input+48, secret+96, seed);
3968
- acc += XXH3_mix16B(input+len-64, secret+112, seed);
4083
+ acc_end += XXH3_mix16B(input+len-64, secret+112, seed);
3969
4084
  }
3970
- acc += XXH3_mix16B(input+32, secret+64, seed);
3971
- acc += XXH3_mix16B(input+len-48, secret+80, seed);
3972
4085
  }
3973
- acc += XXH3_mix16B(input+16, secret+32, seed);
3974
- acc += XXH3_mix16B(input+len-32, secret+48, seed);
3975
4086
  }
3976
- acc += XXH3_mix16B(input+0, secret+0, seed);
3977
- acc += XXH3_mix16B(input+len-16, secret+16, seed);
3978
4087
  #endif
3979
- return XXH3_avalanche(acc);
4088
+ return XXH3_avalanche(acc + acc_end);
3980
4089
  }
3981
4090
  }
3982
4091
 
@@ -3994,13 +4103,17 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
3994
4103
  #define XXH3_MIDSIZE_LASTOFFSET 17
3995
4104
 
3996
4105
  { xxh_u64 acc = len * XXH_PRIME64_1;
3997
- int const nbRounds = (int)len / 16;
3998
- int i;
4106
+ xxh_u64 acc_end;
4107
+ unsigned int const nbRounds = (unsigned int)len / 16;
4108
+ unsigned int i;
4109
+ XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
3999
4110
  for (i=0; i<8; i++) {
4000
4111
  acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
4001
4112
  }
4002
- acc = XXH3_avalanche(acc);
4113
+ /* last bytes */
4114
+ acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
4003
4115
  XXH_ASSERT(nbRounds >= 8);
4116
+ acc = XXH3_avalanche(acc);
4004
4117
  #if defined(__clang__) /* Clang */ \
4005
4118
  && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
4006
4119
  && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
@@ -4027,11 +4140,13 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
4027
4140
  #pragma clang loop vectorize(disable)
4028
4141
  #endif
4029
4142
  for (i=8 ; i < nbRounds; i++) {
4030
- acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
4143
+ /*
4144
+ * Prevents clang for unrolling the acc loop and interleaving with this one.
4145
+ */
4146
+ XXH_COMPILER_GUARD(acc);
4147
+ acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
4031
4148
  }
4032
- /* last bytes */
4033
- acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
4034
- return XXH3_avalanche(acc);
4149
+ return XXH3_avalanche(acc + acc_end);
4035
4150
  }
4036
4151
  }
4037
4152
 
@@ -4047,6 +4162,47 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
4047
4162
  # define ACC_NB XXH_ACC_NB
4048
4163
  #endif
4049
4164
 
4165
+ #ifndef XXH_PREFETCH_DIST
4166
+ # ifdef __clang__
4167
+ # define XXH_PREFETCH_DIST 320
4168
+ # else
4169
+ # if (XXH_VECTOR == XXH_AVX512)
4170
+ # define XXH_PREFETCH_DIST 512
4171
+ # else
4172
+ # define XXH_PREFETCH_DIST 384
4173
+ # endif
4174
+ # endif /* __clang__ */
4175
+ #endif /* XXH_PREFETCH_DIST */
4176
+
4177
+ /*
4178
+ * These macros are to generate an XXH3_accumulate() function.
4179
+ * The two arguments select the name suffix and target attribute.
4180
+ *
4181
+ * The name of this symbol is XXH3_accumulate_<name>() and it calls
4182
+ * XXH3_accumulate_512_<name>().
4183
+ *
4184
+ * It may be useful to hand implement this function if the compiler fails to
4185
+ * optimize the inline function.
4186
+ */
4187
+ #define XXH3_ACCUMULATE_TEMPLATE(name) \
4188
+ void \
4189
+ XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \
4190
+ const xxh_u8* XXH_RESTRICT input, \
4191
+ const xxh_u8* XXH_RESTRICT secret, \
4192
+ size_t nbStripes) \
4193
+ { \
4194
+ size_t n; \
4195
+ for (n = 0; n < nbStripes; n++ ) { \
4196
+ const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \
4197
+ XXH_PREFETCH(in + XXH_PREFETCH_DIST); \
4198
+ XXH3_accumulate_512_##name( \
4199
+ acc, \
4200
+ in, \
4201
+ secret + n*XXH_SECRET_CONSUME_RATE); \
4202
+ } \
4203
+ }
4204
+
4205
+
4050
4206
  XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
4051
4207
  {
4052
4208
  if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
@@ -4115,7 +4271,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
4115
4271
  /* data_key = data_vec ^ key_vec; */
4116
4272
  __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
4117
4273
  /* data_key_lo = data_key >> 32; */
4118
- __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
4274
+ __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
4119
4275
  /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
4120
4276
  __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo);
4121
4277
  /* xacc[0] += swap(data_vec); */
@@ -4125,6 +4281,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
4125
4281
  *xacc = _mm512_add_epi64(product, sum);
4126
4282
  }
4127
4283
  }
4284
+ XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
4128
4285
 
4129
4286
  /*
4130
4287
  * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
@@ -4158,13 +4315,12 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4158
4315
  /* xacc[0] ^= (xacc[0] >> 47) */
4159
4316
  __m512i const acc_vec = *xacc;
4160
4317
  __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47);
4161
- __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted);
4162
4318
  /* xacc[0] ^= secret; */
4163
4319
  __m512i const key_vec = _mm512_loadu_si512 (secret);
4164
- __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
4320
+ __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
4165
4321
 
4166
4322
  /* xacc[0] *= XXH_PRIME32_1; */
4167
- __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
4323
+ __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
4168
4324
  __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32);
4169
4325
  __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32);
4170
4326
  *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
@@ -4179,7 +4335,8 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4179
4335
  XXH_ASSERT(((size_t)customSecret & 63) == 0);
4180
4336
  (void)(&XXH_writeLE64);
4181
4337
  { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
4182
- __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
4338
+ __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
4339
+ __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
4183
4340
 
4184
4341
  const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret);
4185
4342
  __m512i* const dest = ( __m512i*) customSecret;
@@ -4187,14 +4344,7 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4187
4344
  XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
4188
4345
  XXH_ASSERT(((size_t)dest & 63) == 0);
4189
4346
  for (i=0; i < nbRounds; ++i) {
4190
- /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',
4191
- * this will warn "discards 'const' qualifier". */
4192
- union {
4193
- const __m512i* cp;
4194
- void* p;
4195
- } remote_const_void;
4196
- remote_const_void.cp = src + i;
4197
- dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
4347
+ dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
4198
4348
  } }
4199
4349
  }
4200
4350
 
@@ -4230,7 +4380,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
4230
4380
  /* data_key = data_vec ^ key_vec; */
4231
4381
  __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
4232
4382
  /* data_key_lo = data_key >> 32; */
4233
- __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
4383
+ __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
4234
4384
  /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
4235
4385
  __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);
4236
4386
  /* xacc[i] += swap(data_vec); */
@@ -4240,6 +4390,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
4240
4390
  xacc[i] = _mm256_add_epi64(product, sum);
4241
4391
  } }
4242
4392
  }
4393
+ XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
4243
4394
 
4244
4395
  XXH_FORCE_INLINE XXH_TARGET_AVX2 void
4245
4396
  XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
@@ -4262,7 +4413,7 @@ XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4262
4413
  __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
4263
4414
 
4264
4415
  /* xacc[i] *= XXH_PRIME32_1; */
4265
- __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
4416
+ __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
4266
4417
  __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
4267
4418
  __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
4268
4419
  xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
@@ -4294,12 +4445,12 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR
4294
4445
  XXH_ASSERT(((size_t)dest & 31) == 0);
4295
4446
 
4296
4447
  /* GCC -O2 need unroll loop manually */
4297
- dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
4298
- dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
4299
- dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
4300
- dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
4301
- dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
4302
- dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
4448
+ dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
4449
+ dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
4450
+ dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
4451
+ dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
4452
+ dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
4453
+ dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
4303
4454
  }
4304
4455
  }
4305
4456
 
@@ -4346,6 +4497,7 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
4346
4497
  xacc[i] = _mm_add_epi64(product, sum);
4347
4498
  } }
4348
4499
  }
4500
+ XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
4349
4501
 
4350
4502
  XXH_FORCE_INLINE XXH_TARGET_SSE2 void
4351
4503
  XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
@@ -4431,6 +4583,16 @@ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
4431
4583
  * CPU, and it also mitigates some GCC codegen issues.
4432
4584
  *
4433
4585
  * @see XXH3_NEON_LANES for configuring this and details about this optimization.
4586
+ *
4587
+ * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit
4588
+ * integers instead of the other platforms which mask full 64-bit vectors,
4589
+ * so the setup is more complicated than just shifting right.
4590
+ *
4591
+ * Additionally, there is an optimization for 4 lanes at once noted below.
4592
+ *
4593
+ * Since, as stated, the most optimal amount of lanes for Cortexes is 6,
4594
+ * there needs to be *three* versions of the accumulate operation used
4595
+ * for the remaining 2 lanes.
4434
4596
  */
4435
4597
  XXH_FORCE_INLINE void
4436
4598
  XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
@@ -4439,49 +4601,113 @@ XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
4439
4601
  {
4440
4602
  XXH_ASSERT((((size_t)acc) & 15) == 0);
4441
4603
  XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
4442
- {
4443
- uint64x2_t* const xacc = (uint64x2_t *) acc;
4604
+ { /* GCC for darwin arm64 does not like aliasing here */
4605
+ xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;
4444
4606
  /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
4445
4607
  uint8_t const* const xinput = (const uint8_t *) input;
4446
4608
  uint8_t const* const xsecret = (const uint8_t *) secret;
4447
4609
 
4448
4610
  size_t i;
4449
- /* AArch64 uses both scalar and neon at the same time */
4611
+ /* Scalar lanes use the normal scalarRound routine */
4450
4612
  for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
4451
4613
  XXH3_scalarRound(acc, input, secret, i);
4452
4614
  }
4453
- for (i=0; i < XXH3_NEON_LANES / 2; i++) {
4454
- uint64x2_t acc_vec = xacc[i];
4615
+ i = 0;
4616
+ /* 4 NEON lanes at a time. */
4617
+ for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
4618
+ /* data_vec = xinput[i]; */
4619
+ uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16));
4620
+ uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16));
4621
+ /* key_vec = xsecret[i]; */
4622
+ uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16));
4623
+ uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16));
4624
+ /* data_swap = swap(data_vec) */
4625
+ uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
4626
+ uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
4627
+ /* data_key = data_vec ^ key_vec; */
4628
+ uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
4629
+ uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
4630
+
4631
+ /*
4632
+ * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
4633
+ * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
4634
+ * get one vector with the low 32 bits of each lane, and one vector
4635
+ * with the high 32 bits of each lane.
4636
+ *
4637
+ * This compiles to two instructions on AArch64 and has a paired vector
4638
+ * result, which is an artifact from ARMv7a's version which modified both
4639
+ * vectors in place.
4640
+ *
4641
+ * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
4642
+ * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
4643
+ */
4644
+ uint32x4x2_t unzipped = vuzpq_u32(
4645
+ vreinterpretq_u32_u64(data_key_1),
4646
+ vreinterpretq_u32_u64(data_key_2)
4647
+ );
4648
+ /* data_key_lo = data_key & 0xFFFFFFFF */
4649
+ uint32x4_t data_key_lo = unzipped.val[0];
4650
+ /* data_key_hi = data_key >> 32 */
4651
+ uint32x4_t data_key_hi = unzipped.val[1];
4652
+ /*
4653
+ * Then, we can split the vectors horizontally and multiply which, as for most
4654
+ * widening intrinsics, have a variant that works on both high half vectors
4655
+ * for free on AArch64.
4656
+ *
4657
+ * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
4658
+ */
4659
+ uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);
4660
+ uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);
4661
+ /*
4662
+ * Clang reorders
4663
+ * a += b * c; // umlal swap.2d, dkl.2s, dkh.2s
4664
+ * c += a; // add acc.2d, acc.2d, swap.2d
4665
+ * to
4666
+ * c += a; // add acc.2d, acc.2d, swap.2d
4667
+ * c += b * c; // umlal acc.2d, dkl.2s, dkh.2s
4668
+ *
4669
+ * While it would make sense in theory since the addition is faster,
4670
+ * for reasons likely related to umlal being limited to certain NEON
4671
+ * pipelines, this is worse. A compiler guard fixes this.
4672
+ */
4673
+ XXH_COMPILER_GUARD_W(sum_1);
4674
+ XXH_COMPILER_GUARD_W(sum_2);
4675
+ /* xacc[i] = acc_vec + sum; */
4676
+ xacc[i] = vaddq_u64(xacc[i], sum_1);
4677
+ xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);
4678
+ }
4679
+ /* Operate on the remaining NEON lanes 2 at a time. */
4680
+ for (; i < XXH3_NEON_LANES / 2; i++) {
4455
4681
  /* data_vec = xinput[i]; */
4456
4682
  uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16));
4457
4683
  /* key_vec = xsecret[i]; */
4458
4684
  uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));
4459
- uint64x2_t data_key;
4460
- uint32x2_t data_key_lo, data_key_hi;
4461
4685
  /* acc_vec_2 = swap(data_vec) */
4462
- uint64x2_t acc_vec_2 = vextq_u64(data_vec, data_vec, 1);
4686
+ uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);
4463
4687
  /* data_key = data_vec ^ key_vec; */
4464
- data_key = veorq_u64(data_vec, key_vec);
4465
- /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
4466
- * data_key_hi = (uint32x2_t) (data_key >> 32);
4467
- * data_key = UNDEFINED; */
4468
- XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
4469
- /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
4470
- acc_vec_2 = vmlal_u32 (acc_vec_2, data_key_lo, data_key_hi);
4471
- /* xacc[i] += acc_vec_2; */
4472
- acc_vec = vaddq_u64 (acc_vec, acc_vec_2);
4473
- xacc[i] = acc_vec;
4688
+ uint64x2_t data_key = veorq_u64(data_vec, key_vec);
4689
+ /* For two lanes, just use VMOVN and VSHRN. */
4690
+ /* data_key_lo = data_key & 0xFFFFFFFF; */
4691
+ uint32x2_t data_key_lo = vmovn_u64(data_key);
4692
+ /* data_key_hi = data_key >> 32; */
4693
+ uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
4694
+ /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */
4695
+ uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);
4696
+ /* Same Clang workaround as before */
4697
+ XXH_COMPILER_GUARD_W(sum);
4698
+ /* xacc[i] = acc_vec + sum; */
4699
+ xacc[i] = vaddq_u64 (xacc[i], sum);
4474
4700
  }
4475
-
4476
4701
  }
4477
4702
  }
4703
+ XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
4478
4704
 
4479
4705
  XXH_FORCE_INLINE void
4480
4706
  XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4481
4707
  {
4482
4708
  XXH_ASSERT((((size_t)acc) & 15) == 0);
4483
4709
 
4484
- { uint64x2_t* xacc = (uint64x2_t*) acc;
4710
+ { xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc;
4485
4711
  uint8_t const* xsecret = (uint8_t const*) secret;
4486
4712
  uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);
4487
4713
 
@@ -4493,47 +4719,42 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4493
4719
  for (i=0; i < XXH3_NEON_LANES / 2; i++) {
4494
4720
  /* xacc[i] ^= (xacc[i] >> 47); */
4495
4721
  uint64x2_t acc_vec = xacc[i];
4496
- uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
4497
- uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
4722
+ uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
4723
+ uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
4498
4724
 
4499
4725
  /* xacc[i] ^= xsecret[i]; */
4500
- uint64x2_t key_vec = XXH_vld1q_u64 (xsecret + (i * 16));
4501
- uint64x2_t data_key = veorq_u64 (data_vec, key_vec);
4726
+ uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));
4727
+ uint64x2_t data_key = veorq_u64(data_vec, key_vec);
4502
4728
 
4503
4729
  /* xacc[i] *= XXH_PRIME32_1 */
4504
- uint32x2_t data_key_lo, data_key_hi;
4505
- /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
4506
- * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
4507
- * xacc[i] = UNDEFINED; */
4508
- XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
4509
- { /*
4510
- * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
4511
- *
4512
- * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
4513
- * incorrectly "optimize" this:
4514
- * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));
4515
- * shifted = vshll_n_u32(tmp, 32);
4516
- * to this:
4517
- * tmp = "vmulq_u64"(a, b); // no such thing!
4518
- * shifted = vshlq_n_u64(tmp, 32);
4519
- *
4520
- * However, unlike SSE, Clang lacks a 64-bit multiply routine
4521
- * for NEON, and it scalarizes two 64-bit multiplies instead.
4522
- *
4523
- * vmull_u32 has the same timing as vmul_u32, and it avoids
4524
- * this bug completely.
4525
- * See https://bugs.llvm.org/show_bug.cgi?id=39967
4526
- */
4527
- uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
4528
- /* xacc[i] = prod_hi << 32; */
4529
- prod_hi = vshlq_n_u64(prod_hi, 32);
4530
- /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
4531
- xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime);
4532
- }
4730
+ uint32x2_t data_key_lo = vmovn_u64(data_key);
4731
+ uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
4732
+ /*
4733
+ * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
4734
+ *
4735
+ * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
4736
+ * incorrectly "optimize" this:
4737
+ * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));
4738
+ * shifted = vshll_n_u32(tmp, 32);
4739
+ * to this:
4740
+ * tmp = "vmulq_u64"(a, b); // no such thing!
4741
+ * shifted = vshlq_n_u64(tmp, 32);
4742
+ *
4743
+ * However, unlike SSE, Clang lacks a 64-bit multiply routine
4744
+ * for NEON, and it scalarizes two 64-bit multiplies instead.
4745
+ *
4746
+ * vmull_u32 has the same timing as vmul_u32, and it avoids
4747
+ * this bug completely.
4748
+ * See https://bugs.llvm.org/show_bug.cgi?id=39967
4749
+ */
4750
+ uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
4751
+ /* xacc[i] = prod_hi << 32; */
4752
+ prod_hi = vshlq_n_u64(prod_hi, 32);
4753
+ /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
4754
+ xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime);
4533
4755
  }
4534
4756
  }
4535
4757
  }
4536
-
4537
4758
  #endif
4538
4759
 
4539
4760
  #if (XXH_VECTOR == XXH_VSX)
@@ -4544,23 +4765,23 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
4544
4765
  const void* XXH_RESTRICT secret)
4545
4766
  {
4546
4767
  /* presumed aligned */
4547
- unsigned int* const xacc = (unsigned int*) acc;
4548
- xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */
4549
- xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */
4768
+ xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
4769
+ xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */
4770
+ xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */
4550
4771
  xxh_u64x2 const v32 = { 32, 32 };
4551
4772
  size_t i;
4552
4773
  for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
4553
4774
  /* data_vec = xinput[i]; */
4554
- xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
4775
+ xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);
4555
4776
  /* key_vec = xsecret[i]; */
4556
- xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
4777
+ xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i);
4557
4778
  xxh_u64x2 const data_key = data_vec ^ key_vec;
4558
4779
  /* shuffled = (data_key << 32) | (data_key >> 32); */
4559
4780
  xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
4560
4781
  /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
4561
4782
  xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
4562
4783
  /* acc_vec = xacc[i]; */
4563
- xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i);
4784
+ xxh_u64x2 acc_vec = xacc[i];
4564
4785
  acc_vec += product;
4565
4786
 
4566
4787
  /* swap high and low halves */
@@ -4569,18 +4790,18 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
4569
4790
  #else
4570
4791
  acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
4571
4792
  #endif
4572
- /* xacc[i] = acc_vec; */
4573
- vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
4793
+ xacc[i] = acc_vec;
4574
4794
  }
4575
4795
  }
4796
+ XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
4576
4797
 
4577
4798
  XXH_FORCE_INLINE void
4578
4799
  XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4579
4800
  {
4580
4801
  XXH_ASSERT((((size_t)acc) & 15) == 0);
4581
4802
 
4582
- { xxh_u64x2* const xacc = (xxh_u64x2*) acc;
4583
- const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
4803
+ { xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
4804
+ const xxh_u8* const xsecret = (const xxh_u8*) secret;
4584
4805
  /* constants */
4585
4806
  xxh_u64x2 const v32 = { 32, 32 };
4586
4807
  xxh_u64x2 const v47 = { 47, 47 };
@@ -4592,7 +4813,7 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4592
4813
  xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
4593
4814
 
4594
4815
  /* xacc[i] ^= xsecret[i]; */
4595
- xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
4816
+ xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i);
4596
4817
  xxh_u64x2 const data_key = data_vec ^ key_vec;
4597
4818
 
4598
4819
  /* xacc[i] *= XXH_PRIME32_1 */
@@ -4606,8 +4827,148 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4606
4827
 
4607
4828
  #endif
4608
4829
 
4830
+ #if (XXH_VECTOR == XXH_SVE)
4831
+
4832
+ XXH_FORCE_INLINE void
4833
+ XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
4834
+ const void* XXH_RESTRICT input,
4835
+ const void* XXH_RESTRICT secret)
4836
+ {
4837
+ uint64_t *xacc = (uint64_t *)acc;
4838
+ const uint64_t *xinput = (const uint64_t *)(const void *)input;
4839
+ const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
4840
+ svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
4841
+ uint64_t element_count = svcntd();
4842
+ if (element_count >= 8) {
4843
+ svbool_t mask = svptrue_pat_b64(SV_VL8);
4844
+ svuint64_t vacc = svld1_u64(mask, xacc);
4845
+ ACCRND(vacc, 0);
4846
+ svst1_u64(mask, xacc, vacc);
4847
+ } else if (element_count == 2) { /* sve128 */
4848
+ svbool_t mask = svptrue_pat_b64(SV_VL2);
4849
+ svuint64_t acc0 = svld1_u64(mask, xacc + 0);
4850
+ svuint64_t acc1 = svld1_u64(mask, xacc + 2);
4851
+ svuint64_t acc2 = svld1_u64(mask, xacc + 4);
4852
+ svuint64_t acc3 = svld1_u64(mask, xacc + 6);
4853
+ ACCRND(acc0, 0);
4854
+ ACCRND(acc1, 2);
4855
+ ACCRND(acc2, 4);
4856
+ ACCRND(acc3, 6);
4857
+ svst1_u64(mask, xacc + 0, acc0);
4858
+ svst1_u64(mask, xacc + 2, acc1);
4859
+ svst1_u64(mask, xacc + 4, acc2);
4860
+ svst1_u64(mask, xacc + 6, acc3);
4861
+ } else {
4862
+ svbool_t mask = svptrue_pat_b64(SV_VL4);
4863
+ svuint64_t acc0 = svld1_u64(mask, xacc + 0);
4864
+ svuint64_t acc1 = svld1_u64(mask, xacc + 4);
4865
+ ACCRND(acc0, 0);
4866
+ ACCRND(acc1, 4);
4867
+ svst1_u64(mask, xacc + 0, acc0);
4868
+ svst1_u64(mask, xacc + 4, acc1);
4869
+ }
4870
+ }
4871
+
4872
+ XXH_FORCE_INLINE void
4873
+ XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
4874
+ const xxh_u8* XXH_RESTRICT input,
4875
+ const xxh_u8* XXH_RESTRICT secret,
4876
+ size_t nbStripes)
4877
+ {
4878
+ if (nbStripes != 0) {
4879
+ uint64_t *xacc = (uint64_t *)acc;
4880
+ const uint64_t *xinput = (const uint64_t *)(const void *)input;
4881
+ const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
4882
+ svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
4883
+ uint64_t element_count = svcntd();
4884
+ if (element_count >= 8) {
4885
+ svbool_t mask = svptrue_pat_b64(SV_VL8);
4886
+ svuint64_t vacc = svld1_u64(mask, xacc + 0);
4887
+ do {
4888
+ /* svprfd(svbool_t, void *, enum svfprop); */
4889
+ svprfd(mask, xinput + 128, SV_PLDL1STRM);
4890
+ ACCRND(vacc, 0);
4891
+ xinput += 8;
4892
+ xsecret += 1;
4893
+ nbStripes--;
4894
+ } while (nbStripes != 0);
4895
+
4896
+ svst1_u64(mask, xacc + 0, vacc);
4897
+ } else if (element_count == 2) { /* sve128 */
4898
+ svbool_t mask = svptrue_pat_b64(SV_VL2);
4899
+ svuint64_t acc0 = svld1_u64(mask, xacc + 0);
4900
+ svuint64_t acc1 = svld1_u64(mask, xacc + 2);
4901
+ svuint64_t acc2 = svld1_u64(mask, xacc + 4);
4902
+ svuint64_t acc3 = svld1_u64(mask, xacc + 6);
4903
+ do {
4904
+ svprfd(mask, xinput + 128, SV_PLDL1STRM);
4905
+ ACCRND(acc0, 0);
4906
+ ACCRND(acc1, 2);
4907
+ ACCRND(acc2, 4);
4908
+ ACCRND(acc3, 6);
4909
+ xinput += 8;
4910
+ xsecret += 1;
4911
+ nbStripes--;
4912
+ } while (nbStripes != 0);
4913
+
4914
+ svst1_u64(mask, xacc + 0, acc0);
4915
+ svst1_u64(mask, xacc + 2, acc1);
4916
+ svst1_u64(mask, xacc + 4, acc2);
4917
+ svst1_u64(mask, xacc + 6, acc3);
4918
+ } else {
4919
+ svbool_t mask = svptrue_pat_b64(SV_VL4);
4920
+ svuint64_t acc0 = svld1_u64(mask, xacc + 0);
4921
+ svuint64_t acc1 = svld1_u64(mask, xacc + 4);
4922
+ do {
4923
+ svprfd(mask, xinput + 128, SV_PLDL1STRM);
4924
+ ACCRND(acc0, 0);
4925
+ ACCRND(acc1, 4);
4926
+ xinput += 8;
4927
+ xsecret += 1;
4928
+ nbStripes--;
4929
+ } while (nbStripes != 0);
4930
+
4931
+ svst1_u64(mask, xacc + 0, acc0);
4932
+ svst1_u64(mask, xacc + 4, acc1);
4933
+ }
4934
+ }
4935
+ }
4936
+
4937
+ #endif
4938
+
4609
4939
  /* scalar variants - universal */
4610
4940
 
4941
+ #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
4942
+ /*
4943
+ * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they
4944
+ * emit an excess mask and a full 64-bit multiply-add (MADD X-form).
4945
+ *
4946
+ * While this might not seem like much, as AArch64 is a 64-bit architecture, only
4947
+ * big Cortex designs have a full 64-bit multiplier.
4948
+ *
4949
+ * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit
4950
+ * multiplies expand to 2-3 multiplies in microcode. This has a major penalty
4951
+ * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.
4952
+ *
4953
+ * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does
4954
+ * not have this penalty and does the mask automatically.
4955
+ */
4956
+ XXH_FORCE_INLINE xxh_u64
4957
+ XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
4958
+ {
4959
+ xxh_u64 ret;
4960
+ /* note: %x = 64-bit register, %w = 32-bit register */
4961
+ __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));
4962
+ return ret;
4963
+ }
4964
+ #else
4965
+ XXH_FORCE_INLINE xxh_u64
4966
+ XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
4967
+ {
4968
+ return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;
4969
+ }
4970
+ #endif
4971
+
4611
4972
  /*!
4612
4973
  * @internal
4613
4974
  * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
@@ -4630,7 +4991,7 @@ XXH3_scalarRound(void* XXH_RESTRICT acc,
4630
4991
  xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
4631
4992
  xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
4632
4993
  xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
4633
- xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
4994
+ xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);
4634
4995
  }
4635
4996
  }
4636
4997
 
@@ -4655,6 +5016,7 @@ XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
4655
5016
  XXH3_scalarRound(acc, input, secret, i);
4656
5017
  }
4657
5018
  }
5019
+ XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
4658
5020
 
4659
5021
  /*!
4660
5022
  * @internal
@@ -4706,10 +5068,10 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4706
5068
  const xxh_u8* kSecretPtr = XXH3_kSecret;
4707
5069
  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
4708
5070
 
4709
- #if defined(__clang__) && defined(__aarch64__)
5071
+ #if defined(__GNUC__) && defined(__aarch64__)
4710
5072
  /*
4711
5073
  * UGLY HACK:
4712
- * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
5074
+ * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are
4713
5075
  * placed sequentially, in order, at the top of the unrolled loop.
4714
5076
  *
4715
5077
  * While MOVK is great for generating constants (2 cycles for a 64-bit
@@ -4724,7 +5086,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4724
5086
  * ADD
4725
5087
  * SUB STR
4726
5088
  * STR
4727
- * By forcing loads from memory (as the asm line causes Clang to assume
5089
+ * By forcing loads from memory (as the asm line causes the compiler to assume
4728
5090
  * that XXH3_kSecretPtr has been changed), the pipelines are used more
4729
5091
  * efficiently:
4730
5092
  * I L S
@@ -4741,17 +5103,11 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4741
5103
  */
4742
5104
  XXH_COMPILER_GUARD(kSecretPtr);
4743
5105
  #endif
4744
- /*
4745
- * Note: in debug mode, this overrides the asm optimization
4746
- * and Clang will emit MOVK chains again.
4747
- */
4748
- XXH_ASSERT(kSecretPtr == XXH3_kSecret);
4749
-
4750
5106
  { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
4751
5107
  int i;
4752
5108
  for (i=0; i < nbRounds; i++) {
4753
5109
  /*
4754
- * The asm hack causes Clang to assume that kSecretPtr aliases with
5110
+ * The asm hack causes the compiler to assume that kSecretPtr aliases with
4755
5111
  * customSecret, and on aarch64, this prevented LDP from merging two
4756
5112
  * loads together for free. Putting the loads together before the stores
4757
5113
  * properly generates LDP.
@@ -4764,7 +5120,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4764
5120
  }
4765
5121
 
4766
5122
 
4767
- typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);
5123
+ typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
4768
5124
  typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
4769
5125
  typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
4770
5126
 
@@ -4772,36 +5128,48 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
4772
5128
  #if (XXH_VECTOR == XXH_AVX512)
4773
5129
 
4774
5130
  #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
5131
+ #define XXH3_accumulate XXH3_accumulate_avx512
4775
5132
  #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
4776
5133
  #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
4777
5134
 
4778
5135
  #elif (XXH_VECTOR == XXH_AVX2)
4779
5136
 
4780
5137
  #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
5138
+ #define XXH3_accumulate XXH3_accumulate_avx2
4781
5139
  #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
4782
5140
  #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
4783
5141
 
4784
5142
  #elif (XXH_VECTOR == XXH_SSE2)
4785
5143
 
4786
5144
  #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
5145
+ #define XXH3_accumulate XXH3_accumulate_sse2
4787
5146
  #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
4788
5147
  #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
4789
5148
 
4790
5149
  #elif (XXH_VECTOR == XXH_NEON)
4791
5150
 
4792
5151
  #define XXH3_accumulate_512 XXH3_accumulate_512_neon
5152
+ #define XXH3_accumulate XXH3_accumulate_neon
4793
5153
  #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
4794
5154
  #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4795
5155
 
4796
5156
  #elif (XXH_VECTOR == XXH_VSX)
4797
5157
 
4798
5158
  #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
5159
+ #define XXH3_accumulate XXH3_accumulate_vsx
4799
5160
  #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
4800
5161
  #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4801
5162
 
5163
+ #elif (XXH_VECTOR == XXH_SVE)
5164
+ #define XXH3_accumulate_512 XXH3_accumulate_512_sve
5165
+ #define XXH3_accumulate XXH3_accumulate_sve
5166
+ #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
5167
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
5168
+
4802
5169
  #else /* scalar */
4803
5170
 
4804
5171
  #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
5172
+ #define XXH3_accumulate XXH3_accumulate_scalar
4805
5173
  #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
4806
5174
  #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4807
5175
 
@@ -4812,45 +5180,11 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
4812
5180
  # define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4813
5181
  #endif
4814
5182
 
4815
- #ifndef XXH_PREFETCH_DIST
4816
- # ifdef __clang__
4817
- # define XXH_PREFETCH_DIST 320
4818
- # else
4819
- # if (XXH_VECTOR == XXH_AVX512)
4820
- # define XXH_PREFETCH_DIST 512
4821
- # else
4822
- # define XXH_PREFETCH_DIST 384
4823
- # endif
4824
- # endif /* __clang__ */
4825
- #endif /* XXH_PREFETCH_DIST */
4826
-
4827
- /*
4828
- * XXH3_accumulate()
4829
- * Loops over XXH3_accumulate_512().
4830
- * Assumption: nbStripes will not overflow the secret size
4831
- */
4832
- XXH_FORCE_INLINE void
4833
- XXH3_accumulate( xxh_u64* XXH_RESTRICT acc,
4834
- const xxh_u8* XXH_RESTRICT input,
4835
- const xxh_u8* XXH_RESTRICT secret,
4836
- size_t nbStripes,
4837
- XXH3_f_accumulate_512 f_acc512)
4838
- {
4839
- size_t n;
4840
- for (n = 0; n < nbStripes; n++ ) {
4841
- const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
4842
- XXH_PREFETCH(in + XXH_PREFETCH_DIST);
4843
- f_acc512(acc,
4844
- in,
4845
- secret + n*XXH_SECRET_CONSUME_RATE);
4846
- }
4847
- }
4848
-
4849
5183
  XXH_FORCE_INLINE void
4850
5184
  XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
4851
5185
  const xxh_u8* XXH_RESTRICT input, size_t len,
4852
5186
  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
4853
- XXH3_f_accumulate_512 f_acc512,
5187
+ XXH3_f_accumulate f_acc,
4854
5188
  XXH3_f_scrambleAcc f_scramble)
4855
5189
  {
4856
5190
  size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
@@ -4862,7 +5196,7 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
4862
5196
  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
4863
5197
 
4864
5198
  for (n = 0; n < nb_blocks; n++) {
4865
- XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);
5199
+ f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
4866
5200
  f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
4867
5201
  }
4868
5202
 
@@ -4870,12 +5204,12 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
4870
5204
  XXH_ASSERT(len > XXH_STRIPE_LEN);
4871
5205
  { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
4872
5206
  XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
4873
- XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);
5207
+ f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
4874
5208
 
4875
5209
  /* last stripe */
4876
5210
  { const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
4877
5211
  #define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */
4878
- f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
5212
+ XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
4879
5213
  } }
4880
5214
  }
4881
5215
 
@@ -4920,12 +5254,12 @@ XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secre
4920
5254
  XXH_FORCE_INLINE XXH64_hash_t
4921
5255
  XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
4922
5256
  const void* XXH_RESTRICT secret, size_t secretSize,
4923
- XXH3_f_accumulate_512 f_acc512,
5257
+ XXH3_f_accumulate f_acc,
4924
5258
  XXH3_f_scrambleAcc f_scramble)
4925
5259
  {
4926
5260
  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
4927
5261
 
4928
- XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);
5262
+ XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
4929
5263
 
4930
5264
  /* converge into final hash */
4931
5265
  XXH_STATIC_ASSERT(sizeof(acc) == 64);
@@ -4939,13 +5273,15 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
4939
5273
  * It's important for performance to transmit secret's size (when it's static)
4940
5274
  * so that the compiler can properly optimize the vectorized loop.
4941
5275
  * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
5276
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
5277
+ * breaks -Og, this is XXH_NO_INLINE.
4942
5278
  */
4943
- XXH_FORCE_INLINE XXH64_hash_t
5279
+ XXH3_WITH_SECRET_INLINE XXH64_hash_t
4944
5280
  XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
4945
5281
  XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
4946
5282
  {
4947
5283
  (void)seed64;
4948
- return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
5284
+ return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
4949
5285
  }
4950
5286
 
4951
5287
  /*
@@ -4959,7 +5295,7 @@ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
4959
5295
  XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
4960
5296
  {
4961
5297
  (void)seed64; (void)secret; (void)secretLen;
4962
- return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
5298
+ return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
4963
5299
  }
4964
5300
 
4965
5301
  /*
@@ -4976,7 +5312,7 @@ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
4976
5312
  XXH_FORCE_INLINE XXH64_hash_t
4977
5313
  XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
4978
5314
  XXH64_hash_t seed,
4979
- XXH3_f_accumulate_512 f_acc512,
5315
+ XXH3_f_accumulate f_acc,
4980
5316
  XXH3_f_scrambleAcc f_scramble,
4981
5317
  XXH3_f_initCustomSecret f_initSec)
4982
5318
  {
@@ -4984,12 +5320,12 @@ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
4984
5320
  if (seed == 0)
4985
5321
  return XXH3_hashLong_64b_internal(input, len,
4986
5322
  XXH3_kSecret, sizeof(XXH3_kSecret),
4987
- f_acc512, f_scramble);
5323
+ f_acc, f_scramble);
4988
5324
  #endif
4989
5325
  { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
4990
5326
  f_initSec(secret, seed);
4991
5327
  return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
4992
- f_acc512, f_scramble);
5328
+ f_acc, f_scramble);
4993
5329
  }
4994
5330
  }
4995
5331
 
@@ -4997,12 +5333,12 @@ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
4997
5333
  * It's important for performance that XXH3_hashLong is not inlined.
4998
5334
  */
4999
5335
  XXH_NO_INLINE XXH64_hash_t
5000
- XXH3_hashLong_64b_withSeed(const void* input, size_t len,
5001
- XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
5336
+ XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
5337
+ XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
5002
5338
  {
5003
5339
  (void)secret; (void)secretLen;
5004
5340
  return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
5005
- XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
5341
+ XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
5006
5342
  }
5007
5343
 
5008
5344
 
@@ -5035,27 +5371,27 @@ XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
5035
5371
  /* === Public entry point === */
5036
5372
 
5037
5373
  /*! @ingroup XXH3_family */
5038
- XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t length)
5374
+ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
5039
5375
  {
5040
5376
  return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
5041
5377
  }
5042
5378
 
5043
5379
  /*! @ingroup XXH3_family */
5044
5380
  XXH_PUBLIC_API XXH64_hash_t
5045
- XXH3_64bits_withSecret(const void* input, size_t length, const void* secret, size_t secretSize)
5381
+ XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
5046
5382
  {
5047
5383
  return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
5048
5384
  }
5049
5385
 
5050
5386
  /*! @ingroup XXH3_family */
5051
5387
  XXH_PUBLIC_API XXH64_hash_t
5052
- XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed)
5388
+ XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
5053
5389
  {
5054
5390
  return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
5055
5391
  }
5056
5392
 
5057
5393
  XXH_PUBLIC_API XXH64_hash_t
5058
- XXH3_64bits_withSecretandSeed(const void* input, size_t length, const void* secret, size_t secretSize, XXH64_hash_t seed)
5394
+ XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
5059
5395
  {
5060
5396
  if (length <= XXH3_MIDSIZE_MAX)
5061
5397
  return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
@@ -5148,7 +5484,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
5148
5484
 
5149
5485
  /*! @ingroup XXH3_family */
5150
5486
  XXH_PUBLIC_API void
5151
- XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
5487
+ XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
5152
5488
  {
5153
5489
  XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
5154
5490
  }
@@ -5182,7 +5518,7 @@ XXH3_reset_internal(XXH3_state_t* statePtr,
5182
5518
 
5183
5519
  /*! @ingroup XXH3_family */
5184
5520
  XXH_PUBLIC_API XXH_errorcode
5185
- XXH3_64bits_reset(XXH3_state_t* statePtr)
5521
+ XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
5186
5522
  {
5187
5523
  if (statePtr == NULL) return XXH_ERROR;
5188
5524
  XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
@@ -5191,7 +5527,7 @@ XXH3_64bits_reset(XXH3_state_t* statePtr)
5191
5527
 
5192
5528
  /*! @ingroup XXH3_family */
5193
5529
  XXH_PUBLIC_API XXH_errorcode
5194
- XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
5530
+ XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
5195
5531
  {
5196
5532
  if (statePtr == NULL) return XXH_ERROR;
5197
5533
  XXH3_reset_internal(statePtr, 0, secret, secretSize);
@@ -5202,7 +5538,7 @@ XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t
5202
5538
 
5203
5539
  /*! @ingroup XXH3_family */
5204
5540
  XXH_PUBLIC_API XXH_errorcode
5205
- XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
5541
+ XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
5206
5542
  {
5207
5543
  if (statePtr == NULL) return XXH_ERROR;
5208
5544
  if (seed==0) return XXH3_64bits_reset(statePtr);
@@ -5214,7 +5550,7 @@ XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
5214
5550
 
5215
5551
  /*! @ingroup XXH3_family */
5216
5552
  XXH_PUBLIC_API XXH_errorcode
5217
- XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)
5553
+ XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
5218
5554
  {
5219
5555
  if (statePtr == NULL) return XXH_ERROR;
5220
5556
  if (secret == NULL) return XXH_ERROR;
@@ -5224,31 +5560,57 @@ XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret,
5224
5560
  return XXH_OK;
5225
5561
  }
5226
5562
 
5227
- /* Note : when XXH3_consumeStripes() is invoked,
5228
- * there must be a guarantee that at least one more byte must be consumed from input
5229
- * so that the function can blindly consume all stripes using the "normal" secret segment */
5230
- XXH_FORCE_INLINE void
5563
+ /*!
5564
+ * @internal
5565
+ * @brief Processes a large input for XXH3_update() and XXH3_digest_long().
5566
+ *
5567
+ * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.
5568
+ *
5569
+ * @param acc Pointer to the 8 accumulator lanes
5570
+ * @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block*
5571
+ * @param nbStripesPerBlock Number of stripes in a block
5572
+ * @param input Input pointer
5573
+ * @param nbStripes Number of stripes to process
5574
+ * @param secret Secret pointer
5575
+ * @param secretLimit Offset of the last block in @p secret
5576
+ * @param f_acc Pointer to an XXH3_accumulate implementation
5577
+ * @param f_scramble Pointer to an XXH3_scrambleAcc implementation
5578
+ * @return Pointer past the end of @p input after processing
5579
+ */
5580
+ XXH_FORCE_INLINE const xxh_u8 *
5231
5581
  XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
5232
5582
  size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
5233
5583
  const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
5234
5584
  const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
5235
- XXH3_f_accumulate_512 f_acc512,
5585
+ XXH3_f_accumulate f_acc,
5236
5586
  XXH3_f_scrambleAcc f_scramble)
5237
5587
  {
5238
- XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */
5239
- XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
5240
- if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
5241
- /* need a scrambling operation */
5242
- size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
5243
- size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
5244
- XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);
5245
- f_scramble(acc, secret + secretLimit);
5246
- XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
5247
- *nbStripesSoFarPtr = nbStripesAfterBlock;
5248
- } else {
5249
- XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
5588
+ const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;
5589
+ /* Process full blocks */
5590
+ if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {
5591
+ /* Process the initial partial block... */
5592
+ size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;
5593
+
5594
+ do {
5595
+ /* Accumulate and scramble */
5596
+ f_acc(acc, input, initialSecret, nbStripesThisIter);
5597
+ f_scramble(acc, secret + secretLimit);
5598
+ input += nbStripesThisIter * XXH_STRIPE_LEN;
5599
+ nbStripes -= nbStripesThisIter;
5600
+ /* Then continue the loop with the full block size */
5601
+ nbStripesThisIter = nbStripesPerBlock;
5602
+ initialSecret = secret;
5603
+ } while (nbStripes >= nbStripesPerBlock);
5604
+ *nbStripesSoFarPtr = 0;
5605
+ }
5606
+ /* Process a partial block */
5607
+ if (nbStripes > 0) {
5608
+ f_acc(acc, input, initialSecret, nbStripes);
5609
+ input += nbStripes * XXH_STRIPE_LEN;
5250
5610
  *nbStripesSoFarPtr += nbStripes;
5251
5611
  }
5612
+ /* Return end pointer */
5613
+ return input;
5252
5614
  }
5253
5615
 
5254
5616
  #ifndef XXH3_STREAM_USE_STACK
@@ -5262,7 +5624,7 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
5262
5624
  XXH_FORCE_INLINE XXH_errorcode
5263
5625
  XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5264
5626
  const xxh_u8* XXH_RESTRICT input, size_t len,
5265
- XXH3_f_accumulate_512 f_acc512,
5627
+ XXH3_f_accumulate f_acc,
5266
5628
  XXH3_f_scrambleAcc f_scramble)
5267
5629
  {
5268
5630
  if (input==NULL) {
@@ -5278,7 +5640,8 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5278
5640
  * when operating accumulators directly into state.
5279
5641
  * Operating into stack space seems to enable proper optimization.
5280
5642
  * clang, on the other hand, doesn't seem to need this trick */
5281
- XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));
5643
+ XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
5644
+ XXH_memcpy(acc, state->acc, sizeof(acc));
5282
5645
  #else
5283
5646
  xxh_u64* XXH_RESTRICT const acc = state->acc;
5284
5647
  #endif
@@ -5286,7 +5649,7 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5286
5649
  XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
5287
5650
 
5288
5651
  /* small input : just fill in tmp buffer */
5289
- if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {
5652
+ if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {
5290
5653
  XXH_memcpy(state->buffer + state->bufferedSize, input, len);
5291
5654
  state->bufferedSize += (XXH32_hash_t)len;
5292
5655
  return XXH_OK;
@@ -5308,57 +5671,20 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5308
5671
  &state->nbStripesSoFar, state->nbStripesPerBlock,
5309
5672
  state->buffer, XXH3_INTERNALBUFFER_STRIPES,
5310
5673
  secret, state->secretLimit,
5311
- f_acc512, f_scramble);
5674
+ f_acc, f_scramble);
5312
5675
  state->bufferedSize = 0;
5313
5676
  }
5314
5677
  XXH_ASSERT(input < bEnd);
5315
-
5316
- /* large input to consume : ingest per full block */
5317
- if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
5678
+ if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
5318
5679
  size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
5319
- XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);
5320
- /* join to current block's end */
5321
- { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
5322
- XXH_ASSERT(nbStripesToEnd <= nbStripes);
5323
- XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);
5324
- f_scramble(acc, secret + state->secretLimit);
5325
- state->nbStripesSoFar = 0;
5326
- input += nbStripesToEnd * XXH_STRIPE_LEN;
5327
- nbStripes -= nbStripesToEnd;
5328
- }
5329
- /* consume per entire blocks */
5330
- while(nbStripes >= state->nbStripesPerBlock) {
5331
- XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);
5332
- f_scramble(acc, secret + state->secretLimit);
5333
- input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
5334
- nbStripes -= state->nbStripesPerBlock;
5335
- }
5336
- /* consume last partial block */
5337
- XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);
5338
- input += nbStripes * XXH_STRIPE_LEN;
5339
- XXH_ASSERT(input < bEnd); /* at least some bytes left */
5340
- state->nbStripesSoFar = nbStripes;
5341
- /* buffer predecessor of last partial stripe */
5342
- XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5343
- XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
5344
- } else {
5345
- /* content to consume <= block size */
5346
- /* Consume input by a multiple of internal buffer size */
5347
- if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
5348
- const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
5349
- do {
5350
- XXH3_consumeStripes(acc,
5680
+ input = XXH3_consumeStripes(acc,
5351
5681
  &state->nbStripesSoFar, state->nbStripesPerBlock,
5352
- input, XXH3_INTERNALBUFFER_STRIPES,
5353
- secret, state->secretLimit,
5354
- f_acc512, f_scramble);
5355
- input += XXH3_INTERNALBUFFER_SIZE;
5356
- } while (input<limit);
5357
- /* buffer predecessor of last partial stripe */
5358
- XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5359
- }
5360
- }
5682
+ input, nbStripes,
5683
+ secret, state->secretLimit,
5684
+ f_acc, f_scramble);
5685
+ XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5361
5686
 
5687
+ }
5362
5688
  /* Some remaining input (always) : buffer it */
5363
5689
  XXH_ASSERT(input < bEnd);
5364
5690
  XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
@@ -5367,7 +5693,7 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5367
5693
  state->bufferedSize = (XXH32_hash_t)(bEnd-input);
5368
5694
  #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
5369
5695
  /* save stack accumulators into state */
5370
- memcpy(state->acc, acc, sizeof(acc));
5696
+ XXH_memcpy(state->acc, acc, sizeof(acc));
5371
5697
  #endif
5372
5698
  }
5373
5699
 
@@ -5376,10 +5702,10 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5376
5702
 
5377
5703
  /*! @ingroup XXH3_family */
5378
5704
  XXH_PUBLIC_API XXH_errorcode
5379
- XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
5705
+ XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
5380
5706
  {
5381
5707
  return XXH3_update(state, (const xxh_u8*)input, len,
5382
- XXH3_accumulate_512, XXH3_scrambleAcc);
5708
+ XXH3_accumulate, XXH3_scrambleAcc);
5383
5709
  }
5384
5710
 
5385
5711
 
@@ -5388,37 +5714,40 @@ XXH3_digest_long (XXH64_hash_t* acc,
5388
5714
  const XXH3_state_t* state,
5389
5715
  const unsigned char* secret)
5390
5716
  {
5717
+ xxh_u8 lastStripe[XXH_STRIPE_LEN];
5718
+ const xxh_u8* lastStripePtr;
5719
+
5391
5720
  /*
5392
5721
  * Digest on a local copy. This way, the state remains unaltered, and it can
5393
5722
  * continue ingesting more input afterwards.
5394
5723
  */
5395
5724
  XXH_memcpy(acc, state->acc, sizeof(state->acc));
5396
5725
  if (state->bufferedSize >= XXH_STRIPE_LEN) {
5726
+ /* Consume remaining stripes then point to remaining data in buffer */
5397
5727
  size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
5398
5728
  size_t nbStripesSoFar = state->nbStripesSoFar;
5399
5729
  XXH3_consumeStripes(acc,
5400
5730
  &nbStripesSoFar, state->nbStripesPerBlock,
5401
5731
  state->buffer, nbStripes,
5402
5732
  secret, state->secretLimit,
5403
- XXH3_accumulate_512, XXH3_scrambleAcc);
5404
- /* last stripe */
5405
- XXH3_accumulate_512(acc,
5406
- state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
5407
- secret + state->secretLimit - XXH_SECRET_LASTACC_START);
5733
+ XXH3_accumulate, XXH3_scrambleAcc);
5734
+ lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;
5408
5735
  } else { /* bufferedSize < XXH_STRIPE_LEN */
5409
- xxh_u8 lastStripe[XXH_STRIPE_LEN];
5736
+ /* Copy to temp buffer */
5410
5737
  size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
5411
5738
  XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */
5412
5739
  XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
5413
5740
  XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
5414
- XXH3_accumulate_512(acc,
5415
- lastStripe,
5416
- secret + state->secretLimit - XXH_SECRET_LASTACC_START);
5741
+ lastStripePtr = lastStripe;
5417
5742
  }
5743
+ /* Last stripe */
5744
+ XXH3_accumulate_512(acc,
5745
+ lastStripePtr,
5746
+ secret + state->secretLimit - XXH_SECRET_LASTACC_START);
5418
5747
  }
5419
5748
 
5420
5749
  /*! @ingroup XXH3_family */
5421
- XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
5750
+ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
5422
5751
  {
5423
5752
  const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
5424
5753
  if (state->totalLen > XXH3_MIDSIZE_MAX) {
@@ -5631,7 +5960,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5631
5960
  #if XXH_SIZE_OPT >= 1
5632
5961
  {
5633
5962
  /* Smaller, but slightly slower. */
5634
- size_t i = (len - 1) / 32;
5963
+ unsigned int i = (unsigned int)(len - 1) / 32;
5635
5964
  do {
5636
5965
  acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
5637
5966
  } while (i-- != 0);
@@ -5669,25 +5998,34 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5669
5998
  XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
5670
5999
 
5671
6000
  { XXH128_hash_t acc;
5672
- int const nbRounds = (int)len / 32;
5673
- int i;
6001
+ unsigned i;
5674
6002
  acc.low64 = len * XXH_PRIME64_1;
5675
6003
  acc.high64 = 0;
5676
- for (i=0; i<4; i++) {
6004
+ /*
6005
+ * We set as `i` as offset + 32. We do this so that unchanged
6006
+ * `len` can be used as upper bound. This reaches a sweet spot
6007
+ * where both x86 and aarch64 get simple agen and good codegen
6008
+ * for the loop.
6009
+ */
6010
+ for (i = 32; i < 160; i += 32) {
5677
6011
  acc = XXH128_mix32B(acc,
5678
- input + (32 * i),
5679
- input + (32 * i) + 16,
5680
- secret + (32 * i),
6012
+ input + i - 32,
6013
+ input + i - 16,
6014
+ secret + i - 32,
5681
6015
  seed);
5682
6016
  }
5683
6017
  acc.low64 = XXH3_avalanche(acc.low64);
5684
6018
  acc.high64 = XXH3_avalanche(acc.high64);
5685
- XXH_ASSERT(nbRounds >= 4);
5686
- for (i=4 ; i < nbRounds; i++) {
6019
+ /*
6020
+ * NB: `i <= len` will duplicate the last 32-bytes if
6021
+ * len % 32 was zero. This is an unfortunate necessity to keep
6022
+ * the hash result stable.
6023
+ */
6024
+ for (i=160; i <= len; i += 32) {
5687
6025
  acc = XXH128_mix32B(acc,
5688
- input + (32 * i),
5689
- input + (32 * i) + 16,
5690
- secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
6026
+ input + i - 32,
6027
+ input + i - 16,
6028
+ secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
5691
6029
  seed);
5692
6030
  }
5693
6031
  /* last bytes */
@@ -5695,7 +6033,7 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5695
6033
  input + len - 16,
5696
6034
  input + len - 32,
5697
6035
  secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
5698
- 0ULL - seed);
6036
+ (XXH64_hash_t)0 - seed);
5699
6037
 
5700
6038
  { XXH128_hash_t h128;
5701
6039
  h128.low64 = acc.low64 + acc.high64;
@@ -5712,12 +6050,12 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5712
6050
  XXH_FORCE_INLINE XXH128_hash_t
5713
6051
  XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
5714
6052
  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
5715
- XXH3_f_accumulate_512 f_acc512,
6053
+ XXH3_f_accumulate f_acc,
5716
6054
  XXH3_f_scrambleAcc f_scramble)
5717
6055
  {
5718
6056
  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
5719
6057
 
5720
- XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);
6058
+ XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
5721
6059
 
5722
6060
  /* converge into final hash */
5723
6061
  XXH_STATIC_ASSERT(sizeof(acc) == 64);
@@ -5744,38 +6082,41 @@ XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
5744
6082
  {
5745
6083
  (void)seed64; (void)secret; (void)secretLen;
5746
6084
  return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
5747
- XXH3_accumulate_512, XXH3_scrambleAcc);
6085
+ XXH3_accumulate, XXH3_scrambleAcc);
5748
6086
  }
5749
6087
 
5750
6088
  /*
5751
6089
  * It's important for performance to pass @p secretLen (when it's static)
5752
6090
  * to the compiler, so that it can properly optimize the vectorized loop.
6091
+ *
6092
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
6093
+ * breaks -Og, this is XXH_NO_INLINE.
5753
6094
  */
5754
- XXH_FORCE_INLINE XXH128_hash_t
6095
+ XXH3_WITH_SECRET_INLINE XXH128_hash_t
5755
6096
  XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
5756
6097
  XXH64_hash_t seed64,
5757
6098
  const void* XXH_RESTRICT secret, size_t secretLen)
5758
6099
  {
5759
6100
  (void)seed64;
5760
6101
  return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
5761
- XXH3_accumulate_512, XXH3_scrambleAcc);
6102
+ XXH3_accumulate, XXH3_scrambleAcc);
5762
6103
  }
5763
6104
 
5764
6105
  XXH_FORCE_INLINE XXH128_hash_t
5765
6106
  XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
5766
6107
  XXH64_hash_t seed64,
5767
- XXH3_f_accumulate_512 f_acc512,
6108
+ XXH3_f_accumulate f_acc,
5768
6109
  XXH3_f_scrambleAcc f_scramble,
5769
6110
  XXH3_f_initCustomSecret f_initSec)
5770
6111
  {
5771
6112
  if (seed64 == 0)
5772
6113
  return XXH3_hashLong_128b_internal(input, len,
5773
6114
  XXH3_kSecret, sizeof(XXH3_kSecret),
5774
- f_acc512, f_scramble);
6115
+ f_acc, f_scramble);
5775
6116
  { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
5776
6117
  f_initSec(secret, seed64);
5777
6118
  return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
5778
- f_acc512, f_scramble);
6119
+ f_acc, f_scramble);
5779
6120
  }
5780
6121
  }
5781
6122
 
@@ -5788,7 +6129,7 @@ XXH3_hashLong_128b_withSeed(const void* input, size_t len,
5788
6129
  {
5789
6130
  (void)secret; (void)secretLen;
5790
6131
  return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
5791
- XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
6132
+ XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
5792
6133
  }
5793
6134
 
5794
6135
  typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
@@ -5819,7 +6160,7 @@ XXH3_128bits_internal(const void* input, size_t len,
5819
6160
  /* === Public XXH128 API === */
5820
6161
 
5821
6162
  /*! @ingroup XXH3_family */
5822
- XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
6163
+ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
5823
6164
  {
5824
6165
  return XXH3_128bits_internal(input, len, 0,
5825
6166
  XXH3_kSecret, sizeof(XXH3_kSecret),
@@ -5828,7 +6169,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
5828
6169
 
5829
6170
  /*! @ingroup XXH3_family */
5830
6171
  XXH_PUBLIC_API XXH128_hash_t
5831
- XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
6172
+ XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
5832
6173
  {
5833
6174
  return XXH3_128bits_internal(input, len, 0,
5834
6175
  (const xxh_u8*)secret, secretSize,
@@ -5837,7 +6178,7 @@ XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_
5837
6178
 
5838
6179
  /*! @ingroup XXH3_family */
5839
6180
  XXH_PUBLIC_API XXH128_hash_t
5840
- XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
6181
+ XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
5841
6182
  {
5842
6183
  return XXH3_128bits_internal(input, len, seed,
5843
6184
  XXH3_kSecret, sizeof(XXH3_kSecret),
@@ -5846,7 +6187,7 @@ XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
5846
6187
 
5847
6188
  /*! @ingroup XXH3_family */
5848
6189
  XXH_PUBLIC_API XXH128_hash_t
5849
- XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
6190
+ XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
5850
6191
  {
5851
6192
  if (len <= XXH3_MIDSIZE_MAX)
5852
6193
  return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
@@ -5855,7 +6196,7 @@ XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret
5855
6196
 
5856
6197
  /*! @ingroup XXH3_family */
5857
6198
  XXH_PUBLIC_API XXH128_hash_t
5858
- XXH128(const void* input, size_t len, XXH64_hash_t seed)
6199
+ XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
5859
6200
  {
5860
6201
  return XXH3_128bits_withSeed(input, len, seed);
5861
6202
  }
@@ -5870,42 +6211,41 @@ XXH128(const void* input, size_t len, XXH64_hash_t seed)
5870
6211
 
5871
6212
  /*! @ingroup XXH3_family */
5872
6213
  XXH_PUBLIC_API XXH_errorcode
5873
- XXH3_128bits_reset(XXH3_state_t* statePtr)
6214
+ XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
5874
6215
  {
5875
6216
  return XXH3_64bits_reset(statePtr);
5876
6217
  }
5877
6218
 
5878
6219
  /*! @ingroup XXH3_family */
5879
6220
  XXH_PUBLIC_API XXH_errorcode
5880
- XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
6221
+ XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
5881
6222
  {
5882
6223
  return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
5883
6224
  }
5884
6225
 
5885
6226
  /*! @ingroup XXH3_family */
5886
6227
  XXH_PUBLIC_API XXH_errorcode
5887
- XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
6228
+ XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
5888
6229
  {
5889
6230
  return XXH3_64bits_reset_withSeed(statePtr, seed);
5890
6231
  }
5891
6232
 
5892
6233
  /*! @ingroup XXH3_family */
5893
6234
  XXH_PUBLIC_API XXH_errorcode
5894
- XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
6235
+ XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
5895
6236
  {
5896
6237
  return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
5897
6238
  }
5898
6239
 
5899
6240
  /*! @ingroup XXH3_family */
5900
6241
  XXH_PUBLIC_API XXH_errorcode
5901
- XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
6242
+ XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
5902
6243
  {
5903
- return XXH3_update(state, (const xxh_u8*)input, len,
5904
- XXH3_accumulate_512, XXH3_scrambleAcc);
6244
+ return XXH3_64bits_update(state, input, len);
5905
6245
  }
5906
6246
 
5907
6247
  /*! @ingroup XXH3_family */
5908
- XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
6248
+ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
5909
6249
  {
5910
6250
  const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
5911
6251
  if (state->totalLen > XXH3_MIDSIZE_MAX) {
@@ -5947,7 +6287,7 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
5947
6287
  * <0 if *h128_1 < *h128_2
5948
6288
  * =0 if *h128_1 == *h128_2 */
5949
6289
  /*! @ingroup XXH3_family */
5950
- XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
6290
+ XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
5951
6291
  {
5952
6292
  XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
5953
6293
  XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
@@ -5961,7 +6301,7 @@ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
5961
6301
  /*====== Canonical representation ======*/
5962
6302
  /*! @ingroup XXH3_family */
5963
6303
  XXH_PUBLIC_API void
5964
- XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
6304
+ XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
5965
6305
  {
5966
6306
  XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
5967
6307
  if (XXH_CPU_LITTLE_ENDIAN) {
@@ -5974,7 +6314,7 @@ XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
5974
6314
 
5975
6315
  /*! @ingroup XXH3_family */
5976
6316
  XXH_PUBLIC_API XXH128_hash_t
5977
- XXH128_hashFromCanonical(const XXH128_canonical_t* src)
6317
+ XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
5978
6318
  {
5979
6319
  XXH128_hash_t h;
5980
6320
  h.high64 = XXH_readBE64(src);
@@ -5998,7 +6338,7 @@ XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
5998
6338
 
5999
6339
  /*! @ingroup XXH3_family */
6000
6340
  XXH_PUBLIC_API XXH_errorcode
6001
- XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)
6341
+ XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
6002
6342
  {
6003
6343
  #if (XXH_DEBUGLEVEL >= 1)
6004
6344
  XXH_ASSERT(secretBuffer != NULL);
@@ -6043,7 +6383,7 @@ XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSee
6043
6383
 
6044
6384
  /*! @ingroup XXH3_family */
6045
6385
  XXH_PUBLIC_API void
6046
- XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
6386
+ XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
6047
6387
  {
6048
6388
  XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
6049
6389
  XXH3_initCustomSecret(secret, seed);
@@ -6071,5 +6411,5 @@ XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
6071
6411
 
6072
6412
 
6073
6413
  #if defined (__cplusplus)
6074
- }
6414
+ } /* extern "C" */
6075
6415
  #endif