digest-xxhash 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -716,8 +716,15 @@ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canoni
716
716
  # define XXH_HAS_ATTRIBUTE(x) 0
717
717
  #endif
718
718
 
719
+ /*
720
+ * C23 __STDC_VERSION__ number hasn't been specified yet. For now
721
+ * leave as `201711L` (C17 + 1).
722
+ * TODO: Update to correct value when its been specified.
723
+ */
724
+ #define XXH_C23_VN 201711L
725
+
719
726
  /* C-language Attributes are added in C23. */
720
- #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
727
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute)
721
728
  # define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
722
729
  #else
723
730
  # define XXH_HAS_C_ATTRIBUTE(x) 0
@@ -743,6 +750,18 @@ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canoni
743
750
  # define XXH_FALLTHROUGH /* fallthrough */
744
751
  #endif
745
752
 
753
+ /*
754
+ * Define XXH_NOESCAPE for annotated pointers in public API.
755
+ * https://clang.llvm.org/docs/AttributeReference.html#noescape
756
+ * As of writing this, only supported by clang.
757
+ */
758
+ #if XXH_HAS_ATTRIBUTE(noescape)
759
+ # define XXH_NOESCAPE __attribute__((noescape))
760
+ #else
761
+ # define XXH_NOESCAPE
762
+ #endif
763
+
764
+
746
765
  /*!
747
766
  * @}
748
767
  * @ingroup public
@@ -813,7 +832,7 @@ typedef uint64_t XXH64_hash_t;
813
832
  * @see
814
833
  * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
815
834
  */
816
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
835
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
817
836
 
818
837
  /******* Streaming *******/
819
838
  #ifndef XXH_NO_STREAM
@@ -825,16 +844,16 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(const void* input, size_t length, XX
825
844
  typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
826
845
  XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
827
846
  XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
828
- XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
847
+ XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);
829
848
 
830
- XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed);
831
- XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
832
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);
849
+ XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);
850
+ XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
851
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
833
852
  #endif /* !XXH_NO_STREAM */
834
853
  /******* Canonical representation *******/
835
854
  typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
836
- XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
837
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
855
+ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);
856
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);
838
857
 
839
858
  #ifndef XXH_NO_XXH3
840
859
 
@@ -872,7 +891,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canoni
872
891
  *
873
892
  * XXH3 implementation is portable:
874
893
  * it has a generic C90 formulation that can be compiled on any platform,
875
- * all implementations generage exactly the same hash value on all platforms.
894
+ * all implementations generate exactly the same hash value on all platforms.
876
895
  * Starting from v0.8.0, it's also labelled "stable", meaning that
877
896
  * any future version will also generate the same hash value.
878
897
  *
@@ -902,7 +921,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canoni
902
921
  * @see
903
922
  * XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version.
904
923
  */
905
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(const void* input, size_t length);
924
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);
906
925
 
907
926
  /*!
908
927
  * @brief 64-bit seeded variant of XXH3
@@ -919,7 +938,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(const void* input, size_t leng
919
938
  * @param length The length
920
939
  * @param seed The 64-bit seed to alter the state.
921
940
  */
922
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed);
941
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
923
942
 
924
943
  /*!
925
944
  * The bare minimum size for a custom secret.
@@ -948,7 +967,7 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(const void* input, si
948
967
  * This is not necessarily the case when using the blob of bytes directly
949
968
  * because, when hashing _small_ inputs, only a portion of the secret is employed.
950
969
  */
951
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
970
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
952
971
 
953
972
 
954
973
  /******* Streaming *******/
@@ -968,20 +987,20 @@ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(const void* data, s
968
987
  typedef struct XXH3_state_s XXH3_state_t;
969
988
  XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
970
989
  XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
971
- XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
990
+ XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);
972
991
 
973
992
  /*
974
993
  * XXH3_64bits_reset():
975
994
  * Initialize with default parameters.
976
995
  * digest will be equivalent to `XXH3_64bits()`.
977
996
  */
978
- XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
997
+ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
979
998
  /*
980
999
  * XXH3_64bits_reset_withSeed():
981
1000
  * Generate a custom secret from `seed`, and store it into `statePtr`.
982
1001
  * digest will be equivalent to `XXH3_64bits_withSeed()`.
983
1002
  */
984
- XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
1003
+ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
985
1004
  /*!
986
1005
  * XXH3_64bits_reset_withSecret():
987
1006
  * `secret` is referenced, it _must outlive_ the hash streaming session.
@@ -991,10 +1010,10 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr,
991
1010
  * When in doubt about the randomness of a candidate `secret`,
992
1011
  * consider employing `XXH3_generateSecret()` instead (see below).
993
1012
  */
994
- XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
1013
+ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
995
1014
 
996
- XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
997
- XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr);
1015
+ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
1016
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
998
1017
  #endif /* !XXH_NO_STREAM */
999
1018
 
1000
1019
  /* note : canonical representation of XXH3 is the same as XXH64
@@ -1033,11 +1052,11 @@ typedef struct {
1033
1052
  * @see
1034
1053
  * XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version.
1035
1054
  */
1036
- XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(const void* data, size_t len);
1055
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
1037
1056
  /*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */
1038
- XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
1057
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
1039
1058
  /*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */
1040
- XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
1059
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
1041
1060
 
1042
1061
  /******* Streaming *******/
1043
1062
  #ifndef XXH_NO_STREAM
@@ -1053,12 +1072,12 @@ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(const void* data,
1053
1072
  * All reset and streaming functions have same meaning as their 64-bit counterpart.
1054
1073
  */
1055
1074
 
1056
- XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
1057
- XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
1058
- XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
1075
+ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
1076
+ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
1077
+ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
1059
1078
 
1060
- XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
1061
- XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
1079
+ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
1080
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
1062
1081
  #endif /* !XXH_NO_STREAM */
1063
1082
 
1064
1083
  /* Following helper functions make it possible to compare XXH128_hast_t values.
@@ -1079,13 +1098,13 @@ XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
1079
1098
  * =0 if *h128_1 == *h128_2
1080
1099
  * <0 if *h128_1 < *h128_2
1081
1100
  */
1082
- XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(const void* h128_1, const void* h128_2);
1101
+ XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);
1083
1102
 
1084
1103
 
1085
1104
  /******* Canonical representation *******/
1086
1105
  typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
1087
- XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
1088
- XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
1106
+ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);
1107
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);
1089
1108
 
1090
1109
 
1091
1110
  #endif /* !XXH_NO_XXH3 */
@@ -1266,13 +1285,18 @@ struct XXH3_state_s {
1266
1285
  * Note that this doesn't prepare the state for a streaming operation,
1267
1286
  * it's still necessary to use XXH3_NNbits_reset*() afterwards.
1268
1287
  */
1269
- #define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; }
1288
+ #define XXH3_INITSTATE(XXH3_state_ptr) \
1289
+ do { \
1290
+ XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \
1291
+ tmp_xxh3_state_ptr->seed = 0; \
1292
+ tmp_xxh3_state_ptr->extSecret = NULL; \
1293
+ } while(0)
1270
1294
 
1271
1295
 
1272
1296
  /*!
1273
1297
  * simple alias to pre-selected XXH3_128bits variant
1274
1298
  */
1275
- XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
1299
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
1276
1300
 
1277
1301
 
1278
1302
  /* === Experimental API === */
@@ -1329,7 +1353,7 @@ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(const void* data, size_t len, XXH6
1329
1353
  * }
1330
1354
  * @endcode
1331
1355
  */
1332
- XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
1356
+ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);
1333
1357
 
1334
1358
  /*!
1335
1359
  * @brief Generate the same secret as the _withSeed() variants.
@@ -1368,7 +1392,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secr
1368
1392
  * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes
1369
1393
  * @param seed The seed to seed the state.
1370
1394
  */
1371
- XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
1395
+ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);
1372
1396
 
1373
1397
  /*!
1374
1398
  * These variants generate hash values using either
@@ -1397,24 +1421,24 @@ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_
1397
1421
  * because only portions of the secret are employed for small data.
1398
1422
  */
1399
1423
  XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
1400
- XXH3_64bits_withSecretandSeed(const void* data, size_t len,
1401
- const void* secret, size_t secretSize,
1424
+ XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
1425
+ XXH_NOESCAPE const void* secret, size_t secretSize,
1402
1426
  XXH64_hash_t seed);
1403
1427
  /*! @copydoc XXH3_64bits_withSecretandSeed() */
1404
1428
  XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
1405
- XXH3_128bits_withSecretandSeed(const void* input, size_t length,
1406
- const void* secret, size_t secretSize,
1429
+ XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
1430
+ XXH_NOESCAPE const void* secret, size_t secretSize,
1407
1431
  XXH64_hash_t seed64);
1408
1432
  #ifndef XXH_NO_STREAM
1409
1433
  /*! @copydoc XXH3_64bits_withSecretandSeed() */
1410
1434
  XXH_PUBLIC_API XXH_errorcode
1411
- XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1412
- const void* secret, size_t secretSize,
1435
+ XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
1436
+ XXH_NOESCAPE const void* secret, size_t secretSize,
1413
1437
  XXH64_hash_t seed64);
1414
1438
  /*! @copydoc XXH3_64bits_withSecretandSeed() */
1415
1439
  XXH_PUBLIC_API XXH_errorcode
1416
- XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1417
- const void* secret, size_t secretSize,
1440
+ XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
1441
+ XXH_NOESCAPE const void* secret, size_t secretSize,
1418
1442
  XXH64_hash_t seed64);
1419
1443
  #endif /* !XXH_NO_STREAM */
1420
1444
 
@@ -1522,7 +1546,7 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1522
1546
  * care, as what works on one compiler/platform/optimization level may cause
1523
1547
  * another to read garbage data or even crash.
1524
1548
  *
1525
- * See http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
1549
+ * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
1526
1550
  *
1527
1551
  * Prefer these methods in priority order (0 > 3 > 1 > 2)
1528
1552
  */
@@ -1608,6 +1632,23 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1608
1632
  */
1609
1633
  # define XXH_NO_INLINE_HINTS 0
1610
1634
 
1635
+ /*!
1636
+ * @def XXH3_INLINE_SECRET
1637
+ * @brief Determines whether to inline the XXH3 withSecret code.
1638
+ *
1639
+ * When the secret size is known, the compiler can improve the performance
1640
+ * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().
1641
+ *
1642
+ * However, if the secret size is not known, it doesn't have any benefit. This
1643
+ * happens when xxHash is compiled into a global symbol. Therefore, if
1644
+ * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.
1645
+ *
1646
+ * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers
1647
+ * that are *sometimes* force inline on -Og, and it is impossible to automatically
1648
+ * detect this optimization level.
1649
+ */
1650
+ # define XXH3_INLINE_SECRET 0
1651
+
1611
1652
  /*!
1612
1653
  * @def XXH32_ENDJMP
1613
1654
  * @brief Whether to use a jump for `XXH32_finalize`.
@@ -1682,6 +1723,15 @@ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1682
1723
  # endif
1683
1724
  #endif
1684
1725
 
1726
+ #ifndef XXH3_INLINE_SECRET
1727
+ # if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \
1728
+ || !defined(XXH_INLINE_ALL)
1729
+ # define XXH3_INLINE_SECRET 0
1730
+ # else
1731
+ # define XXH3_INLINE_SECRET 1
1732
+ # endif
1733
+ #endif
1734
+
1685
1735
  #ifndef XXH32_ENDJMP
1686
1736
  /* generally preferable for performance */
1687
1737
  # define XXH32_ENDJMP 0
@@ -1778,6 +1828,11 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1778
1828
  # define XXH_NO_INLINE static
1779
1829
  #endif
1780
1830
 
1831
+ #if XXH3_INLINE_SECRET
1832
+ # define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE
1833
+ #else
1834
+ # define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE
1835
+ #endif
1781
1836
 
1782
1837
 
1783
1838
  /* *************************************
@@ -1803,7 +1858,7 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1803
1858
  # include <assert.h> /* note: can still be disabled with NDEBUG */
1804
1859
  # define XXH_ASSERT(c) assert(c)
1805
1860
  #else
1806
- # define XXH_ASSERT(c) ((void)0)
1861
+ # define XXH_ASSERT(c) XXH_ASSUME(c)
1807
1862
  #endif
1808
1863
 
1809
1864
  /* note: use after variable declarations */
@@ -1835,11 +1890,17 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1835
1890
  * XXH3_initCustomSecret_scalar().
1836
1891
  */
1837
1892
  #if defined(__GNUC__) || defined(__clang__)
1838
- # define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
1893
+ # define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))
1839
1894
  #else
1840
1895
  # define XXH_COMPILER_GUARD(var) ((void)0)
1841
1896
  #endif
1842
1897
 
1898
+ #if defined(__clang__)
1899
+ # define XXH_COMPILER_GUARD_W(var) __asm__("" : "+w" (var))
1900
+ #else
1901
+ # define XXH_COMPILER_GUARD_W(var) ((void)0)
1902
+ #endif
1903
+
1843
1904
  /* *************************************
1844
1905
  * Basic Types
1845
1906
  ***************************************/
@@ -1946,7 +2007,7 @@ static xxh_u32 XXH_read32(const void* ptr)
1946
2007
 
1947
2008
  /*
1948
2009
  * Portable and safe solution. Generally efficient.
1949
- * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
2010
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
1950
2011
  */
1951
2012
  static xxh_u32 XXH_read32(const void* memPtr)
1952
2013
  {
@@ -2022,6 +2083,51 @@ static int XXH_isLittleEndian(void)
2022
2083
  # define XXH_HAS_BUILTIN(x) 0
2023
2084
  #endif
2024
2085
 
2086
+
2087
+
2088
+ /*
2089
+ * C23 and future versions have standard "unreachable()".
2090
+ * Once it has been implemented reliably we can add it as an
2091
+ * additional case:
2092
+ *
2093
+ * ```
2094
+ * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN)
2095
+ * # include <stddef.h>
2096
+ * # ifdef unreachable
2097
+ * # define XXH_UNREACHABLE() unreachable()
2098
+ * # endif
2099
+ * #endif
2100
+ * ```
2101
+ *
2102
+ * Note C++23 also has std::unreachable() which can be detected
2103
+ * as follows:
2104
+ * ```
2105
+ * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)
2106
+ * # include <utility>
2107
+ * # define XXH_UNREACHABLE() std::unreachable()
2108
+ * #endif
2109
+ * ```
2110
+ * NB: `__cpp_lib_unreachable` is defined in the `<version>` header.
2111
+ * We don't use that as including `<utility>` in `extern "C"` blocks
2112
+ * doesn't work on GCC12
2113
+ */
2114
+
2115
+ #if XXH_HAS_BUILTIN(__builtin_unreachable)
2116
+ # define XXH_UNREACHABLE() __builtin_unreachable()
2117
+
2118
+ #elif defined(_MSC_VER)
2119
+ # define XXH_UNREACHABLE() __assume(0)
2120
+
2121
+ #else
2122
+ # define XXH_UNREACHABLE()
2123
+ #endif
2124
+
2125
+ #if XXH_HAS_BUILTIN(__builtin_assume)
2126
+ # define XXH_ASSUME(c) __builtin_assume(c)
2127
+ #else
2128
+ # define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
2129
+ #endif
2130
+
2025
2131
  /*!
2026
2132
  * @internal
2027
2133
  * @def XXH_rotl32(x,r)
@@ -2211,9 +2317,9 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
2211
2317
  * can load data, while v3 can multiply. SSE forces them to operate
2212
2318
  * together.
2213
2319
  *
2214
- * This is also enabled on AArch64, as Clang autovectorizes it incorrectly
2215
- * and it is pointless writing a NEON implementation that is basically the
2216
- * same speed as scalar for XXH32.
2320
+ * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing
2321
+ * the loop. NEON is only faster on the A53, and with the newer cores, it is less
2322
+ * than half the speed.
2217
2323
  */
2218
2324
  XXH_COMPILER_GUARD(acc);
2219
2325
  #endif
@@ -2288,41 +2394,41 @@ XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
2288
2394
  } else {
2289
2395
  switch(len&15) /* or switch(bEnd - p) */ {
2290
2396
  case 12: XXH_PROCESS4;
2291
- XXH_FALLTHROUGH;
2397
+ XXH_FALLTHROUGH; /* fallthrough */
2292
2398
  case 8: XXH_PROCESS4;
2293
- XXH_FALLTHROUGH;
2399
+ XXH_FALLTHROUGH; /* fallthrough */
2294
2400
  case 4: XXH_PROCESS4;
2295
2401
  return XXH32_avalanche(hash);
2296
2402
 
2297
2403
  case 13: XXH_PROCESS4;
2298
- XXH_FALLTHROUGH;
2404
+ XXH_FALLTHROUGH; /* fallthrough */
2299
2405
  case 9: XXH_PROCESS4;
2300
- XXH_FALLTHROUGH;
2406
+ XXH_FALLTHROUGH; /* fallthrough */
2301
2407
  case 5: XXH_PROCESS4;
2302
2408
  XXH_PROCESS1;
2303
2409
  return XXH32_avalanche(hash);
2304
2410
 
2305
2411
  case 14: XXH_PROCESS4;
2306
- XXH_FALLTHROUGH;
2412
+ XXH_FALLTHROUGH; /* fallthrough */
2307
2413
  case 10: XXH_PROCESS4;
2308
- XXH_FALLTHROUGH;
2414
+ XXH_FALLTHROUGH; /* fallthrough */
2309
2415
  case 6: XXH_PROCESS4;
2310
2416
  XXH_PROCESS1;
2311
2417
  XXH_PROCESS1;
2312
2418
  return XXH32_avalanche(hash);
2313
2419
 
2314
2420
  case 15: XXH_PROCESS4;
2315
- XXH_FALLTHROUGH;
2421
+ XXH_FALLTHROUGH; /* fallthrough */
2316
2422
  case 11: XXH_PROCESS4;
2317
- XXH_FALLTHROUGH;
2423
+ XXH_FALLTHROUGH; /* fallthrough */
2318
2424
  case 7: XXH_PROCESS4;
2319
- XXH_FALLTHROUGH;
2425
+ XXH_FALLTHROUGH; /* fallthrough */
2320
2426
  case 3: XXH_PROCESS1;
2321
- XXH_FALLTHROUGH;
2427
+ XXH_FALLTHROUGH; /* fallthrough */
2322
2428
  case 2: XXH_PROCESS1;
2323
- XXH_FALLTHROUGH;
2429
+ XXH_FALLTHROUGH; /* fallthrough */
2324
2430
  case 1: XXH_PROCESS1;
2325
- XXH_FALLTHROUGH;
2431
+ XXH_FALLTHROUGH; /* fallthrough */
2326
2432
  case 0: return XXH32_avalanche(hash);
2327
2433
  }
2328
2434
  XXH_ASSERT(0);
@@ -2590,7 +2696,7 @@ static xxh_u64 XXH_read64(const void* ptr)
2590
2696
 
2591
2697
  /*
2592
2698
  * Portable and safe solution. Generally efficient.
2593
- * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
2699
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
2594
2700
  */
2595
2701
  static xxh_u64 XXH_read64(const void* memPtr)
2596
2702
  {
@@ -2823,7 +2929,7 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment
2823
2929
 
2824
2930
 
2825
2931
  /*! @ingroup XXH64_family */
2826
- XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
2932
+ XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
2827
2933
  {
2828
2934
  #if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
2829
2935
  /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
@@ -2857,13 +2963,13 @@ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
2857
2963
  }
2858
2964
 
2859
2965
  /*! @ingroup XXH64_family */
2860
- XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
2966
+ XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
2861
2967
  {
2862
2968
  XXH_memcpy(dstState, srcState, sizeof(*dstState));
2863
2969
  }
2864
2970
 
2865
2971
  /*! @ingroup XXH64_family */
2866
- XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
2972
+ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
2867
2973
  {
2868
2974
  XXH_ASSERT(statePtr != NULL);
2869
2975
  memset(statePtr, 0, sizeof(*statePtr));
@@ -2876,7 +2982,7 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t s
2876
2982
 
2877
2983
  /*! @ingroup XXH64_family */
2878
2984
  XXH_PUBLIC_API XXH_errorcode
2879
- XXH64_update (XXH64_state_t* state, const void* input, size_t len)
2985
+ XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
2880
2986
  {
2881
2987
  if (input==NULL) {
2882
2988
  XXH_ASSERT(len == 0);
@@ -2927,7 +3033,7 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
2927
3033
 
2928
3034
 
2929
3035
  /*! @ingroup XXH64_family */
2930
- XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
3036
+ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
2931
3037
  {
2932
3038
  xxh_u64 h64;
2933
3039
 
@@ -2950,7 +3056,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
2950
3056
  /******* Canonical representation *******/
2951
3057
 
2952
3058
  /*! @ingroup XXH64_family */
2953
- XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
3059
+ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
2954
3060
  {
2955
3061
  XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
2956
3062
  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
@@ -2958,7 +3064,7 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t
2958
3064
  }
2959
3065
 
2960
3066
  /*! @ingroup XXH64_family */
2961
- XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
3067
+ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
2962
3068
  {
2963
3069
  return XXH_readBE64(src);
2964
3070
  }
@@ -2979,11 +3085,19 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
2979
3085
  /* === Compiler specifics === */
2980
3086
 
2981
3087
  #if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
2982
- # define XXH_RESTRICT /* disable */
3088
+ # define XXH_RESTRICT /* disable */
2983
3089
  #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */
2984
3090
  # define XXH_RESTRICT restrict
3091
+ #elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
3092
+ || (defined (__clang__)) \
3093
+ || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \
3094
+ || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))
3095
+ /*
3096
+ * There are a LOT more compilers that recognize __restrict but this
3097
+ * covers the major ones.
3098
+ */
3099
+ # define XXH_RESTRICT __restrict
2985
3100
  #else
2986
- /* Note: it might be useful to define __restrict or __restrict__ for some C++ compilers */
2987
3101
  # define XXH_RESTRICT /* disable */
2988
3102
  #endif
2989
3103
 
@@ -2998,9 +3112,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
2998
3112
  #endif
2999
3113
 
3000
3114
  #if defined(__GNUC__) || defined(__clang__)
3115
+ # if defined(__ARM_FEATURE_SVE)
3116
+ # include <arm_sve.h>
3117
+ # endif
3001
3118
  # if defined(__ARM_NEON__) || defined(__ARM_NEON) \
3002
- || defined(__aarch64__) || defined(_M_ARM) \
3003
- || defined(_M_ARM64) || defined(_M_ARM64EC)
3119
+ || (defined(_M_ARM) && _M_ARM >= 7) \
3120
+ || defined(_M_ARM64) || defined(_M_ARM64EC)
3004
3121
  # define inline __inline__ /* circumvent a clang bug */
3005
3122
  # include <arm_neon.h>
3006
3123
  # undef inline
@@ -3125,12 +3242,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3125
3242
  XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */
3126
3243
  XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */
3127
3244
  XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
3245
+ XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */
3128
3246
  };
3129
3247
  /*!
3130
3248
  * @ingroup tuning
3131
3249
  * @brief Selects the minimum alignment for XXH3's accumulators.
3132
3250
  *
3133
- * When using SIMD, this should match the alignment reqired for said vector
3251
+ * When using SIMD, this should match the alignment required for said vector
3134
3252
  * type, so, for example, 32 for AVX2.
3135
3253
  *
3136
3254
  * Default: Auto detected.
@@ -3146,10 +3264,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3146
3264
  # define XXH_AVX512 3
3147
3265
  # define XXH_NEON 4
3148
3266
  # define XXH_VSX 5
3267
+ # define XXH_SVE 6
3149
3268
  #endif
3150
3269
 
3151
3270
  #ifndef XXH_VECTOR /* can be defined on command line */
3152
- # if ( \
3271
+ # if defined(__ARM_FEATURE_SVE)
3272
+ # define XXH_VECTOR XXH_SVE
3273
+ # elif ( \
3153
3274
  defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
3154
3275
  || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
3155
3276
  ) && ( \
@@ -3172,6 +3293,17 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3172
3293
  # endif
3173
3294
  #endif
3174
3295
 
3296
+ /* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
3297
+ #if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
3298
+ # ifdef _MSC_VER
3299
+ # pragma warning(once : 4606)
3300
+ # else
3301
+ # warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
3302
+ # endif
3303
+ # undef XXH_VECTOR
3304
+ # define XXH_VECTOR XXH_SCALAR
3305
+ #endif
3306
+
3175
3307
  /*
3176
3308
  * Controls the alignment of the accumulator,
3177
3309
  * for compatibility with aligned vector loads, which are usually faster.
@@ -3191,16 +3323,26 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3191
3323
  # define XXH_ACC_ALIGN 16
3192
3324
  # elif XXH_VECTOR == XXH_AVX512 /* avx512 */
3193
3325
  # define XXH_ACC_ALIGN 64
3326
+ # elif XXH_VECTOR == XXH_SVE /* sve */
3327
+ # define XXH_ACC_ALIGN 64
3194
3328
  # endif
3195
3329
  #endif
3196
3330
 
3197
3331
  #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
3198
3332
  || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
3199
3333
  # define XXH_SEC_ALIGN XXH_ACC_ALIGN
3334
+ #elif XXH_VECTOR == XXH_SVE
3335
+ # define XXH_SEC_ALIGN XXH_ACC_ALIGN
3200
3336
  #else
3201
3337
  # define XXH_SEC_ALIGN 8
3202
3338
  #endif
3203
3339
 
3340
+ #if defined(__GNUC__) || defined(__clang__)
3341
+ # define XXH_ALIASING __attribute__((may_alias))
3342
+ #else
3343
+ # define XXH_ALIASING /* nothing */
3344
+ #endif
3345
+
3204
3346
  /*
3205
3347
  * UGLY HACK:
3206
3348
  * GCC usually generates the best code with -O3 for xxHash.
@@ -3229,107 +3371,16 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3229
3371
  # pragma GCC optimize("-O2")
3230
3372
  #endif
3231
3373
 
3232
-
3233
3374
  #if XXH_VECTOR == XXH_NEON
3375
+
3234
3376
  /*
3235
- * NEON's setup for vmlal_u32 is a little more complicated than it is on
3236
- * SSE2, AVX2, and VSX.
3237
- *
3238
- * While PMULUDQ and VMULEUW both perform a mask, VMLAL.U32 performs an upcast.
3239
- *
3240
- * To do the same operation, the 128-bit 'Q' register needs to be split into
3241
- * two 64-bit 'D' registers, performing this operation::
3242
- *
3243
- * [ a | b ]
3244
- * | '---------. .--------' |
3245
- * | x |
3246
- * | .---------' '--------. |
3247
- * [ a & 0xFFFFFFFF | b & 0xFFFFFFFF ],[ a >> 32 | b >> 32 ]
3248
- *
3249
- * Due to significant changes in aarch64, the fastest method for aarch64 is
3250
- * completely different than the fastest method for ARMv7-A.
3251
- *
3252
- * ARMv7-A treats D registers as unions overlaying Q registers, so modifying
3253
- * D11 will modify the high half of Q5. This is similar to how modifying AH
3254
- * will only affect bits 8-15 of AX on x86.
3255
- *
3256
- * VZIP takes two registers, and puts even lanes in one register and odd lanes
3257
- * in the other.
3258
- *
3259
- * On ARMv7-A, this strangely modifies both parameters in place instead of
3260
- * taking the usual 3-operand form.
3261
- *
3262
- * Therefore, if we want to do this, we can simply use a D-form VZIP.32 on the
3263
- * lower and upper halves of the Q register to end up with the high and low
3264
- * halves where we want - all in one instruction.
3265
- *
3266
- * vzip.32 d10, d11 @ d10 = { d10[0], d11[0] }; d11 = { d10[1], d11[1] }
3267
- *
3268
- * Unfortunately we need inline assembly for this: Instructions modifying two
3269
- * registers at once is not possible in GCC or Clang's IR, and they have to
3270
- * create a copy.
3377
+ * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3
3378
+ * optimizes out the entire hashLong loop because of the aliasing violation.
3271
3379
  *
3272
- * aarch64 requires a different approach.
3273
- *
3274
- * In order to make it easier to write a decent compiler for aarch64, many
3275
- * quirks were removed, such as conditional execution.
3276
- *
3277
- * NEON was also affected by this.
3278
- *
3279
- * aarch64 cannot access the high bits of a Q-form register, and writes to a
3280
- * D-form register zero the high bits, similar to how writes to W-form scalar
3281
- * registers (or DWORD registers on x86_64) work.
3282
- *
3283
- * The formerly free vget_high intrinsics now require a vext (with a few
3284
- * exceptions)
3285
- *
3286
- * Additionally, VZIP was replaced by ZIP1 and ZIP2, which are the equivalent
3287
- * of PUNPCKL* and PUNPCKH* in SSE, respectively, in order to only modify one
3288
- * operand.
3289
- *
3290
- * The equivalent of the VZIP.32 on the lower and upper halves would be this
3291
- * mess:
3292
- *
3293
- * ext v2.4s, v0.4s, v0.4s, #2 // v2 = { v0[2], v0[3], v0[0], v0[1] }
3294
- * zip1 v1.2s, v0.2s, v2.2s // v1 = { v0[0], v2[0] }
3295
- * zip2 v0.2s, v0.2s, v1.2s // v0 = { v0[1], v2[1] }
3296
- *
3297
- * Instead, we use a literal downcast, vmovn_u64 (XTN), and vshrn_n_u64 (SHRN):
3298
- *
3299
- * shrn v1.2s, v0.2d, #32 // v1 = (uint32x2_t)(v0 >> 32);
3300
- * xtn v0.2s, v0.2d // v0 = (uint32x2_t)(v0 & 0xFFFFFFFF);
3301
- *
3302
- * This is available on ARMv7-A, but is less efficient than a single VZIP.32.
3380
+ * However, GCC is also inefficient at load-store optimization with vld1q/vst1q,
3381
+ * so the only option is to mark it as aliasing.
3303
3382
  */
3304
-
3305
- /*!
3306
- * Function-like macro:
3307
- * void XXH_SPLIT_IN_PLACE(uint64x2_t &in, uint32x2_t &outLo, uint32x2_t &outHi)
3308
- * {
3309
- * outLo = (uint32x2_t)(in & 0xFFFFFFFF);
3310
- * outHi = (uint32x2_t)(in >> 32);
3311
- * in = UNDEFINED;
3312
- * }
3313
- */
3314
- # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
3315
- && (defined(__GNUC__) || defined(__clang__)) \
3316
- && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
3317
- # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
3318
- do { \
3319
- /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
3320
- /* https://github.com/gcc-mirror/gcc/blob/38cf91e5/gcc/config/arm/arm.c#L22486 */ \
3321
- /* https://github.com/llvm-mirror/llvm/blob/2c4ca683/lib/Target/ARM/ARMAsmPrinter.cpp#L399 */ \
3322
- __asm__("vzip.32 %e0, %f0" : "+w" (in)); \
3323
- (outLo) = vget_low_u32 (vreinterpretq_u32_u64(in)); \
3324
- (outHi) = vget_high_u32(vreinterpretq_u32_u64(in)); \
3325
- } while (0)
3326
- # else
3327
- # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
3328
- do { \
3329
- (outLo) = vmovn_u64 (in); \
3330
- (outHi) = vshrn_n_u64 ((in), 32); \
3331
- } while (0)
3332
- # endif
3383
+ typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
3333
3384
 
3334
3385
  /*!
3335
3386
  * @internal
@@ -3347,7 +3398,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3347
3398
  #if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
3348
3399
  XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
3349
3400
  {
3350
- return *(uint64x2_t const*)ptr;
3401
+ return *(xxh_aliasing_uint64x2_t const *)ptr;
3351
3402
  }
3352
3403
  #else
3353
3404
  XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
@@ -3355,38 +3406,75 @@ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
3355
3406
  return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
3356
3407
  }
3357
3408
  #endif
3409
+
3410
+ /*!
3411
+ * @internal
3412
+ * @brief `vmlal_u32` on low and high halves of a vector.
3413
+ *
3414
+ * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with
3415
+ * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`
3416
+ * with `vmlal_u32`.
3417
+ */
3418
+ #if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11
3419
+ XXH_FORCE_INLINE uint64x2_t
3420
+ XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
3421
+ {
3422
+ /* Inline assembly is the only way */
3423
+ __asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));
3424
+ return acc;
3425
+ }
3426
+ XXH_FORCE_INLINE uint64x2_t
3427
+ XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
3428
+ {
3429
+ /* This intrinsic works as expected */
3430
+ return vmlal_high_u32(acc, lhs, rhs);
3431
+ }
3432
+ #else
3433
+ /* Portable intrinsic versions */
3434
+ XXH_FORCE_INLINE uint64x2_t
3435
+ XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
3436
+ {
3437
+ return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));
3438
+ }
3439
+ /*! @copydoc XXH_vmlal_low_u32
3440
+ * Assume the compiler converts this to vmlal_high_u32 on aarch64 */
3441
+ XXH_FORCE_INLINE uint64x2_t
3442
+ XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
3443
+ {
3444
+ return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));
3445
+ }
3446
+ #endif
3447
+
3358
3448
  /*!
3359
3449
  * @ingroup tuning
3360
3450
  * @brief Controls the NEON to scalar ratio for XXH3
3361
3451
  *
3362
- * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and
3363
- * 2 lanes on scalar by default.
3452
+ * This can be set to 2, 4, 6, or 8.
3364
3453
  *
3365
- * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the
3366
- * emulated 64-bit arithmetic is too slow.
3454
+ * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.
3367
3455
  *
3368
- * Modern ARM CPUs are _very_ sensitive to how their pipelines are used.
3456
+ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those
3457
+ * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU
3458
+ * bandwidth.
3369
3459
  *
3370
- * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't
3371
- * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,
3372
- * you are only using 2/3 of the CPU bandwidth.
3373
- *
3374
- * This is even more noticable on the more advanced cores like the A76 which
3460
+ * This is even more noticeable on the more advanced cores like the Cortex-A76 which
3375
3461
  * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
3376
3462
  *
3377
- * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the
3378
- * remaining lanes will use scalar instructions. This improves the bandwidth
3379
- * and also gives the integer pipelines something to do besides twiddling loop
3380
- * counters and pointers.
3463
+ * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes
3464
+ * and 2 scalar lanes, which is chosen by default.
3465
+ *
3466
+ * This does not apply to Apple processors or 32-bit processors, which run better with
3467
+ * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.
3381
3468
  *
3382
3469
  * This change benefits CPUs with large micro-op buffers without negatively affecting
3383
- * other CPUs:
3470
+ * most other CPUs:
3384
3471
  *
3385
3472
  * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. |
3386
3473
  * |:----------------------|:--------------------|----------:|-----------:|------:|
3387
3474
  * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% |
3388
3475
  * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% |
3389
3476
  * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% |
3477
+ * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% |
3390
3478
  *
3391
3479
  * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
3392
3480
  *
@@ -3394,7 +3482,7 @@ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
3394
3482
  */
3395
3483
  # ifndef XXH3_NEON_LANES
3396
3484
  # if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
3397
- && XXH_SIZE_OPT <= 0
3485
+ && !defined(__APPLE__) && XXH_SIZE_OPT <= 0
3398
3486
  # define XXH3_NEON_LANES 6
3399
3487
  # else
3400
3488
  # define XXH3_NEON_LANES XXH_ACC_NB
@@ -3442,6 +3530,11 @@ typedef __vector unsigned long long xxh_u64x2;
3442
3530
  typedef __vector unsigned char xxh_u8x16;
3443
3531
  typedef __vector unsigned xxh_u32x4;
3444
3532
 
3533
+ /*
3534
+ * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.
3535
+ */
3536
+ typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;
3537
+
3445
3538
  # ifndef XXH_VSX_BE
3446
3539
  # if defined(__BIG_ENDIAN__) \
3447
3540
  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
@@ -3516,6 +3609,20 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
3516
3609
  # endif /* XXH_vec_mulo, XXH_vec_mule */
3517
3610
  #endif /* XXH_VECTOR == XXH_VSX */
3518
3611
 
3612
+ #if XXH_VECTOR == XXH_SVE
3613
+ #define ACCRND(acc, offset) \
3614
+ do { \
3615
+ svuint64_t input_vec = svld1_u64(mask, xinput + offset); \
3616
+ svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \
3617
+ svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \
3618
+ svuint64_t swapped = svtbl_u64(input_vec, kSwap); \
3619
+ svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \
3620
+ svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \
3621
+ svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
3622
+ acc = svadd_u64_x(mask, acc, mul); \
3623
+ } while (0)
3624
+ #endif /* XXH_VECTOR == XXH_SVE */
3625
+
3519
3626
 
3520
3627
  /* prefetch
3521
3628
  * can be disabled, by declaring XXH_NO_PREFETCH build macro */
@@ -3952,31 +4059,33 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
3952
4059
  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
3953
4060
  XXH_ASSERT(16 < len && len <= 128);
3954
4061
 
3955
- { xxh_u64 acc = len * XXH_PRIME64_1;
4062
+ { xxh_u64 acc = len * XXH_PRIME64_1, acc_end;
3956
4063
  #if XXH_SIZE_OPT >= 1
3957
4064
  /* Smaller and cleaner, but slightly slower. */
3958
- size_t i = (len - 1) / 32;
4065
+ unsigned int i = (unsigned int)(len - 1) / 32;
3959
4066
  do {
3960
4067
  acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
3961
4068
  acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
3962
4069
  } while (i-- != 0);
4070
+ acc_end = 0;
3963
4071
  #else
4072
+ acc += XXH3_mix16B(input+0, secret+0, seed);
4073
+ acc_end = XXH3_mix16B(input+len-16, secret+16, seed);
3964
4074
  if (len > 32) {
4075
+ acc += XXH3_mix16B(input+16, secret+32, seed);
4076
+ acc_end += XXH3_mix16B(input+len-32, secret+48, seed);
3965
4077
  if (len > 64) {
4078
+ acc += XXH3_mix16B(input+32, secret+64, seed);
4079
+ acc_end += XXH3_mix16B(input+len-48, secret+80, seed);
4080
+
3966
4081
  if (len > 96) {
3967
4082
  acc += XXH3_mix16B(input+48, secret+96, seed);
3968
- acc += XXH3_mix16B(input+len-64, secret+112, seed);
4083
+ acc_end += XXH3_mix16B(input+len-64, secret+112, seed);
3969
4084
  }
3970
- acc += XXH3_mix16B(input+32, secret+64, seed);
3971
- acc += XXH3_mix16B(input+len-48, secret+80, seed);
3972
4085
  }
3973
- acc += XXH3_mix16B(input+16, secret+32, seed);
3974
- acc += XXH3_mix16B(input+len-32, secret+48, seed);
3975
4086
  }
3976
- acc += XXH3_mix16B(input+0, secret+0, seed);
3977
- acc += XXH3_mix16B(input+len-16, secret+16, seed);
3978
4087
  #endif
3979
- return XXH3_avalanche(acc);
4088
+ return XXH3_avalanche(acc + acc_end);
3980
4089
  }
3981
4090
  }
3982
4091
 
@@ -3994,13 +4103,17 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
3994
4103
  #define XXH3_MIDSIZE_LASTOFFSET 17
3995
4104
 
3996
4105
  { xxh_u64 acc = len * XXH_PRIME64_1;
3997
- int const nbRounds = (int)len / 16;
3998
- int i;
4106
+ xxh_u64 acc_end;
4107
+ unsigned int const nbRounds = (unsigned int)len / 16;
4108
+ unsigned int i;
4109
+ XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
3999
4110
  for (i=0; i<8; i++) {
4000
4111
  acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
4001
4112
  }
4002
- acc = XXH3_avalanche(acc);
4113
+ /* last bytes */
4114
+ acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
4003
4115
  XXH_ASSERT(nbRounds >= 8);
4116
+ acc = XXH3_avalanche(acc);
4004
4117
  #if defined(__clang__) /* Clang */ \
4005
4118
  && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
4006
4119
  && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
@@ -4027,11 +4140,13 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
4027
4140
  #pragma clang loop vectorize(disable)
4028
4141
  #endif
4029
4142
  for (i=8 ; i < nbRounds; i++) {
4030
- acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
4143
+ /*
4144
+ * Prevents clang for unrolling the acc loop and interleaving with this one.
4145
+ */
4146
+ XXH_COMPILER_GUARD(acc);
4147
+ acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
4031
4148
  }
4032
- /* last bytes */
4033
- acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
4034
- return XXH3_avalanche(acc);
4149
+ return XXH3_avalanche(acc + acc_end);
4035
4150
  }
4036
4151
  }
4037
4152
 
@@ -4047,6 +4162,47 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
4047
4162
  # define ACC_NB XXH_ACC_NB
4048
4163
  #endif
4049
4164
 
4165
+ #ifndef XXH_PREFETCH_DIST
4166
+ # ifdef __clang__
4167
+ # define XXH_PREFETCH_DIST 320
4168
+ # else
4169
+ # if (XXH_VECTOR == XXH_AVX512)
4170
+ # define XXH_PREFETCH_DIST 512
4171
+ # else
4172
+ # define XXH_PREFETCH_DIST 384
4173
+ # endif
4174
+ # endif /* __clang__ */
4175
+ #endif /* XXH_PREFETCH_DIST */
4176
+
4177
+ /*
4178
+ * These macros are to generate an XXH3_accumulate() function.
4179
+ * The two arguments select the name suffix and target attribute.
4180
+ *
4181
+ * The name of this symbol is XXH3_accumulate_<name>() and it calls
4182
+ * XXH3_accumulate_512_<name>().
4183
+ *
4184
+ * It may be useful to hand implement this function if the compiler fails to
4185
+ * optimize the inline function.
4186
+ */
4187
+ #define XXH3_ACCUMULATE_TEMPLATE(name) \
4188
+ void \
4189
+ XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \
4190
+ const xxh_u8* XXH_RESTRICT input, \
4191
+ const xxh_u8* XXH_RESTRICT secret, \
4192
+ size_t nbStripes) \
4193
+ { \
4194
+ size_t n; \
4195
+ for (n = 0; n < nbStripes; n++ ) { \
4196
+ const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \
4197
+ XXH_PREFETCH(in + XXH_PREFETCH_DIST); \
4198
+ XXH3_accumulate_512_##name( \
4199
+ acc, \
4200
+ in, \
4201
+ secret + n*XXH_SECRET_CONSUME_RATE); \
4202
+ } \
4203
+ }
4204
+
4205
+
4050
4206
  XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
4051
4207
  {
4052
4208
  if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
@@ -4115,7 +4271,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
4115
4271
  /* data_key = data_vec ^ key_vec; */
4116
4272
  __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
4117
4273
  /* data_key_lo = data_key >> 32; */
4118
- __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
4274
+ __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
4119
4275
  /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
4120
4276
  __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo);
4121
4277
  /* xacc[0] += swap(data_vec); */
@@ -4125,6 +4281,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
4125
4281
  *xacc = _mm512_add_epi64(product, sum);
4126
4282
  }
4127
4283
  }
4284
+ XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
4128
4285
 
4129
4286
  /*
4130
4287
  * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
@@ -4158,13 +4315,12 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4158
4315
  /* xacc[0] ^= (xacc[0] >> 47) */
4159
4316
  __m512i const acc_vec = *xacc;
4160
4317
  __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47);
4161
- __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted);
4162
4318
  /* xacc[0] ^= secret; */
4163
4319
  __m512i const key_vec = _mm512_loadu_si512 (secret);
4164
- __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
4320
+ __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
4165
4321
 
4166
4322
  /* xacc[0] *= XXH_PRIME32_1; */
4167
- __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
4323
+ __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
4168
4324
  __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32);
4169
4325
  __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32);
4170
4326
  *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
@@ -4179,7 +4335,8 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4179
4335
  XXH_ASSERT(((size_t)customSecret & 63) == 0);
4180
4336
  (void)(&XXH_writeLE64);
4181
4337
  { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
4182
- __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
4338
+ __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
4339
+ __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
4183
4340
 
4184
4341
  const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret);
4185
4342
  __m512i* const dest = ( __m512i*) customSecret;
@@ -4187,14 +4344,7 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4187
4344
  XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
4188
4345
  XXH_ASSERT(((size_t)dest & 63) == 0);
4189
4346
  for (i=0; i < nbRounds; ++i) {
4190
- /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',
4191
- * this will warn "discards 'const' qualifier". */
4192
- union {
4193
- const __m512i* cp;
4194
- void* p;
4195
- } remote_const_void;
4196
- remote_const_void.cp = src + i;
4197
- dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
4347
+ dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
4198
4348
  } }
4199
4349
  }
4200
4350
 
@@ -4230,7 +4380,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
4230
4380
  /* data_key = data_vec ^ key_vec; */
4231
4381
  __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
4232
4382
  /* data_key_lo = data_key >> 32; */
4233
- __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
4383
+ __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
4234
4384
  /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
4235
4385
  __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);
4236
4386
  /* xacc[i] += swap(data_vec); */
@@ -4240,6 +4390,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
4240
4390
  xacc[i] = _mm256_add_epi64(product, sum);
4241
4391
  } }
4242
4392
  }
4393
+ XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
4243
4394
 
4244
4395
  XXH_FORCE_INLINE XXH_TARGET_AVX2 void
4245
4396
  XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
@@ -4262,7 +4413,7 @@ XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4262
4413
  __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
4263
4414
 
4264
4415
  /* xacc[i] *= XXH_PRIME32_1; */
4265
- __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
4416
+ __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
4266
4417
  __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
4267
4418
  __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
4268
4419
  xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
@@ -4294,12 +4445,12 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR
4294
4445
  XXH_ASSERT(((size_t)dest & 31) == 0);
4295
4446
 
4296
4447
  /* GCC -O2 need unroll loop manually */
4297
- dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
4298
- dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
4299
- dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
4300
- dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
4301
- dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
4302
- dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
4448
+ dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
4449
+ dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
4450
+ dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
4451
+ dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
4452
+ dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
4453
+ dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
4303
4454
  }
4304
4455
  }
4305
4456
 
@@ -4346,6 +4497,7 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
4346
4497
  xacc[i] = _mm_add_epi64(product, sum);
4347
4498
  } }
4348
4499
  }
4500
+ XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
4349
4501
 
4350
4502
  XXH_FORCE_INLINE XXH_TARGET_SSE2 void
4351
4503
  XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
@@ -4431,6 +4583,16 @@ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
4431
4583
  * CPU, and it also mitigates some GCC codegen issues.
4432
4584
  *
4433
4585
  * @see XXH3_NEON_LANES for configuring this and details about this optimization.
4586
+ *
4587
+ * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit
4588
+ * integers instead of the other platforms which mask full 64-bit vectors,
4589
+ * so the setup is more complicated than just shifting right.
4590
+ *
4591
+ * Additionally, there is an optimization for 4 lanes at once noted below.
4592
+ *
4593
+ * Since, as stated, the most optimal amount of lanes for Cortexes is 6,
4594
+ * there needs to be *three* versions of the accumulate operation used
4595
+ * for the remaining 2 lanes.
4434
4596
  */
4435
4597
  XXH_FORCE_INLINE void
4436
4598
  XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
@@ -4439,49 +4601,113 @@ XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
4439
4601
  {
4440
4602
  XXH_ASSERT((((size_t)acc) & 15) == 0);
4441
4603
  XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
4442
- {
4443
- uint64x2_t* const xacc = (uint64x2_t *) acc;
4604
+ { /* GCC for darwin arm64 does not like aliasing here */
4605
+ xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;
4444
4606
  /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
4445
4607
  uint8_t const* const xinput = (const uint8_t *) input;
4446
4608
  uint8_t const* const xsecret = (const uint8_t *) secret;
4447
4609
 
4448
4610
  size_t i;
4449
- /* AArch64 uses both scalar and neon at the same time */
4611
+ /* Scalar lanes use the normal scalarRound routine */
4450
4612
  for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
4451
4613
  XXH3_scalarRound(acc, input, secret, i);
4452
4614
  }
4453
- for (i=0; i < XXH3_NEON_LANES / 2; i++) {
4454
- uint64x2_t acc_vec = xacc[i];
4615
+ i = 0;
4616
+ /* 4 NEON lanes at a time. */
4617
+ for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
4618
+ /* data_vec = xinput[i]; */
4619
+ uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16));
4620
+ uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16));
4621
+ /* key_vec = xsecret[i]; */
4622
+ uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16));
4623
+ uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16));
4624
+ /* data_swap = swap(data_vec) */
4625
+ uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
4626
+ uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
4627
+ /* data_key = data_vec ^ key_vec; */
4628
+ uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
4629
+ uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
4630
+
4631
+ /*
4632
+ * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
4633
+ * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
4634
+ * get one vector with the low 32 bits of each lane, and one vector
4635
+ * with the high 32 bits of each lane.
4636
+ *
4637
+ * This compiles to two instructions on AArch64 and has a paired vector
4638
+ * result, which is an artifact from ARMv7a's version which modified both
4639
+ * vectors in place.
4640
+ *
4641
+ * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
4642
+ * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
4643
+ */
4644
+ uint32x4x2_t unzipped = vuzpq_u32(
4645
+ vreinterpretq_u32_u64(data_key_1),
4646
+ vreinterpretq_u32_u64(data_key_2)
4647
+ );
4648
+ /* data_key_lo = data_key & 0xFFFFFFFF */
4649
+ uint32x4_t data_key_lo = unzipped.val[0];
4650
+ /* data_key_hi = data_key >> 32 */
4651
+ uint32x4_t data_key_hi = unzipped.val[1];
4652
+ /*
4653
+ * Then, we can split the vectors horizontally and multiply which, as for most
4654
+ * widening intrinsics, have a variant that works on both high half vectors
4655
+ * for free on AArch64.
4656
+ *
4657
+ * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
4658
+ */
4659
+ uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);
4660
+ uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);
4661
+ /*
4662
+ * Clang reorders
4663
+ * a += b * c; // umlal swap.2d, dkl.2s, dkh.2s
4664
+ * c += a; // add acc.2d, acc.2d, swap.2d
4665
+ * to
4666
+ * c += a; // add acc.2d, acc.2d, swap.2d
4667
+ * c += b * c; // umlal acc.2d, dkl.2s, dkh.2s
4668
+ *
4669
+ * While it would make sense in theory since the addition is faster,
4670
+ * for reasons likely related to umlal being limited to certain NEON
4671
+ * pipelines, this is worse. A compiler guard fixes this.
4672
+ */
4673
+ XXH_COMPILER_GUARD_W(sum_1);
4674
+ XXH_COMPILER_GUARD_W(sum_2);
4675
+ /* xacc[i] = acc_vec + sum; */
4676
+ xacc[i] = vaddq_u64(xacc[i], sum_1);
4677
+ xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);
4678
+ }
4679
+ /* Operate on the remaining NEON lanes 2 at a time. */
4680
+ for (; i < XXH3_NEON_LANES / 2; i++) {
4455
4681
  /* data_vec = xinput[i]; */
4456
4682
  uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16));
4457
4683
  /* key_vec = xsecret[i]; */
4458
4684
  uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));
4459
- uint64x2_t data_key;
4460
- uint32x2_t data_key_lo, data_key_hi;
4461
4685
  /* acc_vec_2 = swap(data_vec) */
4462
- uint64x2_t acc_vec_2 = vextq_u64(data_vec, data_vec, 1);
4686
+ uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);
4463
4687
  /* data_key = data_vec ^ key_vec; */
4464
- data_key = veorq_u64(data_vec, key_vec);
4465
- /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
4466
- * data_key_hi = (uint32x2_t) (data_key >> 32);
4467
- * data_key = UNDEFINED; */
4468
- XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
4469
- /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
4470
- acc_vec_2 = vmlal_u32 (acc_vec_2, data_key_lo, data_key_hi);
4471
- /* xacc[i] += acc_vec_2; */
4472
- acc_vec = vaddq_u64 (acc_vec, acc_vec_2);
4473
- xacc[i] = acc_vec;
4688
+ uint64x2_t data_key = veorq_u64(data_vec, key_vec);
4689
+ /* For two lanes, just use VMOVN and VSHRN. */
4690
+ /* data_key_lo = data_key & 0xFFFFFFFF; */
4691
+ uint32x2_t data_key_lo = vmovn_u64(data_key);
4692
+ /* data_key_hi = data_key >> 32; */
4693
+ uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
4694
+ /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */
4695
+ uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);
4696
+ /* Same Clang workaround as before */
4697
+ XXH_COMPILER_GUARD_W(sum);
4698
+ /* xacc[i] = acc_vec + sum; */
4699
+ xacc[i] = vaddq_u64 (xacc[i], sum);
4474
4700
  }
4475
-
4476
4701
  }
4477
4702
  }
4703
+ XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
4478
4704
 
4479
4705
  XXH_FORCE_INLINE void
4480
4706
  XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4481
4707
  {
4482
4708
  XXH_ASSERT((((size_t)acc) & 15) == 0);
4483
4709
 
4484
- { uint64x2_t* xacc = (uint64x2_t*) acc;
4710
+ { xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc;
4485
4711
  uint8_t const* xsecret = (uint8_t const*) secret;
4486
4712
  uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);
4487
4713
 
@@ -4493,47 +4719,42 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4493
4719
  for (i=0; i < XXH3_NEON_LANES / 2; i++) {
4494
4720
  /* xacc[i] ^= (xacc[i] >> 47); */
4495
4721
  uint64x2_t acc_vec = xacc[i];
4496
- uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
4497
- uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
4722
+ uint64x2_t shifted = vshrq_n_u64(acc_vec, 47);
4723
+ uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
4498
4724
 
4499
4725
  /* xacc[i] ^= xsecret[i]; */
4500
- uint64x2_t key_vec = XXH_vld1q_u64 (xsecret + (i * 16));
4501
- uint64x2_t data_key = veorq_u64 (data_vec, key_vec);
4726
+ uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));
4727
+ uint64x2_t data_key = veorq_u64(data_vec, key_vec);
4502
4728
 
4503
4729
  /* xacc[i] *= XXH_PRIME32_1 */
4504
- uint32x2_t data_key_lo, data_key_hi;
4505
- /* data_key_lo = (uint32x2_t) (xacc[i] & 0xFFFFFFFF);
4506
- * data_key_hi = (uint32x2_t) (xacc[i] >> 32);
4507
- * xacc[i] = UNDEFINED; */
4508
- XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
4509
- { /*
4510
- * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
4511
- *
4512
- * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
4513
- * incorrectly "optimize" this:
4514
- * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));
4515
- * shifted = vshll_n_u32(tmp, 32);
4516
- * to this:
4517
- * tmp = "vmulq_u64"(a, b); // no such thing!
4518
- * shifted = vshlq_n_u64(tmp, 32);
4519
- *
4520
- * However, unlike SSE, Clang lacks a 64-bit multiply routine
4521
- * for NEON, and it scalarizes two 64-bit multiplies instead.
4522
- *
4523
- * vmull_u32 has the same timing as vmul_u32, and it avoids
4524
- * this bug completely.
4525
- * See https://bugs.llvm.org/show_bug.cgi?id=39967
4526
- */
4527
- uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
4528
- /* xacc[i] = prod_hi << 32; */
4529
- prod_hi = vshlq_n_u64(prod_hi, 32);
4530
- /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
4531
- xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime);
4532
- }
4730
+ uint32x2_t data_key_lo = vmovn_u64(data_key);
4731
+ uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
4732
+ /*
4733
+ * prod_hi = (data_key >> 32) * XXH_PRIME32_1;
4734
+ *
4735
+ * Avoid vmul_u32 + vshll_n_u32 since Clang 6 and 7 will
4736
+ * incorrectly "optimize" this:
4737
+ * tmp = vmul_u32(vmovn_u64(a), vmovn_u64(b));
4738
+ * shifted = vshll_n_u32(tmp, 32);
4739
+ * to this:
4740
+ * tmp = "vmulq_u64"(a, b); // no such thing!
4741
+ * shifted = vshlq_n_u64(tmp, 32);
4742
+ *
4743
+ * However, unlike SSE, Clang lacks a 64-bit multiply routine
4744
+ * for NEON, and it scalarizes two 64-bit multiplies instead.
4745
+ *
4746
+ * vmull_u32 has the same timing as vmul_u32, and it avoids
4747
+ * this bug completely.
4748
+ * See https://bugs.llvm.org/show_bug.cgi?id=39967
4749
+ */
4750
+ uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
4751
+ /* xacc[i] = prod_hi << 32; */
4752
+ prod_hi = vshlq_n_u64(prod_hi, 32);
4753
+ /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
4754
+ xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime);
4533
4755
  }
4534
4756
  }
4535
4757
  }
4536
-
4537
4758
  #endif
4538
4759
 
4539
4760
  #if (XXH_VECTOR == XXH_VSX)
@@ -4544,23 +4765,23 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
4544
4765
  const void* XXH_RESTRICT secret)
4545
4766
  {
4546
4767
  /* presumed aligned */
4547
- unsigned int* const xacc = (unsigned int*) acc;
4548
- xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */
4549
- xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */
4768
+ xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
4769
+ xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */
4770
+ xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */
4550
4771
  xxh_u64x2 const v32 = { 32, 32 };
4551
4772
  size_t i;
4552
4773
  for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
4553
4774
  /* data_vec = xinput[i]; */
4554
- xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + i);
4775
+ xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);
4555
4776
  /* key_vec = xsecret[i]; */
4556
- xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
4777
+ xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i);
4557
4778
  xxh_u64x2 const data_key = data_vec ^ key_vec;
4558
4779
  /* shuffled = (data_key << 32) | (data_key >> 32); */
4559
4780
  xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
4560
4781
  /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
4561
4782
  xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
4562
4783
  /* acc_vec = xacc[i]; */
4563
- xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i);
4784
+ xxh_u64x2 acc_vec = xacc[i];
4564
4785
  acc_vec += product;
4565
4786
 
4566
4787
  /* swap high and low halves */
@@ -4569,18 +4790,18 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
4569
4790
  #else
4570
4791
  acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
4571
4792
  #endif
4572
- /* xacc[i] = acc_vec; */
4573
- vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
4793
+ xacc[i] = acc_vec;
4574
4794
  }
4575
4795
  }
4796
+ XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
4576
4797
 
4577
4798
  XXH_FORCE_INLINE void
4578
4799
  XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4579
4800
  {
4580
4801
  XXH_ASSERT((((size_t)acc) & 15) == 0);
4581
4802
 
4582
- { xxh_u64x2* const xacc = (xxh_u64x2*) acc;
4583
- const xxh_u64x2* const xsecret = (const xxh_u64x2*) secret;
4803
+ { xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
4804
+ const xxh_u8* const xsecret = (const xxh_u8*) secret;
4584
4805
  /* constants */
4585
4806
  xxh_u64x2 const v32 = { 32, 32 };
4586
4807
  xxh_u64x2 const v47 = { 47, 47 };
@@ -4592,7 +4813,7 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4592
4813
  xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
4593
4814
 
4594
4815
  /* xacc[i] ^= xsecret[i]; */
4595
- xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + i);
4816
+ xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i);
4596
4817
  xxh_u64x2 const data_key = data_vec ^ key_vec;
4597
4818
 
4598
4819
  /* xacc[i] *= XXH_PRIME32_1 */
@@ -4606,8 +4827,148 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4606
4827
 
4607
4828
  #endif
4608
4829
 
4830
+ #if (XXH_VECTOR == XXH_SVE)
4831
+
4832
+ XXH_FORCE_INLINE void
4833
+ XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
4834
+ const void* XXH_RESTRICT input,
4835
+ const void* XXH_RESTRICT secret)
4836
+ {
4837
+ uint64_t *xacc = (uint64_t *)acc;
4838
+ const uint64_t *xinput = (const uint64_t *)(const void *)input;
4839
+ const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
4840
+ svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
4841
+ uint64_t element_count = svcntd();
4842
+ if (element_count >= 8) {
4843
+ svbool_t mask = svptrue_pat_b64(SV_VL8);
4844
+ svuint64_t vacc = svld1_u64(mask, xacc);
4845
+ ACCRND(vacc, 0);
4846
+ svst1_u64(mask, xacc, vacc);
4847
+ } else if (element_count == 2) { /* sve128 */
4848
+ svbool_t mask = svptrue_pat_b64(SV_VL2);
4849
+ svuint64_t acc0 = svld1_u64(mask, xacc + 0);
4850
+ svuint64_t acc1 = svld1_u64(mask, xacc + 2);
4851
+ svuint64_t acc2 = svld1_u64(mask, xacc + 4);
4852
+ svuint64_t acc3 = svld1_u64(mask, xacc + 6);
4853
+ ACCRND(acc0, 0);
4854
+ ACCRND(acc1, 2);
4855
+ ACCRND(acc2, 4);
4856
+ ACCRND(acc3, 6);
4857
+ svst1_u64(mask, xacc + 0, acc0);
4858
+ svst1_u64(mask, xacc + 2, acc1);
4859
+ svst1_u64(mask, xacc + 4, acc2);
4860
+ svst1_u64(mask, xacc + 6, acc3);
4861
+ } else {
4862
+ svbool_t mask = svptrue_pat_b64(SV_VL4);
4863
+ svuint64_t acc0 = svld1_u64(mask, xacc + 0);
4864
+ svuint64_t acc1 = svld1_u64(mask, xacc + 4);
4865
+ ACCRND(acc0, 0);
4866
+ ACCRND(acc1, 4);
4867
+ svst1_u64(mask, xacc + 0, acc0);
4868
+ svst1_u64(mask, xacc + 4, acc1);
4869
+ }
4870
+ }
4871
+
4872
+ XXH_FORCE_INLINE void
4873
+ XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
4874
+ const xxh_u8* XXH_RESTRICT input,
4875
+ const xxh_u8* XXH_RESTRICT secret,
4876
+ size_t nbStripes)
4877
+ {
4878
+ if (nbStripes != 0) {
4879
+ uint64_t *xacc = (uint64_t *)acc;
4880
+ const uint64_t *xinput = (const uint64_t *)(const void *)input;
4881
+ const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
4882
+ svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
4883
+ uint64_t element_count = svcntd();
4884
+ if (element_count >= 8) {
4885
+ svbool_t mask = svptrue_pat_b64(SV_VL8);
4886
+ svuint64_t vacc = svld1_u64(mask, xacc + 0);
4887
+ do {
4888
+ /* svprfd(svbool_t, void *, enum svfprop); */
4889
+ svprfd(mask, xinput + 128, SV_PLDL1STRM);
4890
+ ACCRND(vacc, 0);
4891
+ xinput += 8;
4892
+ xsecret += 1;
4893
+ nbStripes--;
4894
+ } while (nbStripes != 0);
4895
+
4896
+ svst1_u64(mask, xacc + 0, vacc);
4897
+ } else if (element_count == 2) { /* sve128 */
4898
+ svbool_t mask = svptrue_pat_b64(SV_VL2);
4899
+ svuint64_t acc0 = svld1_u64(mask, xacc + 0);
4900
+ svuint64_t acc1 = svld1_u64(mask, xacc + 2);
4901
+ svuint64_t acc2 = svld1_u64(mask, xacc + 4);
4902
+ svuint64_t acc3 = svld1_u64(mask, xacc + 6);
4903
+ do {
4904
+ svprfd(mask, xinput + 128, SV_PLDL1STRM);
4905
+ ACCRND(acc0, 0);
4906
+ ACCRND(acc1, 2);
4907
+ ACCRND(acc2, 4);
4908
+ ACCRND(acc3, 6);
4909
+ xinput += 8;
4910
+ xsecret += 1;
4911
+ nbStripes--;
4912
+ } while (nbStripes != 0);
4913
+
4914
+ svst1_u64(mask, xacc + 0, acc0);
4915
+ svst1_u64(mask, xacc + 2, acc1);
4916
+ svst1_u64(mask, xacc + 4, acc2);
4917
+ svst1_u64(mask, xacc + 6, acc3);
4918
+ } else {
4919
+ svbool_t mask = svptrue_pat_b64(SV_VL4);
4920
+ svuint64_t acc0 = svld1_u64(mask, xacc + 0);
4921
+ svuint64_t acc1 = svld1_u64(mask, xacc + 4);
4922
+ do {
4923
+ svprfd(mask, xinput + 128, SV_PLDL1STRM);
4924
+ ACCRND(acc0, 0);
4925
+ ACCRND(acc1, 4);
4926
+ xinput += 8;
4927
+ xsecret += 1;
4928
+ nbStripes--;
4929
+ } while (nbStripes != 0);
4930
+
4931
+ svst1_u64(mask, xacc + 0, acc0);
4932
+ svst1_u64(mask, xacc + 4, acc1);
4933
+ }
4934
+ }
4935
+ }
4936
+
4937
+ #endif
4938
+
4609
4939
  /* scalar variants - universal */
4610
4940
 
4941
+ #if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
4942
+ /*
4943
+ * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they
4944
+ * emit an excess mask and a full 64-bit multiply-add (MADD X-form).
4945
+ *
4946
+ * While this might not seem like much, as AArch64 is a 64-bit architecture, only
4947
+ * big Cortex designs have a full 64-bit multiplier.
4948
+ *
4949
+ * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit
4950
+ * multiplies expand to 2-3 multiplies in microcode. This has a major penalty
4951
+ * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.
4952
+ *
4953
+ * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does
4954
+ * not have this penalty and does the mask automatically.
4955
+ */
4956
+ XXH_FORCE_INLINE xxh_u64
4957
+ XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
4958
+ {
4959
+ xxh_u64 ret;
4960
+ /* note: %x = 64-bit register, %w = 32-bit register */
4961
+ __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));
4962
+ return ret;
4963
+ }
4964
+ #else
4965
+ XXH_FORCE_INLINE xxh_u64
4966
+ XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
4967
+ {
4968
+ return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;
4969
+ }
4970
+ #endif
4971
+
4611
4972
  /*!
4612
4973
  * @internal
4613
4974
  * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
@@ -4630,7 +4991,7 @@ XXH3_scalarRound(void* XXH_RESTRICT acc,
4630
4991
  xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
4631
4992
  xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
4632
4993
  xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
4633
- xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
4994
+ xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);
4634
4995
  }
4635
4996
  }
4636
4997
 
@@ -4655,6 +5016,7 @@ XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
4655
5016
  XXH3_scalarRound(acc, input, secret, i);
4656
5017
  }
4657
5018
  }
5019
+ XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
4658
5020
 
4659
5021
  /*!
4660
5022
  * @internal
@@ -4706,10 +5068,10 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4706
5068
  const xxh_u8* kSecretPtr = XXH3_kSecret;
4707
5069
  XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
4708
5070
 
4709
- #if defined(__clang__) && defined(__aarch64__)
5071
+ #if defined(__GNUC__) && defined(__aarch64__)
4710
5072
  /*
4711
5073
  * UGLY HACK:
4712
- * Clang generates a bunch of MOV/MOVK pairs for aarch64, and they are
5074
+ * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are
4713
5075
  * placed sequentially, in order, at the top of the unrolled loop.
4714
5076
  *
4715
5077
  * While MOVK is great for generating constants (2 cycles for a 64-bit
@@ -4724,7 +5086,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4724
5086
  * ADD
4725
5087
  * SUB STR
4726
5088
  * STR
4727
- * By forcing loads from memory (as the asm line causes Clang to assume
5089
+ * By forcing loads from memory (as the asm line causes the compiler to assume
4728
5090
  * that XXH3_kSecretPtr has been changed), the pipelines are used more
4729
5091
  * efficiently:
4730
5092
  * I L S
@@ -4741,17 +5103,11 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4741
5103
  */
4742
5104
  XXH_COMPILER_GUARD(kSecretPtr);
4743
5105
  #endif
4744
- /*
4745
- * Note: in debug mode, this overrides the asm optimization
4746
- * and Clang will emit MOVK chains again.
4747
- */
4748
- XXH_ASSERT(kSecretPtr == XXH3_kSecret);
4749
-
4750
5106
  { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
4751
5107
  int i;
4752
5108
  for (i=0; i < nbRounds; i++) {
4753
5109
  /*
4754
- * The asm hack causes Clang to assume that kSecretPtr aliases with
5110
+ * The asm hack causes the compiler to assume that kSecretPtr aliases with
4755
5111
  * customSecret, and on aarch64, this prevented LDP from merging two
4756
5112
  * loads together for free. Putting the loads together before the stores
4757
5113
  * properly generates LDP.
@@ -4764,7 +5120,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4764
5120
  }
4765
5121
 
4766
5122
 
4767
- typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);
5123
+ typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
4768
5124
  typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
4769
5125
  typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
4770
5126
 
@@ -4772,36 +5128,48 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
4772
5128
  #if (XXH_VECTOR == XXH_AVX512)
4773
5129
 
4774
5130
  #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
5131
+ #define XXH3_accumulate XXH3_accumulate_avx512
4775
5132
  #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
4776
5133
  #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
4777
5134
 
4778
5135
  #elif (XXH_VECTOR == XXH_AVX2)
4779
5136
 
4780
5137
  #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
5138
+ #define XXH3_accumulate XXH3_accumulate_avx2
4781
5139
  #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
4782
5140
  #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
4783
5141
 
4784
5142
  #elif (XXH_VECTOR == XXH_SSE2)
4785
5143
 
4786
5144
  #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
5145
+ #define XXH3_accumulate XXH3_accumulate_sse2
4787
5146
  #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
4788
5147
  #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
4789
5148
 
4790
5149
  #elif (XXH_VECTOR == XXH_NEON)
4791
5150
 
4792
5151
  #define XXH3_accumulate_512 XXH3_accumulate_512_neon
5152
+ #define XXH3_accumulate XXH3_accumulate_neon
4793
5153
  #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
4794
5154
  #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4795
5155
 
4796
5156
  #elif (XXH_VECTOR == XXH_VSX)
4797
5157
 
4798
5158
  #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
5159
+ #define XXH3_accumulate XXH3_accumulate_vsx
4799
5160
  #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
4800
5161
  #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4801
5162
 
5163
+ #elif (XXH_VECTOR == XXH_SVE)
5164
+ #define XXH3_accumulate_512 XXH3_accumulate_512_sve
5165
+ #define XXH3_accumulate XXH3_accumulate_sve
5166
+ #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
5167
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
5168
+
4802
5169
  #else /* scalar */
4803
5170
 
4804
5171
  #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
5172
+ #define XXH3_accumulate XXH3_accumulate_scalar
4805
5173
  #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
4806
5174
  #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4807
5175
 
@@ -4812,45 +5180,11 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
4812
5180
  # define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4813
5181
  #endif
4814
5182
 
4815
- #ifndef XXH_PREFETCH_DIST
4816
- # ifdef __clang__
4817
- # define XXH_PREFETCH_DIST 320
4818
- # else
4819
- # if (XXH_VECTOR == XXH_AVX512)
4820
- # define XXH_PREFETCH_DIST 512
4821
- # else
4822
- # define XXH_PREFETCH_DIST 384
4823
- # endif
4824
- # endif /* __clang__ */
4825
- #endif /* XXH_PREFETCH_DIST */
4826
-
4827
- /*
4828
- * XXH3_accumulate()
4829
- * Loops over XXH3_accumulate_512().
4830
- * Assumption: nbStripes will not overflow the secret size
4831
- */
4832
- XXH_FORCE_INLINE void
4833
- XXH3_accumulate( xxh_u64* XXH_RESTRICT acc,
4834
- const xxh_u8* XXH_RESTRICT input,
4835
- const xxh_u8* XXH_RESTRICT secret,
4836
- size_t nbStripes,
4837
- XXH3_f_accumulate_512 f_acc512)
4838
- {
4839
- size_t n;
4840
- for (n = 0; n < nbStripes; n++ ) {
4841
- const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
4842
- XXH_PREFETCH(in + XXH_PREFETCH_DIST);
4843
- f_acc512(acc,
4844
- in,
4845
- secret + n*XXH_SECRET_CONSUME_RATE);
4846
- }
4847
- }
4848
-
4849
5183
  XXH_FORCE_INLINE void
4850
5184
  XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
4851
5185
  const xxh_u8* XXH_RESTRICT input, size_t len,
4852
5186
  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
4853
- XXH3_f_accumulate_512 f_acc512,
5187
+ XXH3_f_accumulate f_acc,
4854
5188
  XXH3_f_scrambleAcc f_scramble)
4855
5189
  {
4856
5190
  size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
@@ -4862,7 +5196,7 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
4862
5196
  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
4863
5197
 
4864
5198
  for (n = 0; n < nb_blocks; n++) {
4865
- XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);
5199
+ f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
4866
5200
  f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
4867
5201
  }
4868
5202
 
@@ -4870,12 +5204,12 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
4870
5204
  XXH_ASSERT(len > XXH_STRIPE_LEN);
4871
5205
  { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
4872
5206
  XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
4873
- XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);
5207
+ f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
4874
5208
 
4875
5209
  /* last stripe */
4876
5210
  { const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
4877
5211
  #define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */
4878
- f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
5212
+ XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
4879
5213
  } }
4880
5214
  }
4881
5215
 
@@ -4920,12 +5254,12 @@ XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secre
4920
5254
  XXH_FORCE_INLINE XXH64_hash_t
4921
5255
  XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
4922
5256
  const void* XXH_RESTRICT secret, size_t secretSize,
4923
- XXH3_f_accumulate_512 f_acc512,
5257
+ XXH3_f_accumulate f_acc,
4924
5258
  XXH3_f_scrambleAcc f_scramble)
4925
5259
  {
4926
5260
  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
4927
5261
 
4928
- XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);
5262
+ XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
4929
5263
 
4930
5264
  /* converge into final hash */
4931
5265
  XXH_STATIC_ASSERT(sizeof(acc) == 64);
@@ -4939,13 +5273,15 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
4939
5273
  * It's important for performance to transmit secret's size (when it's static)
4940
5274
  * so that the compiler can properly optimize the vectorized loop.
4941
5275
  * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
5276
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
5277
+ * breaks -Og, this is XXH_NO_INLINE.
4942
5278
  */
4943
- XXH_FORCE_INLINE XXH64_hash_t
5279
+ XXH3_WITH_SECRET_INLINE XXH64_hash_t
4944
5280
  XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
4945
5281
  XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
4946
5282
  {
4947
5283
  (void)seed64;
4948
- return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
5284
+ return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
4949
5285
  }
4950
5286
 
4951
5287
  /*
@@ -4959,7 +5295,7 @@ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
4959
5295
  XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
4960
5296
  {
4961
5297
  (void)seed64; (void)secret; (void)secretLen;
4962
- return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
5298
+ return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
4963
5299
  }
4964
5300
 
4965
5301
  /*
@@ -4976,7 +5312,7 @@ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
4976
5312
  XXH_FORCE_INLINE XXH64_hash_t
4977
5313
  XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
4978
5314
  XXH64_hash_t seed,
4979
- XXH3_f_accumulate_512 f_acc512,
5315
+ XXH3_f_accumulate f_acc,
4980
5316
  XXH3_f_scrambleAcc f_scramble,
4981
5317
  XXH3_f_initCustomSecret f_initSec)
4982
5318
  {
@@ -4984,12 +5320,12 @@ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
4984
5320
  if (seed == 0)
4985
5321
  return XXH3_hashLong_64b_internal(input, len,
4986
5322
  XXH3_kSecret, sizeof(XXH3_kSecret),
4987
- f_acc512, f_scramble);
5323
+ f_acc, f_scramble);
4988
5324
  #endif
4989
5325
  { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
4990
5326
  f_initSec(secret, seed);
4991
5327
  return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
4992
- f_acc512, f_scramble);
5328
+ f_acc, f_scramble);
4993
5329
  }
4994
5330
  }
4995
5331
 
@@ -4997,12 +5333,12 @@ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
4997
5333
  * It's important for performance that XXH3_hashLong is not inlined.
4998
5334
  */
4999
5335
  XXH_NO_INLINE XXH64_hash_t
5000
- XXH3_hashLong_64b_withSeed(const void* input, size_t len,
5001
- XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
5336
+ XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
5337
+ XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
5002
5338
  {
5003
5339
  (void)secret; (void)secretLen;
5004
5340
  return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
5005
- XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
5341
+ XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
5006
5342
  }
5007
5343
 
5008
5344
 
@@ -5035,27 +5371,27 @@ XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
5035
5371
  /* === Public entry point === */
5036
5372
 
5037
5373
  /*! @ingroup XXH3_family */
5038
- XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t length)
5374
+ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
5039
5375
  {
5040
5376
  return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
5041
5377
  }
5042
5378
 
5043
5379
  /*! @ingroup XXH3_family */
5044
5380
  XXH_PUBLIC_API XXH64_hash_t
5045
- XXH3_64bits_withSecret(const void* input, size_t length, const void* secret, size_t secretSize)
5381
+ XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
5046
5382
  {
5047
5383
  return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
5048
5384
  }
5049
5385
 
5050
5386
  /*! @ingroup XXH3_family */
5051
5387
  XXH_PUBLIC_API XXH64_hash_t
5052
- XXH3_64bits_withSeed(const void* input, size_t length, XXH64_hash_t seed)
5388
+ XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
5053
5389
  {
5054
5390
  return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
5055
5391
  }
5056
5392
 
5057
5393
  XXH_PUBLIC_API XXH64_hash_t
5058
- XXH3_64bits_withSecretandSeed(const void* input, size_t length, const void* secret, size_t secretSize, XXH64_hash_t seed)
5394
+ XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
5059
5395
  {
5060
5396
  if (length <= XXH3_MIDSIZE_MAX)
5061
5397
  return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
@@ -5148,7 +5484,7 @@ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
5148
5484
 
5149
5485
  /*! @ingroup XXH3_family */
5150
5486
  XXH_PUBLIC_API void
5151
- XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
5487
+ XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
5152
5488
  {
5153
5489
  XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
5154
5490
  }
@@ -5182,7 +5518,7 @@ XXH3_reset_internal(XXH3_state_t* statePtr,
5182
5518
 
5183
5519
  /*! @ingroup XXH3_family */
5184
5520
  XXH_PUBLIC_API XXH_errorcode
5185
- XXH3_64bits_reset(XXH3_state_t* statePtr)
5521
+ XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
5186
5522
  {
5187
5523
  if (statePtr == NULL) return XXH_ERROR;
5188
5524
  XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
@@ -5191,7 +5527,7 @@ XXH3_64bits_reset(XXH3_state_t* statePtr)
5191
5527
 
5192
5528
  /*! @ingroup XXH3_family */
5193
5529
  XXH_PUBLIC_API XXH_errorcode
5194
- XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
5530
+ XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
5195
5531
  {
5196
5532
  if (statePtr == NULL) return XXH_ERROR;
5197
5533
  XXH3_reset_internal(statePtr, 0, secret, secretSize);
@@ -5202,7 +5538,7 @@ XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t
5202
5538
 
5203
5539
  /*! @ingroup XXH3_family */
5204
5540
  XXH_PUBLIC_API XXH_errorcode
5205
- XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
5541
+ XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
5206
5542
  {
5207
5543
  if (statePtr == NULL) return XXH_ERROR;
5208
5544
  if (seed==0) return XXH3_64bits_reset(statePtr);
@@ -5214,7 +5550,7 @@ XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
5214
5550
 
5215
5551
  /*! @ingroup XXH3_family */
5216
5552
  XXH_PUBLIC_API XXH_errorcode
5217
- XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)
5553
+ XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
5218
5554
  {
5219
5555
  if (statePtr == NULL) return XXH_ERROR;
5220
5556
  if (secret == NULL) return XXH_ERROR;
@@ -5224,31 +5560,57 @@ XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret,
5224
5560
  return XXH_OK;
5225
5561
  }
5226
5562
 
5227
- /* Note : when XXH3_consumeStripes() is invoked,
5228
- * there must be a guarantee that at least one more byte must be consumed from input
5229
- * so that the function can blindly consume all stripes using the "normal" secret segment */
5230
- XXH_FORCE_INLINE void
5563
+ /*!
5564
+ * @internal
5565
+ * @brief Processes a large input for XXH3_update() and XXH3_digest_long().
5566
+ *
5567
+ * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.
5568
+ *
5569
+ * @param acc Pointer to the 8 accumulator lanes
5570
+ * @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block*
5571
+ * @param nbStripesPerBlock Number of stripes in a block
5572
+ * @param input Input pointer
5573
+ * @param nbStripes Number of stripes to process
5574
+ * @param secret Secret pointer
5575
+ * @param secretLimit Offset of the last block in @p secret
5576
+ * @param f_acc Pointer to an XXH3_accumulate implementation
5577
+ * @param f_scramble Pointer to an XXH3_scrambleAcc implementation
5578
+ * @return Pointer past the end of @p input after processing
5579
+ */
5580
+ XXH_FORCE_INLINE const xxh_u8 *
5231
5581
  XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
5232
5582
  size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
5233
5583
  const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
5234
5584
  const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
5235
- XXH3_f_accumulate_512 f_acc512,
5585
+ XXH3_f_accumulate f_acc,
5236
5586
  XXH3_f_scrambleAcc f_scramble)
5237
5587
  {
5238
- XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */
5239
- XXH_ASSERT(*nbStripesSoFarPtr < nbStripesPerBlock);
5240
- if (nbStripesPerBlock - *nbStripesSoFarPtr <= nbStripes) {
5241
- /* need a scrambling operation */
5242
- size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
5243
- size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
5244
- XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);
5245
- f_scramble(acc, secret + secretLimit);
5246
- XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
5247
- *nbStripesSoFarPtr = nbStripesAfterBlock;
5248
- } else {
5249
- XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
5588
+ const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;
5589
+ /* Process full blocks */
5590
+ if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {
5591
+ /* Process the initial partial block... */
5592
+ size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;
5593
+
5594
+ do {
5595
+ /* Accumulate and scramble */
5596
+ f_acc(acc, input, initialSecret, nbStripesThisIter);
5597
+ f_scramble(acc, secret + secretLimit);
5598
+ input += nbStripesThisIter * XXH_STRIPE_LEN;
5599
+ nbStripes -= nbStripesThisIter;
5600
+ /* Then continue the loop with the full block size */
5601
+ nbStripesThisIter = nbStripesPerBlock;
5602
+ initialSecret = secret;
5603
+ } while (nbStripes >= nbStripesPerBlock);
5604
+ *nbStripesSoFarPtr = 0;
5605
+ }
5606
+ /* Process a partial block */
5607
+ if (nbStripes > 0) {
5608
+ f_acc(acc, input, initialSecret, nbStripes);
5609
+ input += nbStripes * XXH_STRIPE_LEN;
5250
5610
  *nbStripesSoFarPtr += nbStripes;
5251
5611
  }
5612
+ /* Return end pointer */
5613
+ return input;
5252
5614
  }
5253
5615
 
5254
5616
  #ifndef XXH3_STREAM_USE_STACK
@@ -5262,7 +5624,7 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
5262
5624
  XXH_FORCE_INLINE XXH_errorcode
5263
5625
  XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5264
5626
  const xxh_u8* XXH_RESTRICT input, size_t len,
5265
- XXH3_f_accumulate_512 f_acc512,
5627
+ XXH3_f_accumulate f_acc,
5266
5628
  XXH3_f_scrambleAcc f_scramble)
5267
5629
  {
5268
5630
  if (input==NULL) {
@@ -5278,7 +5640,8 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5278
5640
  * when operating accumulators directly into state.
5279
5641
  * Operating into stack space seems to enable proper optimization.
5280
5642
  * clang, on the other hand, doesn't seem to need this trick */
5281
- XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));
5643
+ XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
5644
+ XXH_memcpy(acc, state->acc, sizeof(acc));
5282
5645
  #else
5283
5646
  xxh_u64* XXH_RESTRICT const acc = state->acc;
5284
5647
  #endif
@@ -5286,7 +5649,7 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5286
5649
  XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
5287
5650
 
5288
5651
  /* small input : just fill in tmp buffer */
5289
- if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {
5652
+ if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {
5290
5653
  XXH_memcpy(state->buffer + state->bufferedSize, input, len);
5291
5654
  state->bufferedSize += (XXH32_hash_t)len;
5292
5655
  return XXH_OK;
@@ -5308,57 +5671,20 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5308
5671
  &state->nbStripesSoFar, state->nbStripesPerBlock,
5309
5672
  state->buffer, XXH3_INTERNALBUFFER_STRIPES,
5310
5673
  secret, state->secretLimit,
5311
- f_acc512, f_scramble);
5674
+ f_acc, f_scramble);
5312
5675
  state->bufferedSize = 0;
5313
5676
  }
5314
5677
  XXH_ASSERT(input < bEnd);
5315
-
5316
- /* large input to consume : ingest per full block */
5317
- if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
5678
+ if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
5318
5679
  size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
5319
- XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);
5320
- /* join to current block's end */
5321
- { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
5322
- XXH_ASSERT(nbStripesToEnd <= nbStripes);
5323
- XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);
5324
- f_scramble(acc, secret + state->secretLimit);
5325
- state->nbStripesSoFar = 0;
5326
- input += nbStripesToEnd * XXH_STRIPE_LEN;
5327
- nbStripes -= nbStripesToEnd;
5328
- }
5329
- /* consume per entire blocks */
5330
- while(nbStripes >= state->nbStripesPerBlock) {
5331
- XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);
5332
- f_scramble(acc, secret + state->secretLimit);
5333
- input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
5334
- nbStripes -= state->nbStripesPerBlock;
5335
- }
5336
- /* consume last partial block */
5337
- XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);
5338
- input += nbStripes * XXH_STRIPE_LEN;
5339
- XXH_ASSERT(input < bEnd); /* at least some bytes left */
5340
- state->nbStripesSoFar = nbStripes;
5341
- /* buffer predecessor of last partial stripe */
5342
- XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5343
- XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
5344
- } else {
5345
- /* content to consume <= block size */
5346
- /* Consume input by a multiple of internal buffer size */
5347
- if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
5348
- const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
5349
- do {
5350
- XXH3_consumeStripes(acc,
5680
+ input = XXH3_consumeStripes(acc,
5351
5681
  &state->nbStripesSoFar, state->nbStripesPerBlock,
5352
- input, XXH3_INTERNALBUFFER_STRIPES,
5353
- secret, state->secretLimit,
5354
- f_acc512, f_scramble);
5355
- input += XXH3_INTERNALBUFFER_SIZE;
5356
- } while (input<limit);
5357
- /* buffer predecessor of last partial stripe */
5358
- XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5359
- }
5360
- }
5682
+ input, nbStripes,
5683
+ secret, state->secretLimit,
5684
+ f_acc, f_scramble);
5685
+ XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5361
5686
 
5687
+ }
5362
5688
  /* Some remaining input (always) : buffer it */
5363
5689
  XXH_ASSERT(input < bEnd);
5364
5690
  XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
@@ -5367,7 +5693,7 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5367
5693
  state->bufferedSize = (XXH32_hash_t)(bEnd-input);
5368
5694
  #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
5369
5695
  /* save stack accumulators into state */
5370
- memcpy(state->acc, acc, sizeof(acc));
5696
+ XXH_memcpy(state->acc, acc, sizeof(acc));
5371
5697
  #endif
5372
5698
  }
5373
5699
 
@@ -5376,10 +5702,10 @@ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5376
5702
 
5377
5703
  /*! @ingroup XXH3_family */
5378
5704
  XXH_PUBLIC_API XXH_errorcode
5379
- XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
5705
+ XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
5380
5706
  {
5381
5707
  return XXH3_update(state, (const xxh_u8*)input, len,
5382
- XXH3_accumulate_512, XXH3_scrambleAcc);
5708
+ XXH3_accumulate, XXH3_scrambleAcc);
5383
5709
  }
5384
5710
 
5385
5711
 
@@ -5388,37 +5714,40 @@ XXH3_digest_long (XXH64_hash_t* acc,
5388
5714
  const XXH3_state_t* state,
5389
5715
  const unsigned char* secret)
5390
5716
  {
5717
+ xxh_u8 lastStripe[XXH_STRIPE_LEN];
5718
+ const xxh_u8* lastStripePtr;
5719
+
5391
5720
  /*
5392
5721
  * Digest on a local copy. This way, the state remains unaltered, and it can
5393
5722
  * continue ingesting more input afterwards.
5394
5723
  */
5395
5724
  XXH_memcpy(acc, state->acc, sizeof(state->acc));
5396
5725
  if (state->bufferedSize >= XXH_STRIPE_LEN) {
5726
+ /* Consume remaining stripes then point to remaining data in buffer */
5397
5727
  size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
5398
5728
  size_t nbStripesSoFar = state->nbStripesSoFar;
5399
5729
  XXH3_consumeStripes(acc,
5400
5730
  &nbStripesSoFar, state->nbStripesPerBlock,
5401
5731
  state->buffer, nbStripes,
5402
5732
  secret, state->secretLimit,
5403
- XXH3_accumulate_512, XXH3_scrambleAcc);
5404
- /* last stripe */
5405
- XXH3_accumulate_512(acc,
5406
- state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
5407
- secret + state->secretLimit - XXH_SECRET_LASTACC_START);
5733
+ XXH3_accumulate, XXH3_scrambleAcc);
5734
+ lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;
5408
5735
  } else { /* bufferedSize < XXH_STRIPE_LEN */
5409
- xxh_u8 lastStripe[XXH_STRIPE_LEN];
5736
+ /* Copy to temp buffer */
5410
5737
  size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
5411
5738
  XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */
5412
5739
  XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
5413
5740
  XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
5414
- XXH3_accumulate_512(acc,
5415
- lastStripe,
5416
- secret + state->secretLimit - XXH_SECRET_LASTACC_START);
5741
+ lastStripePtr = lastStripe;
5417
5742
  }
5743
+ /* Last stripe */
5744
+ XXH3_accumulate_512(acc,
5745
+ lastStripePtr,
5746
+ secret + state->secretLimit - XXH_SECRET_LASTACC_START);
5418
5747
  }
5419
5748
 
5420
5749
  /*! @ingroup XXH3_family */
5421
- XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
5750
+ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
5422
5751
  {
5423
5752
  const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
5424
5753
  if (state->totalLen > XXH3_MIDSIZE_MAX) {
@@ -5631,7 +5960,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5631
5960
  #if XXH_SIZE_OPT >= 1
5632
5961
  {
5633
5962
  /* Smaller, but slightly slower. */
5634
- size_t i = (len - 1) / 32;
5963
+ unsigned int i = (unsigned int)(len - 1) / 32;
5635
5964
  do {
5636
5965
  acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
5637
5966
  } while (i-- != 0);
@@ -5669,25 +5998,34 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5669
5998
  XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
5670
5999
 
5671
6000
  { XXH128_hash_t acc;
5672
- int const nbRounds = (int)len / 32;
5673
- int i;
6001
+ unsigned i;
5674
6002
  acc.low64 = len * XXH_PRIME64_1;
5675
6003
  acc.high64 = 0;
5676
- for (i=0; i<4; i++) {
6004
+ /*
6005
+ * We set as `i` as offset + 32. We do this so that unchanged
6006
+ * `len` can be used as upper bound. This reaches a sweet spot
6007
+ * where both x86 and aarch64 get simple agen and good codegen
6008
+ * for the loop.
6009
+ */
6010
+ for (i = 32; i < 160; i += 32) {
5677
6011
  acc = XXH128_mix32B(acc,
5678
- input + (32 * i),
5679
- input + (32 * i) + 16,
5680
- secret + (32 * i),
6012
+ input + i - 32,
6013
+ input + i - 16,
6014
+ secret + i - 32,
5681
6015
  seed);
5682
6016
  }
5683
6017
  acc.low64 = XXH3_avalanche(acc.low64);
5684
6018
  acc.high64 = XXH3_avalanche(acc.high64);
5685
- XXH_ASSERT(nbRounds >= 4);
5686
- for (i=4 ; i < nbRounds; i++) {
6019
+ /*
6020
+ * NB: `i <= len` will duplicate the last 32-bytes if
6021
+ * len % 32 was zero. This is an unfortunate necessity to keep
6022
+ * the hash result stable.
6023
+ */
6024
+ for (i=160; i <= len; i += 32) {
5687
6025
  acc = XXH128_mix32B(acc,
5688
- input + (32 * i),
5689
- input + (32 * i) + 16,
5690
- secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
6026
+ input + i - 32,
6027
+ input + i - 16,
6028
+ secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
5691
6029
  seed);
5692
6030
  }
5693
6031
  /* last bytes */
@@ -5695,7 +6033,7 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5695
6033
  input + len - 16,
5696
6034
  input + len - 32,
5697
6035
  secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
5698
- 0ULL - seed);
6036
+ (XXH64_hash_t)0 - seed);
5699
6037
 
5700
6038
  { XXH128_hash_t h128;
5701
6039
  h128.low64 = acc.low64 + acc.high64;
@@ -5712,12 +6050,12 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5712
6050
  XXH_FORCE_INLINE XXH128_hash_t
5713
6051
  XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
5714
6052
  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
5715
- XXH3_f_accumulate_512 f_acc512,
6053
+ XXH3_f_accumulate f_acc,
5716
6054
  XXH3_f_scrambleAcc f_scramble)
5717
6055
  {
5718
6056
  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
5719
6057
 
5720
- XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);
6058
+ XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
5721
6059
 
5722
6060
  /* converge into final hash */
5723
6061
  XXH_STATIC_ASSERT(sizeof(acc) == 64);
@@ -5744,38 +6082,41 @@ XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
5744
6082
  {
5745
6083
  (void)seed64; (void)secret; (void)secretLen;
5746
6084
  return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
5747
- XXH3_accumulate_512, XXH3_scrambleAcc);
6085
+ XXH3_accumulate, XXH3_scrambleAcc);
5748
6086
  }
5749
6087
 
5750
6088
  /*
5751
6089
  * It's important for performance to pass @p secretLen (when it's static)
5752
6090
  * to the compiler, so that it can properly optimize the vectorized loop.
6091
+ *
6092
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
6093
+ * breaks -Og, this is XXH_NO_INLINE.
5753
6094
  */
5754
- XXH_FORCE_INLINE XXH128_hash_t
6095
+ XXH3_WITH_SECRET_INLINE XXH128_hash_t
5755
6096
  XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
5756
6097
  XXH64_hash_t seed64,
5757
6098
  const void* XXH_RESTRICT secret, size_t secretLen)
5758
6099
  {
5759
6100
  (void)seed64;
5760
6101
  return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
5761
- XXH3_accumulate_512, XXH3_scrambleAcc);
6102
+ XXH3_accumulate, XXH3_scrambleAcc);
5762
6103
  }
5763
6104
 
5764
6105
  XXH_FORCE_INLINE XXH128_hash_t
5765
6106
  XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
5766
6107
  XXH64_hash_t seed64,
5767
- XXH3_f_accumulate_512 f_acc512,
6108
+ XXH3_f_accumulate f_acc,
5768
6109
  XXH3_f_scrambleAcc f_scramble,
5769
6110
  XXH3_f_initCustomSecret f_initSec)
5770
6111
  {
5771
6112
  if (seed64 == 0)
5772
6113
  return XXH3_hashLong_128b_internal(input, len,
5773
6114
  XXH3_kSecret, sizeof(XXH3_kSecret),
5774
- f_acc512, f_scramble);
6115
+ f_acc, f_scramble);
5775
6116
  { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
5776
6117
  f_initSec(secret, seed64);
5777
6118
  return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
5778
- f_acc512, f_scramble);
6119
+ f_acc, f_scramble);
5779
6120
  }
5780
6121
  }
5781
6122
 
@@ -5788,7 +6129,7 @@ XXH3_hashLong_128b_withSeed(const void* input, size_t len,
5788
6129
  {
5789
6130
  (void)secret; (void)secretLen;
5790
6131
  return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
5791
- XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
6132
+ XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
5792
6133
  }
5793
6134
 
5794
6135
  typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
@@ -5819,7 +6160,7 @@ XXH3_128bits_internal(const void* input, size_t len,
5819
6160
  /* === Public XXH128 API === */
5820
6161
 
5821
6162
  /*! @ingroup XXH3_family */
5822
- XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
6163
+ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
5823
6164
  {
5824
6165
  return XXH3_128bits_internal(input, len, 0,
5825
6166
  XXH3_kSecret, sizeof(XXH3_kSecret),
@@ -5828,7 +6169,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
5828
6169
 
5829
6170
  /*! @ingroup XXH3_family */
5830
6171
  XXH_PUBLIC_API XXH128_hash_t
5831
- XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
6172
+ XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
5832
6173
  {
5833
6174
  return XXH3_128bits_internal(input, len, 0,
5834
6175
  (const xxh_u8*)secret, secretSize,
@@ -5837,7 +6178,7 @@ XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_
5837
6178
 
5838
6179
  /*! @ingroup XXH3_family */
5839
6180
  XXH_PUBLIC_API XXH128_hash_t
5840
- XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
6181
+ XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
5841
6182
  {
5842
6183
  return XXH3_128bits_internal(input, len, seed,
5843
6184
  XXH3_kSecret, sizeof(XXH3_kSecret),
@@ -5846,7 +6187,7 @@ XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
5846
6187
 
5847
6188
  /*! @ingroup XXH3_family */
5848
6189
  XXH_PUBLIC_API XXH128_hash_t
5849
- XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
6190
+ XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
5850
6191
  {
5851
6192
  if (len <= XXH3_MIDSIZE_MAX)
5852
6193
  return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
@@ -5855,7 +6196,7 @@ XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret
5855
6196
 
5856
6197
  /*! @ingroup XXH3_family */
5857
6198
  XXH_PUBLIC_API XXH128_hash_t
5858
- XXH128(const void* input, size_t len, XXH64_hash_t seed)
6199
+ XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
5859
6200
  {
5860
6201
  return XXH3_128bits_withSeed(input, len, seed);
5861
6202
  }
@@ -5870,42 +6211,41 @@ XXH128(const void* input, size_t len, XXH64_hash_t seed)
5870
6211
 
5871
6212
  /*! @ingroup XXH3_family */
5872
6213
  XXH_PUBLIC_API XXH_errorcode
5873
- XXH3_128bits_reset(XXH3_state_t* statePtr)
6214
+ XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
5874
6215
  {
5875
6216
  return XXH3_64bits_reset(statePtr);
5876
6217
  }
5877
6218
 
5878
6219
  /*! @ingroup XXH3_family */
5879
6220
  XXH_PUBLIC_API XXH_errorcode
5880
- XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
6221
+ XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
5881
6222
  {
5882
6223
  return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
5883
6224
  }
5884
6225
 
5885
6226
  /*! @ingroup XXH3_family */
5886
6227
  XXH_PUBLIC_API XXH_errorcode
5887
- XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
6228
+ XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
5888
6229
  {
5889
6230
  return XXH3_64bits_reset_withSeed(statePtr, seed);
5890
6231
  }
5891
6232
 
5892
6233
  /*! @ingroup XXH3_family */
5893
6234
  XXH_PUBLIC_API XXH_errorcode
5894
- XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
6235
+ XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
5895
6236
  {
5896
6237
  return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
5897
6238
  }
5898
6239
 
5899
6240
  /*! @ingroup XXH3_family */
5900
6241
  XXH_PUBLIC_API XXH_errorcode
5901
- XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
6242
+ XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
5902
6243
  {
5903
- return XXH3_update(state, (const xxh_u8*)input, len,
5904
- XXH3_accumulate_512, XXH3_scrambleAcc);
6244
+ return XXH3_64bits_update(state, input, len);
5905
6245
  }
5906
6246
 
5907
6247
  /*! @ingroup XXH3_family */
5908
- XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
6248
+ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
5909
6249
  {
5910
6250
  const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
5911
6251
  if (state->totalLen > XXH3_MIDSIZE_MAX) {
@@ -5947,7 +6287,7 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
5947
6287
  * <0 if *h128_1 < *h128_2
5948
6288
  * =0 if *h128_1 == *h128_2 */
5949
6289
  /*! @ingroup XXH3_family */
5950
- XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
6290
+ XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
5951
6291
  {
5952
6292
  XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
5953
6293
  XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
@@ -5961,7 +6301,7 @@ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
5961
6301
  /*====== Canonical representation ======*/
5962
6302
  /*! @ingroup XXH3_family */
5963
6303
  XXH_PUBLIC_API void
5964
- XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
6304
+ XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
5965
6305
  {
5966
6306
  XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
5967
6307
  if (XXH_CPU_LITTLE_ENDIAN) {
@@ -5974,7 +6314,7 @@ XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
5974
6314
 
5975
6315
  /*! @ingroup XXH3_family */
5976
6316
  XXH_PUBLIC_API XXH128_hash_t
5977
- XXH128_hashFromCanonical(const XXH128_canonical_t* src)
6317
+ XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
5978
6318
  {
5979
6319
  XXH128_hash_t h;
5980
6320
  h.high64 = XXH_readBE64(src);
@@ -5998,7 +6338,7 @@ XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
5998
6338
 
5999
6339
  /*! @ingroup XXH3_family */
6000
6340
  XXH_PUBLIC_API XXH_errorcode
6001
- XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)
6341
+ XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
6002
6342
  {
6003
6343
  #if (XXH_DEBUGLEVEL >= 1)
6004
6344
  XXH_ASSERT(secretBuffer != NULL);
@@ -6043,7 +6383,7 @@ XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSee
6043
6383
 
6044
6384
  /*! @ingroup XXH3_family */
6045
6385
  XXH_PUBLIC_API void
6046
- XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
6386
+ XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
6047
6387
  {
6048
6388
  XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
6049
6389
  XXH3_initCustomSecret(secret, seed);
@@ -6071,5 +6411,5 @@ XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
6071
6411
 
6072
6412
 
6073
6413
  #if defined (__cplusplus)
6074
- }
6414
+ } /* extern "C" */
6075
6415
  #endif