digest-xxhash 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * xxHash - Extremely Fast Hash algorithm
3
3
  * Header File
4
- * Copyright (C) 2012-2020 Yann Collet
4
+ * Copyright (C) 2012-2021 Yann Collet
5
5
  *
6
6
  * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
7
7
  *
@@ -157,6 +157,7 @@ extern "C" {
157
157
  # undef XXH3_64bits
158
158
  # undef XXH3_64bits_withSecret
159
159
  # undef XXH3_64bits_withSeed
160
+ # undef XXH3_64bits_withSecretandSeed
160
161
  # undef XXH3_createState
161
162
  # undef XXH3_freeState
162
163
  # undef XXH3_copyState
@@ -174,6 +175,7 @@ extern "C" {
174
175
  # undef XXH3_128bits_reset
175
176
  # undef XXH3_128bits_reset_withSeed
176
177
  # undef XXH3_128bits_reset_withSecret
178
+ # undef XXH3_128bits_reset_withSecretandSeed
177
179
  # undef XXH3_128bits_update
178
180
  # undef XXH3_128bits_digest
179
181
  # undef XXH128_isEqual
@@ -284,23 +286,28 @@ extern "C" {
284
286
  # define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
285
287
  # define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
286
288
  # define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
289
+ # define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
287
290
  # define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
288
291
  # define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
289
292
  # define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
290
293
  # define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
291
294
  # define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
292
295
  # define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
296
+ # define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
293
297
  # define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
294
298
  # define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
295
299
  # define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
300
+ # define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
296
301
  /* XXH3_128bits */
297
302
  # define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
298
303
  # define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
299
304
  # define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
300
305
  # define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
306
+ # define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
301
307
  # define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
302
308
  # define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
303
309
  # define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
310
+ # define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
304
311
  # define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
305
312
  # define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
306
313
  # define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
@@ -321,16 +328,16 @@ extern "C" {
321
328
  /*!
322
329
  * @brief Obtains the xxHash version.
323
330
  *
324
- * This is only useful when xxHash is compiled as a shared library, as it is
325
- * independent of the version defined in the header.
331
+ * This is mostly useful when xxHash is compiled as a shared library,
332
+ * since the returned value comes from the library, as opposed to header file.
326
333
  *
327
- * @return `XXH_VERSION_NUMBER` as of when the libray was compiled.
334
+ * @return `XXH_VERSION_NUMBER` of the invoked library.
328
335
  */
329
336
  XXH_PUBLIC_API unsigned XXH_versionNumber (void);
330
337
 
331
338
 
332
339
  /* ****************************
333
- * Definitions
340
+ * Common basic types
334
341
  ******************************/
335
342
  #include <stddef.h> /* size_t */
336
343
  typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
@@ -374,10 +381,9 @@ typedef uint32_t XXH32_hash_t;
374
381
  * Contains functions used in the classic 32-bit xxHash algorithm.
375
382
  *
376
383
  * @note
377
- * XXH32 is considered rather weak by today's standards.
378
- * The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit
379
- * systems, and offers true 64/128 bit hash results. It provides a superior
380
- * level of dispersion, and greatly reduces the risks of collisions.
384
+ * XXH32 is useful for older platforms, with no or poor 64-bit performance.
385
+ * Note that @ref xxh3_family provides competitive speed
386
+ * for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results.
381
387
  *
382
388
  * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
383
389
  * @see @ref xxh32_impl for implementation details
@@ -594,36 +600,39 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t
594
600
  XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
595
601
 
596
602
 
603
+ #ifdef __has_attribute
604
+ # define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
605
+ #else
606
+ # define XXH_HAS_ATTRIBUTE(x) 0
607
+ #endif
608
+
609
+ /* C-language Attributes are added in C23. */
610
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
611
+ # define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
612
+ #else
613
+ # define XXH_HAS_C_ATTRIBUTE(x) 0
614
+ #endif
615
+
616
+ #if defined(__cplusplus) && defined(__has_cpp_attribute)
617
+ # define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
618
+ #else
619
+ # define XXH_HAS_CPP_ATTRIBUTE(x) 0
620
+ #endif
621
+
597
622
  /*
598
623
  Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
599
624
  introduced in CPP17 and C23.
600
625
  CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
601
626
  C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough
602
627
  */
603
-
604
- #if defined (__has_c_attribute) && defined (__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) /* C2x */
605
- # if __has_c_attribute(fallthrough)
606
- # define XXH_FALLTHROUGH [[fallthrough]]
607
- # endif
608
-
609
- #elif defined(__cplusplus) && defined(__has_cpp_attribute)
610
- # if __has_cpp_attribute(fallthrough)
611
- # define XXH_FALLTHROUGH [[fallthrough]]
612
- # endif
613
- #endif
614
-
615
- #ifndef XXH_FALLTHROUGH
616
- # if defined(__GNUC__) && __GNUC__ >= 7
617
- # define XXH_FALLTHROUGH __attribute__ ((fallthrough))
618
- # elif defined(__clang__) && (__clang_major__ >= 10) \
619
- && (!defined(__APPLE__) || (__clang_major__ >= 12))
620
- /* Apple clang 12 is effectively clang-10 ,
621
- * see https://en.wikipedia.org/wiki/Xcode for details
622
- */
623
- # define XXH_FALLTHROUGH __attribute__ ((fallthrough))
624
- # else
625
- # define XXH_FALLTHROUGH
626
- # endif
628
+ #if XXH_HAS_C_ATTRIBUTE(x)
629
+ # define XXH_FALLTHROUGH [[fallthrough]]
630
+ #elif XXH_HAS_CPP_ATTRIBUTE(x)
631
+ # define XXH_FALLTHROUGH [[fallthrough]]
632
+ #elif XXH_HAS_ATTRIBUTE(__fallthrough__)
633
+ # define XXH_FALLTHROUGH __attribute__ ((fallthrough))
634
+ #else
635
+ # define XXH_FALLTHROUGH
627
636
  #endif
628
637
 
629
638
  /*!
@@ -669,8 +678,8 @@ typedef uint64_t XXH64_hash_t;
669
678
  *
670
679
  * @note
671
680
  * XXH3 provides competitive speed for both 32-bit and 64-bit systems,
672
- * and offers true 64/128 bit hash results. It provides a superior level of
673
- * dispersion, and greatly reduces the risks of collisions.
681
+ * and offers true 64/128 bit hash results.
682
+ * It provides better speed for systems with vector processing capabilities.
674
683
  */
675
684
 
676
685
 
@@ -719,6 +728,8 @@ typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t
719
728
  XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
720
729
  XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
721
730
 
731
+ #ifndef XXH_NO_XXH3
732
+
722
733
  /*!
723
734
  * @}
724
735
  * ************************************************************************
@@ -796,13 +807,17 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, X
796
807
  * It's possible to provide any blob of bytes as a "secret" to generate the hash.
797
808
  * This makes it more difficult for an external actor to prepare an intentional collision.
798
809
  * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
799
- * However, the quality of produced hash values depends on secret's entropy.
800
- * Technically, the secret must look like a bunch of random bytes.
810
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
811
+ * Therefore, the secret _must_ look like a bunch of random bytes.
801
812
  * Avoid "trivial" or structured data such as repeated sequences or a text document.
802
- * Whenever unsure about the "randomness" of the blob of bytes,
803
- * consider relabelling it as a "custom seed" instead,
804
- * and employ "XXH3_generateSecret()" (see below)
805
- * to generate a high entropy secret derived from the custom seed.
813
+ * Whenever in doubt about the "randomness" of the blob of bytes,
814
+ * consider employing "XXH3_generateSecret()" instead (see below).
815
+ * It will generate a proper high entropy secret derived from the blob of bytes.
816
+ * Another advantage of using XXH3_generateSecret() is that
817
+ * it guarantees that all bits within the initial blob of bytes
818
+ * will impact every bit of the output.
819
+ * This is not necessarily the case when using the blob of bytes directly
820
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
806
821
  */
807
822
  XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
808
823
 
@@ -922,6 +937,7 @@ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_has
922
937
  XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
923
938
 
924
939
 
940
+ #endif /* !XXH_NO_XXH3 */
925
941
  #endif /* XXH_NO_LONG_LONG */
926
942
 
927
943
  /*!
@@ -962,13 +978,10 @@ XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t*
962
978
  struct XXH32_state_s {
963
979
  XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
964
980
  XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
965
- XXH32_hash_t v1; /*!< First accumulator lane */
966
- XXH32_hash_t v2; /*!< Second accumulator lane */
967
- XXH32_hash_t v3; /*!< Third accumulator lane */
968
- XXH32_hash_t v4; /*!< Fourth accumulator lane */
981
+ XXH32_hash_t v[4]; /*!< Accumulator lanes */
969
982
  XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
970
983
  XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */
971
- XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may be removed. */
984
+ XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */
972
985
  }; /* typedef'd to XXH32_state_t */
973
986
 
974
987
 
@@ -988,16 +1001,15 @@ struct XXH32_state_s {
988
1001
  */
989
1002
  struct XXH64_state_s {
990
1003
  XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */
991
- XXH64_hash_t v1; /*!< First accumulator lane */
992
- XXH64_hash_t v2; /*!< Second accumulator lane */
993
- XXH64_hash_t v3; /*!< Third accumulator lane */
994
- XXH64_hash_t v4; /*!< Fourth accumulator lane */
1004
+ XXH64_hash_t v[4]; /*!< Accumulator lanes */
995
1005
  XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
996
1006
  XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */
997
1007
  XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/
998
- XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it may be removed. */
1008
+ XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */
999
1009
  }; /* typedef'd to XXH64_state_t */
1000
1010
 
1011
+ #ifndef XXH_NO_XXH3
1012
+
1001
1013
  #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
1002
1014
  # include <stdalign.h>
1003
1015
  # define XXH_ALIGN(n) alignas(n)
@@ -1070,7 +1082,7 @@ struct XXH3_state_s {
1070
1082
  /*!< The internal buffer. @see XXH32_state_s::mem32 */
1071
1083
  XXH32_hash_t bufferedSize;
1072
1084
  /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
1073
- XXH32_hash_t reserved32;
1085
+ XXH32_hash_t useSeed;
1074
1086
  /*!< Reserved field. Needed for padding on 64-bit. */
1075
1087
  size_t nbStripesSoFar;
1076
1088
  /*!< Number or stripes processed. */
@@ -1106,6 +1118,12 @@ struct XXH3_state_s {
1106
1118
  #define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; }
1107
1119
 
1108
1120
 
1121
+ /* XXH128() :
1122
+ * simple alias to pre-selected XXH3_128bits variant
1123
+ */
1124
+ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
1125
+
1126
+
1109
1127
  /* === Experimental API === */
1110
1128
  /* Symbols defined below must be considered tied to a specific library version. */
1111
1129
 
@@ -1118,33 +1136,92 @@ struct XXH3_state_s {
1118
1136
  * as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
1119
1137
  *
1120
1138
  * The function accepts as input a custom seed of any length and any content,
1121
- * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
1122
- * into an already allocated buffer secretBuffer.
1123
- * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
1139
+ * and derives from it a high-entropy secret of length @secretSize
1140
+ * into an already allocated buffer @secretBuffer.
1141
+ * @secretSize must be >= XXH3_SECRET_SIZE_MIN
1124
1142
  *
1125
1143
  * The generated secret can then be used with any `*_withSecret()` variant.
1126
1144
  * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
1127
1145
  * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
1128
1146
  * are part of this list. They all accept a `secret` parameter
1129
- * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
1147
+ * which must be large enough for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
1130
1148
  * _and_ feature very high entropy (consist of random-looking bytes).
1131
1149
  * These conditions can be a high bar to meet, so
1132
- * this function can be used to generate a secret of proper quality.
1150
+ * XXH3_generateSecret() can be employed to ensure proper quality.
1133
1151
  *
1134
1152
  * customSeed can be anything. It can have any size, even small ones,
1135
- * and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes.
1136
- * The resulting `secret` will nonetheless provide all expected qualities.
1153
+ * and its content can be anything, even "poor entropy" sources such as a bunch of zeroes.
1154
+ * The resulting `secret` will nonetheless provide all required qualities.
1137
1155
  *
1138
- * Supplying NULL as the customSeed copies the default secret into `secretBuffer`.
1139
1156
  * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
1140
1157
  */
1141
- XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize);
1158
+ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
1142
1159
 
1143
1160
 
1144
- /* simple short-cut to pre-selected XXH3_128bits variant */
1145
- XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
1161
+ /*
1162
+ * XXH3_generateSecret_fromSeed():
1163
+ *
1164
+ * Generate the same secret as the _withSeed() variants.
1165
+ *
1166
+ * The resulting secret has a length of XXH3_SECRET_DEFAULT_SIZE (necessarily).
1167
+ * @secretBuffer must be already allocated, of size at least XXH3_SECRET_DEFAULT_SIZE bytes.
1168
+ *
1169
+ * The generated secret can be used in combination with
1170
+ *`*_withSecret()` and `_withSecretandSeed()` variants.
1171
+ * This generator is notably useful in combination with `_withSecretandSeed()`,
1172
+ * as a way to emulate a faster `_withSeed()` variant.
1173
+ */
1174
+ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
1175
+
1176
+ /*
1177
+ * *_withSecretandSeed() :
1178
+ * These variants generate hash values using either
1179
+ * @seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)
1180
+ * or @secret for "large" keys (>= XXH3_MIDSIZE_MAX).
1181
+ *
1182
+ * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
1183
+ * `_withSeed()` has to generate the secret on the fly for "large" keys.
1184
+ * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
1185
+ * `_withSecret()` has to generate the masks on the fly for "small" keys,
1186
+ * which requires more instructions than _withSeed() variants.
1187
+ * Therefore, _withSecretandSeed variant combines the best of both worlds.
1188
+ *
1189
+ * When @secret has been generated by XXH3_generateSecret_fromSeed(),
1190
+ * this variant produces *exactly* the same results as `_withSeed()` variant,
1191
+ * hence offering only a pure speed benefit on "large" input,
1192
+ * by skipping the need to regenerate the secret for every large input.
1193
+ *
1194
+ * Another usage scenario is to hash the secret to a 64-bit hash value,
1195
+ * for example with XXH3_64bits(), which then becomes the seed,
1196
+ * and then employ both the seed and the secret in _withSecretandSeed().
1197
+ * On top of speed, an added benefit is that each bit in the secret
1198
+ * has a 50% chance to swap each bit in the output,
1199
+ * via its impact to the seed.
1200
+ * This is not guaranteed when using the secret directly in "small data" scenarios,
1201
+ * because only portions of the secret are employed for small data.
1202
+ */
1203
+ XXH_PUBLIC_API XXH64_hash_t
1204
+ XXH3_64bits_withSecretandSeed(const void* data, size_t len,
1205
+ const void* secret, size_t secretSize,
1206
+ XXH64_hash_t seed);
1207
+
1208
+ XXH_PUBLIC_API XXH128_hash_t
1209
+ XXH3_128bits_withSecretandSeed(const void* data, size_t len,
1210
+ const void* secret, size_t secretSize,
1211
+ XXH64_hash_t seed64);
1212
+
1213
+ XXH_PUBLIC_API XXH_errorcode
1214
+ XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1215
+ const void* secret, size_t secretSize,
1216
+ XXH64_hash_t seed64);
1217
+
1218
+ XXH_PUBLIC_API XXH_errorcode
1219
+ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1220
+ const void* secret, size_t secretSize,
1221
+ XXH64_hash_t seed64);
1146
1222
 
1147
1223
 
1224
+ #endif /* !XXH_NO_XXH3 */
1148
1225
  #endif /* XXH_NO_LONG_LONG */
1149
1226
  #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
1150
1227
  # define XXH_IMPLEMENTATION
@@ -1221,7 +1298,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1221
1298
  * Use `memcpy()`. Safe and portable. Note that most modern compilers will
1222
1299
  * eliminate the function call and treat it as an unaligned access.
1223
1300
  *
1224
- * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
1301
+ * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`
1225
1302
  * @par
1226
1303
  * Depends on compiler extensions and is therefore not portable.
1227
1304
  * This method is safe _if_ your compiler supports it,
@@ -1248,22 +1325,12 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1248
1325
  * care, as what works on one compiler/platform/optimization level may cause
1249
1326
  * another to read garbage data or even crash.
1250
1327
  *
1251
- * See https://stackoverflow.com/a/32095106/646947 for details.
1328
+ * See http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
1252
1329
  *
1253
1330
  * Prefer these methods in priority order (0 > 3 > 1 > 2)
1254
1331
  */
1255
1332
  # define XXH_FORCE_MEMORY_ACCESS 0
1256
- /*!
1257
- * @def XXH_ACCEPT_NULL_INPUT_POINTER
1258
- * @brief Whether to add explicit `NULL` checks.
1259
- *
1260
- * If the input pointer is `NULL` and the length is non-zero, xxHash's default
1261
- * behavior is to dereference it, triggering a segfault.
1262
- *
1263
- * When this macro is enabled, xxHash actively checks the input for a null pointer.
1264
- * If it is, the result for null input pointers is the same as a zero-length input.
1265
- */
1266
- # define XXH_ACCEPT_NULL_INPUT_POINTER 0
1333
+
1267
1334
  /*!
1268
1335
  * @def XXH_FORCE_ALIGN_CHECK
1269
1336
  * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
@@ -1315,18 +1382,16 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1315
1382
  # define XXH_NO_INLINE_HINTS 0
1316
1383
 
1317
1384
  /*!
1318
- * @def XXH_REROLL
1319
- * @brief Whether to reroll `XXH32_finalize`.
1320
- *
1321
- * For performance, `XXH32_finalize` uses an unrolled loop
1322
- * in the form of a switch statement.
1385
+ * @def XXH32_ENDJMP
1386
+ * @brief Whether to use a jump for `XXH32_finalize`.
1323
1387
  *
1324
- * This is not always desirable, as it generates larger code,
1325
- * and depending on the architecture, may even be slower
1388
+ * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
1389
+ * This is generally preferable for performance,
1390
+ * but depending on exact architecture, a jmp may be preferable.
1326
1391
  *
1327
- * This is automatically defined with `-Os`/`-Oz` on GCC and Clang.
1392
+ * This setting is only possibly making a difference for very small inputs.
1328
1393
  */
1329
- # define XXH_REROLL 0
1394
+ # define XXH32_ENDJMP 0
1330
1395
 
1331
1396
  /*!
1332
1397
  * @internal
@@ -1343,32 +1408,18 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1343
1408
  */
1344
1409
 
1345
1410
  #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
1346
- /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
1347
- # if !defined(__clang__) && \
1348
- ( \
1349
- (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
1350
- ( \
1351
- defined(__GNUC__) && ( \
1352
- (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \
1353
- ( \
1354
- defined(__mips__) && \
1355
- (__mips <= 5 || __mips_isa_rev < 6) && \
1356
- (!defined(__mips16) || defined(__mips_mips16e2)) \
1357
- ) \
1358
- ) \
1359
- ) \
1360
- )
1411
+ /* prefer __packed__ structures (method 1) for GCC
1412
+ * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy
1413
+ * which for some reason does unaligned loads. */
1414
+ # if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))
1361
1415
  # define XXH_FORCE_MEMORY_ACCESS 1
1362
1416
  # endif
1363
1417
  #endif
1364
1418
 
1365
- #ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */
1366
- # define XXH_ACCEPT_NULL_INPUT_POINTER 0
1367
- #endif
1368
-
1369
1419
  #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
1370
- # if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \
1371
- || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */
1420
+ /* don't check on x86, aarch64, or arm when unaligned access is available */
1421
+ # if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \
1422
+ || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */
1372
1423
  # define XXH_FORCE_ALIGN_CHECK 0
1373
1424
  # else
1374
1425
  # define XXH_FORCE_ALIGN_CHECK 1
@@ -1384,14 +1435,9 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1384
1435
  # endif
1385
1436
  #endif
1386
1437
 
1387
- #ifndef XXH_REROLL
1388
- # if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ || \
1389
- (defined(__GNUC__) && !defined(__clang__))
1390
- /* The if/then loop is preferable to switch/case on gcc (on x64) */
1391
- # define XXH_REROLL 1
1392
- # else
1393
- # define XXH_REROLL 0
1394
- # endif
1438
+ #ifndef XXH32_ENDJMP
1439
+ /* generally preferable for performance */
1440
+ # define XXH32_ENDJMP 0
1395
1441
  #endif
1396
1442
 
1397
1443
  /*!
@@ -1413,13 +1459,13 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1413
1459
  * @internal
1414
1460
  * @brief Modify this function to use a different routine than malloc().
1415
1461
  */
1416
- static void* XXH_malloc(size_t s) { return malloc(s); }
1462
+ static void* XXH_malloc(size_t s) { return ruby_xmalloc(s); }
1417
1463
 
1418
1464
  /*!
1419
1465
  * @internal
1420
1466
  * @brief Modify this function to use a different routine than free().
1421
1467
  */
1422
- static void XXH_free(void* p) { free(p); }
1468
+ static void XXH_free(void* p) { ruby_xfree(p); }
1423
1469
 
1424
1470
  #include <string.h>
1425
1471
 
@@ -1443,19 +1489,19 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1443
1489
  #endif
1444
1490
 
1445
1491
  #if XXH_NO_INLINE_HINTS /* disable inlining hints */
1446
- # if defined(__GNUC__)
1492
+ # if defined(__GNUC__) || defined(__clang__)
1447
1493
  # define XXH_FORCE_INLINE static __attribute__((unused))
1448
1494
  # else
1449
1495
  # define XXH_FORCE_INLINE static
1450
1496
  # endif
1451
1497
  # define XXH_NO_INLINE static
1452
1498
  /* enable inlining hints */
1499
+ #elif defined(__GNUC__) || defined(__clang__)
1500
+ # define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
1501
+ # define XXH_NO_INLINE static __attribute__((noinline))
1453
1502
  #elif defined(_MSC_VER) /* Visual Studio */
1454
1503
  # define XXH_FORCE_INLINE static __forceinline
1455
1504
  # define XXH_NO_INLINE static __declspec(noinline)
1456
- #elif defined(__GNUC__)
1457
- # define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
1458
- # define XXH_NO_INLINE static __attribute__((noinline))
1459
1505
  #elif defined (__cplusplus) \
1460
1506
  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */
1461
1507
  # define XXH_FORCE_INLINE static inline
@@ -1522,7 +1568,7 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1522
1568
  * We also use it to prevent unwanted constant folding for AArch64 in
1523
1569
  * XXH3_initCustomSecret_scalar().
1524
1570
  */
1525
- #ifdef __GNUC__
1571
+ #if defined(__GNUC__) || defined(__clang__)
1526
1572
  # define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
1527
1573
  #else
1528
1574
  # define XXH_COMPILER_GUARD(var) ((void)0)
@@ -1615,30 +1661,31 @@ static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr;
1615
1661
  #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
1616
1662
 
1617
1663
  /*
1618
- * __pack instructions are safer but compiler specific, hence potentially
1619
- * problematic for some compilers.
1620
- *
1621
- * Currently only defined for GCC and ICC.
1664
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
1665
+ * documentation claimed that it only increased the alignment, but actually it
1666
+ * can decrease it on gcc, clang, and icc:
1667
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
1668
+ * https://gcc.godbolt.org/z/xYez1j67Y.
1622
1669
  */
1623
1670
  #ifdef XXH_OLD_NAMES
1624
1671
  typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
1625
1672
  #endif
1626
1673
  static xxh_u32 XXH_read32(const void* ptr)
1627
1674
  {
1628
- typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;
1629
- return ((const xxh_unalign*)ptr)->u32;
1675
+ typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32;
1676
+ return *((const xxh_unalign32*)ptr);
1630
1677
  }
1631
1678
 
1632
1679
  #else
1633
1680
 
1634
1681
  /*
1635
1682
  * Portable and safe solution. Generally efficient.
1636
- * see: https://stackoverflow.com/a/32095106/646947
1683
+ * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
1637
1684
  */
1638
1685
  static xxh_u32 XXH_read32(const void* memPtr)
1639
1686
  {
1640
1687
  xxh_u32 val;
1641
- memcpy(&val, memPtr, sizeof(val));
1688
+ XXH_memcpy(&val, memPtr, sizeof(val));
1642
1689
  return val;
1643
1690
  }
1644
1691
 
@@ -1955,8 +2002,10 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
1955
2002
  h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \
1956
2003
  } while (0)
1957
2004
 
1958
- /* Compact rerolled version */
1959
- if (XXH_REROLL) {
2005
+ if (ptr==NULL) XXH_ASSERT(len == 0);
2006
+
2007
+ /* Compact rerolled version; generally faster */
2008
+ if (!XXH32_ENDJMP) {
1960
2009
  len &= 15;
1961
2010
  while (len >= 4) {
1962
2011
  XXH_PROCESS4;
@@ -2024,24 +2073,19 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
2024
2073
  * @internal
2025
2074
  * @brief The implementation for @ref XXH32().
2026
2075
  *
2027
- * @param input, len, seed Directly passed from @ref XXH32().
2076
+ * @param input , len , seed Directly passed from @ref XXH32().
2028
2077
  * @param align Whether @p input is aligned.
2029
2078
  * @return The calculated hash.
2030
2079
  */
2031
2080
  XXH_FORCE_INLINE xxh_u32
2032
2081
  XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
2033
2082
  {
2034
- const xxh_u8* bEnd = input ? input + len : NULL;
2035
2083
  xxh_u32 h32;
2036
2084
 
2037
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
2038
- if (input==NULL) {
2039
- len=0;
2040
- bEnd=input=(const xxh_u8*)(size_t)16;
2041
- }
2042
- #endif
2085
+ if (input==NULL) XXH_ASSERT(len == 0);
2043
2086
 
2044
2087
  if (len>=16) {
2088
+ const xxh_u8* const bEnd = input + len;
2045
2089
  const xxh_u8* const limit = bEnd - 15;
2046
2090
  xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
2047
2091
  xxh_u32 v2 = seed + XXH_PRIME32_2;
@@ -2105,20 +2149,18 @@ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
2105
2149
  /*! @ingroup xxh32_family */
2106
2150
  XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
2107
2151
  {
2108
- memcpy(dstState, srcState, sizeof(*dstState));
2152
+ XXH_memcpy(dstState, srcState, sizeof(*dstState));
2109
2153
  }
2110
2154
 
2111
2155
  /*! @ingroup xxh32_family */
2112
2156
  XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
2113
2157
  {
2114
- XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
2115
- memset(&state, 0, sizeof(state));
2116
- state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
2117
- state.v2 = seed + XXH_PRIME32_2;
2118
- state.v3 = seed + 0;
2119
- state.v4 = seed - XXH_PRIME32_1;
2120
- /* do not write into reserved, planned to be removed in a future version */
2121
- memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
2158
+ XXH_ASSERT(statePtr != NULL);
2159
+ memset(statePtr, 0, sizeof(*statePtr));
2160
+ statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
2161
+ statePtr->v[1] = seed + XXH_PRIME32_2;
2162
+ statePtr->v[2] = seed + 0;
2163
+ statePtr->v[3] = seed - XXH_PRIME32_1;
2122
2164
  return XXH_OK;
2123
2165
  }
2124
2166
 
@@ -2127,12 +2169,10 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t s
2127
2169
  XXH_PUBLIC_API XXH_errorcode
2128
2170
  XXH32_update(XXH32_state_t* state, const void* input, size_t len)
2129
2171
  {
2130
- if (input==NULL)
2131
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
2172
+ if (input==NULL) {
2173
+ XXH_ASSERT(len == 0);
2132
2174
  return XXH_OK;
2133
- #else
2134
- return XXH_ERROR;
2135
- #endif
2175
+ }
2136
2176
 
2137
2177
  { const xxh_u8* p = (const xxh_u8*)input;
2138
2178
  const xxh_u8* const bEnd = p + len;
@@ -2149,10 +2189,10 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
2149
2189
  if (state->memsize) { /* some data left from previous update */
2150
2190
  XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
2151
2191
  { const xxh_u32* p32 = state->mem32;
2152
- state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
2153
- state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
2154
- state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
2155
- state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
2192
+ state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;
2193
+ state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;
2194
+ state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;
2195
+ state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));
2156
2196
  }
2157
2197
  p += 16-state->memsize;
2158
2198
  state->memsize = 0;
@@ -2160,22 +2200,14 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
2160
2200
 
2161
2201
  if (p <= bEnd-16) {
2162
2202
  const xxh_u8* const limit = bEnd - 16;
2163
- xxh_u32 v1 = state->v1;
2164
- xxh_u32 v2 = state->v2;
2165
- xxh_u32 v3 = state->v3;
2166
- xxh_u32 v4 = state->v4;
2167
2203
 
2168
2204
  do {
2169
- v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
2170
- v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
2171
- v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
2172
- v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
2205
+ state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;
2206
+ state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;
2207
+ state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;
2208
+ state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;
2173
2209
  } while (p<=limit);
2174
2210
 
2175
- state->v1 = v1;
2176
- state->v2 = v2;
2177
- state->v3 = v3;
2178
- state->v4 = v4;
2179
2211
  }
2180
2212
 
2181
2213
  if (p < bEnd) {
@@ -2194,12 +2226,12 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
2194
2226
  xxh_u32 h32;
2195
2227
 
2196
2228
  if (state->large_len) {
2197
- h32 = XXH_rotl32(state->v1, 1)
2198
- + XXH_rotl32(state->v2, 7)
2199
- + XXH_rotl32(state->v3, 12)
2200
- + XXH_rotl32(state->v4, 18);
2229
+ h32 = XXH_rotl32(state->v[0], 1)
2230
+ + XXH_rotl32(state->v[1], 7)
2231
+ + XXH_rotl32(state->v[2], 12)
2232
+ + XXH_rotl32(state->v[3], 18);
2201
2233
  } else {
2202
- h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
2234
+ h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;
2203
2235
  }
2204
2236
 
2205
2237
  h32 += state->total_len_32;
@@ -2228,7 +2260,7 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t
2228
2260
  {
2229
2261
  XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
2230
2262
  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
2231
- memcpy(dst, &hash, sizeof(*dst));
2263
+ XXH_memcpy(dst, &hash, sizeof(*dst));
2232
2264
  }
2233
2265
  /*! @ingroup xxh32_family */
2234
2266
  XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
@@ -2271,30 +2303,31 @@ static xxh_u64 XXH_read64(const void* memPtr)
2271
2303
  #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
2272
2304
 
2273
2305
  /*
2274
- * __pack instructions are safer, but compiler specific, hence potentially
2275
- * problematic for some compilers.
2276
- *
2277
- * Currently only defined for GCC and ICC.
2306
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
2307
+ * documentation claimed that it only increased the alignment, but actually it
2308
+ * can decrease it on gcc, clang, and icc:
2309
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
2310
+ * https://gcc.godbolt.org/z/xYez1j67Y.
2278
2311
  */
2279
2312
  #ifdef XXH_OLD_NAMES
2280
2313
  typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
2281
2314
  #endif
2282
2315
  static xxh_u64 XXH_read64(const void* ptr)
2283
2316
  {
2284
- typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;
2285
- return ((const xxh_unalign64*)ptr)->u64;
2317
+ typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64;
2318
+ return *((const xxh_unalign64*)ptr);
2286
2319
  }
2287
2320
 
2288
2321
  #else
2289
2322
 
2290
2323
  /*
2291
2324
  * Portable and safe solution. Generally efficient.
2292
- * see: https://stackoverflow.com/a/32095106/646947
2325
+ * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
2293
2326
  */
2294
2327
  static xxh_u64 XXH_read64(const void* memPtr)
2295
2328
  {
2296
2329
  xxh_u64 val;
2297
- memcpy(&val, memPtr, sizeof(val));
2330
+ XXH_memcpy(&val, memPtr, sizeof(val));
2298
2331
  return val;
2299
2332
  }
2300
2333
 
@@ -2424,6 +2457,7 @@ static xxh_u64 XXH64_avalanche(xxh_u64 h64)
2424
2457
  static xxh_u64
2425
2458
  XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
2426
2459
  {
2460
+ if (ptr==NULL) XXH_ASSERT(len == 0);
2427
2461
  len &= 31;
2428
2462
  while (len >= 8) {
2429
2463
  xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
@@ -2459,18 +2493,12 @@ XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
2459
2493
  XXH_FORCE_INLINE xxh_u64
2460
2494
  XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
2461
2495
  {
2462
- const xxh_u8* bEnd = input ? input + len : NULL;
2463
2496
  xxh_u64 h64;
2464
-
2465
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
2466
- if (input==NULL) {
2467
- len=0;
2468
- bEnd=input=(const xxh_u8*)(size_t)32;
2469
- }
2470
- #endif
2497
+ if (input==NULL) XXH_ASSERT(len == 0);
2471
2498
 
2472
2499
  if (len>=32) {
2473
- const xxh_u8* const limit = bEnd - 32;
2500
+ const xxh_u8* const bEnd = input + len;
2501
+ const xxh_u8* const limit = bEnd - 31;
2474
2502
  xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2475
2503
  xxh_u64 v2 = seed + XXH_PRIME64_2;
2476
2504
  xxh_u64 v3 = seed + 0;
@@ -2481,7 +2509,7 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment
2481
2509
  v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
2482
2510
  v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
2483
2511
  v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
2484
- } while (input<=limit);
2512
+ } while (input<limit);
2485
2513
 
2486
2514
  h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
2487
2515
  h64 = XXH64_mergeRound(h64, v1);
@@ -2536,20 +2564,18 @@ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
2536
2564
  /*! @ingroup xxh64_family */
2537
2565
  XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
2538
2566
  {
2539
- memcpy(dstState, srcState, sizeof(*dstState));
2567
+ XXH_memcpy(dstState, srcState, sizeof(*dstState));
2540
2568
  }
2541
2569
 
2542
2570
  /*! @ingroup xxh64_family */
2543
2571
  XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
2544
2572
  {
2545
- XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
2546
- memset(&state, 0, sizeof(state));
2547
- state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2548
- state.v2 = seed + XXH_PRIME64_2;
2549
- state.v3 = seed + 0;
2550
- state.v4 = seed - XXH_PRIME64_1;
2551
- /* do not write into reserved64, might be removed in a future version */
2552
- memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
2573
+ XXH_ASSERT(statePtr != NULL);
2574
+ memset(statePtr, 0, sizeof(*statePtr));
2575
+ statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2576
+ statePtr->v[1] = seed + XXH_PRIME64_2;
2577
+ statePtr->v[2] = seed + 0;
2578
+ statePtr->v[3] = seed - XXH_PRIME64_1;
2553
2579
  return XXH_OK;
2554
2580
  }
2555
2581
 
@@ -2557,12 +2583,10 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t s
2557
2583
  XXH_PUBLIC_API XXH_errorcode
2558
2584
  XXH64_update (XXH64_state_t* state, const void* input, size_t len)
2559
2585
  {
2560
- if (input==NULL)
2561
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
2586
+ if (input==NULL) {
2587
+ XXH_ASSERT(len == 0);
2562
2588
  return XXH_OK;
2563
- #else
2564
- return XXH_ERROR;
2565
- #endif
2589
+ }
2566
2590
 
2567
2591
  { const xxh_u8* p = (const xxh_u8*)input;
2568
2592
  const xxh_u8* const bEnd = p + len;
@@ -2577,32 +2601,24 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
2577
2601
 
2578
2602
  if (state->memsize) { /* tmp buffer is full */
2579
2603
  XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
2580
- state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
2581
- state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
2582
- state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
2583
- state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
2604
+ state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));
2605
+ state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));
2606
+ state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));
2607
+ state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));
2584
2608
  p += 32 - state->memsize;
2585
2609
  state->memsize = 0;
2586
2610
  }
2587
2611
 
2588
2612
  if (p+32 <= bEnd) {
2589
2613
  const xxh_u8* const limit = bEnd - 32;
2590
- xxh_u64 v1 = state->v1;
2591
- xxh_u64 v2 = state->v2;
2592
- xxh_u64 v3 = state->v3;
2593
- xxh_u64 v4 = state->v4;
2594
2614
 
2595
2615
  do {
2596
- v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
2597
- v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
2598
- v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
2599
- v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
2616
+ state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;
2617
+ state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;
2618
+ state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;
2619
+ state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;
2600
2620
  } while (p<=limit);
2601
2621
 
2602
- state->v1 = v1;
2603
- state->v2 = v2;
2604
- state->v3 = v3;
2605
- state->v4 = v4;
2606
2622
  }
2607
2623
 
2608
2624
  if (p < bEnd) {
@@ -2621,18 +2637,13 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
2621
2637
  xxh_u64 h64;
2622
2638
 
2623
2639
  if (state->total_len >= 32) {
2624
- xxh_u64 const v1 = state->v1;
2625
- xxh_u64 const v2 = state->v2;
2626
- xxh_u64 const v3 = state->v3;
2627
- xxh_u64 const v4 = state->v4;
2628
-
2629
- h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
2630
- h64 = XXH64_mergeRound(h64, v1);
2631
- h64 = XXH64_mergeRound(h64, v2);
2632
- h64 = XXH64_mergeRound(h64, v3);
2633
- h64 = XXH64_mergeRound(h64, v4);
2640
+ h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);
2641
+ h64 = XXH64_mergeRound(h64, state->v[0]);
2642
+ h64 = XXH64_mergeRound(h64, state->v[1]);
2643
+ h64 = XXH64_mergeRound(h64, state->v[2]);
2644
+ h64 = XXH64_mergeRound(h64, state->v[3]);
2634
2645
  } else {
2635
- h64 = state->v3 /*seed*/ + XXH_PRIME64_5;
2646
+ h64 = state->v[2] /*seed*/ + XXH_PRIME64_5;
2636
2647
  }
2637
2648
 
2638
2649
  h64 += (xxh_u64) state->total_len;
@@ -2648,7 +2659,7 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t
2648
2659
  {
2649
2660
  XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
2650
2661
  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
2651
- memcpy(dst, &hash, sizeof(*dst));
2662
+ XXH_memcpy(dst, &hash, sizeof(*dst));
2652
2663
  }
2653
2664
 
2654
2665
  /*! @ingroup xxh64_family */
@@ -2691,17 +2702,21 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
2691
2702
  # define XXH_unlikely(x) (x)
2692
2703
  #endif
2693
2704
 
2694
- #if defined(__GNUC__)
2695
- # if defined(__AVX2__)
2696
- # include <immintrin.h>
2697
- # elif defined(__SSE2__)
2698
- # include <emmintrin.h>
2699
- # elif defined(__ARM_NEON__) || defined(__ARM_NEON)
2705
+ #if defined(__GNUC__) || defined(__clang__)
2706
+ # if defined(__ARM_NEON__) || defined(__ARM_NEON) \
2707
+ || defined(__aarch64__) || defined(_M_ARM) \
2708
+ || defined(_M_ARM64) || defined(_M_ARM64EC)
2700
2709
  # define inline __inline__ /* circumvent a clang bug */
2701
2710
  # include <arm_neon.h>
2702
2711
  # undef inline
2712
+ # elif defined(__AVX2__)
2713
+ # include <immintrin.h>
2714
+ # elif defined(__SSE2__)
2715
+ # include <emmintrin.h>
2703
2716
  # endif
2704
- #elif defined(_MSC_VER)
2717
+ #endif
2718
+
2719
+ #if defined(_MSC_VER)
2705
2720
  # include <intrin.h>
2706
2721
  #endif
2707
2722
 
@@ -2839,17 +2854,20 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
2839
2854
  #endif
2840
2855
 
2841
2856
  #ifndef XXH_VECTOR /* can be defined on command line */
2842
- # if defined(__AVX512F__)
2857
+ # if ( \
2858
+ defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
2859
+ || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
2860
+ ) && ( \
2861
+ defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
2862
+ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
2863
+ )
2864
+ # define XXH_VECTOR XXH_NEON
2865
+ # elif defined(__AVX512F__)
2843
2866
  # define XXH_VECTOR XXH_AVX512
2844
2867
  # elif defined(__AVX2__)
2845
2868
  # define XXH_VECTOR XXH_AVX2
2846
2869
  # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
2847
2870
  # define XXH_VECTOR XXH_SSE2
2848
- # elif defined(__GNUC__) /* msvc support maybe later */ \
2849
- && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
2850
- && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
2851
- || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
2852
- # define XXH_VECTOR XXH_NEON
2853
2871
  # elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
2854
2872
  || (defined(__s390x__) && defined(__VEC__)) \
2855
2873
  && defined(__GNUC__) /* TODO: IBM XL */
@@ -2999,8 +3017,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
2999
3017
  * }
3000
3018
  */
3001
3019
  # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
3002
- && defined(__GNUC__) \
3003
- && !defined(__aarch64__) && !defined(__arm64__)
3020
+ && (defined(__GNUC__) || defined(__clang__)) \
3021
+ && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
3004
3022
  # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
3005
3023
  do { \
3006
3024
  /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
@@ -3017,6 +3035,76 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3017
3035
  (outHi) = vshrn_n_u64 ((in), 32); \
3018
3036
  } while (0)
3019
3037
  # endif
3038
+
3039
+ /*!
3040
+ * @internal
3041
+ * @brief `vld1q_u64` but faster and alignment-safe.
3042
+ *
3043
+ * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only
3044
+ * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).
3045
+ *
3046
+ * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it
3047
+ * prohibits load-store optimizations. Therefore, a direct dereference is used.
3048
+ *
3049
+ * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe
3050
+ * unaligned load.
3051
+ */
3052
+ #if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
3053
+ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
3054
+ {
3055
+ return *(uint64x2_t const*)ptr;
3056
+ }
3057
+ #else
3058
+ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
3059
+ {
3060
+ return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
3061
+ }
3062
+ #endif
3063
+ /*!
3064
+ * @ingroup tuning
3065
+ * @brief Controls the NEON to scalar ratio for XXH3
3066
+ *
3067
+ * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and
3068
+ * 2 lanes on scalar by default.
3069
+ *
3070
+ * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the
3071
+ * emulated 64-bit arithmetic is too slow.
3072
+ *
3073
+ * Modern ARM CPUs are _very_ sensitive to how their pipelines are used.
3074
+ *
3075
+ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't
3076
+ * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,
3077
+ * you are only using 2/3 of the CPU bandwidth.
3078
+ *
3079
+ * This is even more noticable on the more advanced cores like the A76 which
3080
+ * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
3081
+ *
3082
+ * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the
3083
+ * remaining lanes will use scalar instructions. This improves the bandwidth
3084
+ * and also gives the integer pipelines something to do besides twiddling loop
3085
+ * counters and pointers.
3086
+ *
3087
+ * This change benefits CPUs with large micro-op buffers without negatively affecting
3088
+ * other CPUs:
3089
+ *
3090
+ * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. |
3091
+ * |:----------------------|:--------------------|----------:|-----------:|------:|
3092
+ * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% |
3093
+ * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% |
3094
+ * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% |
3095
+ *
3096
+ * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
3097
+ *
3098
+ * @see XXH3_accumulate_512_neon()
3099
+ */
3100
+ # ifndef XXH3_NEON_LANES
3101
+ # if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
3102
+ && !defined(__OPTIMIZE_SIZE__)
3103
+ # define XXH3_NEON_LANES 6
3104
+ # else
3105
+ # define XXH3_NEON_LANES XXH_ACC_NB
3106
+ # endif
3107
+ # endif
3020
3108
  #endif /* XXH_VECTOR == XXH_NEON */
3021
3109
 
3022
3110
  /*
@@ -3083,7 +3171,7 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
3083
3171
  XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
3084
3172
  {
3085
3173
  xxh_u64x2 ret;
3086
- memcpy(&ret, ptr, sizeof(xxh_u64x2));
3174
+ XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
3087
3175
  # if XXH_VSX_BE
3088
3176
  ret = XXH_vec_revb(ret);
3089
3177
  # endif
@@ -3193,7 +3281,6 @@ XXH_mult32to64(xxh_u64 x, xxh_u64 y)
3193
3281
  return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
3194
3282
  }
3195
3283
  #elif defined(_MSC_VER) && defined(_M_IX86)
3196
- # include <intrin.h>
3197
3284
  # define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
3198
3285
  #else
3199
3286
  /*
@@ -3212,7 +3299,7 @@ XXH_mult32to64(xxh_u64 x, xxh_u64 y)
3212
3299
  * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
3213
3300
  * version.
3214
3301
  *
3215
- * @param lhs, rhs The 64-bit integers to be multiplied
3302
+ * @param lhs , rhs The 64-bit integers to be multiplied
3216
3303
  * @return The 128-bit result represented in an @ref XXH128_hash_t.
3217
3304
  */
3218
3305
  static XXH128_hash_t
@@ -3233,7 +3320,7 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
3233
3320
  * In that case it is best to use the portable one.
3234
3321
  * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
3235
3322
  */
3236
- #if defined(__GNUC__) && !defined(__wasm__) \
3323
+ #if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
3237
3324
  && defined(__SIZEOF_INT128__) \
3238
3325
  || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
3239
3326
 
@@ -3250,7 +3337,7 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
3250
3337
  *
3251
3338
  * This compiles to single operand MUL on x64.
3252
3339
  */
3253
- #elif defined(_M_X64) || defined(_M_IA64)
3340
+ #elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
3254
3341
 
3255
3342
  #ifndef _MSC_VER
3256
3343
  # pragma intrinsic(_umul128)
@@ -3262,6 +3349,21 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
3262
3349
  r128.high64 = product_high;
3263
3350
  return r128;
3264
3351
 
3352
+ /*
3353
+ * MSVC for ARM64's __umulh method.
3354
+ *
3355
+ * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
3356
+ */
3357
+ #elif defined(_M_ARM64) || defined(_M_ARM64EC)
3358
+
3359
+ #ifndef _MSC_VER
3360
+ # pragma intrinsic(__umulh)
3361
+ #endif
3362
+ XXH128_hash_t r128;
3363
+ r128.low64 = lhs * rhs;
3364
+ r128.high64 = __umulh(lhs, rhs);
3365
+ return r128;
3366
+
3265
3367
  #else
3266
3368
  /*
3267
3369
  * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
@@ -3330,7 +3432,7 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
3330
3432
  * The reason for the separate function is to prevent passing too many structs
3331
3433
  * around by value. This will hopefully inline the multiply, but we don't force it.
3332
3434
  *
3333
- * @param lhs, rhs The 64-bit integers to multiply
3435
+ * @param lhs , rhs The 64-bit integers to multiply
3334
3436
  * @return The low 64 bits of the product XOR'd by the high 64 bits.
3335
3437
  * @see XXH_mult64to128()
3336
3438
  */
@@ -3632,7 +3734,7 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
3632
3734
  XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
3633
3735
  {
3634
3736
  if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
3635
- memcpy(dst, &v64, sizeof(v64));
3737
+ XXH_memcpy(dst, &v64, sizeof(v64));
3636
3738
  }
3637
3739
 
3638
3740
  /* Several intrinsic functions below are supposed to accept __int64 as argument,
@@ -3649,6 +3751,7 @@ XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
3649
3751
  typedef long long xxh_i64;
3650
3752
  #endif
3651
3753
 
3754
+
3652
3755
  /*
3653
3756
  * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
3654
3757
  *
@@ -3684,7 +3787,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
3684
3787
  const void* XXH_RESTRICT input,
3685
3788
  const void* XXH_RESTRICT secret)
3686
3789
  {
3687
- XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc;
3790
+ __m512i* const xacc = (__m512i *) acc;
3688
3791
  XXH_ASSERT((((size_t)acc) & 63) == 0);
3689
3792
  XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
3690
3793
 
@@ -3733,7 +3836,7 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3733
3836
  {
3734
3837
  XXH_ASSERT((((size_t)acc) & 63) == 0);
3735
3838
  XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
3736
- { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
3839
+ { __m512i* const xacc = (__m512i*) acc;
3737
3840
  const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
3738
3841
 
3739
3842
  /* xacc[0] ^= (xacc[0] >> 47) */
@@ -3794,7 +3897,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
3794
3897
  const void* XXH_RESTRICT secret)
3795
3898
  {
3796
3899
  XXH_ASSERT((((size_t)acc) & 31) == 0);
3797
- { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;
3900
+ { __m256i* const xacc = (__m256i *) acc;
3798
3901
  /* Unaligned. This is mainly for pointer arithmetic, and because
3799
3902
  * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
3800
3903
  const __m256i* const xinput = (const __m256i *) input;
@@ -3826,7 +3929,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void
3826
3929
  XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3827
3930
  {
3828
3931
  XXH_ASSERT((((size_t)acc) & 31) == 0);
3829
- { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
3932
+ { __m256i* const xacc = (__m256i*) acc;
3830
3933
  /* Unaligned. This is mainly for pointer arithmetic, and because
3831
3934
  * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
3832
3935
  const __m256i* const xsecret = (const __m256i *) secret;
@@ -3900,7 +4003,7 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
3900
4003
  {
3901
4004
  /* SSE2 is just a half-scale version of the AVX2 version. */
3902
4005
  XXH_ASSERT((((size_t)acc) & 15) == 0);
3903
- { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc;
4006
+ { __m128i* const xacc = (__m128i *) acc;
3904
4007
  /* Unaligned. This is mainly for pointer arithmetic, and because
3905
4008
  * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
3906
4009
  const __m128i* const xinput = (const __m128i *) input;
@@ -3932,7 +4035,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void
3932
4035
  XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3933
4036
  {
3934
4037
  XXH_ASSERT((((size_t)acc) & 15) == 0);
3935
- { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
4038
+ { __m128i* const xacc = (__m128i*) acc;
3936
4039
  /* Unaligned. This is mainly for pointer arithmetic, and because
3937
4040
  * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
3938
4041
  const __m128i* const xsecret = (const __m128i *) secret;
@@ -3994,40 +4097,66 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTR
3994
4097
 
3995
4098
  #if (XXH_VECTOR == XXH_NEON)
3996
4099
 
4100
+ /* forward declarations for the scalar routines */
4101
+ XXH_FORCE_INLINE void
4102
+ XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
4103
+ void const* XXH_RESTRICT secret, size_t lane);
4104
+
4105
+ XXH_FORCE_INLINE void
4106
+ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
4107
+ void const* XXH_RESTRICT secret, size_t lane);
4108
+
4109
+ /*!
4110
+ * @internal
4111
+ * @brief The bulk processing loop for NEON.
4112
+ *
4113
+ * The NEON code path is actually partially scalar when running on AArch64. This
4114
+ * is to optimize the pipelining and can have up to 15% speedup depending on the
4115
+ * CPU, and it also mitigates some GCC codegen issues.
4116
+ *
4117
+ * @see XXH3_NEON_LANES for configuring this and details about this optimization.
4118
+ */
3997
4119
  XXH_FORCE_INLINE void
3998
4120
  XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
3999
4121
  const void* XXH_RESTRICT input,
4000
4122
  const void* XXH_RESTRICT secret)
4001
4123
  {
4002
4124
  XXH_ASSERT((((size_t)acc) & 15) == 0);
4125
+ XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
4003
4126
  {
4004
- XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
4127
+ uint64x2_t* const xacc = (uint64x2_t *) acc;
4005
4128
  /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
4006
4129
  uint8_t const* const xinput = (const uint8_t *) input;
4007
4130
  uint8_t const* const xsecret = (const uint8_t *) secret;
4008
4131
 
4009
4132
  size_t i;
4010
- for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
4133
+ /* AArch64 uses both scalar and neon at the same time */
4134
+ for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
4135
+ XXH3_scalarRound(acc, input, secret, i);
4136
+ }
4137
+ for (i=0; i < XXH3_NEON_LANES / 2; i++) {
4138
+ uint64x2_t acc_vec = xacc[i];
4011
4139
  /* data_vec = xinput[i]; */
4012
- uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
4140
+ uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16));
4013
4141
  /* key_vec = xsecret[i]; */
4014
- uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
4142
+ uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));
4015
4143
  uint64x2_t data_key;
4016
4144
  uint32x2_t data_key_lo, data_key_hi;
4017
- /* xacc[i] += swap(data_vec); */
4018
- uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
4019
- uint64x2_t const swapped = vextq_u64(data64, data64, 1);
4020
- xacc[i] = vaddq_u64 (xacc[i], swapped);
4145
+ /* acc_vec_2 = swap(data_vec) */
4146
+ uint64x2_t acc_vec_2 = vextq_u64(data_vec, data_vec, 1);
4021
4147
  /* data_key = data_vec ^ key_vec; */
4022
- data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
4148
+ data_key = veorq_u64(data_vec, key_vec);
4023
4149
  /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
4024
4150
  * data_key_hi = (uint32x2_t) (data_key >> 32);
4025
4151
  * data_key = UNDEFINED; */
4026
4152
  XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
4027
- /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
4028
- xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
4029
-
4153
+ /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
4154
+ acc_vec_2 = vmlal_u32 (acc_vec_2, data_key_lo, data_key_hi);
4155
+ /* xacc[i] += acc_vec_2; */
4156
+ acc_vec = vaddq_u64 (acc_vec, acc_vec_2);
4157
+ xacc[i] = acc_vec;
4030
4158
  }
4159
+
4031
4160
  }
4032
4161
  }
4033
4162
 
@@ -4041,15 +4170,19 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4041
4170
  uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);
4042
4171
 
4043
4172
  size_t i;
4044
- for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) {
4173
+ /* AArch64 uses both scalar and neon at the same time */
4174
+ for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
4175
+ XXH3_scalarScrambleRound(acc, secret, i);
4176
+ }
4177
+ for (i=0; i < XXH3_NEON_LANES / 2; i++) {
4045
4178
  /* xacc[i] ^= (xacc[i] >> 47); */
4046
4179
  uint64x2_t acc_vec = xacc[i];
4047
- uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
4048
- uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
4180
+ uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
4181
+ uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
4049
4182
 
4050
4183
  /* xacc[i] ^= xsecret[i]; */
4051
- uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
4052
- uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
4184
+ uint64x2_t key_vec = XXH_vld1q_u64 (xsecret + (i * 16));
4185
+ uint64x2_t data_key = veorq_u64 (data_vec, key_vec);
4053
4186
 
4054
4187
  /* xacc[i] *= XXH_PRIME32_1 */
4055
4188
  uint32x2_t data_key_lo, data_key_hi;
@@ -4077,11 +4210,12 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4077
4210
  */
4078
4211
  uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
4079
4212
  /* xacc[i] = prod_hi << 32; */
4080
- xacc[i] = vshlq_n_u64(prod_hi, 32);
4213
+ prod_hi = vshlq_n_u64(prod_hi, 32);
4081
4214
  /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
4082
- xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
4215
+ xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime);
4083
4216
  }
4084
- } }
4217
+ }
4218
+ }
4085
4219
  }
4086
4220
 
4087
4221
  #endif
@@ -4093,7 +4227,8 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
4093
4227
  const void* XXH_RESTRICT input,
4094
4228
  const void* XXH_RESTRICT secret)
4095
4229
  {
4096
- xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */
4230
+ /* presumed aligned */
4231
+ unsigned int* const xacc = (unsigned int*) acc;
4097
4232
  xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */
4098
4233
  xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */
4099
4234
  xxh_u64x2 const v32 = { 32, 32 };
@@ -4108,14 +4243,18 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
4108
4243
  xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
4109
4244
  /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
4110
4245
  xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
4111
- xacc[i] += product;
4246
+ /* acc_vec = xacc[i]; */
4247
+ xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i);
4248
+ acc_vec += product;
4112
4249
 
4113
4250
  /* swap high and low halves */
4114
4251
  #ifdef __s390x__
4115
- xacc[i] += vec_permi(data_vec, data_vec, 2);
4252
+ acc_vec += vec_permi(data_vec, data_vec, 2);
4116
4253
  #else
4117
- xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
4254
+ acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
4118
4255
  #endif
4256
+ /* xacc[i] = acc_vec; */
4257
+ vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
4119
4258
  }
4120
4259
  }
4121
4260
 
@@ -4153,38 +4292,90 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4153
4292
 
4154
4293
  /* scalar variants - universal */
4155
4294
 
4295
+ /*!
4296
+ * @internal
4297
+ * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
4298
+ *
4299
+ * This is extracted to its own function because the NEON path uses a combination
4300
+ * of NEON and scalar.
4301
+ */
4302
+ XXH_FORCE_INLINE void
4303
+ XXH3_scalarRound(void* XXH_RESTRICT acc,
4304
+ void const* XXH_RESTRICT input,
4305
+ void const* XXH_RESTRICT secret,
4306
+ size_t lane)
4307
+ {
4308
+ xxh_u64* xacc = (xxh_u64*) acc;
4309
+ xxh_u8 const* xinput = (xxh_u8 const*) input;
4310
+ xxh_u8 const* xsecret = (xxh_u8 const*) secret;
4311
+ XXH_ASSERT(lane < XXH_ACC_NB);
4312
+ XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
4313
+ {
4314
+ xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
4315
+ xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
4316
+ xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
4317
+ xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
4318
+ }
4319
+ }
4320
+
4321
+ /*!
4322
+ * @internal
4323
+ * @brief Processes a 64 byte block of data using the scalar path.
4324
+ */
4156
4325
  XXH_FORCE_INLINE void
4157
4326
  XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
4158
4327
  const void* XXH_RESTRICT input,
4159
4328
  const void* XXH_RESTRICT secret)
4160
4329
  {
4161
- XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
4162
- const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */
4163
- const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
4164
4330
  size_t i;
4165
- XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
4331
+ /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */
4332
+ #if defined(__GNUC__) && !defined(__clang__) \
4333
+ && (defined(__arm__) || defined(__thumb2__)) \
4334
+ && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \
4335
+ && !defined(__OPTIMIZE_SIZE__)
4336
+ # pragma GCC unroll 8
4337
+ #endif
4166
4338
  for (i=0; i < XXH_ACC_NB; i++) {
4167
- xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
4168
- xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
4169
- xacc[i ^ 1] += data_val; /* swap adjacent lanes */
4170
- xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
4339
+ XXH3_scalarRound(acc, input, secret, i);
4171
4340
  }
4172
4341
  }
4173
4342
 
4343
+ /*!
4344
+ * @internal
4345
+ * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
4346
+ *
4347
+ * This is extracted to its own function because the NEON path uses a combination
4348
+ * of NEON and scalar.
4349
+ */
4174
4350
  XXH_FORCE_INLINE void
4175
- XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4351
+ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
4352
+ void const* XXH_RESTRICT secret,
4353
+ size_t lane)
4176
4354
  {
4177
- XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
4355
+ xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
4178
4356
  const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
4179
- size_t i;
4180
4357
  XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
4181
- for (i=0; i < XXH_ACC_NB; i++) {
4182
- xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
4183
- xxh_u64 acc64 = xacc[i];
4358
+ XXH_ASSERT(lane < XXH_ACC_NB);
4359
+ {
4360
+ xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
4361
+ xxh_u64 acc64 = xacc[lane];
4184
4362
  acc64 = XXH_xorshift64(acc64, 47);
4185
4363
  acc64 ^= key64;
4186
4364
  acc64 *= XXH_PRIME32_1;
4187
- xacc[i] = acc64;
4365
+ xacc[lane] = acc64;
4366
+ }
4367
+ }
4368
+
4369
+ /*!
4370
+ * @internal
4371
+ * @brief Scrambles the accumulators after a large chunk has been read
4372
+ */
4373
+ XXH_FORCE_INLINE void
4374
+ XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4375
+ {
4376
+ size_t i;
4377
+ for (i=0; i < XXH_ACC_NB; i++) {
4378
+ XXH3_scalarScrambleRound(acc, secret, i);
4188
4379
  }
4189
4380
  }
4190
4381
 
@@ -4206,8 +4397,9 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4206
4397
  * placed sequentially, in order, at the top of the unrolled loop.
4207
4398
  *
4208
4399
  * While MOVK is great for generating constants (2 cycles for a 64-bit
4209
- * constant compared to 4 cycles for LDR), long MOVK chains stall the
4210
- * integer pipelines:
4400
+ * constant compared to 4 cycles for LDR), it fights for bandwidth with
4401
+ * the arithmetic instructions.
4402
+ *
4211
4403
  * I L S
4212
4404
  * MOVK
4213
4405
  * MOVK
@@ -4224,6 +4416,9 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4224
4416
  * ADD LDR
4225
4417
  * SUB STR
4226
4418
  * STR
4419
+ *
4420
+ * See XXH3_NEON_LANES for details on the pipsline.
4421
+ *
4227
4422
  * XXH3_64bits_withSeed, len == 256, Snapdragon 835
4228
4423
  * without hack: 2654.4 MB/s
4229
4424
  * with hack: 3202.9 MB/s
@@ -4422,9 +4617,11 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
4422
4617
  }
4423
4618
 
4424
4619
  /*
4425
- * It's important for performance that XXH3_hashLong is not inlined.
4620
+ * It's important for performance to transmit secret's size (when it's static)
4621
+ * so that the compiler can properly optimize the vectorized loop.
4622
+ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
4426
4623
  */
4427
- XXH_NO_INLINE XXH64_hash_t
4624
+ XXH_FORCE_INLINE XXH64_hash_t
4428
4625
  XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
4429
4626
  XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
4430
4627
  {
@@ -4433,11 +4630,10 @@ XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
4433
4630
  }
4434
4631
 
4435
4632
  /*
4436
- * It's important for performance that XXH3_hashLong is not inlined.
4437
- * Since the function is not inlined, the compiler may not be able to understand that,
4438
- * in some scenarios, its `secret` argument is actually a compile time constant.
4439
- * This variant enforces that the compiler can detect that,
4440
- * and uses this opportunity to streamline the generated code for better performance.
4633
+ * It's preferable for performance that XXH3_hashLong is not inlined,
4634
+ * as it results in a smaller function for small data, easier to the instruction cache.
4635
+ * Note that inside this no_inline function, we do inline the internal loop,
4636
+ * and provide a statically defined secret size to allow optimization of vector loop.
4441
4637
  */
4442
4638
  XXH_NO_INLINE XXH64_hash_t
4443
4639
  XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
@@ -4537,6 +4733,14 @@ XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
4537
4733
  return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
4538
4734
  }
4539
4735
 
4736
+ XXH_PUBLIC_API XXH64_hash_t
4737
+ XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
4738
+ {
4739
+ if (len <= XXH3_MIDSIZE_MAX)
4740
+ return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
4741
+ return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize);
4742
+ }
4743
+
4540
4744
 
4541
4745
  /* === XXH3 streaming === */
4542
4746
 
@@ -4625,13 +4829,13 @@ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
4625
4829
  XXH_PUBLIC_API void
4626
4830
  XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
4627
4831
  {
4628
- memcpy(dst_state, src_state, sizeof(*dst_state));
4832
+ XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
4629
4833
  }
4630
4834
 
4631
4835
  static void
4632
4836
  XXH3_reset_internal(XXH3_state_t* statePtr,
4633
- XXH64_hash_t seed,
4634
- const void* secret, size_t secretSize)
4837
+ XXH64_hash_t seed,
4838
+ const void* secret, size_t secretSize)
4635
4839
  {
4636
4840
  size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
4637
4841
  size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
@@ -4648,6 +4852,7 @@ XXH3_reset_internal(XXH3_state_t* statePtr,
4648
4852
  statePtr->acc[6] = XXH_PRIME64_5;
4649
4853
  statePtr->acc[7] = XXH_PRIME32_1;
4650
4854
  statePtr->seed = seed;
4855
+ statePtr->useSeed = (seed != 0);
4651
4856
  statePtr->extSecret = (const unsigned char*)secret;
4652
4857
  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
4653
4858
  statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
@@ -4680,11 +4885,24 @@ XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
4680
4885
  {
4681
4886
  if (statePtr == NULL) return XXH_ERROR;
4682
4887
  if (seed==0) return XXH3_64bits_reset(statePtr);
4683
- if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
4888
+ if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
4889
+ XXH3_initCustomSecret(statePtr->customSecret, seed);
4684
4890
  XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
4685
4891
  return XXH_OK;
4686
4892
  }
4687
4893
 
4894
+ /*! @ingroup xxh3_family */
4895
+ XXH_PUBLIC_API XXH_errorcode
4896
+ XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)
4897
+ {
4898
+ if (statePtr == NULL) return XXH_ERROR;
4899
+ if (secret == NULL) return XXH_ERROR;
4900
+ if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
4901
+ XXH3_reset_internal(statePtr, seed64, secret, secretSize);
4902
+ statePtr->useSeed = 1; /* always, even if seed64==0 */
4903
+ return XXH_OK;
4904
+ }
4905
+
4688
4906
  /* Note : when XXH3_consumeStripes() is invoked,
4689
4907
  * there must be a guarantee that at least one more byte must be consumed from input
4690
4908
  * so that the function can blindly consume all stripes using the "normal" secret segment */
@@ -4712,35 +4930,48 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
4712
4930
  }
4713
4931
  }
4714
4932
 
4933
+ #ifndef XXH3_STREAM_USE_STACK
4934
+ # ifndef __clang__ /* clang doesn't need additional stack space */
4935
+ # define XXH3_STREAM_USE_STACK 1
4936
+ # endif
4937
+ #endif
4715
4938
  /*
4716
4939
  * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
4717
4940
  */
4718
4941
  XXH_FORCE_INLINE XXH_errorcode
4719
- XXH3_update(XXH3_state_t* state,
4720
- const xxh_u8* input, size_t len,
4942
+ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
4943
+ const xxh_u8* XXH_RESTRICT input, size_t len,
4721
4944
  XXH3_f_accumulate_512 f_acc512,
4722
4945
  XXH3_f_scrambleAcc f_scramble)
4723
4946
  {
4724
- if (input==NULL)
4725
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
4947
+ if (input==NULL) {
4948
+ XXH_ASSERT(len == 0);
4726
4949
  return XXH_OK;
4727
- #else
4728
- return XXH_ERROR;
4729
- #endif
4950
+ }
4730
4951
 
4952
+ XXH_ASSERT(state != NULL);
4731
4953
  { const xxh_u8* const bEnd = input + len;
4732
4954
  const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
4733
-
4955
+ #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
4956
+ /* For some reason, gcc and MSVC seem to suffer greatly
4957
+ * when operating accumulators directly into state.
4958
+ * Operating into stack space seems to enable proper optimization.
4959
+ * clang, on the other hand, doesn't seem to need this trick */
4960
+ XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));
4961
+ #else
4962
+ xxh_u64* XXH_RESTRICT const acc = state->acc;
4963
+ #endif
4734
4964
  state->totalLen += len;
4735
4965
  XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
4736
4966
 
4737
- if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
4967
+ /* small input : just fill in tmp buffer */
4968
+ if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {
4738
4969
  XXH_memcpy(state->buffer + state->bufferedSize, input, len);
4739
4970
  state->bufferedSize += (XXH32_hash_t)len;
4740
4971
  return XXH_OK;
4741
4972
  }
4742
- /* total input is now > XXH3_INTERNALBUFFER_SIZE */
4743
4973
 
4974
+ /* total input is now > XXH3_INTERNALBUFFER_SIZE */
4744
4975
  #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
4745
4976
  XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */
4746
4977
 
@@ -4752,7 +4983,7 @@ XXH3_update(XXH3_state_t* state,
4752
4983
  size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
4753
4984
  XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
4754
4985
  input += loadSize;
4755
- XXH3_consumeStripes(state->acc,
4986
+ XXH3_consumeStripes(acc,
4756
4987
  &state->nbStripesSoFar, state->nbStripesPerBlock,
4757
4988
  state->buffer, XXH3_INTERNALBUFFER_STRIPES,
4758
4989
  secret, state->secretLimit,
@@ -4761,25 +4992,62 @@ XXH3_update(XXH3_state_t* state,
4761
4992
  }
4762
4993
  XXH_ASSERT(input < bEnd);
4763
4994
 
4764
- /* Consume input by a multiple of internal buffer size */
4765
- if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
4766
- const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
4767
- do {
4768
- XXH3_consumeStripes(state->acc,
4769
- &state->nbStripesSoFar, state->nbStripesPerBlock,
4770
- input, XXH3_INTERNALBUFFER_STRIPES,
4771
- secret, state->secretLimit,
4772
- f_acc512, f_scramble);
4773
- input += XXH3_INTERNALBUFFER_SIZE;
4774
- } while (input<limit);
4775
- /* for last partial stripe */
4776
- memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
4995
+ /* large input to consume : ingest per full block */
4996
+ if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
4997
+ size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
4998
+ XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);
4999
+ /* join to current block's end */
5000
+ { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
5001
+ XXH_ASSERT(nbStripesToEnd <= nbStripes);
5002
+ XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);
5003
+ f_scramble(acc, secret + state->secretLimit);
5004
+ state->nbStripesSoFar = 0;
5005
+ input += nbStripesToEnd * XXH_STRIPE_LEN;
5006
+ nbStripes -= nbStripesToEnd;
5007
+ }
5008
+ /* consume per entire blocks */
5009
+ while(nbStripes >= state->nbStripesPerBlock) {
5010
+ XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);
5011
+ f_scramble(acc, secret + state->secretLimit);
5012
+ input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
5013
+ nbStripes -= state->nbStripesPerBlock;
5014
+ }
5015
+ /* consume last partial block */
5016
+ XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);
5017
+ input += nbStripes * XXH_STRIPE_LEN;
5018
+ XXH_ASSERT(input < bEnd); /* at least some bytes left */
5019
+ state->nbStripesSoFar = nbStripes;
5020
+ /* buffer predecessor of last partial stripe */
5021
+ XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5022
+ XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
5023
+ } else {
5024
+ /* content to consume <= block size */
5025
+ /* Consume input by a multiple of internal buffer size */
5026
+ if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
5027
+ const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
5028
+ do {
5029
+ XXH3_consumeStripes(acc,
5030
+ &state->nbStripesSoFar, state->nbStripesPerBlock,
5031
+ input, XXH3_INTERNALBUFFER_STRIPES,
5032
+ secret, state->secretLimit,
5033
+ f_acc512, f_scramble);
5034
+ input += XXH3_INTERNALBUFFER_SIZE;
5035
+ } while (input<limit);
5036
+ /* buffer predecessor of last partial stripe */
5037
+ XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5038
+ }
4777
5039
  }
4778
- XXH_ASSERT(input < bEnd);
4779
5040
 
4780
5041
  /* Some remaining input (always) : buffer it */
5042
+ XXH_ASSERT(input < bEnd);
5043
+ XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
5044
+ XXH_ASSERT(state->bufferedSize == 0);
4781
5045
  XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
4782
5046
  state->bufferedSize = (XXH32_hash_t)(bEnd-input);
5047
+ #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
5048
+ /* save stack accumulators into state */
5049
+ memcpy(state->acc, acc, sizeof(acc));
5050
+ #endif
4783
5051
  }
4784
5052
 
4785
5053
  return XXH_OK;
@@ -4803,7 +5071,7 @@ XXH3_digest_long (XXH64_hash_t* acc,
4803
5071
  * Digest on a local copy. This way, the state remains unaltered, and it can
4804
5072
  * continue ingesting more input afterwards.
4805
5073
  */
4806
- memcpy(acc, state->acc, sizeof(state->acc));
5074
+ XXH_memcpy(acc, state->acc, sizeof(state->acc));
4807
5075
  if (state->bufferedSize >= XXH_STRIPE_LEN) {
4808
5076
  size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
4809
5077
  size_t nbStripesSoFar = state->nbStripesSoFar;
@@ -4820,8 +5088,8 @@ XXH3_digest_long (XXH64_hash_t* acc,
4820
5088
  xxh_u8 lastStripe[XXH_STRIPE_LEN];
4821
5089
  size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
4822
5090
  XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */
4823
- memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
4824
- memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
5091
+ XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
5092
+ XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
4825
5093
  XXH3_accumulate_512(acc,
4826
5094
  lastStripe,
4827
5095
  secret + state->secretLimit - XXH_SECRET_LASTACC_START);
@@ -4840,58 +5108,13 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
4840
5108
  (xxh_u64)state->totalLen * XXH_PRIME64_1);
4841
5109
  }
4842
5110
  /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
4843
- if (state->seed)
5111
+ if (state->useSeed)
4844
5112
  return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
4845
5113
  return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
4846
5114
  secret, state->secretLimit + XXH_STRIPE_LEN);
4847
5115
  }
4848
5116
 
4849
5117
 
4850
- #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
4851
-
4852
- /*! @ingroup xxh3_family */
4853
- XXH_PUBLIC_API void
4854
- XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize)
4855
- {
4856
- XXH_ASSERT(secretBuffer != NULL);
4857
- if (customSeedSize == 0) {
4858
- memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
4859
- return;
4860
- }
4861
- XXH_ASSERT(customSeed != NULL);
4862
-
4863
- { size_t const segmentSize = sizeof(XXH128_hash_t);
4864
- size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
4865
- XXH128_canonical_t scrambler;
4866
- XXH64_hash_t seeds[12];
4867
- size_t segnb;
4868
- XXH_ASSERT(nbSegments == 12);
4869
- XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */
4870
- XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
4871
-
4872
- /*
4873
- * Copy customSeed to seeds[], truncating or repeating as necessary.
4874
- */
4875
- { size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
4876
- size_t filled = toFill;
4877
- memcpy(seeds, customSeed, toFill);
4878
- while (filled < sizeof(seeds)) {
4879
- toFill = XXH_MIN(filled, sizeof(seeds) - filled);
4880
- memcpy((char*)seeds + filled, seeds, toFill);
4881
- filled += toFill;
4882
- } }
4883
-
4884
- /* generate secret */
4885
- memcpy(secretBuffer, &scrambler, sizeof(scrambler));
4886
- for (segnb=1; segnb < nbSegments; segnb++) {
4887
- size_t const segmentStart = segnb * segmentSize;
4888
- XXH128_canonical_t segment;
4889
- XXH128_canonicalFromHash(&segment,
4890
- XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) );
4891
- memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment));
4892
- } }
4893
- }
4894
-
4895
5118
 
4896
5119
  /* ==========================================
4897
5120
  * XXH3 128 bits (a.k.a XXH128)
@@ -5193,9 +5416,10 @@ XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
5193
5416
  }
5194
5417
 
5195
5418
  /*
5196
- * It's important for performance that XXH3_hashLong is not inlined.
5419
+ * It's important for performance to pass @secretLen (when it's static)
5420
+ * to the compiler, so that it can properly optimize the vectorized loop.
5197
5421
  */
5198
- XXH_NO_INLINE XXH128_hash_t
5422
+ XXH_FORCE_INLINE XXH128_hash_t
5199
5423
  XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
5200
5424
  XXH64_hash_t seed64,
5201
5425
  const void* XXH_RESTRICT secret, size_t secretLen)
@@ -5288,6 +5512,15 @@ XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
5288
5512
  XXH3_hashLong_128b_withSeed);
5289
5513
  }
5290
5514
 
5515
+ /*! @ingroup xxh3_family */
5516
+ XXH_PUBLIC_API XXH128_hash_t
5517
+ XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
5518
+ {
5519
+ if (len <= XXH3_MIDSIZE_MAX)
5520
+ return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
5521
+ return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
5522
+ }
5523
+
5291
5524
  /*! @ingroup xxh3_family */
5292
5525
  XXH_PUBLIC_API XXH128_hash_t
5293
5526
  XXH128(const void* input, size_t len, XXH64_hash_t seed)
@@ -5299,7 +5532,7 @@ XXH128(const void* input, size_t len, XXH64_hash_t seed)
5299
5532
  /* === XXH3 128-bit streaming === */
5300
5533
 
5301
5534
  /*
5302
- * All the functions are actually the same as for 64-bit streaming variant.
5535
+ * All initialization and update functions are identical to 64-bit streaming variant.
5303
5536
  * The only difference is the finalization routine.
5304
5537
  */
5305
5538
 
@@ -5307,31 +5540,28 @@ XXH128(const void* input, size_t len, XXH64_hash_t seed)
5307
5540
  XXH_PUBLIC_API XXH_errorcode
5308
5541
  XXH3_128bits_reset(XXH3_state_t* statePtr)
5309
5542
  {
5310
- if (statePtr == NULL) return XXH_ERROR;
5311
- XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
5312
- return XXH_OK;
5543
+ return XXH3_64bits_reset(statePtr);
5313
5544
  }
5314
5545
 
5315
5546
  /*! @ingroup xxh3_family */
5316
5547
  XXH_PUBLIC_API XXH_errorcode
5317
5548
  XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
5318
5549
  {
5319
- if (statePtr == NULL) return XXH_ERROR;
5320
- XXH3_reset_internal(statePtr, 0, secret, secretSize);
5321
- if (secret == NULL) return XXH_ERROR;
5322
- if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
5323
- return XXH_OK;
5550
+ return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
5324
5551
  }
5325
5552
 
5326
5553
  /*! @ingroup xxh3_family */
5327
5554
  XXH_PUBLIC_API XXH_errorcode
5328
5555
  XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
5329
5556
  {
5330
- if (statePtr == NULL) return XXH_ERROR;
5331
- if (seed==0) return XXH3_128bits_reset(statePtr);
5332
- if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
5333
- XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
5334
- return XXH_OK;
5557
+ return XXH3_64bits_reset_withSeed(statePtr, seed);
5558
+ }
5559
+
5560
+ /*! @ingroup xxh3_family */
5561
+ XXH_PUBLIC_API XXH_errorcode
5562
+ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
5563
+ {
5564
+ return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
5335
5565
  }
5336
5566
 
5337
5567
  /*! @ingroup xxh3_family */
@@ -5406,8 +5636,8 @@ XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
5406
5636
  hash.high64 = XXH_swap64(hash.high64);
5407
5637
  hash.low64 = XXH_swap64(hash.low64);
5408
5638
  }
5409
- memcpy(dst, &hash.high64, sizeof(hash.high64));
5410
- memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
5639
+ XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
5640
+ XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
5411
5641
  }
5412
5642
 
5413
5643
  /*! @ingroup xxh3_family */
@@ -5420,6 +5650,77 @@ XXH128_hashFromCanonical(const XXH128_canonical_t* src)
5420
5650
  return h;
5421
5651
  }
5422
5652
 
5653
+
5654
+
5655
+ /* ==========================================
5656
+ * Secret generators
5657
+ * ==========================================
5658
+ */
5659
+ #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
5660
+
5661
+ XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
5662
+ {
5663
+ XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
5664
+ XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
5665
+ }
5666
+
5667
+ /*! @ingroup xxh3_family */
5668
+ XXH_PUBLIC_API XXH_errorcode
5669
+ XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)
5670
+ {
5671
+ #if (XXH_DEBUGLEVEL >= 1)
5672
+ XXH_ASSERT(secretBuffer != NULL);
5673
+ XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
5674
+ #else
5675
+ /* production mode, assert() are disabled */
5676
+ if (secretBuffer == NULL) return XXH_ERROR;
5677
+ if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
5678
+ #endif
5679
+
5680
+ if (customSeedSize == 0) {
5681
+ customSeed = XXH3_kSecret;
5682
+ customSeedSize = XXH_SECRET_DEFAULT_SIZE;
5683
+ }
5684
+ #if (XXH_DEBUGLEVEL >= 1)
5685
+ XXH_ASSERT(customSeed != NULL);
5686
+ #else
5687
+ if (customSeed == NULL) return XXH_ERROR;
5688
+ #endif
5689
+
5690
+ /* Fill secretBuffer with a copy of customSeed - repeat as needed */
5691
+ { size_t pos = 0;
5692
+ while (pos < secretSize) {
5693
+ size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
5694
+ memcpy((char*)secretBuffer + pos, customSeed, toCopy);
5695
+ pos += toCopy;
5696
+ } }
5697
+
5698
+ { size_t const nbSeg16 = secretSize / 16;
5699
+ size_t n;
5700
+ XXH128_canonical_t scrambler;
5701
+ XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
5702
+ for (n=0; n<nbSeg16; n++) {
5703
+ XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
5704
+ XXH3_combine16((char*)secretBuffer + n*16, h128);
5705
+ }
5706
+ /* last segment */
5707
+ XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
5708
+ }
5709
+ return XXH_OK;
5710
+ }
5711
+
5712
+ /*! @ingroup xxh3_family */
5713
+ XXH_PUBLIC_API void
5714
+ XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
5715
+ {
5716
+ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
5717
+ XXH3_initCustomSecret(secret, seed);
5718
+ XXH_ASSERT(secretBuffer != NULL);
5719
+ memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
5720
+ }
5721
+
5722
+
5723
+
5423
5724
  /* Pop our optimization override from above */
5424
5725
  #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
5425
5726
  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \