digest-xxhash 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  /*
2
2
  * xxHash - Extremely Fast Hash algorithm
3
3
  * Header File
4
- * Copyright (C) 2012-2020 Yann Collet
4
+ * Copyright (C) 2012-2021 Yann Collet
5
5
  *
6
6
  * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
7
7
  *
@@ -157,6 +157,7 @@ extern "C" {
157
157
  # undef XXH3_64bits
158
158
  # undef XXH3_64bits_withSecret
159
159
  # undef XXH3_64bits_withSeed
160
+ # undef XXH3_64bits_withSecretandSeed
160
161
  # undef XXH3_createState
161
162
  # undef XXH3_freeState
162
163
  # undef XXH3_copyState
@@ -174,6 +175,7 @@ extern "C" {
174
175
  # undef XXH3_128bits_reset
175
176
  # undef XXH3_128bits_reset_withSeed
176
177
  # undef XXH3_128bits_reset_withSecret
178
+ # undef XXH3_128bits_reset_withSecretandSeed
177
179
  # undef XXH3_128bits_update
178
180
  # undef XXH3_128bits_digest
179
181
  # undef XXH128_isEqual
@@ -284,23 +286,28 @@ extern "C" {
284
286
  # define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
285
287
  # define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
286
288
  # define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
289
+ # define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
287
290
  # define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
288
291
  # define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
289
292
  # define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
290
293
  # define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
291
294
  # define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
292
295
  # define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
296
+ # define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
293
297
  # define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
294
298
  # define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
295
299
  # define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
300
+ # define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
296
301
  /* XXH3_128bits */
297
302
  # define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
298
303
  # define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
299
304
  # define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
300
305
  # define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
306
+ # define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
301
307
  # define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
302
308
  # define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
303
309
  # define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
310
+ # define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
304
311
  # define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
305
312
  # define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
306
313
  # define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
@@ -321,16 +328,16 @@ extern "C" {
321
328
  /*!
322
329
  * @brief Obtains the xxHash version.
323
330
  *
324
- * This is only useful when xxHash is compiled as a shared library, as it is
325
- * independent of the version defined in the header.
331
+ * This is mostly useful when xxHash is compiled as a shared library,
332
+ * since the returned value comes from the library, as opposed to header file.
326
333
  *
327
- * @return `XXH_VERSION_NUMBER` as of when the libray was compiled.
334
+ * @return `XXH_VERSION_NUMBER` of the invoked library.
328
335
  */
329
336
  XXH_PUBLIC_API unsigned XXH_versionNumber (void);
330
337
 
331
338
 
332
339
  /* ****************************
333
- * Definitions
340
+ * Common basic types
334
341
  ******************************/
335
342
  #include <stddef.h> /* size_t */
336
343
  typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
@@ -374,10 +381,9 @@ typedef uint32_t XXH32_hash_t;
374
381
  * Contains functions used in the classic 32-bit xxHash algorithm.
375
382
  *
376
383
  * @note
377
- * XXH32 is considered rather weak by today's standards.
378
- * The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit
379
- * systems, and offers true 64/128 bit hash results. It provides a superior
380
- * level of dispersion, and greatly reduces the risks of collisions.
384
+ * XXH32 is useful for older platforms, with no or poor 64-bit performance.
385
+ * Note that @ref xxh3_family provides competitive speed
386
+ * for both 32-bit and 64-bit systems, and offers true 64/128 bit hash results.
381
387
  *
382
388
  * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
383
389
  * @see @ref xxh32_impl for implementation details
@@ -594,36 +600,39 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t
594
600
  XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
595
601
 
596
602
 
603
+ #ifdef __has_attribute
604
+ # define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
605
+ #else
606
+ # define XXH_HAS_ATTRIBUTE(x) 0
607
+ #endif
608
+
609
+ /* C-language Attributes are added in C23. */
610
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
611
+ # define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
612
+ #else
613
+ # define XXH_HAS_C_ATTRIBUTE(x) 0
614
+ #endif
615
+
616
+ #if defined(__cplusplus) && defined(__has_cpp_attribute)
617
+ # define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
618
+ #else
619
+ # define XXH_HAS_CPP_ATTRIBUTE(x) 0
620
+ #endif
621
+
597
622
  /*
598
623
  Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
599
624
  introduced in CPP17 and C23.
600
625
  CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
601
626
  C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough
602
627
  */
603
-
604
- #if defined (__has_c_attribute) && defined (__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) /* C2x */
605
- # if __has_c_attribute(fallthrough)
606
- # define XXH_FALLTHROUGH [[fallthrough]]
607
- # endif
608
-
609
- #elif defined(__cplusplus) && defined(__has_cpp_attribute)
610
- # if __has_cpp_attribute(fallthrough)
611
- # define XXH_FALLTHROUGH [[fallthrough]]
612
- # endif
613
- #endif
614
-
615
- #ifndef XXH_FALLTHROUGH
616
- # if defined(__GNUC__) && __GNUC__ >= 7
617
- # define XXH_FALLTHROUGH __attribute__ ((fallthrough))
618
- # elif defined(__clang__) && (__clang_major__ >= 10) \
619
- && (!defined(__APPLE__) || (__clang_major__ >= 12))
620
- /* Apple clang 12 is effectively clang-10 ,
621
- * see https://en.wikipedia.org/wiki/Xcode for details
622
- */
623
- # define XXH_FALLTHROUGH __attribute__ ((fallthrough))
624
- # else
625
- # define XXH_FALLTHROUGH
626
- # endif
628
+ #if XXH_HAS_C_ATTRIBUTE(x)
629
+ # define XXH_FALLTHROUGH [[fallthrough]]
630
+ #elif XXH_HAS_CPP_ATTRIBUTE(x)
631
+ # define XXH_FALLTHROUGH [[fallthrough]]
632
+ #elif XXH_HAS_ATTRIBUTE(__fallthrough__)
633
+ # define XXH_FALLTHROUGH __attribute__ ((fallthrough))
634
+ #else
635
+ # define XXH_FALLTHROUGH
627
636
  #endif
628
637
 
629
638
  /*!
@@ -669,8 +678,8 @@ typedef uint64_t XXH64_hash_t;
669
678
  *
670
679
  * @note
671
680
  * XXH3 provides competitive speed for both 32-bit and 64-bit systems,
672
- * and offers true 64/128 bit hash results. It provides a superior level of
673
- * dispersion, and greatly reduces the risks of collisions.
681
+ * and offers true 64/128 bit hash results.
682
+ * It provides better speed for systems with vector processing capabilities.
674
683
  */
675
684
 
676
685
 
@@ -719,6 +728,8 @@ typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t
719
728
  XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
720
729
  XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
721
730
 
731
+ #ifndef XXH_NO_XXH3
732
+
722
733
  /*!
723
734
  * @}
724
735
  * ************************************************************************
@@ -796,13 +807,17 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, X
796
807
  * It's possible to provide any blob of bytes as a "secret" to generate the hash.
797
808
  * This makes it more difficult for an external actor to prepare an intentional collision.
798
809
  * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
799
- * However, the quality of produced hash values depends on secret's entropy.
800
- * Technically, the secret must look like a bunch of random bytes.
810
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
811
+ * Therefore, the secret _must_ look like a bunch of random bytes.
801
812
  * Avoid "trivial" or structured data such as repeated sequences or a text document.
802
- * Whenever unsure about the "randomness" of the blob of bytes,
803
- * consider relabelling it as a "custom seed" instead,
804
- * and employ "XXH3_generateSecret()" (see below)
805
- * to generate a high entropy secret derived from the custom seed.
813
+ * Whenever in doubt about the "randomness" of the blob of bytes,
814
+ * consider employing "XXH3_generateSecret()" instead (see below).
815
+ * It will generate a proper high entropy secret derived from the blob of bytes.
816
+ * Another advantage of using XXH3_generateSecret() is that
817
+ * it guarantees that all bits within the initial blob of bytes
818
+ * will impact every bit of the output.
819
+ * This is not necessarily the case when using the blob of bytes directly
820
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
806
821
  */
807
822
  XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
808
823
 
@@ -922,6 +937,7 @@ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_has
922
937
  XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
923
938
 
924
939
 
940
+ #endif /* !XXH_NO_XXH3 */
925
941
  #endif /* XXH_NO_LONG_LONG */
926
942
 
927
943
  /*!
@@ -962,13 +978,10 @@ XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t*
962
978
  struct XXH32_state_s {
963
979
  XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
964
980
  XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
965
- XXH32_hash_t v1; /*!< First accumulator lane */
966
- XXH32_hash_t v2; /*!< Second accumulator lane */
967
- XXH32_hash_t v3; /*!< Third accumulator lane */
968
- XXH32_hash_t v4; /*!< Fourth accumulator lane */
981
+ XXH32_hash_t v[4]; /*!< Accumulator lanes */
969
982
  XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
970
983
  XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */
971
- XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may be removed. */
984
+ XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */
972
985
  }; /* typedef'd to XXH32_state_t */
973
986
 
974
987
 
@@ -988,16 +1001,15 @@ struct XXH32_state_s {
988
1001
  */
989
1002
  struct XXH64_state_s {
990
1003
  XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */
991
- XXH64_hash_t v1; /*!< First accumulator lane */
992
- XXH64_hash_t v2; /*!< Second accumulator lane */
993
- XXH64_hash_t v3; /*!< Third accumulator lane */
994
- XXH64_hash_t v4; /*!< Fourth accumulator lane */
1004
+ XXH64_hash_t v[4]; /*!< Accumulator lanes */
995
1005
  XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
996
1006
  XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */
997
1007
  XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/
998
- XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it may be removed. */
1008
+ XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */
999
1009
  }; /* typedef'd to XXH64_state_t */
1000
1010
 
1011
+ #ifndef XXH_NO_XXH3
1012
+
1001
1013
  #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
1002
1014
  # include <stdalign.h>
1003
1015
  # define XXH_ALIGN(n) alignas(n)
@@ -1070,7 +1082,7 @@ struct XXH3_state_s {
1070
1082
  /*!< The internal buffer. @see XXH32_state_s::mem32 */
1071
1083
  XXH32_hash_t bufferedSize;
1072
1084
  /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
1073
- XXH32_hash_t reserved32;
1085
+ XXH32_hash_t useSeed;
1074
1086
  /*!< Reserved field. Needed for padding on 64-bit. */
1075
1087
  size_t nbStripesSoFar;
1076
1088
  /*!< Number or stripes processed. */
@@ -1106,6 +1118,12 @@ struct XXH3_state_s {
1106
1118
  #define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; }
1107
1119
 
1108
1120
 
1121
+ /* XXH128() :
1122
+ * simple alias to pre-selected XXH3_128bits variant
1123
+ */
1124
+ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
1125
+
1126
+
1109
1127
  /* === Experimental API === */
1110
1128
  /* Symbols defined below must be considered tied to a specific library version. */
1111
1129
 
@@ -1118,33 +1136,92 @@ struct XXH3_state_s {
1118
1136
  * as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
1119
1137
  *
1120
1138
  * The function accepts as input a custom seed of any length and any content,
1121
- * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
1122
- * into an already allocated buffer secretBuffer.
1123
- * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
1139
+ * and derives from it a high-entropy secret of length @secretSize
1140
+ * into an already allocated buffer @secretBuffer.
1141
+ * @secretSize must be >= XXH3_SECRET_SIZE_MIN
1124
1142
  *
1125
1143
  * The generated secret can then be used with any `*_withSecret()` variant.
1126
1144
  * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
1127
1145
  * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
1128
1146
  * are part of this list. They all accept a `secret` parameter
1129
- * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
1147
+ * which must be large enough for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
1130
1148
  * _and_ feature very high entropy (consist of random-looking bytes).
1131
1149
  * These conditions can be a high bar to meet, so
1132
- * this function can be used to generate a secret of proper quality.
1150
+ * XXH3_generateSecret() can be employed to ensure proper quality.
1133
1151
  *
1134
1152
  * customSeed can be anything. It can have any size, even small ones,
1135
- * and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes.
1136
- * The resulting `secret` will nonetheless provide all expected qualities.
1153
+ * and its content can be anything, even "poor entropy" sources such as a bunch of zeroes.
1154
+ * The resulting `secret` will nonetheless provide all required qualities.
1137
1155
  *
1138
- * Supplying NULL as the customSeed copies the default secret into `secretBuffer`.
1139
1156
  * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
1140
1157
  */
1141
- XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize);
1158
+ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize);
1142
1159
 
1143
1160
 
1144
- /* simple short-cut to pre-selected XXH3_128bits variant */
1145
- XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
1161
+ /*
1162
+ * XXH3_generateSecret_fromSeed():
1163
+ *
1164
+ * Generate the same secret as the _withSeed() variants.
1165
+ *
1166
+ * The resulting secret has a length of XXH3_SECRET_DEFAULT_SIZE (necessarily).
1167
+ * @secretBuffer must be already allocated, of size at least XXH3_SECRET_DEFAULT_SIZE bytes.
1168
+ *
1169
+ * The generated secret can be used in combination with
1170
+ *`*_withSecret()` and `_withSecretandSeed()` variants.
1171
+ * This generator is notably useful in combination with `_withSecretandSeed()`,
1172
+ * as a way to emulate a faster `_withSeed()` variant.
1173
+ */
1174
+ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed);
1175
+
1176
+ /*
1177
+ * *_withSecretandSeed() :
1178
+ * These variants generate hash values using either
1179
+ * @seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)
1180
+ * or @secret for "large" keys (>= XXH3_MIDSIZE_MAX).
1181
+ *
1182
+ * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
1183
+ * `_withSeed()` has to generate the secret on the fly for "large" keys.
1184
+ * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
1185
+ * `_withSecret()` has to generate the masks on the fly for "small" keys,
1186
+ * which requires more instructions than _withSeed() variants.
1187
+ * Therefore, _withSecretandSeed variant combines the best of both worlds.
1188
+ *
1189
+ * When @secret has been generated by XXH3_generateSecret_fromSeed(),
1190
+ * this variant produces *exactly* the same results as `_withSeed()` variant,
1191
+ * hence offering only a pure speed benefit on "large" input,
1192
+ * by skipping the need to regenerate the secret for every large input.
1193
+ *
1194
+ * Another usage scenario is to hash the secret to a 64-bit hash value,
1195
+ * for example with XXH3_64bits(), which then becomes the seed,
1196
+ * and then employ both the seed and the secret in _withSecretandSeed().
1197
+ * On top of speed, an added benefit is that each bit in the secret
1198
+ * has a 50% chance to swap each bit in the output,
1199
+ * via its impact to the seed.
1200
+ * This is not guaranteed when using the secret directly in "small data" scenarios,
1201
+ * because only portions of the secret are employed for small data.
1202
+ */
1203
+ XXH_PUBLIC_API XXH64_hash_t
1204
+ XXH3_64bits_withSecretandSeed(const void* data, size_t len,
1205
+ const void* secret, size_t secretSize,
1206
+ XXH64_hash_t seed);
1207
+
1208
+ XXH_PUBLIC_API XXH128_hash_t
1209
+ XXH3_128bits_withSecretandSeed(const void* data, size_t len,
1210
+ const void* secret, size_t secretSize,
1211
+ XXH64_hash_t seed64);
1212
+
1213
+ XXH_PUBLIC_API XXH_errorcode
1214
+ XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1215
+ const void* secret, size_t secretSize,
1216
+ XXH64_hash_t seed64);
1217
+
1218
+ XXH_PUBLIC_API XXH_errorcode
1219
+ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr,
1220
+ const void* secret, size_t secretSize,
1221
+ XXH64_hash_t seed64);
1146
1222
 
1147
1223
 
1224
+ #endif /* !XXH_NO_XXH3 */
1148
1225
  #endif /* XXH_NO_LONG_LONG */
1149
1226
  #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
1150
1227
  # define XXH_IMPLEMENTATION
@@ -1221,7 +1298,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1221
1298
  * Use `memcpy()`. Safe and portable. Note that most modern compilers will
1222
1299
  * eliminate the function call and treat it as an unaligned access.
1223
1300
  *
1224
- * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
1301
+ * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`
1225
1302
  * @par
1226
1303
  * Depends on compiler extensions and is therefore not portable.
1227
1304
  * This method is safe _if_ your compiler supports it,
@@ -1248,22 +1325,12 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1248
1325
  * care, as what works on one compiler/platform/optimization level may cause
1249
1326
  * another to read garbage data or even crash.
1250
1327
  *
1251
- * See https://stackoverflow.com/a/32095106/646947 for details.
1328
+ * See http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
1252
1329
  *
1253
1330
  * Prefer these methods in priority order (0 > 3 > 1 > 2)
1254
1331
  */
1255
1332
  # define XXH_FORCE_MEMORY_ACCESS 0
1256
- /*!
1257
- * @def XXH_ACCEPT_NULL_INPUT_POINTER
1258
- * @brief Whether to add explicit `NULL` checks.
1259
- *
1260
- * If the input pointer is `NULL` and the length is non-zero, xxHash's default
1261
- * behavior is to dereference it, triggering a segfault.
1262
- *
1263
- * When this macro is enabled, xxHash actively checks the input for a null pointer.
1264
- * If it is, the result for null input pointers is the same as a zero-length input.
1265
- */
1266
- # define XXH_ACCEPT_NULL_INPUT_POINTER 0
1333
+
1267
1334
  /*!
1268
1335
  * @def XXH_FORCE_ALIGN_CHECK
1269
1336
  * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
@@ -1315,18 +1382,16 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1315
1382
  # define XXH_NO_INLINE_HINTS 0
1316
1383
 
1317
1384
  /*!
1318
- * @def XXH_REROLL
1319
- * @brief Whether to reroll `XXH32_finalize`.
1320
- *
1321
- * For performance, `XXH32_finalize` uses an unrolled loop
1322
- * in the form of a switch statement.
1385
+ * @def XXH32_ENDJMP
1386
+ * @brief Whether to use a jump for `XXH32_finalize`.
1323
1387
  *
1324
- * This is not always desirable, as it generates larger code,
1325
- * and depending on the architecture, may even be slower
1388
+ * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
1389
+ * This is generally preferable for performance,
1390
+ * but depending on exact architecture, a jmp may be preferable.
1326
1391
  *
1327
- * This is automatically defined with `-Os`/`-Oz` on GCC and Clang.
1392
+ * This setting is only possibly making a difference for very small inputs.
1328
1393
  */
1329
- # define XXH_REROLL 0
1394
+ # define XXH32_ENDJMP 0
1330
1395
 
1331
1396
  /*!
1332
1397
  * @internal
@@ -1343,32 +1408,18 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1343
1408
  */
1344
1409
 
1345
1410
  #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
1346
- /* prefer __packed__ structures (method 1) for gcc on armv7+ and mips */
1347
- # if !defined(__clang__) && \
1348
- ( \
1349
- (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
1350
- ( \
1351
- defined(__GNUC__) && ( \
1352
- (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || \
1353
- ( \
1354
- defined(__mips__) && \
1355
- (__mips <= 5 || __mips_isa_rev < 6) && \
1356
- (!defined(__mips16) || defined(__mips_mips16e2)) \
1357
- ) \
1358
- ) \
1359
- ) \
1360
- )
1411
+ /* prefer __packed__ structures (method 1) for GCC
1412
+ * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy
1413
+ * which for some reason does unaligned loads. */
1414
+ # if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))
1361
1415
  # define XXH_FORCE_MEMORY_ACCESS 1
1362
1416
  # endif
1363
1417
  #endif
1364
1418
 
1365
- #ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */
1366
- # define XXH_ACCEPT_NULL_INPUT_POINTER 0
1367
- #endif
1368
-
1369
1419
  #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
1370
- # if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \
1371
- || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */
1420
+ /* don't check on x86, aarch64, or arm when unaligned access is available */
1421
+ # if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \
1422
+ || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */
1372
1423
  # define XXH_FORCE_ALIGN_CHECK 0
1373
1424
  # else
1374
1425
  # define XXH_FORCE_ALIGN_CHECK 1
@@ -1384,14 +1435,9 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1384
1435
  # endif
1385
1436
  #endif
1386
1437
 
1387
- #ifndef XXH_REROLL
1388
- # if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ || \
1389
- (defined(__GNUC__) && !defined(__clang__))
1390
- /* The if/then loop is preferable to switch/case on gcc (on x64) */
1391
- # define XXH_REROLL 1
1392
- # else
1393
- # define XXH_REROLL 0
1394
- # endif
1438
+ #ifndef XXH32_ENDJMP
1439
+ /* generally preferable for performance */
1440
+ # define XXH32_ENDJMP 0
1395
1441
  #endif
1396
1442
 
1397
1443
  /*!
@@ -1413,13 +1459,13 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1413
1459
  * @internal
1414
1460
  * @brief Modify this function to use a different routine than malloc().
1415
1461
  */
1416
- static void* XXH_malloc(size_t s) { return malloc(s); }
1462
+ static void* XXH_malloc(size_t s) { return ruby_xmalloc(s); }
1417
1463
 
1418
1464
  /*!
1419
1465
  * @internal
1420
1466
  * @brief Modify this function to use a different routine than free().
1421
1467
  */
1422
- static void XXH_free(void* p) { free(p); }
1468
+ static void XXH_free(void* p) { ruby_xfree(p); }
1423
1469
 
1424
1470
  #include <string.h>
1425
1471
 
@@ -1443,19 +1489,19 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1443
1489
  #endif
1444
1490
 
1445
1491
  #if XXH_NO_INLINE_HINTS /* disable inlining hints */
1446
- # if defined(__GNUC__)
1492
+ # if defined(__GNUC__) || defined(__clang__)
1447
1493
  # define XXH_FORCE_INLINE static __attribute__((unused))
1448
1494
  # else
1449
1495
  # define XXH_FORCE_INLINE static
1450
1496
  # endif
1451
1497
  # define XXH_NO_INLINE static
1452
1498
  /* enable inlining hints */
1499
+ #elif defined(__GNUC__) || defined(__clang__)
1500
+ # define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
1501
+ # define XXH_NO_INLINE static __attribute__((noinline))
1453
1502
  #elif defined(_MSC_VER) /* Visual Studio */
1454
1503
  # define XXH_FORCE_INLINE static __forceinline
1455
1504
  # define XXH_NO_INLINE static __declspec(noinline)
1456
- #elif defined(__GNUC__)
1457
- # define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
1458
- # define XXH_NO_INLINE static __attribute__((noinline))
1459
1505
  #elif defined (__cplusplus) \
1460
1506
  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */
1461
1507
  # define XXH_FORCE_INLINE static inline
@@ -1522,7 +1568,7 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1522
1568
  * We also use it to prevent unwanted constant folding for AArch64 in
1523
1569
  * XXH3_initCustomSecret_scalar().
1524
1570
  */
1525
- #ifdef __GNUC__
1571
+ #if defined(__GNUC__) || defined(__clang__)
1526
1572
  # define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
1527
1573
  #else
1528
1574
  # define XXH_COMPILER_GUARD(var) ((void)0)
@@ -1615,30 +1661,31 @@ static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr;
1615
1661
  #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
1616
1662
 
1617
1663
  /*
1618
- * __pack instructions are safer but compiler specific, hence potentially
1619
- * problematic for some compilers.
1620
- *
1621
- * Currently only defined for GCC and ICC.
1664
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
1665
+ * documentation claimed that it only increased the alignment, but actually it
1666
+ * can decrease it on gcc, clang, and icc:
1667
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
1668
+ * https://gcc.godbolt.org/z/xYez1j67Y.
1622
1669
  */
1623
1670
  #ifdef XXH_OLD_NAMES
1624
1671
  typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
1625
1672
  #endif
1626
1673
  static xxh_u32 XXH_read32(const void* ptr)
1627
1674
  {
1628
- typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;
1629
- return ((const xxh_unalign*)ptr)->u32;
1675
+ typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32;
1676
+ return *((const xxh_unalign32*)ptr);
1630
1677
  }
1631
1678
 
1632
1679
  #else
1633
1680
 
1634
1681
  /*
1635
1682
  * Portable and safe solution. Generally efficient.
1636
- * see: https://stackoverflow.com/a/32095106/646947
1683
+ * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
1637
1684
  */
1638
1685
  static xxh_u32 XXH_read32(const void* memPtr)
1639
1686
  {
1640
1687
  xxh_u32 val;
1641
- memcpy(&val, memPtr, sizeof(val));
1688
+ XXH_memcpy(&val, memPtr, sizeof(val));
1642
1689
  return val;
1643
1690
  }
1644
1691
 
@@ -1955,8 +2002,10 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
1955
2002
  h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \
1956
2003
  } while (0)
1957
2004
 
1958
- /* Compact rerolled version */
1959
- if (XXH_REROLL) {
2005
+ if (ptr==NULL) XXH_ASSERT(len == 0);
2006
+
2007
+ /* Compact rerolled version; generally faster */
2008
+ if (!XXH32_ENDJMP) {
1960
2009
  len &= 15;
1961
2010
  while (len >= 4) {
1962
2011
  XXH_PROCESS4;
@@ -2024,24 +2073,19 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
2024
2073
  * @internal
2025
2074
  * @brief The implementation for @ref XXH32().
2026
2075
  *
2027
- * @param input, len, seed Directly passed from @ref XXH32().
2076
+ * @param input , len , seed Directly passed from @ref XXH32().
2028
2077
  * @param align Whether @p input is aligned.
2029
2078
  * @return The calculated hash.
2030
2079
  */
2031
2080
  XXH_FORCE_INLINE xxh_u32
2032
2081
  XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
2033
2082
  {
2034
- const xxh_u8* bEnd = input ? input + len : NULL;
2035
2083
  xxh_u32 h32;
2036
2084
 
2037
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
2038
- if (input==NULL) {
2039
- len=0;
2040
- bEnd=input=(const xxh_u8*)(size_t)16;
2041
- }
2042
- #endif
2085
+ if (input==NULL) XXH_ASSERT(len == 0);
2043
2086
 
2044
2087
  if (len>=16) {
2088
+ const xxh_u8* const bEnd = input + len;
2045
2089
  const xxh_u8* const limit = bEnd - 15;
2046
2090
  xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
2047
2091
  xxh_u32 v2 = seed + XXH_PRIME32_2;
@@ -2105,20 +2149,18 @@ XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
2105
2149
  /*! @ingroup xxh32_family */
2106
2150
  XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
2107
2151
  {
2108
- memcpy(dstState, srcState, sizeof(*dstState));
2152
+ XXH_memcpy(dstState, srcState, sizeof(*dstState));
2109
2153
  }
2110
2154
 
2111
2155
  /*! @ingroup xxh32_family */
2112
2156
  XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
2113
2157
  {
2114
- XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
2115
- memset(&state, 0, sizeof(state));
2116
- state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
2117
- state.v2 = seed + XXH_PRIME32_2;
2118
- state.v3 = seed + 0;
2119
- state.v4 = seed - XXH_PRIME32_1;
2120
- /* do not write into reserved, planned to be removed in a future version */
2121
- memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
2158
+ XXH_ASSERT(statePtr != NULL);
2159
+ memset(statePtr, 0, sizeof(*statePtr));
2160
+ statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
2161
+ statePtr->v[1] = seed + XXH_PRIME32_2;
2162
+ statePtr->v[2] = seed + 0;
2163
+ statePtr->v[3] = seed - XXH_PRIME32_1;
2122
2164
  return XXH_OK;
2123
2165
  }
2124
2166
 
@@ -2127,12 +2169,10 @@ XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t s
2127
2169
  XXH_PUBLIC_API XXH_errorcode
2128
2170
  XXH32_update(XXH32_state_t* state, const void* input, size_t len)
2129
2171
  {
2130
- if (input==NULL)
2131
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
2172
+ if (input==NULL) {
2173
+ XXH_ASSERT(len == 0);
2132
2174
  return XXH_OK;
2133
- #else
2134
- return XXH_ERROR;
2135
- #endif
2175
+ }
2136
2176
 
2137
2177
  { const xxh_u8* p = (const xxh_u8*)input;
2138
2178
  const xxh_u8* const bEnd = p + len;
@@ -2149,10 +2189,10 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
2149
2189
  if (state->memsize) { /* some data left from previous update */
2150
2190
  XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
2151
2191
  { const xxh_u32* p32 = state->mem32;
2152
- state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
2153
- state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
2154
- state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
2155
- state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
2192
+ state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;
2193
+ state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;
2194
+ state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;
2195
+ state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));
2156
2196
  }
2157
2197
  p += 16-state->memsize;
2158
2198
  state->memsize = 0;
@@ -2160,22 +2200,14 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
2160
2200
 
2161
2201
  if (p <= bEnd-16) {
2162
2202
  const xxh_u8* const limit = bEnd - 16;
2163
- xxh_u32 v1 = state->v1;
2164
- xxh_u32 v2 = state->v2;
2165
- xxh_u32 v3 = state->v3;
2166
- xxh_u32 v4 = state->v4;
2167
2203
 
2168
2204
  do {
2169
- v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
2170
- v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
2171
- v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
2172
- v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
2205
+ state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;
2206
+ state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;
2207
+ state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;
2208
+ state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;
2173
2209
  } while (p<=limit);
2174
2210
 
2175
- state->v1 = v1;
2176
- state->v2 = v2;
2177
- state->v3 = v3;
2178
- state->v4 = v4;
2179
2211
  }
2180
2212
 
2181
2213
  if (p < bEnd) {
@@ -2194,12 +2226,12 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
2194
2226
  xxh_u32 h32;
2195
2227
 
2196
2228
  if (state->large_len) {
2197
- h32 = XXH_rotl32(state->v1, 1)
2198
- + XXH_rotl32(state->v2, 7)
2199
- + XXH_rotl32(state->v3, 12)
2200
- + XXH_rotl32(state->v4, 18);
2229
+ h32 = XXH_rotl32(state->v[0], 1)
2230
+ + XXH_rotl32(state->v[1], 7)
2231
+ + XXH_rotl32(state->v[2], 12)
2232
+ + XXH_rotl32(state->v[3], 18);
2201
2233
  } else {
2202
- h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
2234
+ h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;
2203
2235
  }
2204
2236
 
2205
2237
  h32 += state->total_len_32;
@@ -2228,7 +2260,7 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t
2228
2260
  {
2229
2261
  XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
2230
2262
  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
2231
- memcpy(dst, &hash, sizeof(*dst));
2263
+ XXH_memcpy(dst, &hash, sizeof(*dst));
2232
2264
  }
2233
2265
  /*! @ingroup xxh32_family */
2234
2266
  XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
@@ -2271,30 +2303,31 @@ static xxh_u64 XXH_read64(const void* memPtr)
2271
2303
  #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
2272
2304
 
2273
2305
  /*
2274
- * __pack instructions are safer, but compiler specific, hence potentially
2275
- * problematic for some compilers.
2276
- *
2277
- * Currently only defined for GCC and ICC.
2306
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
2307
+ * documentation claimed that it only increased the alignment, but actually it
2308
+ * can decrease it on gcc, clang, and icc:
2309
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
2310
+ * https://gcc.godbolt.org/z/xYez1j67Y.
2278
2311
  */
2279
2312
  #ifdef XXH_OLD_NAMES
2280
2313
  typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
2281
2314
  #endif
2282
2315
  static xxh_u64 XXH_read64(const void* ptr)
2283
2316
  {
2284
- typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;
2285
- return ((const xxh_unalign64*)ptr)->u64;
2317
+ typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64;
2318
+ return *((const xxh_unalign64*)ptr);
2286
2319
  }
2287
2320
 
2288
2321
  #else
2289
2322
 
2290
2323
  /*
2291
2324
  * Portable and safe solution. Generally efficient.
2292
- * see: https://stackoverflow.com/a/32095106/646947
2325
+ * see: http://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
2293
2326
  */
2294
2327
  static xxh_u64 XXH_read64(const void* memPtr)
2295
2328
  {
2296
2329
  xxh_u64 val;
2297
- memcpy(&val, memPtr, sizeof(val));
2330
+ XXH_memcpy(&val, memPtr, sizeof(val));
2298
2331
  return val;
2299
2332
  }
2300
2333
 
@@ -2424,6 +2457,7 @@ static xxh_u64 XXH64_avalanche(xxh_u64 h64)
2424
2457
  static xxh_u64
2425
2458
  XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
2426
2459
  {
2460
+ if (ptr==NULL) XXH_ASSERT(len == 0);
2427
2461
  len &= 31;
2428
2462
  while (len >= 8) {
2429
2463
  xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
@@ -2459,18 +2493,12 @@ XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
2459
2493
  XXH_FORCE_INLINE xxh_u64
2460
2494
  XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
2461
2495
  {
2462
- const xxh_u8* bEnd = input ? input + len : NULL;
2463
2496
  xxh_u64 h64;
2464
-
2465
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
2466
- if (input==NULL) {
2467
- len=0;
2468
- bEnd=input=(const xxh_u8*)(size_t)32;
2469
- }
2470
- #endif
2497
+ if (input==NULL) XXH_ASSERT(len == 0);
2471
2498
 
2472
2499
  if (len>=32) {
2473
- const xxh_u8* const limit = bEnd - 32;
2500
+ const xxh_u8* const bEnd = input + len;
2501
+ const xxh_u8* const limit = bEnd - 31;
2474
2502
  xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2475
2503
  xxh_u64 v2 = seed + XXH_PRIME64_2;
2476
2504
  xxh_u64 v3 = seed + 0;
@@ -2481,7 +2509,7 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment
2481
2509
  v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
2482
2510
  v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
2483
2511
  v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
2484
- } while (input<=limit);
2512
+ } while (input<limit);
2485
2513
 
2486
2514
  h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
2487
2515
  h64 = XXH64_mergeRound(h64, v1);
@@ -2536,20 +2564,18 @@ XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
2536
2564
  /*! @ingroup xxh64_family */
2537
2565
  XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
2538
2566
  {
2539
- memcpy(dstState, srcState, sizeof(*dstState));
2567
+ XXH_memcpy(dstState, srcState, sizeof(*dstState));
2540
2568
  }
2541
2569
 
2542
2570
  /*! @ingroup xxh64_family */
2543
2571
  XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
2544
2572
  {
2545
- XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
2546
- memset(&state, 0, sizeof(state));
2547
- state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2548
- state.v2 = seed + XXH_PRIME64_2;
2549
- state.v3 = seed + 0;
2550
- state.v4 = seed - XXH_PRIME64_1;
2551
- /* do not write into reserved64, might be removed in a future version */
2552
- memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
2573
+ XXH_ASSERT(statePtr != NULL);
2574
+ memset(statePtr, 0, sizeof(*statePtr));
2575
+ statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2576
+ statePtr->v[1] = seed + XXH_PRIME64_2;
2577
+ statePtr->v[2] = seed + 0;
2578
+ statePtr->v[3] = seed - XXH_PRIME64_1;
2553
2579
  return XXH_OK;
2554
2580
  }
2555
2581
 
@@ -2557,12 +2583,10 @@ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t s
2557
2583
  XXH_PUBLIC_API XXH_errorcode
2558
2584
  XXH64_update (XXH64_state_t* state, const void* input, size_t len)
2559
2585
  {
2560
- if (input==NULL)
2561
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
2586
+ if (input==NULL) {
2587
+ XXH_ASSERT(len == 0);
2562
2588
  return XXH_OK;
2563
- #else
2564
- return XXH_ERROR;
2565
- #endif
2589
+ }
2566
2590
 
2567
2591
  { const xxh_u8* p = (const xxh_u8*)input;
2568
2592
  const xxh_u8* const bEnd = p + len;
@@ -2577,32 +2601,24 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
2577
2601
 
2578
2602
  if (state->memsize) { /* tmp buffer is full */
2579
2603
  XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
2580
- state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
2581
- state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
2582
- state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
2583
- state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
2604
+ state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));
2605
+ state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));
2606
+ state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));
2607
+ state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));
2584
2608
  p += 32 - state->memsize;
2585
2609
  state->memsize = 0;
2586
2610
  }
2587
2611
 
2588
2612
  if (p+32 <= bEnd) {
2589
2613
  const xxh_u8* const limit = bEnd - 32;
2590
- xxh_u64 v1 = state->v1;
2591
- xxh_u64 v2 = state->v2;
2592
- xxh_u64 v3 = state->v3;
2593
- xxh_u64 v4 = state->v4;
2594
2614
 
2595
2615
  do {
2596
- v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
2597
- v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
2598
- v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
2599
- v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
2616
+ state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;
2617
+ state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;
2618
+ state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;
2619
+ state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;
2600
2620
  } while (p<=limit);
2601
2621
 
2602
- state->v1 = v1;
2603
- state->v2 = v2;
2604
- state->v3 = v3;
2605
- state->v4 = v4;
2606
2622
  }
2607
2623
 
2608
2624
  if (p < bEnd) {
@@ -2621,18 +2637,13 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
2621
2637
  xxh_u64 h64;
2622
2638
 
2623
2639
  if (state->total_len >= 32) {
2624
- xxh_u64 const v1 = state->v1;
2625
- xxh_u64 const v2 = state->v2;
2626
- xxh_u64 const v3 = state->v3;
2627
- xxh_u64 const v4 = state->v4;
2628
-
2629
- h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
2630
- h64 = XXH64_mergeRound(h64, v1);
2631
- h64 = XXH64_mergeRound(h64, v2);
2632
- h64 = XXH64_mergeRound(h64, v3);
2633
- h64 = XXH64_mergeRound(h64, v4);
2640
+ h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);
2641
+ h64 = XXH64_mergeRound(h64, state->v[0]);
2642
+ h64 = XXH64_mergeRound(h64, state->v[1]);
2643
+ h64 = XXH64_mergeRound(h64, state->v[2]);
2644
+ h64 = XXH64_mergeRound(h64, state->v[3]);
2634
2645
  } else {
2635
- h64 = state->v3 /*seed*/ + XXH_PRIME64_5;
2646
+ h64 = state->v[2] /*seed*/ + XXH_PRIME64_5;
2636
2647
  }
2637
2648
 
2638
2649
  h64 += (xxh_u64) state->total_len;
@@ -2648,7 +2659,7 @@ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t
2648
2659
  {
2649
2660
  XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
2650
2661
  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
2651
- memcpy(dst, &hash, sizeof(*dst));
2662
+ XXH_memcpy(dst, &hash, sizeof(*dst));
2652
2663
  }
2653
2664
 
2654
2665
  /*! @ingroup xxh64_family */
@@ -2691,17 +2702,21 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
2691
2702
  # define XXH_unlikely(x) (x)
2692
2703
  #endif
2693
2704
 
2694
- #if defined(__GNUC__)
2695
- # if defined(__AVX2__)
2696
- # include <immintrin.h>
2697
- # elif defined(__SSE2__)
2698
- # include <emmintrin.h>
2699
- # elif defined(__ARM_NEON__) || defined(__ARM_NEON)
2705
+ #if defined(__GNUC__) || defined(__clang__)
2706
+ # if defined(__ARM_NEON__) || defined(__ARM_NEON) \
2707
+ || defined(__aarch64__) || defined(_M_ARM) \
2708
+ || defined(_M_ARM64) || defined(_M_ARM64EC)
2700
2709
  # define inline __inline__ /* circumvent a clang bug */
2701
2710
  # include <arm_neon.h>
2702
2711
  # undef inline
2712
+ # elif defined(__AVX2__)
2713
+ # include <immintrin.h>
2714
+ # elif defined(__SSE2__)
2715
+ # include <emmintrin.h>
2703
2716
  # endif
2704
- #elif defined(_MSC_VER)
2717
+ #endif
2718
+
2719
+ #if defined(_MSC_VER)
2705
2720
  # include <intrin.h>
2706
2721
  #endif
2707
2722
 
@@ -2839,17 +2854,20 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
2839
2854
  #endif
2840
2855
 
2841
2856
  #ifndef XXH_VECTOR /* can be defined on command line */
2842
- # if defined(__AVX512F__)
2857
+ # if ( \
2858
+ defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
2859
+ || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
2860
+ ) && ( \
2861
+ defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
2862
+ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
2863
+ )
2864
+ # define XXH_VECTOR XXH_NEON
2865
+ # elif defined(__AVX512F__)
2843
2866
  # define XXH_VECTOR XXH_AVX512
2844
2867
  # elif defined(__AVX2__)
2845
2868
  # define XXH_VECTOR XXH_AVX2
2846
2869
  # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
2847
2870
  # define XXH_VECTOR XXH_SSE2
2848
- # elif defined(__GNUC__) /* msvc support maybe later */ \
2849
- && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
2850
- && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
2851
- || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
2852
- # define XXH_VECTOR XXH_NEON
2853
2871
  # elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
2854
2872
  || (defined(__s390x__) && defined(__VEC__)) \
2855
2873
  && defined(__GNUC__) /* TODO: IBM XL */
@@ -2999,8 +3017,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
2999
3017
  * }
3000
3018
  */
3001
3019
  # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
3002
- && defined(__GNUC__) \
3003
- && !defined(__aarch64__) && !defined(__arm64__)
3020
+ && (defined(__GNUC__) || defined(__clang__)) \
3021
+ && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
3004
3022
  # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
3005
3023
  do { \
3006
3024
  /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
@@ -3017,6 +3035,76 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
3017
3035
  (outHi) = vshrn_n_u64 ((in), 32); \
3018
3036
  } while (0)
3019
3037
  # endif
3038
+
3039
+ /*!
3040
+ * @internal
3041
+ * @brief `vld1q_u64` but faster and alignment-safe.
3042
+ *
3043
+ * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only
3044
+ * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).
3045
+ *
3046
+ * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it
3047
+ * prohibits load-store optimizations. Therefore, a direct dereference is used.
3048
+ *
3049
+ * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe
3050
+ * unaligned load.
3051
+ */
3052
+ #if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
3053
+ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
3054
+ {
3055
+ return *(uint64x2_t const*)ptr;
3056
+ }
3057
+ #else
3058
+ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
3059
+ {
3060
+ return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
3061
+ }
3062
+ #endif
3063
+ /*!
3064
+ * @ingroup tuning
3065
+ * @brief Controls the NEON to scalar ratio for XXH3
3066
+ *
3067
+ * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and
3068
+ * 2 lanes on scalar by default.
3069
+ *
3070
+ * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the
3071
+ * emulated 64-bit arithmetic is too slow.
3072
+ *
3073
+ * Modern ARM CPUs are _very_ sensitive to how their pipelines are used.
3074
+ *
3075
+ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't
3076
+ * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,
3077
+ * you are only using 2/3 of the CPU bandwidth.
3078
+ *
3079
+ * This is even more noticable on the more advanced cores like the A76 which
3080
+ * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
3081
+ *
3082
+ * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the
3083
+ * remaining lanes will use scalar instructions. This improves the bandwidth
3084
+ * and also gives the integer pipelines something to do besides twiddling loop
3085
+ * counters and pointers.
3086
+ *
3087
+ * This change benefits CPUs with large micro-op buffers without negatively affecting
3088
+ * other CPUs:
3089
+ *
3090
+ * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. |
3091
+ * |:----------------------|:--------------------|----------:|-----------:|------:|
3092
+ * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% |
3093
+ * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% |
3094
+ * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% |
3095
+ *
3096
+ * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
3097
+ *
3098
+ * @see XXH3_accumulate_512_neon()
3099
+ */
3100
+ # ifndef XXH3_NEON_LANES
3101
+ # if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
3102
+ && !defined(__OPTIMIZE_SIZE__)
3103
+ # define XXH3_NEON_LANES 6
3104
+ # else
3105
+ # define XXH3_NEON_LANES XXH_ACC_NB
3106
+ # endif
3107
+ # endif
3020
3108
  #endif /* XXH_VECTOR == XXH_NEON */
3021
3109
 
3022
3110
  /*
@@ -3083,7 +3171,7 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
3083
3171
  XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
3084
3172
  {
3085
3173
  xxh_u64x2 ret;
3086
- memcpy(&ret, ptr, sizeof(xxh_u64x2));
3174
+ XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
3087
3175
  # if XXH_VSX_BE
3088
3176
  ret = XXH_vec_revb(ret);
3089
3177
  # endif
@@ -3193,7 +3281,6 @@ XXH_mult32to64(xxh_u64 x, xxh_u64 y)
3193
3281
  return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
3194
3282
  }
3195
3283
  #elif defined(_MSC_VER) && defined(_M_IX86)
3196
- # include <intrin.h>
3197
3284
  # define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
3198
3285
  #else
3199
3286
  /*
@@ -3212,7 +3299,7 @@ XXH_mult32to64(xxh_u64 x, xxh_u64 y)
3212
3299
  * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
3213
3300
  * version.
3214
3301
  *
3215
- * @param lhs, rhs The 64-bit integers to be multiplied
3302
+ * @param lhs , rhs The 64-bit integers to be multiplied
3216
3303
  * @return The 128-bit result represented in an @ref XXH128_hash_t.
3217
3304
  */
3218
3305
  static XXH128_hash_t
@@ -3233,7 +3320,7 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
3233
3320
  * In that case it is best to use the portable one.
3234
3321
  * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
3235
3322
  */
3236
- #if defined(__GNUC__) && !defined(__wasm__) \
3323
+ #if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
3237
3324
  && defined(__SIZEOF_INT128__) \
3238
3325
  || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
3239
3326
 
@@ -3250,7 +3337,7 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
3250
3337
  *
3251
3338
  * This compiles to single operand MUL on x64.
3252
3339
  */
3253
- #elif defined(_M_X64) || defined(_M_IA64)
3340
+ #elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
3254
3341
 
3255
3342
  #ifndef _MSC_VER
3256
3343
  # pragma intrinsic(_umul128)
@@ -3262,6 +3349,21 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
3262
3349
  r128.high64 = product_high;
3263
3350
  return r128;
3264
3351
 
3352
+ /*
3353
+ * MSVC for ARM64's __umulh method.
3354
+ *
3355
+ * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
3356
+ */
3357
+ #elif defined(_M_ARM64) || defined(_M_ARM64EC)
3358
+
3359
+ #ifndef _MSC_VER
3360
+ # pragma intrinsic(__umulh)
3361
+ #endif
3362
+ XXH128_hash_t r128;
3363
+ r128.low64 = lhs * rhs;
3364
+ r128.high64 = __umulh(lhs, rhs);
3365
+ return r128;
3366
+
3265
3367
  #else
3266
3368
  /*
3267
3369
  * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
@@ -3330,7 +3432,7 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
3330
3432
  * The reason for the separate function is to prevent passing too many structs
3331
3433
  * around by value. This will hopefully inline the multiply, but we don't force it.
3332
3434
  *
3333
- * @param lhs, rhs The 64-bit integers to multiply
3435
+ * @param lhs , rhs The 64-bit integers to multiply
3334
3436
  * @return The low 64 bits of the product XOR'd by the high 64 bits.
3335
3437
  * @see XXH_mult64to128()
3336
3438
  */
@@ -3632,7 +3734,7 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
3632
3734
  XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
3633
3735
  {
3634
3736
  if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
3635
- memcpy(dst, &v64, sizeof(v64));
3737
+ XXH_memcpy(dst, &v64, sizeof(v64));
3636
3738
  }
3637
3739
 
3638
3740
  /* Several intrinsic functions below are supposed to accept __int64 as argument,
@@ -3649,6 +3751,7 @@ XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
3649
3751
  typedef long long xxh_i64;
3650
3752
  #endif
3651
3753
 
3754
+
3652
3755
  /*
3653
3756
  * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
3654
3757
  *
@@ -3684,7 +3787,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
3684
3787
  const void* XXH_RESTRICT input,
3685
3788
  const void* XXH_RESTRICT secret)
3686
3789
  {
3687
- XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc;
3790
+ __m512i* const xacc = (__m512i *) acc;
3688
3791
  XXH_ASSERT((((size_t)acc) & 63) == 0);
3689
3792
  XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
3690
3793
 
@@ -3733,7 +3836,7 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3733
3836
  {
3734
3837
  XXH_ASSERT((((size_t)acc) & 63) == 0);
3735
3838
  XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
3736
- { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
3839
+ { __m512i* const xacc = (__m512i*) acc;
3737
3840
  const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
3738
3841
 
3739
3842
  /* xacc[0] ^= (xacc[0] >> 47) */
@@ -3794,7 +3897,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
3794
3897
  const void* XXH_RESTRICT secret)
3795
3898
  {
3796
3899
  XXH_ASSERT((((size_t)acc) & 31) == 0);
3797
- { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;
3900
+ { __m256i* const xacc = (__m256i *) acc;
3798
3901
  /* Unaligned. This is mainly for pointer arithmetic, and because
3799
3902
  * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
3800
3903
  const __m256i* const xinput = (const __m256i *) input;
@@ -3826,7 +3929,7 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void
3826
3929
  XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3827
3930
  {
3828
3931
  XXH_ASSERT((((size_t)acc) & 31) == 0);
3829
- { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
3932
+ { __m256i* const xacc = (__m256i*) acc;
3830
3933
  /* Unaligned. This is mainly for pointer arithmetic, and because
3831
3934
  * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
3832
3935
  const __m256i* const xsecret = (const __m256i *) secret;
@@ -3900,7 +4003,7 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
3900
4003
  {
3901
4004
  /* SSE2 is just a half-scale version of the AVX2 version. */
3902
4005
  XXH_ASSERT((((size_t)acc) & 15) == 0);
3903
- { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc;
4006
+ { __m128i* const xacc = (__m128i *) acc;
3904
4007
  /* Unaligned. This is mainly for pointer arithmetic, and because
3905
4008
  * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
3906
4009
  const __m128i* const xinput = (const __m128i *) input;
@@ -3932,7 +4035,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void
3932
4035
  XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3933
4036
  {
3934
4037
  XXH_ASSERT((((size_t)acc) & 15) == 0);
3935
- { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
4038
+ { __m128i* const xacc = (__m128i*) acc;
3936
4039
  /* Unaligned. This is mainly for pointer arithmetic, and because
3937
4040
  * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
3938
4041
  const __m128i* const xsecret = (const __m128i *) secret;
@@ -3994,40 +4097,66 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTR
3994
4097
 
3995
4098
  #if (XXH_VECTOR == XXH_NEON)
3996
4099
 
4100
+ /* forward declarations for the scalar routines */
4101
+ XXH_FORCE_INLINE void
4102
+ XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
4103
+ void const* XXH_RESTRICT secret, size_t lane);
4104
+
4105
+ XXH_FORCE_INLINE void
4106
+ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
4107
+ void const* XXH_RESTRICT secret, size_t lane);
4108
+
4109
+ /*!
4110
+ * @internal
4111
+ * @brief The bulk processing loop for NEON.
4112
+ *
4113
+ * The NEON code path is actually partially scalar when running on AArch64. This
4114
+ * is to optimize the pipelining and can have up to 15% speedup depending on the
4115
+ * CPU, and it also mitigates some GCC codegen issues.
4116
+ *
4117
+ * @see XXH3_NEON_LANES for configuring this and details about this optimization.
4118
+ */
3997
4119
  XXH_FORCE_INLINE void
3998
4120
  XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
3999
4121
  const void* XXH_RESTRICT input,
4000
4122
  const void* XXH_RESTRICT secret)
4001
4123
  {
4002
4124
  XXH_ASSERT((((size_t)acc) & 15) == 0);
4125
+ XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
4003
4126
  {
4004
- XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
4127
+ uint64x2_t* const xacc = (uint64x2_t *) acc;
4005
4128
  /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
4006
4129
  uint8_t const* const xinput = (const uint8_t *) input;
4007
4130
  uint8_t const* const xsecret = (const uint8_t *) secret;
4008
4131
 
4009
4132
  size_t i;
4010
- for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
4133
+ /* AArch64 uses both scalar and neon at the same time */
4134
+ for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
4135
+ XXH3_scalarRound(acc, input, secret, i);
4136
+ }
4137
+ for (i=0; i < XXH3_NEON_LANES / 2; i++) {
4138
+ uint64x2_t acc_vec = xacc[i];
4011
4139
  /* data_vec = xinput[i]; */
4012
- uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
4140
+ uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16));
4013
4141
  /* key_vec = xsecret[i]; */
4014
- uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
4142
+ uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));
4015
4143
  uint64x2_t data_key;
4016
4144
  uint32x2_t data_key_lo, data_key_hi;
4017
- /* xacc[i] += swap(data_vec); */
4018
- uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
4019
- uint64x2_t const swapped = vextq_u64(data64, data64, 1);
4020
- xacc[i] = vaddq_u64 (xacc[i], swapped);
4145
+ /* acc_vec_2 = swap(data_vec) */
4146
+ uint64x2_t acc_vec_2 = vextq_u64(data_vec, data_vec, 1);
4021
4147
  /* data_key = data_vec ^ key_vec; */
4022
- data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
4148
+ data_key = veorq_u64(data_vec, key_vec);
4023
4149
  /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
4024
4150
  * data_key_hi = (uint32x2_t) (data_key >> 32);
4025
4151
  * data_key = UNDEFINED; */
4026
4152
  XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
4027
- /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
4028
- xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
4029
-
4153
+ /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
4154
+ acc_vec_2 = vmlal_u32 (acc_vec_2, data_key_lo, data_key_hi);
4155
+ /* xacc[i] += acc_vec_2; */
4156
+ acc_vec = vaddq_u64 (acc_vec, acc_vec_2);
4157
+ xacc[i] = acc_vec;
4030
4158
  }
4159
+
4031
4160
  }
4032
4161
  }
4033
4162
 
@@ -4041,15 +4170,19 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4041
4170
  uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);
4042
4171
 
4043
4172
  size_t i;
4044
- for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) {
4173
+ /* AArch64 uses both scalar and neon at the same time */
4174
+ for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
4175
+ XXH3_scalarScrambleRound(acc, secret, i);
4176
+ }
4177
+ for (i=0; i < XXH3_NEON_LANES / 2; i++) {
4045
4178
  /* xacc[i] ^= (xacc[i] >> 47); */
4046
4179
  uint64x2_t acc_vec = xacc[i];
4047
- uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
4048
- uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
4180
+ uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
4181
+ uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
4049
4182
 
4050
4183
  /* xacc[i] ^= xsecret[i]; */
4051
- uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
4052
- uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
4184
+ uint64x2_t key_vec = XXH_vld1q_u64 (xsecret + (i * 16));
4185
+ uint64x2_t data_key = veorq_u64 (data_vec, key_vec);
4053
4186
 
4054
4187
  /* xacc[i] *= XXH_PRIME32_1 */
4055
4188
  uint32x2_t data_key_lo, data_key_hi;
@@ -4077,11 +4210,12 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4077
4210
  */
4078
4211
  uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
4079
4212
  /* xacc[i] = prod_hi << 32; */
4080
- xacc[i] = vshlq_n_u64(prod_hi, 32);
4213
+ prod_hi = vshlq_n_u64(prod_hi, 32);
4081
4214
  /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
4082
- xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
4215
+ xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime);
4083
4216
  }
4084
- } }
4217
+ }
4218
+ }
4085
4219
  }
4086
4220
 
4087
4221
  #endif
@@ -4093,7 +4227,8 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
4093
4227
  const void* XXH_RESTRICT input,
4094
4228
  const void* XXH_RESTRICT secret)
4095
4229
  {
4096
- xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */
4230
+ /* presumed aligned */
4231
+ unsigned int* const xacc = (unsigned int*) acc;
4097
4232
  xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */
4098
4233
  xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */
4099
4234
  xxh_u64x2 const v32 = { 32, 32 };
@@ -4108,14 +4243,18 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
4108
4243
  xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
4109
4244
  /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
4110
4245
  xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
4111
- xacc[i] += product;
4246
+ /* acc_vec = xacc[i]; */
4247
+ xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i);
4248
+ acc_vec += product;
4112
4249
 
4113
4250
  /* swap high and low halves */
4114
4251
  #ifdef __s390x__
4115
- xacc[i] += vec_permi(data_vec, data_vec, 2);
4252
+ acc_vec += vec_permi(data_vec, data_vec, 2);
4116
4253
  #else
4117
- xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
4254
+ acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
4118
4255
  #endif
4256
+ /* xacc[i] = acc_vec; */
4257
+ vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
4119
4258
  }
4120
4259
  }
4121
4260
 
@@ -4153,38 +4292,90 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4153
4292
 
4154
4293
  /* scalar variants - universal */
4155
4294
 
4295
+ /*!
4296
+ * @internal
4297
+ * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
4298
+ *
4299
+ * This is extracted to its own function because the NEON path uses a combination
4300
+ * of NEON and scalar.
4301
+ */
4302
+ XXH_FORCE_INLINE void
4303
+ XXH3_scalarRound(void* XXH_RESTRICT acc,
4304
+ void const* XXH_RESTRICT input,
4305
+ void const* XXH_RESTRICT secret,
4306
+ size_t lane)
4307
+ {
4308
+ xxh_u64* xacc = (xxh_u64*) acc;
4309
+ xxh_u8 const* xinput = (xxh_u8 const*) input;
4310
+ xxh_u8 const* xsecret = (xxh_u8 const*) secret;
4311
+ XXH_ASSERT(lane < XXH_ACC_NB);
4312
+ XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
4313
+ {
4314
+ xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
4315
+ xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
4316
+ xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
4317
+ xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
4318
+ }
4319
+ }
4320
+
4321
+ /*!
4322
+ * @internal
4323
+ * @brief Processes a 64 byte block of data using the scalar path.
4324
+ */
4156
4325
  XXH_FORCE_INLINE void
4157
4326
  XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
4158
4327
  const void* XXH_RESTRICT input,
4159
4328
  const void* XXH_RESTRICT secret)
4160
4329
  {
4161
- XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
4162
- const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */
4163
- const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
4164
4330
  size_t i;
4165
- XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
4331
+ /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */
4332
+ #if defined(__GNUC__) && !defined(__clang__) \
4333
+ && (defined(__arm__) || defined(__thumb2__)) \
4334
+ && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \
4335
+ && !defined(__OPTIMIZE_SIZE__)
4336
+ # pragma GCC unroll 8
4337
+ #endif
4166
4338
  for (i=0; i < XXH_ACC_NB; i++) {
4167
- xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
4168
- xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
4169
- xacc[i ^ 1] += data_val; /* swap adjacent lanes */
4170
- xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
4339
+ XXH3_scalarRound(acc, input, secret, i);
4171
4340
  }
4172
4341
  }
4173
4342
 
4343
+ /*!
4344
+ * @internal
4345
+ * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
4346
+ *
4347
+ * This is extracted to its own function because the NEON path uses a combination
4348
+ * of NEON and scalar.
4349
+ */
4174
4350
  XXH_FORCE_INLINE void
4175
- XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4351
+ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
4352
+ void const* XXH_RESTRICT secret,
4353
+ size_t lane)
4176
4354
  {
4177
- XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
4355
+ xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
4178
4356
  const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
4179
- size_t i;
4180
4357
  XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
4181
- for (i=0; i < XXH_ACC_NB; i++) {
4182
- xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
4183
- xxh_u64 acc64 = xacc[i];
4358
+ XXH_ASSERT(lane < XXH_ACC_NB);
4359
+ {
4360
+ xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
4361
+ xxh_u64 acc64 = xacc[lane];
4184
4362
  acc64 = XXH_xorshift64(acc64, 47);
4185
4363
  acc64 ^= key64;
4186
4364
  acc64 *= XXH_PRIME32_1;
4187
- xacc[i] = acc64;
4365
+ xacc[lane] = acc64;
4366
+ }
4367
+ }
4368
+
4369
+ /*!
4370
+ * @internal
4371
+ * @brief Scrambles the accumulators after a large chunk has been read
4372
+ */
4373
+ XXH_FORCE_INLINE void
4374
+ XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4375
+ {
4376
+ size_t i;
4377
+ for (i=0; i < XXH_ACC_NB; i++) {
4378
+ XXH3_scalarScrambleRound(acc, secret, i);
4188
4379
  }
4189
4380
  }
4190
4381
 
@@ -4206,8 +4397,9 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4206
4397
  * placed sequentially, in order, at the top of the unrolled loop.
4207
4398
  *
4208
4399
  * While MOVK is great for generating constants (2 cycles for a 64-bit
4209
- * constant compared to 4 cycles for LDR), long MOVK chains stall the
4210
- * integer pipelines:
4400
+ * constant compared to 4 cycles for LDR), it fights for bandwidth with
4401
+ * the arithmetic instructions.
4402
+ *
4211
4403
  * I L S
4212
4404
  * MOVK
4213
4405
  * MOVK
@@ -4224,6 +4416,9 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4224
4416
  * ADD LDR
4225
4417
  * SUB STR
4226
4418
  * STR
4419
+ *
4420
+ * See XXH3_NEON_LANES for details on the pipsline.
4421
+ *
4227
4422
  * XXH3_64bits_withSeed, len == 256, Snapdragon 835
4228
4423
  * without hack: 2654.4 MB/s
4229
4424
  * with hack: 3202.9 MB/s
@@ -4422,9 +4617,11 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
4422
4617
  }
4423
4618
 
4424
4619
  /*
4425
- * It's important for performance that XXH3_hashLong is not inlined.
4620
+ * It's important for performance to transmit secret's size (when it's static)
4621
+ * so that the compiler can properly optimize the vectorized loop.
4622
+ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
4426
4623
  */
4427
- XXH_NO_INLINE XXH64_hash_t
4624
+ XXH_FORCE_INLINE XXH64_hash_t
4428
4625
  XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
4429
4626
  XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
4430
4627
  {
@@ -4433,11 +4630,10 @@ XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
4433
4630
  }
4434
4631
 
4435
4632
  /*
4436
- * It's important for performance that XXH3_hashLong is not inlined.
4437
- * Since the function is not inlined, the compiler may not be able to understand that,
4438
- * in some scenarios, its `secret` argument is actually a compile time constant.
4439
- * This variant enforces that the compiler can detect that,
4440
- * and uses this opportunity to streamline the generated code for better performance.
4633
+ * It's preferable for performance that XXH3_hashLong is not inlined,
4634
+ * as it results in a smaller function for small data, easier to the instruction cache.
4635
+ * Note that inside this no_inline function, we do inline the internal loop,
4636
+ * and provide a statically defined secret size to allow optimization of vector loop.
4441
4637
  */
4442
4638
  XXH_NO_INLINE XXH64_hash_t
4443
4639
  XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
@@ -4537,6 +4733,14 @@ XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
4537
4733
  return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
4538
4734
  }
4539
4735
 
4736
+ XXH_PUBLIC_API XXH64_hash_t
4737
+ XXH3_64bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
4738
+ {
4739
+ if (len <= XXH3_MIDSIZE_MAX)
4740
+ return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
4741
+ return XXH3_hashLong_64b_withSecret(input, len, seed, (const xxh_u8*)secret, secretSize);
4742
+ }
4743
+
4540
4744
 
4541
4745
  /* === XXH3 streaming === */
4542
4746
 
@@ -4625,13 +4829,13 @@ XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
4625
4829
  XXH_PUBLIC_API void
4626
4830
  XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
4627
4831
  {
4628
- memcpy(dst_state, src_state, sizeof(*dst_state));
4832
+ XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
4629
4833
  }
4630
4834
 
4631
4835
  static void
4632
4836
  XXH3_reset_internal(XXH3_state_t* statePtr,
4633
- XXH64_hash_t seed,
4634
- const void* secret, size_t secretSize)
4837
+ XXH64_hash_t seed,
4838
+ const void* secret, size_t secretSize)
4635
4839
  {
4636
4840
  size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
4637
4841
  size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
@@ -4648,6 +4852,7 @@ XXH3_reset_internal(XXH3_state_t* statePtr,
4648
4852
  statePtr->acc[6] = XXH_PRIME64_5;
4649
4853
  statePtr->acc[7] = XXH_PRIME32_1;
4650
4854
  statePtr->seed = seed;
4855
+ statePtr->useSeed = (seed != 0);
4651
4856
  statePtr->extSecret = (const unsigned char*)secret;
4652
4857
  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
4653
4858
  statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
@@ -4680,11 +4885,24 @@ XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
4680
4885
  {
4681
4886
  if (statePtr == NULL) return XXH_ERROR;
4682
4887
  if (seed==0) return XXH3_64bits_reset(statePtr);
4683
- if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
4888
+ if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
4889
+ XXH3_initCustomSecret(statePtr->customSecret, seed);
4684
4890
  XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
4685
4891
  return XXH_OK;
4686
4892
  }
4687
4893
 
4894
+ /*! @ingroup xxh3_family */
4895
+ XXH_PUBLIC_API XXH_errorcode
4896
+ XXH3_64bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed64)
4897
+ {
4898
+ if (statePtr == NULL) return XXH_ERROR;
4899
+ if (secret == NULL) return XXH_ERROR;
4900
+ if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
4901
+ XXH3_reset_internal(statePtr, seed64, secret, secretSize);
4902
+ statePtr->useSeed = 1; /* always, even if seed64==0 */
4903
+ return XXH_OK;
4904
+ }
4905
+
4688
4906
  /* Note : when XXH3_consumeStripes() is invoked,
4689
4907
  * there must be a guarantee that at least one more byte must be consumed from input
4690
4908
  * so that the function can blindly consume all stripes using the "normal" secret segment */
@@ -4712,35 +4930,48 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
4712
4930
  }
4713
4931
  }
4714
4932
 
4933
+ #ifndef XXH3_STREAM_USE_STACK
4934
+ # ifndef __clang__ /* clang doesn't need additional stack space */
4935
+ # define XXH3_STREAM_USE_STACK 1
4936
+ # endif
4937
+ #endif
4715
4938
  /*
4716
4939
  * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
4717
4940
  */
4718
4941
  XXH_FORCE_INLINE XXH_errorcode
4719
- XXH3_update(XXH3_state_t* state,
4720
- const xxh_u8* input, size_t len,
4942
+ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
4943
+ const xxh_u8* XXH_RESTRICT input, size_t len,
4721
4944
  XXH3_f_accumulate_512 f_acc512,
4722
4945
  XXH3_f_scrambleAcc f_scramble)
4723
4946
  {
4724
- if (input==NULL)
4725
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
4947
+ if (input==NULL) {
4948
+ XXH_ASSERT(len == 0);
4726
4949
  return XXH_OK;
4727
- #else
4728
- return XXH_ERROR;
4729
- #endif
4950
+ }
4730
4951
 
4952
+ XXH_ASSERT(state != NULL);
4731
4953
  { const xxh_u8* const bEnd = input + len;
4732
4954
  const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
4733
-
4955
+ #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
4956
+ /* For some reason, gcc and MSVC seem to suffer greatly
4957
+ * when operating accumulators directly into state.
4958
+ * Operating into stack space seems to enable proper optimization.
4959
+ * clang, on the other hand, doesn't seem to need this trick */
4960
+ XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));
4961
+ #else
4962
+ xxh_u64* XXH_RESTRICT const acc = state->acc;
4963
+ #endif
4734
4964
  state->totalLen += len;
4735
4965
  XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
4736
4966
 
4737
- if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
4967
+ /* small input : just fill in tmp buffer */
4968
+ if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {
4738
4969
  XXH_memcpy(state->buffer + state->bufferedSize, input, len);
4739
4970
  state->bufferedSize += (XXH32_hash_t)len;
4740
4971
  return XXH_OK;
4741
4972
  }
4742
- /* total input is now > XXH3_INTERNALBUFFER_SIZE */
4743
4973
 
4974
+ /* total input is now > XXH3_INTERNALBUFFER_SIZE */
4744
4975
  #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
4745
4976
  XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */
4746
4977
 
@@ -4752,7 +4983,7 @@ XXH3_update(XXH3_state_t* state,
4752
4983
  size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
4753
4984
  XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
4754
4985
  input += loadSize;
4755
- XXH3_consumeStripes(state->acc,
4986
+ XXH3_consumeStripes(acc,
4756
4987
  &state->nbStripesSoFar, state->nbStripesPerBlock,
4757
4988
  state->buffer, XXH3_INTERNALBUFFER_STRIPES,
4758
4989
  secret, state->secretLimit,
@@ -4761,25 +4992,62 @@ XXH3_update(XXH3_state_t* state,
4761
4992
  }
4762
4993
  XXH_ASSERT(input < bEnd);
4763
4994
 
4764
- /* Consume input by a multiple of internal buffer size */
4765
- if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
4766
- const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
4767
- do {
4768
- XXH3_consumeStripes(state->acc,
4769
- &state->nbStripesSoFar, state->nbStripesPerBlock,
4770
- input, XXH3_INTERNALBUFFER_STRIPES,
4771
- secret, state->secretLimit,
4772
- f_acc512, f_scramble);
4773
- input += XXH3_INTERNALBUFFER_SIZE;
4774
- } while (input<limit);
4775
- /* for last partial stripe */
4776
- memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
4995
+ /* large input to consume : ingest per full block */
4996
+ if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
4997
+ size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
4998
+ XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);
4999
+ /* join to current block's end */
5000
+ { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
5001
+ XXH_ASSERT(nbStripesToEnd <= nbStripes);
5002
+ XXH3_accumulate(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd, f_acc512);
5003
+ f_scramble(acc, secret + state->secretLimit);
5004
+ state->nbStripesSoFar = 0;
5005
+ input += nbStripesToEnd * XXH_STRIPE_LEN;
5006
+ nbStripes -= nbStripesToEnd;
5007
+ }
5008
+ /* consume per entire blocks */
5009
+ while(nbStripes >= state->nbStripesPerBlock) {
5010
+ XXH3_accumulate(acc, input, secret, state->nbStripesPerBlock, f_acc512);
5011
+ f_scramble(acc, secret + state->secretLimit);
5012
+ input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
5013
+ nbStripes -= state->nbStripesPerBlock;
5014
+ }
5015
+ /* consume last partial block */
5016
+ XXH3_accumulate(acc, input, secret, nbStripes, f_acc512);
5017
+ input += nbStripes * XXH_STRIPE_LEN;
5018
+ XXH_ASSERT(input < bEnd); /* at least some bytes left */
5019
+ state->nbStripesSoFar = nbStripes;
5020
+ /* buffer predecessor of last partial stripe */
5021
+ XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5022
+ XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
5023
+ } else {
5024
+ /* content to consume <= block size */
5025
+ /* Consume input by a multiple of internal buffer size */
5026
+ if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
5027
+ const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
5028
+ do {
5029
+ XXH3_consumeStripes(acc,
5030
+ &state->nbStripesSoFar, state->nbStripesPerBlock,
5031
+ input, XXH3_INTERNALBUFFER_STRIPES,
5032
+ secret, state->secretLimit,
5033
+ f_acc512, f_scramble);
5034
+ input += XXH3_INTERNALBUFFER_SIZE;
5035
+ } while (input<limit);
5036
+ /* buffer predecessor of last partial stripe */
5037
+ XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5038
+ }
4777
5039
  }
4778
- XXH_ASSERT(input < bEnd);
4779
5040
 
4780
5041
  /* Some remaining input (always) : buffer it */
5042
+ XXH_ASSERT(input < bEnd);
5043
+ XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
5044
+ XXH_ASSERT(state->bufferedSize == 0);
4781
5045
  XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
4782
5046
  state->bufferedSize = (XXH32_hash_t)(bEnd-input);
5047
+ #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
5048
+ /* save stack accumulators into state */
5049
+ memcpy(state->acc, acc, sizeof(acc));
5050
+ #endif
4783
5051
  }
4784
5052
 
4785
5053
  return XXH_OK;
@@ -4803,7 +5071,7 @@ XXH3_digest_long (XXH64_hash_t* acc,
4803
5071
  * Digest on a local copy. This way, the state remains unaltered, and it can
4804
5072
  * continue ingesting more input afterwards.
4805
5073
  */
4806
- memcpy(acc, state->acc, sizeof(state->acc));
5074
+ XXH_memcpy(acc, state->acc, sizeof(state->acc));
4807
5075
  if (state->bufferedSize >= XXH_STRIPE_LEN) {
4808
5076
  size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
4809
5077
  size_t nbStripesSoFar = state->nbStripesSoFar;
@@ -4820,8 +5088,8 @@ XXH3_digest_long (XXH64_hash_t* acc,
4820
5088
  xxh_u8 lastStripe[XXH_STRIPE_LEN];
4821
5089
  size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
4822
5090
  XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */
4823
- memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
4824
- memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
5091
+ XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
5092
+ XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
4825
5093
  XXH3_accumulate_512(acc,
4826
5094
  lastStripe,
4827
5095
  secret + state->secretLimit - XXH_SECRET_LASTACC_START);
@@ -4840,58 +5108,13 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
4840
5108
  (xxh_u64)state->totalLen * XXH_PRIME64_1);
4841
5109
  }
4842
5110
  /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
4843
- if (state->seed)
5111
+ if (state->useSeed)
4844
5112
  return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
4845
5113
  return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
4846
5114
  secret, state->secretLimit + XXH_STRIPE_LEN);
4847
5115
  }
4848
5116
 
4849
5117
 
4850
- #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
4851
-
4852
- /*! @ingroup xxh3_family */
4853
- XXH_PUBLIC_API void
4854
- XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize)
4855
- {
4856
- XXH_ASSERT(secretBuffer != NULL);
4857
- if (customSeedSize == 0) {
4858
- memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
4859
- return;
4860
- }
4861
- XXH_ASSERT(customSeed != NULL);
4862
-
4863
- { size_t const segmentSize = sizeof(XXH128_hash_t);
4864
- size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
4865
- XXH128_canonical_t scrambler;
4866
- XXH64_hash_t seeds[12];
4867
- size_t segnb;
4868
- XXH_ASSERT(nbSegments == 12);
4869
- XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */
4870
- XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
4871
-
4872
- /*
4873
- * Copy customSeed to seeds[], truncating or repeating as necessary.
4874
- */
4875
- { size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
4876
- size_t filled = toFill;
4877
- memcpy(seeds, customSeed, toFill);
4878
- while (filled < sizeof(seeds)) {
4879
- toFill = XXH_MIN(filled, sizeof(seeds) - filled);
4880
- memcpy((char*)seeds + filled, seeds, toFill);
4881
- filled += toFill;
4882
- } }
4883
-
4884
- /* generate secret */
4885
- memcpy(secretBuffer, &scrambler, sizeof(scrambler));
4886
- for (segnb=1; segnb < nbSegments; segnb++) {
4887
- size_t const segmentStart = segnb * segmentSize;
4888
- XXH128_canonical_t segment;
4889
- XXH128_canonicalFromHash(&segment,
4890
- XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) );
4891
- memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment));
4892
- } }
4893
- }
4894
-
4895
5118
 
4896
5119
  /* ==========================================
4897
5120
  * XXH3 128 bits (a.k.a XXH128)
@@ -5193,9 +5416,10 @@ XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
5193
5416
  }
5194
5417
 
5195
5418
  /*
5196
- * It's important for performance that XXH3_hashLong is not inlined.
5419
+ * It's important for performance to pass @secretLen (when it's static)
5420
+ * to the compiler, so that it can properly optimize the vectorized loop.
5197
5421
  */
5198
- XXH_NO_INLINE XXH128_hash_t
5422
+ XXH_FORCE_INLINE XXH128_hash_t
5199
5423
  XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
5200
5424
  XXH64_hash_t seed64,
5201
5425
  const void* XXH_RESTRICT secret, size_t secretLen)
@@ -5288,6 +5512,15 @@ XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
5288
5512
  XXH3_hashLong_128b_withSeed);
5289
5513
  }
5290
5514
 
5515
+ /*! @ingroup xxh3_family */
5516
+ XXH_PUBLIC_API XXH128_hash_t
5517
+ XXH3_128bits_withSecretandSeed(const void* input, size_t len, const void* secret, size_t secretSize, XXH64_hash_t seed)
5518
+ {
5519
+ if (len <= XXH3_MIDSIZE_MAX)
5520
+ return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
5521
+ return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
5522
+ }
5523
+
5291
5524
  /*! @ingroup xxh3_family */
5292
5525
  XXH_PUBLIC_API XXH128_hash_t
5293
5526
  XXH128(const void* input, size_t len, XXH64_hash_t seed)
@@ -5299,7 +5532,7 @@ XXH128(const void* input, size_t len, XXH64_hash_t seed)
5299
5532
  /* === XXH3 128-bit streaming === */
5300
5533
 
5301
5534
  /*
5302
- * All the functions are actually the same as for 64-bit streaming variant.
5535
+ * All initialization and update functions are identical to 64-bit streaming variant.
5303
5536
  * The only difference is the finalization routine.
5304
5537
  */
5305
5538
 
@@ -5307,31 +5540,28 @@ XXH128(const void* input, size_t len, XXH64_hash_t seed)
5307
5540
  XXH_PUBLIC_API XXH_errorcode
5308
5541
  XXH3_128bits_reset(XXH3_state_t* statePtr)
5309
5542
  {
5310
- if (statePtr == NULL) return XXH_ERROR;
5311
- XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
5312
- return XXH_OK;
5543
+ return XXH3_64bits_reset(statePtr);
5313
5544
  }
5314
5545
 
5315
5546
  /*! @ingroup xxh3_family */
5316
5547
  XXH_PUBLIC_API XXH_errorcode
5317
5548
  XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
5318
5549
  {
5319
- if (statePtr == NULL) return XXH_ERROR;
5320
- XXH3_reset_internal(statePtr, 0, secret, secretSize);
5321
- if (secret == NULL) return XXH_ERROR;
5322
- if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
5323
- return XXH_OK;
5550
+ return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
5324
5551
  }
5325
5552
 
5326
5553
  /*! @ingroup xxh3_family */
5327
5554
  XXH_PUBLIC_API XXH_errorcode
5328
5555
  XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
5329
5556
  {
5330
- if (statePtr == NULL) return XXH_ERROR;
5331
- if (seed==0) return XXH3_128bits_reset(statePtr);
5332
- if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
5333
- XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
5334
- return XXH_OK;
5557
+ return XXH3_64bits_reset_withSeed(statePtr, seed);
5558
+ }
5559
+
5560
+ /*! @ingroup xxh3_family */
5561
+ XXH_PUBLIC_API XXH_errorcode
5562
+ XXH3_128bits_reset_withSecretandSeed(XXH3_state_t* statePtr, const void* secret, size_t secretSize, XXH64_hash_t seed)
5563
+ {
5564
+ return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
5335
5565
  }
5336
5566
 
5337
5567
  /*! @ingroup xxh3_family */
@@ -5406,8 +5636,8 @@ XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
5406
5636
  hash.high64 = XXH_swap64(hash.high64);
5407
5637
  hash.low64 = XXH_swap64(hash.low64);
5408
5638
  }
5409
- memcpy(dst, &hash.high64, sizeof(hash.high64));
5410
- memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
5639
+ XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
5640
+ XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
5411
5641
  }
5412
5642
 
5413
5643
  /*! @ingroup xxh3_family */
@@ -5420,6 +5650,77 @@ XXH128_hashFromCanonical(const XXH128_canonical_t* src)
5420
5650
  return h;
5421
5651
  }
5422
5652
 
5653
+
5654
+
5655
+ /* ==========================================
5656
+ * Secret generators
5657
+ * ==========================================
5658
+ */
5659
+ #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
5660
+
5661
+ XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
5662
+ {
5663
+ XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
5664
+ XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
5665
+ }
5666
+
5667
+ /*! @ingroup xxh3_family */
5668
+ XXH_PUBLIC_API XXH_errorcode
5669
+ XXH3_generateSecret(void* secretBuffer, size_t secretSize, const void* customSeed, size_t customSeedSize)
5670
+ {
5671
+ #if (XXH_DEBUGLEVEL >= 1)
5672
+ XXH_ASSERT(secretBuffer != NULL);
5673
+ XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
5674
+ #else
5675
+ /* production mode, assert() are disabled */
5676
+ if (secretBuffer == NULL) return XXH_ERROR;
5677
+ if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
5678
+ #endif
5679
+
5680
+ if (customSeedSize == 0) {
5681
+ customSeed = XXH3_kSecret;
5682
+ customSeedSize = XXH_SECRET_DEFAULT_SIZE;
5683
+ }
5684
+ #if (XXH_DEBUGLEVEL >= 1)
5685
+ XXH_ASSERT(customSeed != NULL);
5686
+ #else
5687
+ if (customSeed == NULL) return XXH_ERROR;
5688
+ #endif
5689
+
5690
+ /* Fill secretBuffer with a copy of customSeed - repeat as needed */
5691
+ { size_t pos = 0;
5692
+ while (pos < secretSize) {
5693
+ size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
5694
+ memcpy((char*)secretBuffer + pos, customSeed, toCopy);
5695
+ pos += toCopy;
5696
+ } }
5697
+
5698
+ { size_t const nbSeg16 = secretSize / 16;
5699
+ size_t n;
5700
+ XXH128_canonical_t scrambler;
5701
+ XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
5702
+ for (n=0; n<nbSeg16; n++) {
5703
+ XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
5704
+ XXH3_combine16((char*)secretBuffer + n*16, h128);
5705
+ }
5706
+ /* last segment */
5707
+ XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
5708
+ }
5709
+ return XXH_OK;
5710
+ }
5711
+
5712
+ /*! @ingroup xxh3_family */
5713
+ XXH_PUBLIC_API void
5714
+ XXH3_generateSecret_fromSeed(void* secretBuffer, XXH64_hash_t seed)
5715
+ {
5716
+ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
5717
+ XXH3_initCustomSecret(secret, seed);
5718
+ XXH_ASSERT(secretBuffer != NULL);
5719
+ memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
5720
+ }
5721
+
5722
+
5723
+
5423
5724
  /* Pop our optimization override from above */
5424
5725
  #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
5425
5726
  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \