@nxtedition/rocksdb 8.1.3 → 8.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (146) hide show
  1. package/deps/rocksdb/rocksdb/CMakeLists.txt +13 -1
  2. package/deps/rocksdb/rocksdb/Makefile +2 -2
  3. package/deps/rocksdb/rocksdb/TARGETS +4 -2
  4. package/deps/rocksdb/rocksdb/cache/cache_bench_tool.cc +32 -35
  5. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.cc +0 -30
  6. package/deps/rocksdb/rocksdb/cache/cache_entry_roles.h +0 -83
  7. package/deps/rocksdb/rocksdb/cache/cache_entry_stats.h +13 -14
  8. package/deps/rocksdb/rocksdb/cache/cache_helpers.cc +40 -0
  9. package/deps/rocksdb/rocksdb/cache/cache_helpers.h +14 -20
  10. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.cc +8 -9
  11. package/deps/rocksdb/rocksdb/cache/cache_reservation_manager.h +5 -4
  12. package/deps/rocksdb/rocksdb/cache/cache_test.cc +124 -156
  13. package/deps/rocksdb/rocksdb/cache/charged_cache.cc +10 -26
  14. package/deps/rocksdb/rocksdb/cache/charged_cache.h +11 -16
  15. package/deps/rocksdb/rocksdb/cache/clock_cache.cc +35 -32
  16. package/deps/rocksdb/rocksdb/cache/clock_cache.h +19 -21
  17. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.cc +42 -30
  18. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache.h +9 -8
  19. package/deps/rocksdb/rocksdb/cache/compressed_secondary_cache_test.cc +91 -143
  20. package/deps/rocksdb/rocksdb/cache/lru_cache.cc +54 -60
  21. package/deps/rocksdb/rocksdb/cache/lru_cache.h +37 -63
  22. package/deps/rocksdb/rocksdb/cache/lru_cache_test.cc +120 -106
  23. package/deps/rocksdb/rocksdb/cache/secondary_cache.cc +14 -5
  24. package/deps/rocksdb/rocksdb/cache/sharded_cache.h +16 -31
  25. package/deps/rocksdb/rocksdb/cache/typed_cache.h +339 -0
  26. package/deps/rocksdb/rocksdb/db/blob/blob_contents.cc +0 -48
  27. package/deps/rocksdb/rocksdb/db/blob/blob_contents.h +18 -15
  28. package/deps/rocksdb/rocksdb/db/blob/blob_counting_iterator.h +0 -11
  29. package/deps/rocksdb/rocksdb/db/blob/blob_file_builder.cc +5 -26
  30. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.cc +7 -8
  31. package/deps/rocksdb/rocksdb/db/blob/blob_file_cache.h +6 -3
  32. package/deps/rocksdb/rocksdb/db/blob/blob_file_reader.cc +2 -7
  33. package/deps/rocksdb/rocksdb/db/blob/blob_source.cc +19 -47
  34. package/deps/rocksdb/rocksdb/db/blob/blob_source.h +13 -5
  35. package/deps/rocksdb/rocksdb/db/blob/blob_source_test.cc +15 -22
  36. package/deps/rocksdb/rocksdb/db/builder.cc +17 -12
  37. package/deps/rocksdb/rocksdb/db/column_family.cc +0 -1
  38. package/deps/rocksdb/rocksdb/db/column_family.h +0 -6
  39. package/deps/rocksdb/rocksdb/db/compaction/clipping_iterator.h +0 -5
  40. package/deps/rocksdb/rocksdb/db/compaction/compaction.cc +3 -0
  41. package/deps/rocksdb/rocksdb/db/compaction/compaction.h +0 -2
  42. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.cc +28 -27
  43. package/deps/rocksdb/rocksdb/db/compaction/compaction_iterator.h +2 -17
  44. package/deps/rocksdb/rocksdb/db/compaction/compaction_job.cc +1 -0
  45. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.cc +254 -139
  46. package/deps/rocksdb/rocksdb/db/compaction/compaction_outputs.h +7 -5
  47. package/deps/rocksdb/rocksdb/db/compaction/subcompaction_state.h +0 -5
  48. package/deps/rocksdb/rocksdb/db/db_basic_test.cc +98 -9
  49. package/deps/rocksdb/rocksdb/db/db_block_cache_test.cc +28 -28
  50. package/deps/rocksdb/rocksdb/db/db_compaction_test.cc +125 -0
  51. package/deps/rocksdb/rocksdb/db/db_flush_test.cc +65 -4
  52. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.cc +1 -1
  53. package/deps/rocksdb/rocksdb/db/db_impl/db_impl.h +27 -15
  54. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_compaction_flush.cc +78 -49
  55. package/deps/rocksdb/rocksdb/db/db_impl/db_impl_write.cc +34 -24
  56. package/deps/rocksdb/rocksdb/db/db_iter.cc +8 -2
  57. package/deps/rocksdb/rocksdb/db/db_merge_operand_test.cc +42 -0
  58. package/deps/rocksdb/rocksdb/db/db_merge_operator_test.cc +155 -0
  59. package/deps/rocksdb/rocksdb/db/db_properties_test.cc +12 -12
  60. package/deps/rocksdb/rocksdb/db/db_range_del_test.cc +117 -210
  61. package/deps/rocksdb/rocksdb/db/db_test_util.cc +11 -10
  62. package/deps/rocksdb/rocksdb/db/db_test_util.h +36 -24
  63. package/deps/rocksdb/rocksdb/db/db_with_timestamp_basic_test.cc +28 -0
  64. package/deps/rocksdb/rocksdb/db/flush_job.cc +6 -6
  65. package/deps/rocksdb/rocksdb/db/flush_job.h +3 -2
  66. package/deps/rocksdb/rocksdb/db/flush_job_test.cc +29 -29
  67. package/deps/rocksdb/rocksdb/db/history_trimming_iterator.h +0 -4
  68. package/deps/rocksdb/rocksdb/db/internal_stats.cc +11 -11
  69. package/deps/rocksdb/rocksdb/db/internal_stats.h +2 -2
  70. package/deps/rocksdb/rocksdb/db/log_reader.cc +8 -6
  71. package/deps/rocksdb/rocksdb/db/log_test.cc +35 -2
  72. package/deps/rocksdb/rocksdb/db/memtable.cc +30 -5
  73. package/deps/rocksdb/rocksdb/db/merge_helper.cc +47 -33
  74. package/deps/rocksdb/rocksdb/db/merge_helper.h +14 -6
  75. package/deps/rocksdb/rocksdb/db/table_cache.cc +41 -91
  76. package/deps/rocksdb/rocksdb/db/table_cache.h +17 -19
  77. package/deps/rocksdb/rocksdb/db/table_cache_sync_and_async.h +7 -9
  78. package/deps/rocksdb/rocksdb/db/version_builder.cc +12 -9
  79. package/deps/rocksdb/rocksdb/db/version_edit.h +1 -0
  80. package/deps/rocksdb/rocksdb/db/version_set.cc +20 -28
  81. package/deps/rocksdb/rocksdb/db/version_set.h +2 -2
  82. package/deps/rocksdb/rocksdb/db/version_set_sync_and_async.h +1 -1
  83. package/deps/rocksdb/rocksdb/db/write_batch.cc +4 -1
  84. package/deps/rocksdb/rocksdb/file/file_prefetch_buffer.cc +1 -0
  85. package/deps/rocksdb/rocksdb/file/prefetch_test.cc +358 -214
  86. package/deps/rocksdb/rocksdb/include/rocksdb/cache.h +137 -135
  87. package/deps/rocksdb/rocksdb/include/rocksdb/merge_operator.h +21 -0
  88. package/deps/rocksdb/rocksdb/include/rocksdb/secondary_cache.h +8 -6
  89. package/deps/rocksdb/rocksdb/include/rocksdb/version.h +1 -1
  90. package/deps/rocksdb/rocksdb/memory/memory_allocator.h +9 -0
  91. package/deps/rocksdb/rocksdb/options/customizable_test.cc +4 -3
  92. package/deps/rocksdb/rocksdb/port/port_posix.h +2 -0
  93. package/{prebuilds → deps/rocksdb/rocksdb/prebuilds}/linux-x64/node.napi.node +0 -0
  94. package/deps/rocksdb/rocksdb/src.mk +2 -1
  95. package/deps/rocksdb/rocksdb/table/block_based/block.h +3 -0
  96. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.cc +25 -67
  97. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_builder.h +3 -3
  98. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_factory.cc +18 -13
  99. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.cc +156 -223
  100. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader.h +31 -50
  101. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_impl.h +46 -18
  102. package/deps/rocksdb/rocksdb/table/block_based/block_based_table_reader_sync_and_async.h +3 -3
  103. package/deps/rocksdb/rocksdb/table/block_based/block_cache.cc +96 -0
  104. package/deps/rocksdb/rocksdb/table/block_based/block_cache.h +132 -0
  105. package/deps/rocksdb/rocksdb/table/block_based/cachable_entry.h +28 -0
  106. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.cc +6 -5
  107. package/deps/rocksdb/rocksdb/table/block_based/filter_block_reader_common.h +1 -4
  108. package/deps/rocksdb/rocksdb/table/block_based/full_filter_block.cc +6 -7
  109. package/deps/rocksdb/rocksdb/table/block_based/index_reader_common.cc +3 -1
  110. package/deps/rocksdb/rocksdb/table/block_based/parsed_full_filter_block.h +6 -1
  111. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.cc +19 -18
  112. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block.h +9 -5
  113. package/deps/rocksdb/rocksdb/table/block_based/partitioned_filter_block_test.cc +3 -1
  114. package/deps/rocksdb/rocksdb/table/block_based/partitioned_index_reader.cc +2 -1
  115. package/deps/rocksdb/rocksdb/table/block_based/uncompression_dict_reader.cc +2 -2
  116. package/deps/rocksdb/rocksdb/table/format.h +1 -1
  117. package/deps/rocksdb/rocksdb/table/get_context.cc +12 -3
  118. package/deps/rocksdb/rocksdb/table/internal_iterator.h +0 -2
  119. package/deps/rocksdb/rocksdb/table/merging_iterator.cc +92 -7
  120. package/deps/rocksdb/rocksdb/table/merging_iterator.h +0 -80
  121. package/deps/rocksdb/rocksdb/tools/block_cache_analyzer/block_cache_trace_analyzer_test.cc +66 -1
  122. package/deps/rocksdb/rocksdb/tools/db_bench_tool.cc +9 -2
  123. package/deps/rocksdb/rocksdb/trace_replay/block_cache_tracer.cc +5 -0
  124. package/deps/rocksdb/rocksdb/trace_replay/trace_replay.cc +1 -1
  125. package/deps/rocksdb/rocksdb/util/async_file_reader.cc +20 -12
  126. package/deps/rocksdb/rocksdb/util/compression.cc +2 -2
  127. package/deps/rocksdb/rocksdb/util/compression.h +11 -2
  128. package/deps/rocksdb/rocksdb/util/xxhash.h +1901 -887
  129. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.cc +35 -57
  130. package/deps/rocksdb/rocksdb/utilities/cache_dump_load_impl.h +4 -5
  131. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.cc +11 -6
  132. package/deps/rocksdb/rocksdb/utilities/fault_injection_secondary_cache.h +6 -5
  133. package/deps/rocksdb/rocksdb/utilities/memory_allocators.h +0 -1
  134. package/deps/rocksdb/rocksdb/utilities/simulator_cache/cache_simulator.cc +10 -11
  135. package/deps/rocksdb/rocksdb/utilities/simulator_cache/sim_cache.cc +31 -31
  136. package/deps/rocksdb/rocksdb/utilities/transactions/lock/range/range_tree/lib/portability/toku_time.h +4 -0
  137. package/deps/rocksdb/rocksdb/utilities/transactions/transaction_test.cc +52 -0
  138. package/deps/rocksdb/rocksdb/utilities/ttl/db_ttl_impl.cc +1 -0
  139. package/deps/rocksdb/rocksdb/utilities/write_batch_with_index/write_batch_with_index_internal.cc +12 -3
  140. package/deps/rocksdb/rocksdb.gyp +0 -3
  141. package/index.js +2 -2
  142. package/package.json +1 -1
  143. package/prebuilds/darwin-arm64/node.napi.node +0 -0
  144. package/deps/rocksdb/rocksdb/table/block_based/block_like_traits.h +0 -182
  145. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.cc +0 -142
  146. package/deps/rocksdb/rocksdb/table/compaction_merging_iterator.h +0 -241
@@ -20,7 +20,7 @@
20
20
  /*
21
21
  * xxHash - Extremely Fast Hash algorithm
22
22
  * Header File
23
- * Copyright (C) 2012-2020 Yann Collet
23
+ * Copyright (C) 2012-2021 Yann Collet
24
24
  *
25
25
  * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
26
26
  *
@@ -51,49 +51,142 @@
51
51
  * - xxHash homepage: https://www.xxhash.com
52
52
  * - xxHash source repository: https://github.com/Cyan4973/xxHash
53
53
  */
54
+
54
55
  /*!
55
56
  * @mainpage xxHash
56
57
  *
58
+ * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed
59
+ * limits.
60
+ *
61
+ * It is proposed in four flavors, in three families:
62
+ * 1. @ref XXH32_family
63
+ * - Classic 32-bit hash function. Simple, compact, and runs on almost all
64
+ * 32-bit and 64-bit systems.
65
+ * 2. @ref XXH64_family
66
+ * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most
67
+ * 64-bit systems (but _not_ 32-bit systems).
68
+ * 3. @ref XXH3_family
69
+ * - Modern 64-bit and 128-bit hash function family which features improved
70
+ * strength and performance across the board, especially on smaller data.
71
+ * It benefits greatly from SIMD and 64-bit without requiring it.
72
+ *
73
+ * Benchmarks
74
+ * ---
75
+ * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04.
76
+ * The open source benchmark program is compiled with clang v10.0 using -O3 flag.
77
+ *
78
+ * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity |
79
+ * | -------------------- | ------- | ----: | ---------------: | ------------------: |
80
+ * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 |
81
+ * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 |
82
+ * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 |
83
+ * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 |
84
+ * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 |
85
+ * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 |
86
+ * | RAM sequential read | | N/A | 28.0 GB/s | N/A |
87
+ * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 |
88
+ * | City64 | | 64 | 22.0 GB/s | 76.6 |
89
+ * | T1ha2 | | 64 | 22.0 GB/s | 99.0 |
90
+ * | City128 | | 128 | 21.7 GB/s | 57.7 |
91
+ * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 |
92
+ * | XXH64() | | 64 | 19.4 GB/s | 71.0 |
93
+ * | SpookyHash | | 64 | 19.3 GB/s | 53.2 |
94
+ * | Mum | | 64 | 18.0 GB/s | 67.0 |
95
+ * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 |
96
+ * | XXH32() | | 32 | 9.7 GB/s | 71.9 |
97
+ * | City32 | | 32 | 9.1 GB/s | 66.0 |
98
+ * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 |
99
+ * | Murmur3 | | 32 | 3.9 GB/s | 56.1 |
100
+ * | SipHash* | | 64 | 3.0 GB/s | 43.2 |
101
+ * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 |
102
+ * | HighwayHash | | 64 | 1.4 GB/s | 6.0 |
103
+ * | FNV64 | | 64 | 1.2 GB/s | 62.7 |
104
+ * | Blake2* | | 256 | 1.1 GB/s | 5.1 |
105
+ * | SHA1* | | 160 | 0.8 GB/s | 5.6 |
106
+ * | MD5* | | 128 | 0.6 GB/s | 7.8 |
107
+ * @note
108
+ * - Hashes which require a specific ISA extension are noted. SSE2 is also noted,
109
+ * even though it is mandatory on x64.
110
+ * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic
111
+ * by modern standards.
112
+ * - Small data velocity is a rough average of algorithm's efficiency for small
113
+ * data. For more accurate information, see the wiki.
114
+ * - More benchmarks and strength tests are found on the wiki:
115
+ * https://github.com/Cyan4973/xxHash/wiki
116
+ *
117
+ * Usage
118
+ * ------
119
+ * All xxHash variants use a similar API. Changing the algorithm is a trivial
120
+ * substitution.
121
+ *
122
+ * @pre
123
+ * For functions which take an input and length parameter, the following
124
+ * requirements are assumed:
125
+ * - The range from [`input`, `input + length`) is valid, readable memory.
126
+ * - The only exception is if the `length` is `0`, `input` may be `NULL`.
127
+ * - For C++, the objects must have the *TriviallyCopyable* property, as the
128
+ * functions access bytes directly as if it was an array of `unsigned char`.
129
+ *
130
+ * @anchor single_shot_example
131
+ * **Single Shot**
132
+ *
133
+ * These functions are stateless functions which hash a contiguous block of memory,
134
+ * immediately returning the result. They are the easiest and usually the fastest
135
+ * option.
136
+ *
137
+ * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits()
138
+ *
139
+ * @code{.c}
140
+ * #include <string.h>
141
+ * #include "xxhash.h"
142
+ *
143
+ * // Example for a function which hashes a null terminated string with XXH32().
144
+ * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed)
145
+ * {
146
+ * // NULL pointers are only valid if the length is zero
147
+ * size_t length = (string == NULL) ? 0 : strlen(string);
148
+ * return XXH32(string, length, seed);
149
+ * }
150
+ * @endcode
151
+ *
152
+ * @anchor streaming_example
153
+ * **Streaming**
154
+ *
155
+ * These groups of functions allow incremental hashing of unknown size, even
156
+ * more than what would fit in a size_t.
157
+ *
158
+ * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset()
159
+ *
160
+ * @code{.c}
161
+ * #include <stdio.h>
162
+ * #include <assert.h>
163
+ * #include "xxhash.h"
164
+ * // Example for a function which hashes a FILE incrementally with XXH3_64bits().
165
+ * XXH64_hash_t hashFile(FILE* f)
166
+ * {
167
+ * // Allocate a state struct. Do not just use malloc() or new.
168
+ * XXH3_state_t* state = XXH3_createState();
169
+ * assert(state != NULL && "Out of memory!");
170
+ * // Reset the state to start a new hashing session.
171
+ * XXH3_64bits_reset(state);
172
+ * char buffer[4096];
173
+ * size_t count;
174
+ * // Read the file in chunks
175
+ * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) {
176
+ * // Run update() as many times as necessary to process the data
177
+ * XXH3_64bits_update(state, buffer, count);
178
+ * }
179
+ * // Retrieve the finalized hash. This will not change the state.
180
+ * XXH64_hash_t result = XXH3_64bits_digest(state);
181
+ * // Free the state. Do not use free().
182
+ * XXH3_freeState(state);
183
+ * return result;
184
+ * }
185
+ * @endcode
186
+ *
57
187
  * @file xxhash.h
58
188
  * xxHash prototypes and implementation
59
189
  */
60
- /* TODO: update */
61
- /* Notice extracted from xxHash homepage:
62
-
63
- xxHash is an extremely fast hash algorithm, running at RAM speed limits.
64
- It also successfully passes all tests from the SMHasher suite.
65
-
66
- Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
67
-
68
- Name Speed Q.Score Author
69
- xxHash 5.4 GB/s 10
70
- CrapWow 3.2 GB/s 2 Andrew
71
- MurmurHash 3a 2.7 GB/s 10 Austin Appleby
72
- SpookyHash 2.0 GB/s 10 Bob Jenkins
73
- SBox 1.4 GB/s 9 Bret Mulvey
74
- Lookup3 1.2 GB/s 9 Bob Jenkins
75
- SuperFastHash 1.2 GB/s 1 Paul Hsieh
76
- CityHash64 1.05 GB/s 10 Pike & Alakuijala
77
- FNV 0.55 GB/s 5 Fowler, Noll, Vo
78
- CRC32 0.43 GB/s 9
79
- MD5-32 0.33 GB/s 10 Ronald L. Rivest
80
- SHA1-32 0.28 GB/s 10
81
-
82
- Q.Score is a measure of quality of the hash function.
83
- It depends on successfully passing SMHasher test set.
84
- 10 is a perfect score.
85
-
86
- Note: SMHasher's CRC32 implementation is not the fastest one.
87
- Other speed-oriented implementations can be faster,
88
- especially in combination with PCLMUL instruction:
89
- https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html?showComment=1552696407071#c3490092340461170735
90
-
91
- A 64-bit version, named XXH64, is available since r35.
92
- It offers much better speed, but for 64-bit applications only.
93
- Name Speed on 64 bits Speed on 32 bits
94
- XXH64 13.8 GB/s 1.9 GB/s
95
- XXH32 6.8 GB/s 6.0 GB/s
96
- */
97
190
 
98
191
  #if defined (__cplusplus)
99
192
  extern "C" {
@@ -103,21 +196,53 @@ extern "C" {
103
196
  * INLINE mode
104
197
  ******************************/
105
198
  /*!
106
- * XXH_INLINE_ALL (and XXH_PRIVATE_API)
199
+ * @defgroup public Public API
200
+ * Contains details on the public xxHash functions.
201
+ * @{
202
+ */
203
+ #ifdef XXH_DOXYGEN
204
+ /*!
205
+ * @brief Exposes the implementation and marks all functions as `inline`.
206
+ *
107
207
  * Use these build macros to inline xxhash into the target unit.
108
208
  * Inlining improves performance on small inputs, especially when the length is
109
209
  * expressed as a compile-time constant:
110
210
  *
111
- * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
211
+ * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
112
212
  *
113
213
  * It also keeps xxHash symbols private to the unit, so they are not exported.
114
214
  *
115
215
  * Usage:
216
+ * @code{.c}
116
217
  * #define XXH_INLINE_ALL
117
218
  * #include "xxhash.h"
118
- *
219
+ * @endcode
119
220
  * Do not compile and link xxhash.o as a separate object, as it is not useful.
120
221
  */
222
+ # define XXH_INLINE_ALL
223
+ # undef XXH_INLINE_ALL
224
+ /*!
225
+ * @brief Exposes the implementation without marking functions as inline.
226
+ */
227
+ # define XXH_PRIVATE_API
228
+ # undef XXH_PRIVATE_API
229
+ /*!
230
+ * @brief Emulate a namespace by transparently prefixing all symbols.
231
+ *
232
+ * If you want to include _and expose_ xxHash functions from within your own
233
+ * library, but also want to avoid symbol collisions with other libraries which
234
+ * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix
235
+ * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE
236
+ * (therefore, avoid empty or numeric values).
237
+ *
238
+ * Note that no change is required within the calling program as long as it
239
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
240
+ * by this header.
241
+ */
242
+ # define XXH_NAMESPACE /* YOUR NAME HERE */
243
+ # undef XXH_NAMESPACE
244
+ #endif
245
+
121
246
  #if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
122
247
  && !defined(XXH_INLINE_ALL_31684351384)
123
248
  /* this section should be traversed only once */
@@ -140,29 +265,80 @@ extern "C" {
140
265
 
141
266
  /*
142
267
  * This part deals with the special case where a unit wants to inline xxHash,
143
- * but "xxhash.h" has previously been included without XXH_INLINE_ALL, such
144
- * as part of some previously included *.h header file.
268
+ * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
269
+ * such as part of some previously included *.h header file.
145
270
  * Without further action, the new include would just be ignored,
146
271
  * and functions would effectively _not_ be inlined (silent failure).
147
272
  * The following macros solve this situation by prefixing all inlined names,
148
273
  * avoiding naming collision with previous inclusions.
149
274
  */
150
- # ifdef XXH_NAMESPACE
151
- # error "XXH_INLINE_ALL with XXH_NAMESPACE is not supported"
152
- /*
153
- * Note: Alternative: #undef all symbols (it's a pretty large list).
154
- * Without #error: it compiles, but functions are actually not inlined.
155
- */
156
- # endif
275
+ /* Before that, we unconditionally #undef all symbols,
276
+ * in case they were already defined with XXH_NAMESPACE.
277
+ * They will then be redefined for XXH_INLINE_ALL
278
+ */
279
+ # undef XXH_versionNumber
280
+ /* XXH32 */
281
+ # undef XXH32
282
+ # undef XXH32_createState
283
+ # undef XXH32_freeState
284
+ # undef XXH32_reset
285
+ # undef XXH32_update
286
+ # undef XXH32_digest
287
+ # undef XXH32_copyState
288
+ # undef XXH32_canonicalFromHash
289
+ # undef XXH32_hashFromCanonical
290
+ /* XXH64 */
291
+ # undef XXH64
292
+ # undef XXH64_createState
293
+ # undef XXH64_freeState
294
+ # undef XXH64_reset
295
+ # undef XXH64_update
296
+ # undef XXH64_digest
297
+ # undef XXH64_copyState
298
+ # undef XXH64_canonicalFromHash
299
+ # undef XXH64_hashFromCanonical
300
+ /* XXH3_64bits */
301
+ # undef XXH3_64bits
302
+ # undef XXH3_64bits_withSecret
303
+ # undef XXH3_64bits_withSeed
304
+ # undef XXH3_64bits_withSecretandSeed
305
+ # undef XXH3_createState
306
+ # undef XXH3_freeState
307
+ # undef XXH3_copyState
308
+ # undef XXH3_64bits_reset
309
+ # undef XXH3_64bits_reset_withSeed
310
+ # undef XXH3_64bits_reset_withSecret
311
+ # undef XXH3_64bits_update
312
+ # undef XXH3_64bits_digest
313
+ # undef XXH3_generateSecret
314
+ /* XXH3_128bits */
315
+ # undef XXH128
316
+ # undef XXH3_128bits
317
+ # undef XXH3_128bits_withSeed
318
+ # undef XXH3_128bits_withSecret
319
+ # undef XXH3_128bits_reset
320
+ # undef XXH3_128bits_reset_withSeed
321
+ # undef XXH3_128bits_reset_withSecret
322
+ # undef XXH3_128bits_reset_withSecretandSeed
323
+ # undef XXH3_128bits_update
324
+ # undef XXH3_128bits_digest
325
+ # undef XXH128_isEqual
326
+ # undef XXH128_cmp
327
+ # undef XXH128_canonicalFromHash
328
+ # undef XXH128_hashFromCanonical
329
+ /* Finally, free the namespace itself */
330
+ # undef XXH_NAMESPACE
331
+
332
+ /* employ the namespace for XXH_INLINE_ALL */
157
333
  # define XXH_NAMESPACE XXH_INLINE_
158
334
  /*
159
- * Some identifiers (enums, type names) are not symbols, but they must
160
- * still be renamed to avoid redeclaration.
335
+ * Some identifiers (enums, type names) are not symbols,
336
+ * but they must nonetheless be renamed to avoid redeclaration.
161
337
  * Alternative solution: do not redeclare them.
162
- * However, this requires some #ifdefs, and is a more dispersed action.
163
- * Meanwhile, renaming can be achieved in a single block
338
+ * However, this requires some #ifdefs, and has a more dispersed impact.
339
+ * Meanwhile, renaming can be achieved in a single place.
164
340
  */
165
- # define XXH_IPREF(Id) XXH_INLINE_ ## Id
341
+ # define XXH_IPREF(Id) XXH_NAMESPACE ## Id
166
342
  # define XXH_OK XXH_IPREF(XXH_OK)
167
343
  # define XXH_ERROR XXH_IPREF(XXH_ERROR)
168
344
  # define XXH_errorcode XXH_IPREF(XXH_errorcode)
@@ -181,21 +357,13 @@ extern "C" {
181
357
  # undef XXHASH_H_STATIC_13879238742
182
358
  #endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
183
359
 
184
-
185
-
186
360
  /* ****************************************************************
187
361
  * Stable API
188
362
  *****************************************************************/
189
363
  #ifndef XXHASH_H_5627135585666179
190
364
  #define XXHASH_H_5627135585666179 1
191
365
 
192
-
193
- /*!
194
- * @defgroup public Public API
195
- * Contains details on the public xxHash functions.
196
- * @{
197
- */
198
- /* specific declaration modes for Windows */
366
+ /*! @brief Marks a global symbol. */
199
367
  #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
200
368
  # if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
201
369
  # ifdef XXH_EXPORT
@@ -208,24 +376,6 @@ extern "C" {
208
376
  # endif
209
377
  #endif
210
378
 
211
- #ifdef XXH_DOXYGEN
212
- /*!
213
- * @brief Emulate a namespace by transparently prefixing all symbols.
214
- *
215
- * If you want to include _and expose_ xxHash functions from within your own
216
- * library, but also want to avoid symbol collisions with other libraries which
217
- * may also include xxHash, you can use XXH_NAMESPACE to automatically prefix
218
- * any public symbol from xxhash library with the value of XXH_NAMESPACE
219
- * (therefore, avoid empty or numeric values).
220
- *
221
- * Note that no change is required within the calling program as long as it
222
- * includes `xxhash.h`: Regular symbol names will be automatically translated
223
- * by this header.
224
- */
225
- # define XXH_NAMESPACE /* YOUR NAME HERE */
226
- # undef XXH_NAMESPACE
227
- #endif
228
-
229
379
  #ifdef XXH_NAMESPACE
230
380
  # define XXH_CAT(A,B) A##B
231
381
  # define XXH_NAME2(A,B) XXH_CAT(A,B)
@@ -254,23 +404,28 @@ extern "C" {
254
404
  # define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
255
405
  # define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
256
406
  # define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
407
+ # define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
257
408
  # define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
258
409
  # define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
259
410
  # define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
260
411
  # define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
261
412
  # define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
262
413
  # define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
414
+ # define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
263
415
  # define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
264
416
  # define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
265
417
  # define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
418
+ # define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
266
419
  /* XXH3_128bits */
267
420
  # define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
268
421
  # define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
269
422
  # define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
270
423
  # define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
424
+ # define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
271
425
  # define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
272
426
  # define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
273
427
  # define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
428
+ # define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
274
429
  # define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
275
430
  # define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
276
431
  # define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
@@ -280,30 +435,64 @@ extern "C" {
280
435
  #endif
281
436
 
282
437
 
438
+ /* *************************************
439
+ * Compiler specifics
440
+ ***************************************/
441
+
442
+ /* specific declaration modes for Windows */
443
+ #if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
444
+ # if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
445
+ # ifdef XXH_EXPORT
446
+ # define XXH_PUBLIC_API __declspec(dllexport)
447
+ # elif XXH_IMPORT
448
+ # define XXH_PUBLIC_API __declspec(dllimport)
449
+ # endif
450
+ # else
451
+ # define XXH_PUBLIC_API /* do nothing */
452
+ # endif
453
+ #endif
454
+
455
+ #if defined (__GNUC__)
456
+ # define XXH_CONSTF __attribute__((const))
457
+ # define XXH_PUREF __attribute__((pure))
458
+ # define XXH_MALLOCF __attribute__((malloc))
459
+ #else
460
+ # define XXH_CONSTF /* disable */
461
+ # define XXH_PUREF
462
+ # define XXH_MALLOCF
463
+ #endif
464
+
283
465
  /* *************************************
284
466
  * Version
285
467
  ***************************************/
286
468
  #define XXH_VERSION_MAJOR 0
287
469
  #define XXH_VERSION_MINOR 8
288
470
  #define XXH_VERSION_RELEASE 1
471
+ /*! @brief Version number, encoded as two digits each */
289
472
  #define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
290
473
 
291
474
  /*!
292
475
  * @brief Obtains the xxHash version.
293
476
  *
294
- * This is only useful when xxHash is compiled as a shared library, as it is
295
- * independent of the version defined in the header.
477
+ * This is mostly useful when xxHash is compiled as a shared library,
478
+ * since the returned value comes from the library, as opposed to header file.
296
479
  *
297
- * @return `XXH_VERSION_NUMBER` as of when the libray was compiled.
480
+ * @return @ref XXH_VERSION_NUMBER of the invoked library.
298
481
  */
299
- XXH_PUBLIC_API unsigned XXH_versionNumber (void);
482
+ XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void);
300
483
 
301
484
 
302
485
  /* ****************************
303
- * Definitions
486
+ * Common basic types
304
487
  ******************************/
305
488
  #include <stddef.h> /* size_t */
306
- typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
489
+ /*!
490
+ * @brief Exit code for the streaming API.
491
+ */
492
+ typedef enum {
493
+ XXH_OK = 0, /*!< OK */
494
+ XXH_ERROR /*!< Error */
495
+ } XXH_errorcode;
307
496
 
308
497
 
309
498
  /*-**********************************************************************
@@ -316,39 +505,38 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
316
505
  * Not necessarily defined to `uint32_t` but functionally equivalent.
317
506
  */
318
507
  typedef uint32_t XXH32_hash_t;
508
+
319
509
  #elif !defined (__VMS) \
320
510
  && (defined (__cplusplus) \
321
511
  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
322
512
  # include <stdint.h>
323
513
  typedef uint32_t XXH32_hash_t;
514
+
324
515
  #else
325
516
  # include <limits.h>
326
517
  # if UINT_MAX == 0xFFFFFFFFUL
327
518
  typedef unsigned int XXH32_hash_t;
519
+ # elif ULONG_MAX == 0xFFFFFFFFUL
520
+ typedef unsigned long XXH32_hash_t;
328
521
  # else
329
- # if ULONG_MAX == 0xFFFFFFFFUL
330
- typedef unsigned long XXH32_hash_t;
331
- # else
332
- # error "unsupported platform: need a 32-bit type"
333
- # endif
522
+ # error "unsupported platform: need a 32-bit type"
334
523
  # endif
335
524
  #endif
336
525
 
337
526
  /*!
338
527
  * @}
339
528
  *
340
- * @defgroup xxh32_family XXH32 family
529
+ * @defgroup XXH32_family XXH32 family
341
530
  * @ingroup public
342
531
  * Contains functions used in the classic 32-bit xxHash algorithm.
343
532
  *
344
533
  * @note
345
- * XXH32 is considered rather weak by today's standards.
346
- * The @ref xxh3_family provides competitive speed for both 32-bit and 64-bit
347
- * systems, and offers true 64/128 bit hash results. It provides a superior
348
- * level of dispersion, and greatly reduces the risks of collisions.
534
+ * XXH32 is useful for older platforms, with no or poor 64-bit performance.
535
+ * Note that the @ref XXH3_family provides competitive speed for both 32-bit
536
+ * and 64-bit systems, and offers true 64/128 bit hash results.
349
537
  *
350
- * @see @ref xxh64_family, @ref xxh3_family : Other xxHash families
351
- * @see @ref xxh32_impl for implementation details
538
+ * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families
539
+ * @see @ref XXH32_impl for implementation details
352
540
  * @{
353
541
  */
354
542
 
@@ -357,6 +545,8 @@ typedef uint32_t XXH32_hash_t;
357
545
  *
358
546
  * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s
359
547
  *
548
+ * See @ref single_shot_example "Single Shot Example" for an example.
549
+ *
360
550
  * @param input The block of data to be hashed, at least @p length bytes in size.
361
551
  * @param length The length of @p input, in bytes.
362
552
  * @param seed The 32-bit seed to alter the hash's output predictably.
@@ -374,8 +564,9 @@ typedef uint32_t XXH32_hash_t;
374
564
  * @see
375
565
  * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version.
376
566
  */
377
- XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
567
+ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
378
568
 
569
+ #ifndef XXH_NO_STREAM
379
570
  /*!
380
571
  * Streaming functions generate the xxHash value from an incremental input.
381
572
  * This method is slower than single-call functions, due to state management.
@@ -398,32 +589,7 @@ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_
398
589
  *
399
590
  * When done, release the state using `XXH*_freeState()`.
400
591
  *
401
- * Example code for incrementally hashing a file:
402
- * @code{.c}
403
- * #include <stdio.h>
404
- * #include <xxhash.h>
405
- * #define BUFFER_SIZE 256
406
- *
407
- * // Note: XXH64 and XXH3 use the same interface.
408
- * XXH32_hash_t
409
- * hashFile(FILE* stream)
410
- * {
411
- * XXH32_state_t* state;
412
- * unsigned char buf[BUFFER_SIZE];
413
- * size_t amt;
414
- * XXH32_hash_t hash;
415
- *
416
- * state = XXH32_createState(); // Create a state
417
- * assert(state != NULL); // Error check here
418
- * XXH32_reset(state, 0xbaad5eed); // Reset state with our seed
419
- * while ((amt = fread(buf, 1, sizeof(buf), stream)) != 0) {
420
- * XXH32_update(state, buf, amt); // Hash the file in chunks
421
- * }
422
- * hash = XXH32_digest(state); // Finalize the hash
423
- * XXH32_freeState(state); // Clean up
424
- * return hash;
425
- * }
426
- * @endcode
592
+ * @see streaming_example at the top of @ref xxhash.h for an example.
427
593
  */
428
594
 
429
595
  /*!
@@ -440,7 +606,7 @@ typedef struct XXH32_state_s XXH32_state_t;
440
606
  * Must be freed with XXH32_freeState().
441
607
  * @return An allocated XXH32_state_t on success, `NULL` on failure.
442
608
  */
443
- XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
609
+ XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void);
444
610
  /*!
445
611
  * @brief Frees an @ref XXH32_state_t.
446
612
  *
@@ -508,7 +674,8 @@ XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void*
508
674
  *
509
675
  * @return The calculated xxHash32 value from that state.
510
676
  */
511
- XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
677
+ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
678
+ #endif /* !XXH_NO_STREAM */
512
679
 
513
680
  /******* Canonical representation *******/
514
681
 
@@ -559,7 +726,52 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t
559
726
  *
560
727
  * @return The converted hash.
561
728
  */
562
- XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
729
+ XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
730
+
731
+
732
+ #ifdef __has_attribute
733
+ # define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
734
+ #else
735
+ # define XXH_HAS_ATTRIBUTE(x) 0
736
+ #endif
737
+
738
+ /* C-language Attributes are added in C23. */
739
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(__has_c_attribute)
740
+ # define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
741
+ #else
742
+ # define XXH_HAS_C_ATTRIBUTE(x) 0
743
+ #endif
744
+
745
+ #if defined(__cplusplus) && defined(__has_cpp_attribute)
746
+ # define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
747
+ #else
748
+ # define XXH_HAS_CPP_ATTRIBUTE(x) 0
749
+ #endif
750
+
751
+ /*
752
+ * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
753
+ * introduced in CPP17 and C23.
754
+ * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
755
+ * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough
756
+ */
757
+ #if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough)
758
+ # define XXH_FALLTHROUGH [[fallthrough]]
759
+ #elif XXH_HAS_ATTRIBUTE(__fallthrough__)
760
+ # define XXH_FALLTHROUGH __attribute__ ((__fallthrough__))
761
+ #else
762
+ # define XXH_FALLTHROUGH /* fallthrough */
763
+ #endif
764
+
765
+ /*
766
+ * Define XXH_NOESCAPE for annotated pointers in public API.
767
+ * https://clang.llvm.org/docs/AttributeReference.html#noescape
768
+ * As of writing this, only supported by clang.
769
+ */
770
+ #if XXH_HAS_ATTRIBUTE(noescape)
771
+ # define XXH_NOESCAPE __attribute__((noescape))
772
+ #else
773
+ # define XXH_NOESCAPE
774
+ #endif
563
775
 
564
776
 
565
777
  /*!
@@ -598,18 +810,17 @@ typedef uint64_t XXH64_hash_t;
598
810
  /*!
599
811
  * @}
600
812
  *
601
- * @defgroup xxh64_family XXH64 family
813
+ * @defgroup XXH64_family XXH64 family
602
814
  * @ingroup public
603
815
  * @{
604
816
  * Contains functions used in the classic 64-bit xxHash algorithm.
605
817
  *
606
818
  * @note
607
819
  * XXH3 provides competitive speed for both 32-bit and 64-bit systems,
608
- * and offers true 64/128 bit hash results. It provides a superior level of
609
- * dispersion, and greatly reduces the risks of collisions.
820
+ * and offers true 64/128 bit hash results.
821
+ * It provides better speed for systems with vector processing capabilities.
610
822
  */
611
823
 
612
-
613
824
  /*!
614
825
  * @brief Calculates the 64-bit hash of @p input using xxHash64.
615
826
  *
@@ -633,32 +844,35 @@ typedef uint64_t XXH64_hash_t;
633
844
  * @see
634
845
  * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version.
635
846
  */
636
- XXH_PUBLIC_API XXH64_hash_t XXH64(const void* input, size_t length, XXH64_hash_t seed);
847
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
637
848
 
638
849
  /******* Streaming *******/
850
+ #ifndef XXH_NO_STREAM
639
851
  /*!
640
852
  * @brief The opaque state struct for the XXH64 streaming API.
641
853
  *
642
854
  * @see XXH64_state_s for details.
643
855
  */
644
856
  typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */
645
- XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
857
+ XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
646
858
  XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr);
647
- XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
648
-
649
- XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, XXH64_hash_t seed);
650
- XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
651
- XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr);
859
+ XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);
652
860
 
861
+ XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);
862
+ XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
863
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
864
+ #endif /* !XXH_NO_STREAM */
653
865
  /******* Canonical representation *******/
654
866
  typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
655
- XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
656
- XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
867
+ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);
868
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);
869
+
870
+ #ifndef XXH_NO_XXH3
657
871
 
658
872
  /*!
659
873
  * @}
660
874
  * ************************************************************************
661
- * @defgroup xxh3_family XXH3 family
875
+ * @defgroup XXH3_family XXH3 family
662
876
  * @ingroup public
663
877
  * @{
664
878
  *
@@ -678,12 +892,14 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
678
892
  *
679
893
  * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
680
894
  * but does not require it.
681
- * Any 32-bit and 64-bit targets that can run XXH32 smoothly
682
- * can run XXH3 at competitive speeds, even without vector support.
683
- * Further details are explained in the implementation.
895
+ * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3
896
+ * at competitive speeds, even without vector support. Further details are
897
+ * explained in the implementation.
684
898
  *
685
899
  * Optimized implementations are provided for AVX512, AVX2, SSE2, NEON, POWER8,
686
- * ZVector and scalar targets. This can be controlled via the XXH_VECTOR macro.
900
+ * ZVector and scalar targets. This can be controlled via the @ref XXH_VECTOR
901
+ * macro. For the x86 family, an automatic dispatcher is included separately
902
+ * in @ref xxh_x86dispatch.c.
687
903
  *
688
904
  * XXH3 implementation is portable:
689
905
  * it has a generic C90 formulation that can be compiled on any platform,
@@ -699,24 +915,42 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
699
915
  *
700
916
  * The API supports one-shot hashing, streaming mode, and custom secrets.
701
917
  */
702
-
703
918
  /*-**********************************************************************
704
919
  * XXH3 64-bit variant
705
920
  ************************************************************************/
706
921
 
707
- /* XXH3_64bits():
708
- * default 64-bit variant, using default secret and default seed of 0.
709
- * It's the fastest variant. */
710
- XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* data, size_t len);
922
+ /*!
923
+ * @brief 64-bit unseeded variant of XXH3.
924
+ *
925
+ * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of 0, however
926
+ * it may have slightly better performance due to constant propagation of the
927
+ * defaults.
928
+ *
929
+ * @see
930
+ * XXH32(), XXH64(), XXH3_128bits(): equivalent for the other xxHash algorithms
931
+ * @see
932
+ * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants
933
+ * @see
934
+ * XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version.
935
+ */
936
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);
711
937
 
712
- /*
713
- * XXH3_64bits_withSeed():
714
- * This variant generates a custom secret on the fly
715
- * based on default secret altered using the `seed` value.
938
+ /*!
939
+ * @brief 64-bit seeded variant of XXH3
940
+ *
941
+ * This variant generates a custom secret on the fly based on default secret
942
+ * altered using the `seed` value.
943
+ *
716
944
  * While this operation is decently fast, note that it's not completely free.
717
- * Note: seed==0 produces the same results as XXH3_64bits().
945
+ *
946
+ * @note
947
+ * seed == 0 produces the same results as @ref XXH3_64bits().
948
+ *
949
+ * @param input The data to hash
950
+ * @param length The length
951
+ * @param seed The 64-bit seed to alter the state.
718
952
  */
719
- XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
953
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
720
954
 
721
955
  /*!
722
956
  * The bare minimum size for a custom secret.
@@ -727,23 +961,29 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSeed(const void* data, size_t len, X
727
961
  */
728
962
  #define XXH3_SECRET_SIZE_MIN 136
729
963
 
730
- /*
731
- * XXH3_64bits_withSecret():
964
+ /*!
965
+ * @brief 64-bit variant of XXH3 with a custom "secret".
966
+ *
732
967
  * It's possible to provide any blob of bytes as a "secret" to generate the hash.
733
968
  * This makes it more difficult for an external actor to prepare an intentional collision.
734
969
  * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN).
735
- * However, the quality of produced hash values depends on secret's entropy.
736
- * Technically, the secret must look like a bunch of random bytes.
970
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
971
+ * Therefore, the secret _must_ look like a bunch of random bytes.
737
972
  * Avoid "trivial" or structured data such as repeated sequences or a text document.
738
- * Whenever unsure about the "randomness" of the blob of bytes,
739
- * consider relabelling it as a "custom seed" instead,
740
- * and employ "XXH3_generateSecret()" (see below)
741
- * to generate a high entropy secret derived from the custom seed.
973
+ * Whenever in doubt about the "randomness" of the blob of bytes,
974
+ * consider employing "XXH3_generateSecret()" instead (see below).
975
+ * It will generate a proper high entropy secret derived from the blob of bytes.
976
+ * Another advantage of using XXH3_generateSecret() is that
977
+ * it guarantees that all bits within the initial blob of bytes
978
+ * will impact every bit of the output.
979
+ * This is not necessarily the case when using the blob of bytes directly
980
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
742
981
  */
743
- XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
982
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
744
983
 
745
984
 
746
985
  /******* Streaming *******/
986
+ #ifndef XXH_NO_STREAM
747
987
  /*
748
988
  * Streaming requires state maintenance.
749
989
  * This operation costs memory and CPU.
@@ -757,23 +997,23 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_withSecret(const void* data, size_t len,
757
997
  * @see XXH3_state_s for details.
758
998
  */
759
999
  typedef struct XXH3_state_s XXH3_state_t;
760
- XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void);
1000
+ XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
761
1001
  XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
762
- XXH_PUBLIC_API void XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state);
1002
+ XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);
763
1003
 
764
1004
  /*
765
1005
  * XXH3_64bits_reset():
766
1006
  * Initialize with default parameters.
767
1007
  * digest will be equivalent to `XXH3_64bits()`.
768
1008
  */
769
- XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH3_state_t* statePtr);
1009
+ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
770
1010
  /*
771
1011
  * XXH3_64bits_reset_withSeed():
772
1012
  * Generate a custom secret from `seed`, and store it into `statePtr`.
773
1013
  * digest will be equivalent to `XXH3_64bits_withSeed()`.
774
1014
  */
775
- XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
776
- /*
1015
+ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
1016
+ /*!
777
1017
  * XXH3_64bits_reset_withSecret():
778
1018
  * `secret` is referenced, it _must outlive_ the hash streaming session.
779
1019
  * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`,
@@ -782,10 +1022,11 @@ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr,
782
1022
  * When in doubt about the randomness of a candidate `secret`,
783
1023
  * consider employing `XXH3_generateSecret()` instead (see below).
784
1024
  */
785
- XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
1025
+ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
786
1026
 
787
- XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
788
- XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* statePtr);
1027
+ XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
1028
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
1029
+ #endif /* !XXH_NO_STREAM */
789
1030
 
790
1031
  /* note : canonical representation of XXH3 is the same as XXH64
791
1032
  * since they both produce XXH64_hash_t values */
@@ -806,11 +1047,31 @@ typedef struct {
806
1047
  XXH64_hash_t high64; /*!< `value >> 64` */
807
1048
  } XXH128_hash_t;
808
1049
 
809
- XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* data, size_t len);
810
- XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSeed(const void* data, size_t len, XXH64_hash_t seed);
811
- XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t len, const void* secret, size_t secretSize);
1050
+ /*!
1051
+ * @brief Unseeded 128-bit variant of XXH3
1052
+ *
1053
+ * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead
1054
+ * for shorter inputs.
1055
+ *
1056
+ * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of 0, however
1057
+ * it may have slightly better performance due to constant propagation of the
1058
+ * defaults.
1059
+ *
1060
+ * @see
1061
+ * XXH32(), XXH64(), XXH3_64bits(): equivalent for the other xxHash algorithms
1062
+ * @see
1063
+ * XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants
1064
+ * @see
1065
+ * XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version.
1066
+ */
1067
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
1068
+ /*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */
1069
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
1070
+ /*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */
1071
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
812
1072
 
813
1073
  /******* Streaming *******/
1074
+ #ifndef XXH_NO_STREAM
814
1075
  /*
815
1076
  * Streaming requires state maintenance.
816
1077
  * This operation costs memory and CPU.
@@ -823,12 +1084,13 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_withSecret(const void* data, size_t le
823
1084
  * All reset and streaming functions have same meaning as their 64-bit counterpart.
824
1085
  */
825
1086
 
826
- XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH3_state_t* statePtr);
827
- XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed);
828
- XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize);
1087
+ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
1088
+ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
1089
+ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
829
1090
 
830
- XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH3_state_t* statePtr, const void* input, size_t length);
831
- XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
1091
+ XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
1092
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
1093
+ #endif /* !XXH_NO_STREAM */
832
1094
 
833
1095
  /* Following helper functions make it possible to compare XXH128_hast_t values.
834
1096
  * Since XXH128_hash_t is a structure, this capability is not offered by the language.
@@ -838,26 +1100,26 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* statePtr);
838
1100
  * XXH128_isEqual():
839
1101
  * Return: 1 if `h1` and `h2` are equal, 0 if they are not.
840
1102
  */
841
- XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
1103
+ XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
842
1104
 
843
1105
  /*!
844
- * XXH128_cmp():
845
- *
1106
+ * @brief Compares two @ref XXH128_hash_t
846
1107
  * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
847
1108
  *
848
- * return: >0 if *h128_1 > *h128_2
849
- * =0 if *h128_1 == *h128_2
850
- * <0 if *h128_1 < *h128_2
1109
+ * @return: >0 if *h128_1 > *h128_2
1110
+ * =0 if *h128_1 == *h128_2
1111
+ * <0 if *h128_1 < *h128_2
851
1112
  */
852
- XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2);
1113
+ XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);
853
1114
 
854
1115
 
855
1116
  /******* Canonical representation *******/
856
1117
  typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
857
- XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash);
858
- XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t* src);
1118
+ XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);
1119
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);
859
1120
 
860
1121
 
1122
+ #endif /* !XXH_NO_XXH3 */
861
1123
  #endif /* XXH_NO_LONG_LONG */
862
1124
 
863
1125
  /*!
@@ -898,13 +1160,10 @@ XXH_PUBLIC_API XXH128_hash_t XXH128_hashFromCanonical(const XXH128_canonical_t*
898
1160
  struct XXH32_state_s {
899
1161
  XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
900
1162
  XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
901
- XXH32_hash_t v1; /*!< First accumulator lane */
902
- XXH32_hash_t v2; /*!< Second accumulator lane */
903
- XXH32_hash_t v3; /*!< Third accumulator lane */
904
- XXH32_hash_t v4; /*!< Fourth accumulator lane */
1163
+ XXH32_hash_t v[4]; /*!< Accumulator lanes */
905
1164
  XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */
906
1165
  XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */
907
- XXH32_hash_t reserved; /*!< Reserved field. Do not read or write to it, it may be removed. */
1166
+ XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */
908
1167
  }; /* typedef'd to XXH32_state_t */
909
1168
 
910
1169
 
@@ -924,19 +1183,21 @@ struct XXH32_state_s {
924
1183
  */
925
1184
  struct XXH64_state_s {
926
1185
  XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */
927
- XXH64_hash_t v1; /*!< First accumulator lane */
928
- XXH64_hash_t v2; /*!< Second accumulator lane */
929
- XXH64_hash_t v3; /*!< Third accumulator lane */
930
- XXH64_hash_t v4; /*!< Fourth accumulator lane */
1186
+ XXH64_hash_t v[4]; /*!< Accumulator lanes */
931
1187
  XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */
932
1188
  XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */
933
1189
  XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/
934
- XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it, it may be removed. */
1190
+ XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */
935
1191
  }; /* typedef'd to XXH64_state_t */
936
1192
 
937
- #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */
1193
+ #ifndef XXH_NO_XXH3
1194
+
1195
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
938
1196
  # include <stdalign.h>
939
1197
  # define XXH_ALIGN(n) alignas(n)
1198
+ #elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
1199
+ /* In C++ alignas() is a keyword */
1200
+ # define XXH_ALIGN(n) alignas(n)
940
1201
  #elif defined(__GNUC__)
941
1202
  # define XXH_ALIGN(n) __attribute__ ((aligned(n)))
942
1203
  #elif defined(_MSC_VER)
@@ -947,6 +1208,7 @@ struct XXH64_state_s {
947
1208
 
948
1209
  /* Old GCC versions only accept the attribute after the type in structures. */
949
1210
  #if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \
1211
+ && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
950
1212
  && defined(__GNUC__)
951
1213
  # define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
952
1214
  #else
@@ -976,16 +1238,18 @@ struct XXH64_state_s {
976
1238
  * @brief Structure for XXH3 streaming API.
977
1239
  *
978
1240
  * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
979
- * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
980
- * an opaque type. This allows fields to safely be changed.
1241
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
1242
+ * Otherwise it is an opaque type.
1243
+ * Never use this definition in combination with dynamic library.
1244
+ * This allows fields to safely be changed in the future.
981
1245
  *
982
- * @note **This structure has a strict alignment requirement of 64 bytes.** Do
983
- * not allocate this with `malloc()` or `new`, it will not be sufficiently
984
- * aligned. Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack
985
- * allocation.
1246
+ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
1247
+ * Do not allocate this with `malloc()` or `new`,
1248
+ * it will not be sufficiently aligned.
1249
+ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
986
1250
  *
987
1251
  * Typedef'd to @ref XXH3_state_t.
988
- * Do not access the members of this struct directly.
1252
+ * Do never access the members of this struct directly.
989
1253
  *
990
1254
  * @see XXH3_INITSTATE() for stack initialization.
991
1255
  * @see XXH3_createState(), XXH3_freeState().
@@ -993,14 +1257,14 @@ struct XXH64_state_s {
993
1257
  */
994
1258
  struct XXH3_state_s {
995
1259
  XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
996
- /*!< The 8 accumulators. Similar to `vN` in @ref XXH32_state_s::v1 and @ref XXH64_state_s */
1260
+ /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */
997
1261
  XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
998
1262
  /*!< Used to store a custom secret generated from a seed. */
999
1263
  XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
1000
1264
  /*!< The internal buffer. @see XXH32_state_s::mem32 */
1001
1265
  XXH32_hash_t bufferedSize;
1002
1266
  /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
1003
- XXH32_hash_t reserved32;
1267
+ XXH32_hash_t useSeed;
1004
1268
  /*!< Reserved field. Needed for padding on 64-bit. */
1005
1269
  size_t nbStripesSoFar;
1006
1270
  /*!< Number or stripes processed. */
@@ -1036,45 +1300,156 @@ struct XXH3_state_s {
1036
1300
  #define XXH3_INITSTATE(XXH3_state_ptr) { (XXH3_state_ptr)->seed = 0; }
1037
1301
 
1038
1302
 
1303
+ /*!
1304
+ * simple alias to pre-selected XXH3_128bits variant
1305
+ */
1306
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
1307
+
1308
+
1039
1309
  /* === Experimental API === */
1040
1310
  /* Symbols defined below must be considered tied to a specific library version. */
1041
1311
 
1042
- /*
1312
+ /*!
1043
1313
  * XXH3_generateSecret():
1044
1314
  *
1045
1315
  * Derive a high-entropy secret from any user-defined content, named customSeed.
1046
1316
  * The generated secret can be used in combination with `*_withSecret()` functions.
1047
- * The `_withSecret()` variants are useful to provide a higher level of protection than 64-bit seed,
1048
- * as it becomes much more difficult for an external actor to guess how to impact the calculation logic.
1317
+ * The `_withSecret()` variants are useful to provide a higher level of protection
1318
+ * than 64-bit seed, as it becomes much more difficult for an external actor to
1319
+ * guess how to impact the calculation logic.
1049
1320
  *
1050
1321
  * The function accepts as input a custom seed of any length and any content,
1051
- * and derives from it a high-entropy secret of length XXH3_SECRET_DEFAULT_SIZE
1052
- * into an already allocated buffer secretBuffer.
1053
- * The generated secret is _always_ XXH_SECRET_DEFAULT_SIZE bytes long.
1322
+ * and derives from it a high-entropy secret of length @p secretSize into an
1323
+ * already allocated buffer @p secretBuffer.
1054
1324
  *
1055
1325
  * The generated secret can then be used with any `*_withSecret()` variant.
1056
- * Functions `XXH3_128bits_withSecret()`, `XXH3_64bits_withSecret()`,
1057
- * `XXH3_128bits_reset_withSecret()` and `XXH3_64bits_reset_withSecret()`
1326
+ * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(),
1327
+ * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret()
1058
1328
  * are part of this list. They all accept a `secret` parameter
1059
- * which must be very long for implementation reasons (>= XXH3_SECRET_SIZE_MIN)
1329
+ * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN)
1060
1330
  * _and_ feature very high entropy (consist of random-looking bytes).
1061
- * These conditions can be a high bar to meet, so
1062
- * this function can be used to generate a secret of proper quality.
1331
+ * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can
1332
+ * be employed to ensure proper quality.
1063
1333
  *
1064
- * customSeed can be anything. It can have any size, even small ones,
1065
- * and its content can be anything, even stupidly "low entropy" source such as a bunch of zeroes.
1066
- * The resulting `secret` will nonetheless provide all expected qualities.
1334
+ * @p customSeed can be anything. It can have any size, even small ones,
1335
+ * and its content can be anything, even "poor entropy" sources such as a bunch
1336
+ * of zeroes. The resulting `secret` will nonetheless provide all required qualities.
1337
+ *
1338
+ * @pre
1339
+ * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN
1340
+ * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
1067
1341
  *
1068
- * Supplying NULL as the customSeed copies the default secret into `secretBuffer`.
1069
- * When customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
1342
+ * Example code:
1343
+ * @code{.c}
1344
+ * #include <stdio.h>
1345
+ * #include <stdlib.h>
1346
+ * #include <string.h>
1347
+ * #define XXH_STATIC_LINKING_ONLY // expose unstable API
1348
+ * #include "xxhash.h"
1349
+ * // Hashes argv[2] using the entropy from argv[1].
1350
+ * int main(int argc, char* argv[])
1351
+ * {
1352
+ * char secret[XXH3_SECRET_SIZE_MIN];
1353
+ * if (argv != 3) { return 1; }
1354
+ * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1]));
1355
+ * XXH64_hash_t h = XXH3_64bits_withSecret(
1356
+ * argv[2], strlen(argv[2]),
1357
+ * secret, sizeof(secret)
1358
+ * );
1359
+ * printf("%016llx\n", (unsigned long long) h);
1360
+ * }
1361
+ * @endcode
1070
1362
  */
1071
- XXH_PUBLIC_API void XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize);
1072
-
1363
+ XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);
1073
1364
 
1074
- /* simple short-cut to pre-selected XXH3_128bits variant */
1075
- XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t seed);
1365
+ /*!
1366
+ * @brief Generate the same secret as the _withSeed() variants.
1367
+ *
1368
+ * The generated secret can be used in combination with
1369
+ *`*_withSecret()` and `_withSecretandSeed()` variants.
1370
+ *
1371
+ * Example C++ `std::string` hash class:
1372
+ * @code{.cpp}
1373
+ * #include <string>
1374
+ * #define XXH_STATIC_LINKING_ONLY // expose unstable API
1375
+ * #include "xxhash.h"
1376
+ * // Slow, seeds each time
1377
+ * class HashSlow {
1378
+ * XXH64_hash_t seed;
1379
+ * public:
1380
+ * HashSlow(XXH64_hash_t s) : seed{s} {}
1381
+ * size_t operator()(const std::string& x) const {
1382
+ * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)};
1383
+ * }
1384
+ * };
1385
+ * // Fast, caches the seeded secret for future uses.
1386
+ * class HashFast {
1387
+ * unsigned char secret[XXH3_SECRET_SIZE_MIN];
1388
+ * public:
1389
+ * HashFast(XXH64_hash_t s) {
1390
+ * XXH3_generateSecret_fromSeed(secret, seed);
1391
+ * }
1392
+ * size_t operator()(const std::string& x) const {
1393
+ * return size_t{
1394
+ * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret))
1395
+ * };
1396
+ * }
1397
+ * };
1398
+ * @endcode
1399
+ * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes
1400
+ * @param seed The seed to seed the state.
1401
+ */
1402
+ XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);
1076
1403
 
1404
+ /*!
1405
+ * These variants generate hash values using either
1406
+ * @p seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes)
1407
+ * or @p secret for "large" keys (>= XXH3_MIDSIZE_MAX).
1408
+ *
1409
+ * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
1410
+ * `_withSeed()` has to generate the secret on the fly for "large" keys.
1411
+ * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
1412
+ * `_withSecret()` has to generate the masks on the fly for "small" keys,
1413
+ * which requires more instructions than _withSeed() variants.
1414
+ * Therefore, _withSecretandSeed variant combines the best of both worlds.
1415
+ *
1416
+ * When @p secret has been generated by XXH3_generateSecret_fromSeed(),
1417
+ * this variant produces *exactly* the same results as `_withSeed()` variant,
1418
+ * hence offering only a pure speed benefit on "large" input,
1419
+ * by skipping the need to regenerate the secret for every large input.
1420
+ *
1421
+ * Another usage scenario is to hash the secret to a 64-bit hash value,
1422
+ * for example with XXH3_64bits(), which then becomes the seed,
1423
+ * and then employ both the seed and the secret in _withSecretandSeed().
1424
+ * On top of speed, an added benefit is that each bit in the secret
1425
+ * has a 50% chance to swap each bit in the output, via its impact to the seed.
1426
+ *
1427
+ * This is not guaranteed when using the secret directly in "small data" scenarios,
1428
+ * because only portions of the secret are employed for small data.
1429
+ */
1430
+ XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
1431
+ XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
1432
+ XXH_NOESCAPE const void* secret, size_t secretSize,
1433
+ XXH64_hash_t seed);
1434
+ /*! @copydoc XXH3_64bits_withSecretandSeed() */
1435
+ XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
1436
+ XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
1437
+ XXH_NOESCAPE const void* secret, size_t secretSize,
1438
+ XXH64_hash_t seed64);
1439
+ #ifndef XXH_NO_STREAM
1440
+ /*! @copydoc XXH3_64bits_withSecretandSeed() */
1441
+ XXH_PUBLIC_API XXH_errorcode
1442
+ XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
1443
+ XXH_NOESCAPE const void* secret, size_t secretSize,
1444
+ XXH64_hash_t seed64);
1445
+ /*! @copydoc XXH3_64bits_withSecretandSeed() */
1446
+ XXH_PUBLIC_API XXH_errorcode
1447
+ XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
1448
+ XXH_NOESCAPE const void* secret, size_t secretSize,
1449
+ XXH64_hash_t seed64);
1450
+ #endif /* !XXH_NO_STREAM */
1077
1451
 
1452
+ #endif /* !XXH_NO_XXH3 */
1078
1453
  #endif /* XXH_NO_LONG_LONG */
1079
1454
  #if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
1080
1455
  # define XXH_IMPLEMENTATION
@@ -1128,7 +1503,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1128
1503
  /*!
1129
1504
  * @brief Define this to disable 64-bit code.
1130
1505
  *
1131
- * Useful if only using the @ref xxh32_family and you have a strict C90 compiler.
1506
+ * Useful if only using the @ref XXH32_family and you have a strict C90 compiler.
1132
1507
  */
1133
1508
  # define XXH_NO_LONG_LONG
1134
1509
  # undef XXH_NO_LONG_LONG /* don't actually */
@@ -1151,7 +1526,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1151
1526
  * Use `memcpy()`. Safe and portable. Note that most modern compilers will
1152
1527
  * eliminate the function call and treat it as an unaligned access.
1153
1528
  *
1154
- * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((packed))`
1529
+ * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`
1155
1530
  * @par
1156
1531
  * Depends on compiler extensions and is therefore not portable.
1157
1532
  * This method is safe _if_ your compiler supports it,
@@ -1178,22 +1553,40 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1178
1553
  * care, as what works on one compiler/platform/optimization level may cause
1179
1554
  * another to read garbage data or even crash.
1180
1555
  *
1181
- * See https://stackoverflow.com/a/32095106/646947 for details.
1556
+ * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
1182
1557
  *
1183
1558
  * Prefer these methods in priority order (0 > 3 > 1 > 2)
1184
1559
  */
1185
1560
  # define XXH_FORCE_MEMORY_ACCESS 0
1561
+
1186
1562
  /*!
1187
- * @def XXH_ACCEPT_NULL_INPUT_POINTER
1188
- * @brief Whether to add explicit `NULL` checks.
1563
+ * @def XXH_SIZE_OPT
1564
+ * @brief Controls how much xxHash optimizes for size.
1189
1565
  *
1190
- * If the input pointer is `NULL` and the length is non-zero, xxHash's default
1191
- * behavior is to dereference it, triggering a segfault.
1566
+ * xxHash, when compiled, tends to result in a rather large binary size. This
1567
+ * is mostly due to heavy usage to forced inlining and constant folding of the
1568
+ * @ref XXH3_family to increase performance.
1192
1569
  *
1193
- * When this macro is enabled, xxHash actively checks the input for a null pointer.
1194
- * If it is, the result for null input pointers is the same as a zero-length input.
1570
+ * However, some developers prefer size over speed. This option can
1571
+ * significantly reduce the size of the generated code. When using the `-Os`
1572
+ * or `-Oz` options on GCC or Clang, this is defined to 1 by default,
1573
+ * otherwise it is defined to 0.
1574
+ *
1575
+ * Most of these size optimizations can be controlled manually.
1576
+ *
1577
+ * This is a number from 0-2.
1578
+ * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed
1579
+ * comes first.
1580
+ * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more
1581
+ * conservative and disables hacks that increase code size. It implies the
1582
+ * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0,
1583
+ * and @ref XXH3_NEON_LANES == 8 if they are not already defined.
1584
+ * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible.
1585
+ * Performance may cry. For example, the single shot functions just use the
1586
+ * streaming API.
1195
1587
  */
1196
- # define XXH_ACCEPT_NULL_INPUT_POINTER 0
1588
+ # define XXH_SIZE_OPT 0
1589
+
1197
1590
  /*!
1198
1591
  * @def XXH_FORCE_ALIGN_CHECK
1199
1592
  * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
@@ -1215,9 +1608,11 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1215
1608
  *
1216
1609
  * In these cases, the alignment check can be removed by setting this macro to 0.
1217
1610
  * Then the code will always use unaligned memory access.
1218
- * Align check is automatically disabled on x86, x64 & arm64,
1611
+ * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips
1219
1612
  * which are platforms known to offer good unaligned memory accesses performance.
1220
1613
  *
1614
+ * It is also disabled by default when @ref XXH_SIZE_OPT >= 1.
1615
+ *
1221
1616
  * This option does not affect XXH3 (only XXH32 and XXH64).
1222
1617
  */
1223
1618
  # define XXH_FORCE_ALIGN_CHECK 0
@@ -1239,24 +1634,22 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1239
1634
  * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
1240
1635
  * compiler full control on whether to inline or not.
1241
1636
  *
1242
- * When not optimizing (-O0), optimizing for size (-Os, -Oz), or using
1243
- * -fno-inline with GCC or Clang, this will automatically be defined.
1637
+ * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if
1638
+ * @ref XXH_SIZE_OPT >= 1, this will automatically be defined.
1244
1639
  */
1245
1640
  # define XXH_NO_INLINE_HINTS 0
1246
1641
 
1247
1642
  /*!
1248
- * @def XXH_REROLL
1249
- * @brief Whether to reroll `XXH32_finalize` and `XXH64_finalize`.
1643
+ * @def XXH32_ENDJMP
1644
+ * @brief Whether to use a jump for `XXH32_finalize`.
1250
1645
  *
1251
- * For performance, `XXH32_finalize` and `XXH64_finalize` use an unrolled loop
1252
- * in the form of a switch statement.
1646
+ * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
1647
+ * This is generally preferable for performance,
1648
+ * but depending on exact architecture, a jmp may be preferable.
1253
1649
  *
1254
- * This is not always desirable, as it generates larger code, and depending on
1255
- * the architecture, may even be slower
1256
- *
1257
- * This is automatically defined with `-Os`/`-Oz` on GCC and Clang.
1650
+ * This setting is only possibly making a difference for very small inputs.
1258
1651
  */
1259
- # define XXH_REROLL 0
1652
+ # define XXH32_ENDJMP 0
1260
1653
 
1261
1654
  /*!
1262
1655
  * @internal
@@ -1267,27 +1660,46 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1267
1660
  */
1268
1661
  # define XXH_OLD_NAMES
1269
1662
  # undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
1663
+
1664
+ /*!
1665
+ * @def XXH_NO_STREAM
1666
+ * @brief Disables the streaming API.
1667
+ *
1668
+ * When xxHash is not inlined and the streaming functions are not used, disabling
1669
+ * the streaming functions can improve code size significantly, especially with
1670
+ * the @ref XXH3_family which tends to make constant folded copies of itself.
1671
+ */
1672
+ # define XXH_NO_STREAM
1673
+ # undef XXH_NO_STREAM /* don't actually */
1270
1674
  #endif /* XXH_DOXYGEN */
1271
1675
  /*!
1272
1676
  * @}
1273
1677
  */
1274
1678
 
1275
1679
  #ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
1276
- /* prefer __packed__ structures (method 1) for gcc on armv7 and armv8 */
1277
- # if !defined(__clang__) && ( \
1278
- (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
1279
- (defined(__GNUC__) && (defined(__ARM_ARCH) && __ARM_ARCH >= 7)) )
1680
+ /* prefer __packed__ structures (method 1) for GCC
1681
+ * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy
1682
+ * which for some reason does unaligned loads. */
1683
+ # if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))
1280
1684
  # define XXH_FORCE_MEMORY_ACCESS 1
1281
1685
  # endif
1282
1686
  #endif
1283
1687
 
1284
- #ifndef XXH_ACCEPT_NULL_INPUT_POINTER /* can be defined externally */
1285
- # define XXH_ACCEPT_NULL_INPUT_POINTER 0
1688
+ #ifndef XXH_SIZE_OPT
1689
+ /* default to 1 for -Os or -Oz */
1690
+ # if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__)
1691
+ # define XXH_SIZE_OPT 1
1692
+ # else
1693
+ # define XXH_SIZE_OPT 0
1694
+ # endif
1286
1695
  #endif
1287
1696
 
1288
1697
  #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
1289
- # if defined(__i386) || defined(__x86_64__) || defined(__aarch64__) \
1290
- || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) /* visual */
1698
+ /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */
1699
+ # if XXH_SIZE_OPT >= 1 || \
1700
+ defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \
1701
+ || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) \
1702
+ || defined(__loongarch64) /* visual */
1291
1703
  # define XXH_FORCE_ALIGN_CHECK 0
1292
1704
  # else
1293
1705
  # define XXH_FORCE_ALIGN_CHECK 1
@@ -1295,20 +1707,16 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1295
1707
  #endif
1296
1708
 
1297
1709
  #ifndef XXH_NO_INLINE_HINTS
1298
- # if defined(__OPTIMIZE_SIZE__) /* -Os, -Oz */ \
1299
- || defined(__NO_INLINE__) /* -O0, -fno-inline */
1710
+ # if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */
1300
1711
  # define XXH_NO_INLINE_HINTS 1
1301
1712
  # else
1302
1713
  # define XXH_NO_INLINE_HINTS 0
1303
1714
  # endif
1304
1715
  #endif
1305
1716
 
1306
- #ifndef XXH_REROLL
1307
- # if defined(__OPTIMIZE_SIZE__)
1308
- # define XXH_REROLL 1
1309
- # else
1310
- # define XXH_REROLL 0
1311
- # endif
1717
+ #ifndef XXH32_ENDJMP
1718
+ /* generally preferable for performance */
1719
+ # define XXH32_ENDJMP 0
1312
1720
  #endif
1313
1721
 
1314
1722
  /*!
@@ -1320,6 +1728,24 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1320
1728
  /* *************************************
1321
1729
  * Includes & Memory related functions
1322
1730
  ***************************************/
1731
+ #if defined(XXH_NO_STREAM)
1732
+ /* nothing */
1733
+ #elif defined(XXH_NO_STDLIB)
1734
+
1735
+ /* When requesting to disable any mention of stdlib,
1736
+ * the library loses the ability to invoked malloc / free.
1737
+ * In practice, it means that functions like `XXH*_createState()`
1738
+ * will always fail, and return NULL.
1739
+ * This flag is useful in situations where
1740
+ * xxhash.h is integrated into some kernel, embedded or limited environment
1741
+ * without access to dynamic allocation.
1742
+ */
1743
+
1744
+ static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; }
1745
+ static void XXH_free(void* p) { (void)p; }
1746
+
1747
+ #else
1748
+
1323
1749
  /*
1324
1750
  * Modify the local functions below should you wish to use
1325
1751
  * different memory routines for malloc() and free()
@@ -1330,7 +1756,7 @@ XXH_PUBLIC_API XXH128_hash_t XXH128(const void* data, size_t len, XXH64_hash_t s
1330
1756
  * @internal
1331
1757
  * @brief Modify this function to use a different routine than malloc().
1332
1758
  */
1333
- static void* XXH_malloc(size_t s) { return malloc(s); }
1759
+ static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); }
1334
1760
 
1335
1761
  /*!
1336
1762
  * @internal
@@ -1338,6 +1764,8 @@ static void* XXH_malloc(size_t s) { return malloc(s); }
1338
1764
  */
1339
1765
  static void XXH_free(void* p) { free(p); }
1340
1766
 
1767
+ #endif /* XXH_NO_STDLIB */
1768
+
1341
1769
  #include <string.h>
1342
1770
 
1343
1771
  /*!
@@ -1360,19 +1788,19 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1360
1788
  #endif
1361
1789
 
1362
1790
  #if XXH_NO_INLINE_HINTS /* disable inlining hints */
1363
- # if defined(__GNUC__)
1791
+ # if defined(__GNUC__) || defined(__clang__)
1364
1792
  # define XXH_FORCE_INLINE static __attribute__((unused))
1365
1793
  # else
1366
1794
  # define XXH_FORCE_INLINE static
1367
1795
  # endif
1368
1796
  # define XXH_NO_INLINE static
1369
1797
  /* enable inlining hints */
1798
+ #elif defined(__GNUC__) || defined(__clang__)
1799
+ # define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
1800
+ # define XXH_NO_INLINE static __attribute__((noinline))
1370
1801
  #elif defined(_MSC_VER) /* Visual Studio */
1371
1802
  # define XXH_FORCE_INLINE static __forceinline
1372
1803
  # define XXH_NO_INLINE static __declspec(noinline)
1373
- #elif defined(__GNUC__)
1374
- # define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused))
1375
- # define XXH_NO_INLINE static __attribute__((noinline))
1376
1804
  #elif defined (__cplusplus) \
1377
1805
  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */
1378
1806
  # define XXH_FORCE_INLINE static inline
@@ -1407,11 +1835,20 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1407
1835
  # include <assert.h> /* note: can still be disabled with NDEBUG */
1408
1836
  # define XXH_ASSERT(c) assert(c)
1409
1837
  #else
1410
- # define XXH_ASSERT(c) ((void)0)
1838
+ # define XXH_ASSERT(c) XXH_ASSUME(c)
1411
1839
  #endif
1412
1840
 
1413
1841
  /* note: use after variable declarations */
1414
- #define XXH_STATIC_ASSERT(c) do { enum { XXH_sa = 1/(int)(!!(c)) }; } while (0)
1842
+ #ifndef XXH_STATIC_ASSERT
1843
+ # if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */
1844
+ # define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0)
1845
+ # elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */
1846
+ # define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
1847
+ # else
1848
+ # define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)
1849
+ # endif
1850
+ # define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)
1851
+ #endif
1415
1852
 
1416
1853
  /*!
1417
1854
  * @internal
@@ -1429,12 +1866,18 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size)
1429
1866
  * We also use it to prevent unwanted constant folding for AArch64 in
1430
1867
  * XXH3_initCustomSecret_scalar().
1431
1868
  */
1432
- #ifdef __GNUC__
1869
+ #if defined(__GNUC__) || defined(__clang__)
1433
1870
  # define XXH_COMPILER_GUARD(var) __asm__ __volatile__("" : "+r" (var))
1434
1871
  #else
1435
1872
  # define XXH_COMPILER_GUARD(var) ((void)0)
1436
1873
  #endif
1437
1874
 
1875
+ #if defined(__GNUC__) || defined(__clang__)
1876
+ # define XXH_COMPILER_GUARD_W(var) __asm__ __volatile__("" : "+w" (var))
1877
+ #else
1878
+ # define XXH_COMPILER_GUARD_W(var) ((void)0)
1879
+ #endif
1880
+
1438
1881
  /* *************************************
1439
1882
  * Basic Types
1440
1883
  ***************************************/
@@ -1522,30 +1965,31 @@ static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr;
1522
1965
  #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
1523
1966
 
1524
1967
  /*
1525
- * __pack instructions are safer but compiler specific, hence potentially
1526
- * problematic for some compilers.
1527
- *
1528
- * Currently only defined for GCC and ICC.
1968
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
1969
+ * documentation claimed that it only increased the alignment, but actually it
1970
+ * can decrease it on gcc, clang, and icc:
1971
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
1972
+ * https://gcc.godbolt.org/z/xYez1j67Y.
1529
1973
  */
1530
1974
  #ifdef XXH_OLD_NAMES
1531
1975
  typedef union { xxh_u32 u32; } __attribute__((packed)) unalign;
1532
1976
  #endif
1533
1977
  static xxh_u32 XXH_read32(const void* ptr)
1534
1978
  {
1535
- typedef union { xxh_u32 u32; } __attribute__((packed)) xxh_unalign;
1536
- return ((const xxh_unalign*)ptr)->u32;
1979
+ typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32;
1980
+ return *((const xxh_unalign32*)ptr);
1537
1981
  }
1538
1982
 
1539
1983
  #else
1540
1984
 
1541
1985
  /*
1542
1986
  * Portable and safe solution. Generally efficient.
1543
- * see: https://stackoverflow.com/a/32095106/646947
1987
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
1544
1988
  */
1545
1989
  static xxh_u32 XXH_read32(const void* memPtr)
1546
1990
  {
1547
1991
  xxh_u32 val;
1548
- memcpy(&val, memPtr, sizeof(val));
1992
+ XXH_memcpy(&val, memPtr, sizeof(val));
1549
1993
  return val;
1550
1994
  }
1551
1995
 
@@ -1553,6 +1997,7 @@ static xxh_u32 XXH_read32(const void* memPtr)
1553
1997
 
1554
1998
 
1555
1999
  /* *** Endianness *** */
2000
+
1556
2001
  /*!
1557
2002
  * @ingroup tuning
1558
2003
  * @def XXH_CPU_LITTLE_ENDIAN
@@ -1561,8 +2006,8 @@ static xxh_u32 XXH_read32(const void* memPtr)
1561
2006
  * Defined to 1 if the target is little endian, or 0 if it is big endian.
1562
2007
  * It can be defined externally, for example on the compiler command line.
1563
2008
  *
1564
- * If it is not defined, a runtime check (which is usually constant folded)
1565
- * is used instead.
2009
+ * If it is not defined,
2010
+ * a runtime check (which is usually constant folded) is used instead.
1566
2011
  *
1567
2012
  * @note
1568
2013
  * This is not necessarily defined to an integer constant.
@@ -1615,6 +2060,29 @@ static int XXH_isLittleEndian(void)
1615
2060
  # define XXH_HAS_BUILTIN(x) 0
1616
2061
  #endif
1617
2062
 
2063
+
2064
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L)
2065
+ /* C23 and future versions have standard "unreachable()" */
2066
+ # include <stddef.h>
2067
+ # define XXH_UNREACHABLE() unreachable()
2068
+
2069
+ #elif defined(__cplusplus) && (__cplusplus > 202002L)
2070
+ /* C++23 and future versions have std::unreachable() */
2071
+ # include <utility> /* std::unreachable() */
2072
+ # define XXH_UNREACHABLE() std::unreachable()
2073
+
2074
+ #elif XXH_HAS_BUILTIN(__builtin_unreachable)
2075
+ # define XXH_UNREACHABLE() __builtin_unreachable()
2076
+
2077
+ #elif defined(_MSC_VER)
2078
+ # define XXH_UNREACHABLE() __assume(0)
2079
+
2080
+ #else
2081
+ # define XXH_UNREACHABLE()
2082
+ #endif
2083
+
2084
+ #define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
2085
+
1618
2086
  /*!
1619
2087
  * @internal
1620
2088
  * @def XXH_rotl32(x,r)
@@ -1737,8 +2205,10 @@ XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
1737
2205
  *********************************************************************/
1738
2206
  /*!
1739
2207
  * @}
1740
- * @defgroup xxh32_impl XXH32 implementation
2208
+ * @defgroup XXH32_impl XXH32 implementation
1741
2209
  * @ingroup impl
2210
+ *
2211
+ * Details on the XXH32 implementation.
1742
2212
  * @{
1743
2213
  */
1744
2214
  /* #define instead of static const, to be used as initializers */
@@ -1818,17 +2288,17 @@ static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
1818
2288
  * The final mix ensures that all input bits have a chance to impact any bit in
1819
2289
  * the output digest, resulting in an unbiased distribution.
1820
2290
  *
1821
- * @param h32 The hash to avalanche.
2291
+ * @param hash The hash to avalanche.
1822
2292
  * @return The avalanched hash.
1823
2293
  */
1824
- static xxh_u32 XXH32_avalanche(xxh_u32 h32)
2294
+ static xxh_u32 XXH32_avalanche(xxh_u32 hash)
1825
2295
  {
1826
- h32 ^= h32 >> 15;
1827
- h32 *= XXH_PRIME32_2;
1828
- h32 ^= h32 >> 13;
1829
- h32 *= XXH_PRIME32_3;
1830
- h32 ^= h32 >> 16;
1831
- return(h32);
2296
+ hash ^= hash >> 15;
2297
+ hash *= XXH_PRIME32_2;
2298
+ hash ^= hash >> 13;
2299
+ hash *= XXH_PRIME32_3;
2300
+ hash ^= hash >> 16;
2301
+ return hash;
1832
2302
  }
1833
2303
 
1834
2304
  #define XXH_get32bits(p) XXH_readLE32_align(p, align)
@@ -1841,28 +2311,31 @@ static xxh_u32 XXH32_avalanche(xxh_u32 h32)
1841
2311
  * This final stage will digest them to ensure that all input bytes are present
1842
2312
  * in the final mix.
1843
2313
  *
1844
- * @param h32 The hash to finalize.
2314
+ * @param hash The hash to finalize.
1845
2315
  * @param ptr The pointer to the remaining input.
1846
2316
  * @param len The remaining length, modulo 16.
1847
2317
  * @param align Whether @p ptr is aligned.
1848
2318
  * @return The finalized hash.
2319
+ * @see XXH64_finalize().
1849
2320
  */
1850
- static xxh_u32
1851
- XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
2321
+ static XXH_PUREF xxh_u32
2322
+ XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
1852
2323
  {
1853
- #define XXH_PROCESS1 do { \
1854
- h32 += (*ptr++) * XXH_PRIME32_5; \
1855
- h32 = XXH_rotl32(h32, 11) * XXH_PRIME32_1; \
2324
+ #define XXH_PROCESS1 do { \
2325
+ hash += (*ptr++) * XXH_PRIME32_5; \
2326
+ hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \
1856
2327
  } while (0)
1857
2328
 
1858
- #define XXH_PROCESS4 do { \
1859
- h32 += XXH_get32bits(ptr) * XXH_PRIME32_3; \
1860
- ptr += 4; \
1861
- h32 = XXH_rotl32(h32, 17) * XXH_PRIME32_4; \
2329
+ #define XXH_PROCESS4 do { \
2330
+ hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \
2331
+ ptr += 4; \
2332
+ hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \
1862
2333
  } while (0)
1863
2334
 
1864
- /* Compact rerolled version */
1865
- if (XXH_REROLL) {
2335
+ if (ptr==NULL) XXH_ASSERT(len == 0);
2336
+
2337
+ /* Compact rerolled version; generally faster */
2338
+ if (!XXH32_ENDJMP) {
1866
2339
  len &= 15;
1867
2340
  while (len >= 4) {
1868
2341
  XXH_PROCESS4;
@@ -1872,49 +2345,49 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
1872
2345
  XXH_PROCESS1;
1873
2346
  --len;
1874
2347
  }
1875
- return XXH32_avalanche(h32);
2348
+ return XXH32_avalanche(hash);
1876
2349
  } else {
1877
2350
  switch(len&15) /* or switch(bEnd - p) */ {
1878
2351
  case 12: XXH_PROCESS4;
1879
- FALLTHROUGH_INTENDED;
2352
+ XXH_FALLTHROUGH; /* fallthrough */
1880
2353
  case 8: XXH_PROCESS4;
1881
- FALLTHROUGH_INTENDED;
2354
+ XXH_FALLTHROUGH; /* fallthrough */
1882
2355
  case 4: XXH_PROCESS4;
1883
- return XXH32_avalanche(h32);
2356
+ return XXH32_avalanche(hash);
1884
2357
 
1885
2358
  case 13: XXH_PROCESS4;
1886
- FALLTHROUGH_INTENDED;
2359
+ XXH_FALLTHROUGH; /* fallthrough */
1887
2360
  case 9: XXH_PROCESS4;
1888
- FALLTHROUGH_INTENDED;
2361
+ XXH_FALLTHROUGH; /* fallthrough */
1889
2362
  case 5: XXH_PROCESS4;
1890
2363
  XXH_PROCESS1;
1891
- return XXH32_avalanche(h32);
2364
+ return XXH32_avalanche(hash);
1892
2365
 
1893
2366
  case 14: XXH_PROCESS4;
1894
- FALLTHROUGH_INTENDED;
2367
+ XXH_FALLTHROUGH; /* fallthrough */
1895
2368
  case 10: XXH_PROCESS4;
1896
- FALLTHROUGH_INTENDED;
2369
+ XXH_FALLTHROUGH; /* fallthrough */
1897
2370
  case 6: XXH_PROCESS4;
1898
2371
  XXH_PROCESS1;
1899
2372
  XXH_PROCESS1;
1900
- return XXH32_avalanche(h32);
2373
+ return XXH32_avalanche(hash);
1901
2374
 
1902
2375
  case 15: XXH_PROCESS4;
1903
- FALLTHROUGH_INTENDED;
2376
+ XXH_FALLTHROUGH; /* fallthrough */
1904
2377
  case 11: XXH_PROCESS4;
1905
- FALLTHROUGH_INTENDED;
2378
+ XXH_FALLTHROUGH; /* fallthrough */
1906
2379
  case 7: XXH_PROCESS4;
1907
- FALLTHROUGH_INTENDED;
2380
+ XXH_FALLTHROUGH; /* fallthrough */
1908
2381
  case 3: XXH_PROCESS1;
1909
- FALLTHROUGH_INTENDED;
2382
+ XXH_FALLTHROUGH; /* fallthrough */
1910
2383
  case 2: XXH_PROCESS1;
1911
- FALLTHROUGH_INTENDED;
2384
+ XXH_FALLTHROUGH; /* fallthrough */
1912
2385
  case 1: XXH_PROCESS1;
1913
- FALLTHROUGH_INTENDED;
1914
- case 0: return XXH32_avalanche(h32);
2386
+ XXH_FALLTHROUGH; /* fallthrough */
2387
+ case 0: return XXH32_avalanche(hash);
1915
2388
  }
1916
2389
  XXH_ASSERT(0);
1917
- return h32; /* reaching this point is deemed impossible */
2390
+ return hash; /* reaching this point is deemed impossible */
1918
2391
  }
1919
2392
  }
1920
2393
 
@@ -1930,24 +2403,19 @@ XXH32_finalize(xxh_u32 h32, const xxh_u8* ptr, size_t len, XXH_alignment align)
1930
2403
  * @internal
1931
2404
  * @brief The implementation for @ref XXH32().
1932
2405
  *
1933
- * @param input, len, seed Directly passed from @ref XXH32().
2406
+ * @param input , len , seed Directly passed from @ref XXH32().
1934
2407
  * @param align Whether @p input is aligned.
1935
2408
  * @return The calculated hash.
1936
2409
  */
1937
- XXH_FORCE_INLINE xxh_u32
2410
+ XXH_FORCE_INLINE XXH_PUREF xxh_u32
1938
2411
  XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
1939
2412
  {
1940
- const xxh_u8* bEnd = input ? input + len : NULL;
1941
2413
  xxh_u32 h32;
1942
2414
 
1943
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
1944
- if (input==NULL) {
1945
- len=0;
1946
- bEnd=input=(const xxh_u8*)(size_t)16;
1947
- }
1948
- #endif
2415
+ if (input==NULL) XXH_ASSERT(len == 0);
1949
2416
 
1950
2417
  if (len>=16) {
2418
+ const xxh_u8* const bEnd = input + len;
1951
2419
  const xxh_u8* const limit = bEnd - 15;
1952
2420
  xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
1953
2421
  xxh_u32 v2 = seed + XXH_PRIME32_2;
@@ -1972,10 +2440,10 @@ XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment
1972
2440
  return XXH32_finalize(h32, input, len&15, align);
1973
2441
  }
1974
2442
 
1975
- /*! @ingroup xxh32_family */
2443
+ /*! @ingroup XXH32_family */
1976
2444
  XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
1977
2445
  {
1978
- #if 0
2446
+ #if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
1979
2447
  /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
1980
2448
  XXH32_state_t state;
1981
2449
  XXH32_reset(&state, seed);
@@ -1994,51 +2462,46 @@ XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t s
1994
2462
 
1995
2463
 
1996
2464
  /******* Hash streaming *******/
1997
- /*!
1998
- * @ingroup xxh32_family
1999
- */
2465
+ #ifndef XXH_NO_STREAM
2466
+ /*! @ingroup XXH32_family */
2000
2467
  XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
2001
2468
  {
2002
2469
  return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
2003
2470
  }
2004
- /*! @ingroup xxh32_family */
2471
+ /*! @ingroup XXH32_family */
2005
2472
  XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
2006
2473
  {
2007
2474
  XXH_free(statePtr);
2008
2475
  return XXH_OK;
2009
2476
  }
2010
2477
 
2011
- /*! @ingroup xxh32_family */
2478
+ /*! @ingroup XXH32_family */
2012
2479
  XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
2013
2480
  {
2014
- memcpy(dstState, srcState, sizeof(*dstState));
2481
+ XXH_memcpy(dstState, srcState, sizeof(*dstState));
2015
2482
  }
2016
2483
 
2017
- /*! @ingroup xxh32_family */
2484
+ /*! @ingroup XXH32_family */
2018
2485
  XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
2019
2486
  {
2020
- XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
2021
- memset(&state, 0, sizeof(state));
2022
- state.v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
2023
- state.v2 = seed + XXH_PRIME32_2;
2024
- state.v3 = seed + 0;
2025
- state.v4 = seed - XXH_PRIME32_1;
2026
- /* do not write into reserved, planned to be removed in a future version */
2027
- memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
2487
+ XXH_ASSERT(statePtr != NULL);
2488
+ memset(statePtr, 0, sizeof(*statePtr));
2489
+ statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
2490
+ statePtr->v[1] = seed + XXH_PRIME32_2;
2491
+ statePtr->v[2] = seed + 0;
2492
+ statePtr->v[3] = seed - XXH_PRIME32_1;
2028
2493
  return XXH_OK;
2029
2494
  }
2030
2495
 
2031
2496
 
2032
- /*! @ingroup xxh32_family */
2497
+ /*! @ingroup XXH32_family */
2033
2498
  XXH_PUBLIC_API XXH_errorcode
2034
2499
  XXH32_update(XXH32_state_t* state, const void* input, size_t len)
2035
2500
  {
2036
- if (input==NULL)
2037
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
2501
+ if (input==NULL) {
2502
+ XXH_ASSERT(len == 0);
2038
2503
  return XXH_OK;
2039
- #else
2040
- return XXH_ERROR;
2041
- #endif
2504
+ }
2042
2505
 
2043
2506
  { const xxh_u8* p = (const xxh_u8*)input;
2044
2507
  const xxh_u8* const bEnd = p + len;
@@ -2055,35 +2518,25 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
2055
2518
  if (state->memsize) { /* some data left from previous update */
2056
2519
  XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize);
2057
2520
  { const xxh_u32* p32 = state->mem32;
2058
- state->v1 = XXH32_round(state->v1, XXH_readLE32(p32)); p32++;
2059
- state->v2 = XXH32_round(state->v2, XXH_readLE32(p32)); p32++;
2060
- state->v3 = XXH32_round(state->v3, XXH_readLE32(p32)); p32++;
2061
- state->v4 = XXH32_round(state->v4, XXH_readLE32(p32));
2521
+ state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++;
2522
+ state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++;
2523
+ state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++;
2524
+ state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32));
2062
2525
  }
2063
2526
  p += 16-state->memsize;
2064
2527
  state->memsize = 0;
2065
2528
  }
2066
2529
 
2067
- /* uintptr_t casts avoid UB or compiler warning on out-of-bounds
2068
- * pointer arithmetic */
2069
- if ((uintptr_t)p <= (uintptr_t)bEnd - 16) {
2070
- const uintptr_t limit = (uintptr_t)bEnd - 16;
2071
- xxh_u32 v1 = state->v1;
2072
- xxh_u32 v2 = state->v2;
2073
- xxh_u32 v3 = state->v3;
2074
- xxh_u32 v4 = state->v4;
2530
+ if (p <= bEnd-16) {
2531
+ const xxh_u8* const limit = bEnd - 16;
2075
2532
 
2076
2533
  do {
2077
- v1 = XXH32_round(v1, XXH_readLE32(p)); p+=4;
2078
- v2 = XXH32_round(v2, XXH_readLE32(p)); p+=4;
2079
- v3 = XXH32_round(v3, XXH_readLE32(p)); p+=4;
2080
- v4 = XXH32_round(v4, XXH_readLE32(p)); p+=4;
2081
- } while ((uintptr_t)p<=limit);
2082
-
2083
- state->v1 = v1;
2084
- state->v2 = v2;
2085
- state->v3 = v3;
2086
- state->v4 = v4;
2534
+ state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4;
2535
+ state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4;
2536
+ state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4;
2537
+ state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4;
2538
+ } while (p<=limit);
2539
+
2087
2540
  }
2088
2541
 
2089
2542
  if (p < bEnd) {
@@ -2096,30 +2549,30 @@ XXH32_update(XXH32_state_t* state, const void* input, size_t len)
2096
2549
  }
2097
2550
 
2098
2551
 
2099
- /*! @ingroup xxh32_family */
2552
+ /*! @ingroup XXH32_family */
2100
2553
  XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
2101
2554
  {
2102
2555
  xxh_u32 h32;
2103
2556
 
2104
2557
  if (state->large_len) {
2105
- h32 = XXH_rotl32(state->v1, 1)
2106
- + XXH_rotl32(state->v2, 7)
2107
- + XXH_rotl32(state->v3, 12)
2108
- + XXH_rotl32(state->v4, 18);
2558
+ h32 = XXH_rotl32(state->v[0], 1)
2559
+ + XXH_rotl32(state->v[1], 7)
2560
+ + XXH_rotl32(state->v[2], 12)
2561
+ + XXH_rotl32(state->v[3], 18);
2109
2562
  } else {
2110
- h32 = state->v3 /* == seed */ + XXH_PRIME32_5;
2563
+ h32 = state->v[2] /* == seed */ + XXH_PRIME32_5;
2111
2564
  }
2112
2565
 
2113
2566
  h32 += state->total_len_32;
2114
2567
 
2115
2568
  return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned);
2116
2569
  }
2117
-
2570
+ #endif /* !XXH_NO_STREAM */
2118
2571
 
2119
2572
  /******* Canonical representation *******/
2120
2573
 
2121
2574
  /*!
2122
- * @ingroup xxh32_family
2575
+ * @ingroup XXH32_family
2123
2576
  * The default return values from XXH functions are unsigned 32 and 64 bit
2124
2577
  * integers.
2125
2578
  *
@@ -2136,9 +2589,9 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t
2136
2589
  {
2137
2590
  XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
2138
2591
  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
2139
- memcpy(dst, &hash, sizeof(*dst));
2592
+ XXH_memcpy(dst, &hash, sizeof(*dst));
2140
2593
  }
2141
- /*! @ingroup xxh32_family */
2594
+ /*! @ingroup XXH32_family */
2142
2595
  XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
2143
2596
  {
2144
2597
  return XXH_readBE32(src);
@@ -2179,30 +2632,31 @@ static xxh_u64 XXH_read64(const void* memPtr)
2179
2632
  #elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
2180
2633
 
2181
2634
  /*
2182
- * __pack instructions are safer, but compiler specific, hence potentially
2183
- * problematic for some compilers.
2184
- *
2185
- * Currently only defined for GCC and ICC.
2635
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
2636
+ * documentation claimed that it only increased the alignment, but actually it
2637
+ * can decrease it on gcc, clang, and icc:
2638
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
2639
+ * https://gcc.godbolt.org/z/xYez1j67Y.
2186
2640
  */
2187
2641
  #ifdef XXH_OLD_NAMES
2188
2642
  typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64;
2189
2643
  #endif
2190
2644
  static xxh_u64 XXH_read64(const void* ptr)
2191
2645
  {
2192
- typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) xxh_unalign64;
2193
- return ((const xxh_unalign64*)ptr)->u64;
2646
+ typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64;
2647
+ return *((const xxh_unalign64*)ptr);
2194
2648
  }
2195
2649
 
2196
2650
  #else
2197
2651
 
2198
2652
  /*
2199
2653
  * Portable and safe solution. Generally efficient.
2200
- * see: https://stackoverflow.com/a/32095106/646947
2654
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
2201
2655
  */
2202
2656
  static xxh_u64 XXH_read64(const void* memPtr)
2203
2657
  {
2204
2658
  xxh_u64 val;
2205
- memcpy(&val, memPtr, sizeof(val));
2659
+ XXH_memcpy(&val, memPtr, sizeof(val));
2206
2660
  return val;
2207
2661
  }
2208
2662
 
@@ -2281,8 +2735,10 @@ XXH_readLE64_align(const void* ptr, XXH_alignment align)
2281
2735
  /******* xxh64 *******/
2282
2736
  /*!
2283
2737
  * @}
2284
- * @defgroup xxh64_impl XXH64 implementation
2738
+ * @defgroup XXH64_impl XXH64 implementation
2285
2739
  * @ingroup impl
2740
+ *
2741
+ * Details on the XXH64 implementation.
2286
2742
  * @{
2287
2743
  */
2288
2744
  /* #define rather that static const, to be used as initializers */
@@ -2300,6 +2756,7 @@ XXH_readLE64_align(const void* ptr, XXH_alignment align)
2300
2756
  # define PRIME64_5 XXH_PRIME64_5
2301
2757
  #endif
2302
2758
 
2759
+ /*! @copydoc XXH32_round */
2303
2760
  static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
2304
2761
  {
2305
2762
  acc += input * XXH_PRIME64_2;
@@ -2316,42 +2773,59 @@ static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
2316
2773
  return acc;
2317
2774
  }
2318
2775
 
2319
- static xxh_u64 XXH64_avalanche(xxh_u64 h64)
2776
+ /*! @copydoc XXH32_avalanche */
2777
+ static xxh_u64 XXH64_avalanche(xxh_u64 hash)
2320
2778
  {
2321
- h64 ^= h64 >> 33;
2322
- h64 *= XXH_PRIME64_2;
2323
- h64 ^= h64 >> 29;
2324
- h64 *= XXH_PRIME64_3;
2325
- h64 ^= h64 >> 32;
2326
- return h64;
2779
+ hash ^= hash >> 33;
2780
+ hash *= XXH_PRIME64_2;
2781
+ hash ^= hash >> 29;
2782
+ hash *= XXH_PRIME64_3;
2783
+ hash ^= hash >> 32;
2784
+ return hash;
2327
2785
  }
2328
2786
 
2329
2787
 
2330
2788
  #define XXH_get64bits(p) XXH_readLE64_align(p, align)
2331
2789
 
2332
- static xxh_u64
2333
- XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
2334
- {
2335
- len &= 31;
2336
- while (len >= 8) {
2337
- xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
2338
- ptr += 8;
2339
- h64 ^= k1;
2340
- h64 = XXH_rotl64(h64,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
2790
+ /*!
2791
+ * @internal
2792
+ * @brief Processes the last 0-31 bytes of @p ptr.
2793
+ *
2794
+ * There may be up to 31 bytes remaining to consume from the input.
2795
+ * This final stage will digest them to ensure that all input bytes are present
2796
+ * in the final mix.
2797
+ *
2798
+ * @param hash The hash to finalize.
2799
+ * @param ptr The pointer to the remaining input.
2800
+ * @param len The remaining length, modulo 32.
2801
+ * @param align Whether @p ptr is aligned.
2802
+ * @return The finalized hash
2803
+ * @see XXH32_finalize().
2804
+ */
2805
+ static XXH_PUREF xxh_u64
2806
+ XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
2807
+ {
2808
+ if (ptr==NULL) XXH_ASSERT(len == 0);
2809
+ len &= 31;
2810
+ while (len >= 8) {
2811
+ xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
2812
+ ptr += 8;
2813
+ hash ^= k1;
2814
+ hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
2341
2815
  len -= 8;
2342
2816
  }
2343
2817
  if (len >= 4) {
2344
- h64 ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
2818
+ hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
2345
2819
  ptr += 4;
2346
- h64 = XXH_rotl64(h64, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
2820
+ hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
2347
2821
  len -= 4;
2348
2822
  }
2349
2823
  while (len > 0) {
2350
- h64 ^= (*ptr++) * XXH_PRIME64_5;
2351
- h64 = XXH_rotl64(h64, 11) * XXH_PRIME64_1;
2824
+ hash ^= (*ptr++) * XXH_PRIME64_5;
2825
+ hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1;
2352
2826
  --len;
2353
2827
  }
2354
- return XXH64_avalanche(h64);
2828
+ return XXH64_avalanche(hash);
2355
2829
  }
2356
2830
 
2357
2831
  #ifdef XXH_OLD_NAMES
@@ -2364,21 +2838,23 @@ XXH64_finalize(xxh_u64 h64, const xxh_u8* ptr, size_t len, XXH_alignment align)
2364
2838
  # undef XXH_PROCESS8_64
2365
2839
  #endif
2366
2840
 
2367
- XXH_FORCE_INLINE xxh_u64
2841
+ /*!
2842
+ * @internal
2843
+ * @brief The implementation for @ref XXH64().
2844
+ *
2845
+ * @param input , len , seed Directly passed from @ref XXH64().
2846
+ * @param align Whether @p input is aligned.
2847
+ * @return The calculated hash.
2848
+ */
2849
+ XXH_FORCE_INLINE XXH_PUREF xxh_u64
2368
2850
  XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
2369
2851
  {
2370
- const xxh_u8* bEnd = input ? input + len : NULL;
2371
2852
  xxh_u64 h64;
2372
-
2373
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
2374
- if (input==NULL) {
2375
- len=0;
2376
- bEnd=input=(const xxh_u8*)(size_t)32;
2377
- }
2378
- #endif
2853
+ if (input==NULL) XXH_ASSERT(len == 0);
2379
2854
 
2380
2855
  if (len>=32) {
2381
- const xxh_u8* const limit = bEnd - 32;
2856
+ const xxh_u8* const bEnd = input + len;
2857
+ const xxh_u8* const limit = bEnd - 31;
2382
2858
  xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2383
2859
  xxh_u64 v2 = seed + XXH_PRIME64_2;
2384
2860
  xxh_u64 v3 = seed + 0;
@@ -2389,7 +2865,7 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment
2389
2865
  v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8;
2390
2866
  v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8;
2391
2867
  v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8;
2392
- } while (input<=limit);
2868
+ } while (input<limit);
2393
2869
 
2394
2870
  h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
2395
2871
  h64 = XXH64_mergeRound(h64, v1);
@@ -2407,10 +2883,10 @@ XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment
2407
2883
  }
2408
2884
 
2409
2885
 
2410
- /*! @ingroup xxh64_family */
2411
- XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t seed)
2886
+ /*! @ingroup XXH64_family */
2887
+ XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
2412
2888
  {
2413
- #if 0
2889
+ #if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
2414
2890
  /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
2415
2891
  XXH64_state_t state;
2416
2892
  XXH64_reset(&state, seed);
@@ -2428,49 +2904,45 @@ XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t len, XXH64_hash_t s
2428
2904
  }
2429
2905
 
2430
2906
  /******* Hash Streaming *******/
2431
-
2432
- /*! @ingroup xxh64_family*/
2907
+ #ifndef XXH_NO_STREAM
2908
+ /*! @ingroup XXH64_family*/
2433
2909
  XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
2434
2910
  {
2435
2911
  return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
2436
2912
  }
2437
- /*! @ingroup xxh64_family */
2913
+ /*! @ingroup XXH64_family */
2438
2914
  XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
2439
2915
  {
2440
2916
  XXH_free(statePtr);
2441
2917
  return XXH_OK;
2442
2918
  }
2443
2919
 
2444
- /*! @ingroup xxh64_family */
2445
- XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
2920
+ /*! @ingroup XXH64_family */
2921
+ XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
2446
2922
  {
2447
- memcpy(dstState, srcState, sizeof(*dstState));
2923
+ XXH_memcpy(dstState, srcState, sizeof(*dstState));
2448
2924
  }
2449
2925
 
2450
- /*! @ingroup xxh64_family */
2451
- XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, XXH64_hash_t seed)
2926
+ /*! @ingroup XXH64_family */
2927
+ XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
2452
2928
  {
2453
- XXH64_state_t state; /* use a local state to memcpy() in order to avoid strict-aliasing warnings */
2454
- memset(&state, 0, sizeof(state));
2455
- state.v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2456
- state.v2 = seed + XXH_PRIME64_2;
2457
- state.v3 = seed + 0;
2458
- state.v4 = seed - XXH_PRIME64_1;
2459
- /* do not write into reserved64, might be removed in a future version */
2460
- memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved64));
2929
+ XXH_ASSERT(statePtr != NULL);
2930
+ memset(statePtr, 0, sizeof(*statePtr));
2931
+ statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
2932
+ statePtr->v[1] = seed + XXH_PRIME64_2;
2933
+ statePtr->v[2] = seed + 0;
2934
+ statePtr->v[3] = seed - XXH_PRIME64_1;
2461
2935
  return XXH_OK;
2462
2936
  }
2463
2937
 
2464
- /*! @ingroup xxh64_family */
2938
+ /*! @ingroup XXH64_family */
2465
2939
  XXH_PUBLIC_API XXH_errorcode
2466
- XXH64_update (XXH64_state_t* state, const void* input, size_t len)
2940
+ XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
2467
2941
  {
2468
- if (input==NULL)
2469
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
2942
+ if (input==NULL) {
2943
+ XXH_ASSERT(len == 0);
2470
2944
  return XXH_OK;
2471
- #else
2472
- return XXH_ERROR;
2473
- #endif
2945
+ }
2474
2946
 
2475
2947
  { const xxh_u8* p = (const xxh_u8*)input;
2476
2948
  const xxh_u8* const bEnd = p + len;
@@ -2485,34 +2957,24 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
2485
2957
 
2486
2958
  if (state->memsize) { /* tmp buffer is full */
2487
2959
  XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize);
2488
- state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0));
2489
- state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1));
2490
- state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2));
2491
- state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3));
2960
+ state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0));
2961
+ state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1));
2962
+ state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2));
2963
+ state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3));
2492
2964
  p += 32 - state->memsize;
2493
2965
  state->memsize = 0;
2494
2966
  }
2495
2967
 
2496
- /* uintptr_t casts avoid UB or compiler warning on out-of-bounds
2497
- * pointer arithmetic */
2498
- if ((uintptr_t)p + 32 <= (uintptr_t)bEnd) {
2499
- const uintptr_t limit = (uintptr_t)bEnd - 32;
2500
- xxh_u64 v1 = state->v1;
2501
- xxh_u64 v2 = state->v2;
2502
- xxh_u64 v3 = state->v3;
2503
- xxh_u64 v4 = state->v4;
2968
+ if (p+32 <= bEnd) {
2969
+ const xxh_u8* const limit = bEnd - 32;
2504
2970
 
2505
2971
  do {
2506
- v1 = XXH64_round(v1, XXH_readLE64(p)); p+=8;
2507
- v2 = XXH64_round(v2, XXH_readLE64(p)); p+=8;
2508
- v3 = XXH64_round(v3, XXH_readLE64(p)); p+=8;
2509
- v4 = XXH64_round(v4, XXH_readLE64(p)); p+=8;
2510
- } while ((uintptr_t)p<=limit);
2511
-
2512
- state->v1 = v1;
2513
- state->v2 = v2;
2514
- state->v3 = v3;
2515
- state->v4 = v4;
2972
+ state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8;
2973
+ state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8;
2974
+ state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8;
2975
+ state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8;
2976
+ } while (p<=limit);
2977
+
2516
2978
  }
2517
2979
 
2518
2980
  if (p < bEnd) {
@@ -2525,44 +2987,39 @@ XXH64_update (XXH64_state_t* state, const void* input, size_t len)
2525
2987
  }
2526
2988
 
2527
2989
 
2528
- /*! @ingroup xxh64_family */
2529
- XXH_PUBLIC_API XXH64_hash_t XXH64_digest(const XXH64_state_t* state)
2990
+ /*! @ingroup XXH64_family */
2991
+ XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
2530
2992
  {
2531
2993
  xxh_u64 h64;
2532
2994
 
2533
2995
  if (state->total_len >= 32) {
2534
- xxh_u64 const v1 = state->v1;
2535
- xxh_u64 const v2 = state->v2;
2536
- xxh_u64 const v3 = state->v3;
2537
- xxh_u64 const v4 = state->v4;
2538
-
2539
- h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
2540
- h64 = XXH64_mergeRound(h64, v1);
2541
- h64 = XXH64_mergeRound(h64, v2);
2542
- h64 = XXH64_mergeRound(h64, v3);
2543
- h64 = XXH64_mergeRound(h64, v4);
2996
+ h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18);
2997
+ h64 = XXH64_mergeRound(h64, state->v[0]);
2998
+ h64 = XXH64_mergeRound(h64, state->v[1]);
2999
+ h64 = XXH64_mergeRound(h64, state->v[2]);
3000
+ h64 = XXH64_mergeRound(h64, state->v[3]);
2544
3001
  } else {
2545
- h64 = state->v3 /*seed*/ + XXH_PRIME64_5;
3002
+ h64 = state->v[2] /*seed*/ + XXH_PRIME64_5;
2546
3003
  }
2547
3004
 
2548
3005
  h64 += (xxh_u64) state->total_len;
2549
3006
 
2550
3007
  return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned);
2551
3008
  }
2552
-
3009
+ #endif /* !XXH_NO_STREAM */
2553
3010
 
2554
3011
  /******* Canonical representation *******/
2555
3012
 
2556
- /*! @ingroup xxh64_family */
2557
- XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
3013
+ /*! @ingroup XXH64_family */
3014
+ XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
2558
3015
  {
2559
3016
  XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
2560
3017
  if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
2561
- memcpy(dst, &hash, sizeof(*dst));
3018
+ XXH_memcpy(dst, &hash, sizeof(*dst));
2562
3019
  }
2563
3020
 
2564
- /*! @ingroup xxh64_family */
2565
- XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
3021
+ /*! @ingroup XXH64_family */
3022
+ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
2566
3023
  {
2567
3024
  return XXH_readBE64(src);
2568
3025
  }
@@ -2575,7 +3032,7 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
2575
3032
  ************************************************************************ */
2576
3033
  /*!
2577
3034
  * @}
2578
- * @defgroup xxh3_impl XXH3 implementation
3035
+ * @defgroup XXH3_impl XXH3 implementation
2579
3036
  * @ingroup impl
2580
3037
  * @{
2581
3038
  */
@@ -2601,17 +3058,23 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
2601
3058
  # define XXH_unlikely(x) (x)
2602
3059
  #endif
2603
3060
 
2604
- #if defined(__GNUC__)
2605
- # if defined(__AVX2__)
2606
- # include <immintrin.h>
2607
- # elif defined(__SSE2__)
2608
- # include <emmintrin.h>
2609
- # elif defined(__ARM_NEON__) || defined(__ARM_NEON)
3061
+ #if defined(__GNUC__) || defined(__clang__)
3062
+ # if defined(__ARM_FEATURE_SVE)
3063
+ # include <arm_sve.h>
3064
+ # elif defined(__ARM_NEON__) || defined(__ARM_NEON) \
3065
+ || defined(__aarch64__) || defined(_M_ARM) \
3066
+ || defined(_M_ARM64) || defined(_M_ARM64EC)
2610
3067
  # define inline __inline__ /* circumvent a clang bug */
2611
3068
  # include <arm_neon.h>
2612
3069
  # undef inline
3070
+ # elif defined(__AVX2__)
3071
+ # include <immintrin.h>
3072
+ # elif defined(__SSE2__)
3073
+ # include <emmintrin.h>
2613
3074
  # endif
2614
- #elif defined(_MSC_VER)
3075
+ #endif
3076
+
3077
+ #if defined(_MSC_VER)
2615
3078
  # include <intrin.h>
2616
3079
  #endif
2617
3080
 
@@ -2725,12 +3188,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
2725
3188
  XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */
2726
3189
  XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */
2727
3190
  XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
3191
+ XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */
2728
3192
  };
2729
3193
  /*!
2730
3194
  * @ingroup tuning
2731
3195
  * @brief Selects the minimum alignment for XXH3's accumulators.
2732
3196
  *
2733
- * When using SIMD, this should match the alignment reqired for said vector
3197
+ * When using SIMD, this should match the alignment required for said vector
2734
3198
  * type, so, for example, 32 for AVX2.
2735
3199
  *
2736
3200
  * Default: Auto detected.
@@ -2746,20 +3210,26 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
2746
3210
  # define XXH_AVX512 3
2747
3211
  # define XXH_NEON 4
2748
3212
  # define XXH_VSX 5
3213
+ # define XXH_SVE 6
2749
3214
  #endif
2750
3215
 
2751
3216
  #ifndef XXH_VECTOR /* can be defined on command line */
2752
- # if defined(__AVX512F__)
3217
+ # if defined(__ARM_FEATURE_SVE)
3218
+ # define XXH_VECTOR XXH_SVE
3219
+ # elif ( \
3220
+ defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
3221
+ || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
3222
+ ) && ( \
3223
+ defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
3224
+ || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
3225
+ )
3226
+ # define XXH_VECTOR XXH_NEON
3227
+ # elif defined(__AVX512F__)
2753
3228
  # define XXH_VECTOR XXH_AVX512
2754
3229
  # elif defined(__AVX2__)
2755
3230
  # define XXH_VECTOR XXH_AVX2
2756
3231
  # elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
2757
3232
  # define XXH_VECTOR XXH_SSE2
2758
- # elif defined(__GNUC__) /* msvc support maybe later */ \
2759
- && (defined(__ARM_NEON__) || defined(__ARM_NEON)) \
2760
- && (defined(__LITTLE_ENDIAN__) /* We only support little endian NEON */ \
2761
- || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
2762
- # define XXH_VECTOR XXH_NEON
2763
3233
  # elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
2764
3234
  || (defined(__s390x__) && defined(__VEC__)) \
2765
3235
  && defined(__GNUC__) /* TODO: IBM XL */
@@ -2769,6 +3239,17 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
2769
3239
  # endif
2770
3240
  #endif
2771
3241
 
3242
+ /* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
3243
+ #if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
3244
+ # ifdef _MSC_VER
3245
+ # pragma warning(once : 4606)
3246
+ # else
3247
+ # warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
3248
+ # endif
3249
+ # undef XXH_VECTOR
3250
+ # define XXH_VECTOR XXH_SCALAR
3251
+ #endif
3252
+
2772
3253
  /*
2773
3254
  * Controls the alignment of the accumulator,
2774
3255
  * for compatibility with aligned vector loads, which are usually faster.
@@ -2788,12 +3269,16 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
2788
3269
  # define XXH_ACC_ALIGN 16
2789
3270
  # elif XXH_VECTOR == XXH_AVX512 /* avx512 */
2790
3271
  # define XXH_ACC_ALIGN 64
3272
+ # elif XXH_VECTOR == XXH_SVE /* sve */
3273
+ # define XXH_ACC_ALIGN 64
2791
3274
  # endif
2792
3275
  #endif
2793
3276
 
2794
3277
  #if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
2795
3278
  || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
2796
3279
  # define XXH_SEC_ALIGN XXH_ACC_ALIGN
3280
+ #elif XXH_VECTOR == XXH_SVE
3281
+ # define XXH_SEC_ALIGN XXH_ACC_ALIGN
2797
3282
  #else
2798
3283
  # define XXH_SEC_ALIGN 8
2799
3284
  #endif
@@ -2821,7 +3306,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
2821
3306
  */
2822
3307
  #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
2823
3308
  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
2824
- && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
3309
+ && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
2825
3310
  # pragma GCC push_options
2826
3311
  # pragma GCC optimize("-O2")
2827
3312
  #endif
@@ -2909,8 +3394,8 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
2909
3394
  * }
2910
3395
  */
2911
3396
  # if !defined(XXH_NO_VZIP_HACK) /* define to disable */ \
2912
- && defined(__GNUC__) \
2913
- && !defined(__aarch64__) && !defined(__arm64__)
3397
+ && (defined(__GNUC__) || defined(__clang__)) \
3398
+ && (defined(__arm__) || defined(__thumb__) || defined(_M_ARM))
2914
3399
  # define XXH_SPLIT_IN_PLACE(in, outLo, outHi) \
2915
3400
  do { \
2916
3401
  /* Undocumented GCC/Clang operand modifier: %e0 = lower D half, %f0 = upper D half */ \
@@ -2927,6 +3412,78 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
2927
3412
  (outHi) = vshrn_n_u64 ((in), 32); \
2928
3413
  } while (0)
2929
3414
  # endif
3415
+
3416
+ /*!
3417
+ * @internal
3418
+ * @brief `vld1q_u64` but faster and alignment-safe.
3419
+ *
3420
+ * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only
3421
+ * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).
3422
+ *
3423
+ * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it
3424
+ * prohibits load-store optimizations. Therefore, a direct dereference is used.
3425
+ *
3426
+ * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe
3427
+ * unaligned load.
3428
+ */
3429
+ #if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
3430
+ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
3431
+ {
3432
+ return *(uint64x2_t const*)ptr;
3433
+ }
3434
+ #else
3435
+ XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
3436
+ {
3437
+ return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
3438
+ }
3439
+ #endif
3440
+ /*!
3441
+ * @ingroup tuning
3442
+ * @brief Controls the NEON to scalar ratio for XXH3
3443
+ *
3444
+ * On AArch64 when not optimizing for size, XXH3 will run 6 lanes using NEON and
3445
+ * 2 lanes on scalar by default (except on Apple platforms, as Apple CPUs benefit
3446
+ * from only using NEON).
3447
+ *
3448
+ * This can be set to 2, 4, 6, or 8. ARMv7 will default to all 8 NEON lanes, as the
3449
+ * emulated 64-bit arithmetic is too slow.
3450
+ *
3451
+ * Modern ARM CPUs are _very_ sensitive to how their pipelines are used.
3452
+ *
3453
+ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but it can't
3454
+ * have more than 2 NEON (F0/F1) micro-ops. If you are only using NEON instructions,
3455
+ * you are only using 2/3 of the CPU bandwidth.
3456
+ *
3457
+ * This is even more noticeable on the more advanced cores like the A76 which
3458
+ * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
3459
+ *
3460
+ * Therefore, @ref XXH3_NEON_LANES lanes will be processed using NEON, and the
3461
+ * remaining lanes will use scalar instructions. This improves the bandwidth
3462
+ * and also gives the integer pipelines something to do besides twiddling loop
3463
+ * counters and pointers.
3464
+ *
3465
+ * This change benefits CPUs with large micro-op buffers without negatively affecting
3466
+ * most other CPUs:
3467
+ *
3468
+ * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. |
3469
+ * |:----------------------|:--------------------|----------:|-----------:|------:|
3470
+ * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% |
3471
+ * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% |
3472
+ * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% |
3473
+ * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% |
3474
+ *
3475
+ * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
3476
+ *
3477
+ * @see XXH3_accumulate_512_neon()
3478
+ */
3479
+ # ifndef XXH3_NEON_LANES
3480
+ # if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
3481
+ && !defined(__APPLE__) && XXH_SIZE_OPT <= 0
3482
+ # define XXH3_NEON_LANES 6
3483
+ # else
3484
+ # define XXH3_NEON_LANES XXH_ACC_NB
3485
+ # endif
3486
+ # endif
2930
3487
  #endif /* XXH_VECTOR == XXH_NEON */
2931
3488
 
2932
3489
  /*
@@ -2938,23 +3495,33 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
2938
3495
  * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
2939
3496
  */
2940
3497
  #if XXH_VECTOR == XXH_VSX
3498
+ /* Annoyingly, these headers _may_ define three macros: `bool`, `vector`,
3499
+ * and `pixel`. This is a problem for obvious reasons.
3500
+ *
3501
+ * These keywords are unnecessary; the spec literally says they are
3502
+ * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd
3503
+ * after including the header.
3504
+ *
3505
+ * We use pragma push_macro/pop_macro to keep the namespace clean. */
3506
+ # pragma push_macro("bool")
3507
+ # pragma push_macro("vector")
3508
+ # pragma push_macro("pixel")
3509
+ /* silence potential macro redefined warnings */
3510
+ # undef bool
3511
+ # undef vector
3512
+ # undef pixel
3513
+
2941
3514
  # if defined(__s390x__)
2942
3515
  # include <s390intrin.h>
2943
3516
  # else
2944
- /* gcc's altivec.h can have the unwanted consequence to unconditionally
2945
- * #define bool, vector, and pixel keywords,
2946
- * with bad consequences for programs already using these keywords for other purposes.
2947
- * The paragraph defining these macros is skipped when __APPLE_ALTIVEC__ is defined.
2948
- * __APPLE_ALTIVEC__ is _generally_ defined automatically by the compiler,
2949
- * but it seems that, in some cases, it isn't.
2950
- * Force the build macro to be defined, so that keywords are not altered.
2951
- */
2952
- # if defined(__GNUC__) && !defined(__APPLE_ALTIVEC__)
2953
- # define __APPLE_ALTIVEC__
2954
- # endif
2955
3517
  # include <altivec.h>
2956
3518
  # endif
2957
3519
 
3520
+ /* Restore the original macro values, if applicable. */
3521
+ # pragma pop_macro("pixel")
3522
+ # pragma pop_macro("vector")
3523
+ # pragma pop_macro("bool")
3524
+
2958
3525
  typedef __vector unsigned long long xxh_u64x2;
2959
3526
  typedef __vector unsigned char xxh_u8x16;
2960
3527
  typedef __vector unsigned xxh_u32x4;
@@ -2993,7 +3560,7 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
2993
3560
  XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
2994
3561
  {
2995
3562
  xxh_u64x2 ret;
2996
- memcpy(&ret, ptr, sizeof(xxh_u64x2));
3563
+ XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
2997
3564
  # if XXH_VSX_BE
2998
3565
  ret = XXH_vec_revb(ret);
2999
3566
  # endif
@@ -3010,8 +3577,9 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
3010
3577
  /* s390x is always big endian, no issue on this platform */
3011
3578
  # define XXH_vec_mulo vec_mulo
3012
3579
  # define XXH_vec_mule vec_mule
3013
- # elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw)
3580
+ # elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__)
3014
3581
  /* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
3582
+ /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */
3015
3583
  # define XXH_vec_mulo __builtin_altivec_vmulouw
3016
3584
  # define XXH_vec_mule __builtin_altivec_vmuleuw
3017
3585
  # else
@@ -3032,13 +3600,29 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
3032
3600
  # endif /* XXH_vec_mulo, XXH_vec_mule */
3033
3601
  #endif /* XXH_VECTOR == XXH_VSX */
3034
3602
 
3603
+ #if XXH_VECTOR == XXH_SVE
3604
+ #define ACCRND(acc, offset) \
3605
+ do { \
3606
+ svuint64_t input_vec = svld1_u64(mask, xinput + offset); \
3607
+ svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \
3608
+ svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \
3609
+ svuint64_t swapped = svtbl_u64(input_vec, kSwap); \
3610
+ svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \
3611
+ svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \
3612
+ svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
3613
+ acc = svadd_u64_x(mask, acc, mul); \
3614
+ } while (0)
3615
+ #endif /* XXH_VECTOR == XXH_SVE */
3616
+
3035
3617
 
3036
3618
  /* prefetch
3037
3619
  * can be disabled, by declaring XXH_NO_PREFETCH build macro */
3038
3620
  #if defined(XXH_NO_PREFETCH)
3039
3621
  # define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */
3040
3622
  #else
3041
- # if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */
3623
+ # if XXH_SIZE_OPT >= 1
3624
+ # define XXH_PREFETCH(ptr) (void)(ptr)
3625
+ # elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */
3042
3626
  # include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
3043
3627
  # define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
3044
3628
  # elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
@@ -3103,7 +3687,6 @@ XXH_mult32to64(xxh_u64 x, xxh_u64 y)
3103
3687
  return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
3104
3688
  }
3105
3689
  #elif defined(_MSC_VER) && defined(_M_IX86)
3106
- # include <intrin.h>
3107
3690
  # define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
3108
3691
  #else
3109
3692
  /*
@@ -3122,7 +3705,7 @@ XXH_mult32to64(xxh_u64 x, xxh_u64 y)
3122
3705
  * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
3123
3706
  * version.
3124
3707
  *
3125
- * @param lhs, rhs The 64-bit integers to be multiplied
3708
+ * @param lhs , rhs The 64-bit integers to be multiplied
3126
3709
  * @return The 128-bit result represented in an @ref XXH128_hash_t.
3127
3710
  */
3128
3711
  static XXH128_hash_t
@@ -3143,7 +3726,7 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
3143
3726
  * In that case it is best to use the portable one.
3144
3727
  * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
3145
3728
  */
3146
- #if defined(__GNUC__) && !defined(__wasm__) \
3729
+ #if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
3147
3730
  && defined(__SIZEOF_INT128__) \
3148
3731
  || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
3149
3732
 
@@ -3160,7 +3743,7 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
3160
3743
  *
3161
3744
  * This compiles to single operand MUL on x64.
3162
3745
  */
3163
- #elif defined(_M_X64) || defined(_M_IA64)
3746
+ #elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
3164
3747
 
3165
3748
  #ifndef _MSC_VER
3166
3749
  # pragma intrinsic(_umul128)
@@ -3172,6 +3755,21 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
3172
3755
  r128.high64 = product_high;
3173
3756
  return r128;
3174
3757
 
3758
+ /*
3759
+ * MSVC for ARM64's __umulh method.
3760
+ *
3761
+ * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
3762
+ */
3763
+ #elif defined(_M_ARM64) || defined(_M_ARM64EC)
3764
+
3765
+ #ifndef _MSC_VER
3766
+ # pragma intrinsic(__umulh)
3767
+ #endif
3768
+ XXH128_hash_t r128;
3769
+ r128.low64 = lhs * rhs;
3770
+ r128.high64 = __umulh(lhs, rhs);
3771
+ return r128;
3772
+
3175
3773
  #else
3176
3774
  /*
3177
3775
  * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
@@ -3240,7 +3838,7 @@ XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
3240
3838
  * The reason for the separate function is to prevent passing too many structs
3241
3839
  * around by value. This will hopefully inline the multiply, but we don't force it.
3242
3840
  *
3243
- * @param lhs, rhs The 64-bit integers to multiply
3841
+ * @param lhs , rhs The 64-bit integers to multiply
3244
3842
  * @return The low 64 bits of the product XOR'd by the high 64 bits.
3245
3843
  * @see XXH_mult64to128()
3246
3844
  */
@@ -3252,7 +3850,7 @@ XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
3252
3850
  }
3253
3851
 
3254
3852
  /*! Seems to produce slightly better code on GCC for some reason. */
3255
- XXH_FORCE_INLINE xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
3853
+ XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
3256
3854
  {
3257
3855
  XXH_ASSERT(0 <= shift && shift < 64);
3258
3856
  return v64 ^ (v64 >> shift);
@@ -3319,7 +3917,7 @@ static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
3319
3917
  *
3320
3918
  * This adds an extra layer of strength for custom secrets.
3321
3919
  */
3322
- XXH_FORCE_INLINE XXH64_hash_t
3920
+ XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
3323
3921
  XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
3324
3922
  {
3325
3923
  XXH_ASSERT(input != NULL);
@@ -3341,7 +3939,7 @@ XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h
3341
3939
  }
3342
3940
  }
3343
3941
 
3344
- XXH_FORCE_INLINE XXH64_hash_t
3942
+ XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
3345
3943
  XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
3346
3944
  {
3347
3945
  XXH_ASSERT(input != NULL);
@@ -3357,7 +3955,7 @@ XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_h
3357
3955
  }
3358
3956
  }
3359
3957
 
3360
- XXH_FORCE_INLINE XXH64_hash_t
3958
+ XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
3361
3959
  XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
3362
3960
  {
3363
3961
  XXH_ASSERT(input != NULL);
@@ -3374,7 +3972,7 @@ XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_
3374
3972
  }
3375
3973
  }
3376
3974
 
3377
- XXH_FORCE_INLINE XXH64_hash_t
3975
+ XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
3378
3976
  XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
3379
3977
  {
3380
3978
  XXH_ASSERT(len <= 16);
@@ -3444,7 +4042,7 @@ XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
3444
4042
  }
3445
4043
 
3446
4044
  /* For mid range keys, XXH3 uses a Mum-hash variant. */
3447
- XXH_FORCE_INLINE XXH64_hash_t
4045
+ XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
3448
4046
  XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
3449
4047
  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
3450
4048
  XXH64_hash_t seed)
@@ -3452,29 +4050,39 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
3452
4050
  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
3453
4051
  XXH_ASSERT(16 < len && len <= 128);
3454
4052
 
3455
- { xxh_u64 acc = len * XXH_PRIME64_1;
4053
+ { xxh_u64 acc = len * XXH_PRIME64_1, acc_end;
4054
+ #if XXH_SIZE_OPT >= 1
4055
+ /* Smaller and cleaner, but slightly slower. */
4056
+ unsigned int i = (unsigned int)(len - 1) / 32;
4057
+ do {
4058
+ acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
4059
+ acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
4060
+ } while (i-- != 0);
4061
+ acc_end = 0;
4062
+ #else
4063
+ acc += XXH3_mix16B(input+0, secret+0, seed);
4064
+ acc_end = XXH3_mix16B(input+len-16, secret+16, seed);
3456
4065
  if (len > 32) {
4066
+ acc += XXH3_mix16B(input+16, secret+32, seed);
4067
+ acc_end += XXH3_mix16B(input+len-32, secret+48, seed);
3457
4068
  if (len > 64) {
4069
+ acc += XXH3_mix16B(input+32, secret+64, seed);
4070
+ acc_end += XXH3_mix16B(input+len-48, secret+80, seed);
4071
+
3458
4072
  if (len > 96) {
3459
4073
  acc += XXH3_mix16B(input+48, secret+96, seed);
3460
- acc += XXH3_mix16B(input+len-64, secret+112, seed);
4074
+ acc_end += XXH3_mix16B(input+len-64, secret+112, seed);
3461
4075
  }
3462
- acc += XXH3_mix16B(input+32, secret+64, seed);
3463
- acc += XXH3_mix16B(input+len-48, secret+80, seed);
3464
4076
  }
3465
- acc += XXH3_mix16B(input+16, secret+32, seed);
3466
- acc += XXH3_mix16B(input+len-32, secret+48, seed);
3467
4077
  }
3468
- acc += XXH3_mix16B(input+0, secret+0, seed);
3469
- acc += XXH3_mix16B(input+len-16, secret+16, seed);
3470
-
3471
- return XXH3_avalanche(acc);
4078
+ #endif
4079
+ return XXH3_avalanche(acc + acc_end);
3472
4080
  }
3473
4081
  }
3474
4082
 
3475
4083
  #define XXH3_MIDSIZE_MAX 240
3476
4084
 
3477
- XXH_NO_INLINE XXH64_hash_t
4085
+ XXH_NO_INLINE XXH_PUREF XXH64_hash_t
3478
4086
  XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
3479
4087
  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
3480
4088
  XXH64_hash_t seed)
@@ -3486,13 +4094,17 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
3486
4094
  #define XXH3_MIDSIZE_LASTOFFSET 17
3487
4095
 
3488
4096
  { xxh_u64 acc = len * XXH_PRIME64_1;
3489
- int const nbRounds = (int)len / 16;
3490
- int i;
4097
+ xxh_u64 acc_end;
4098
+ unsigned int const nbRounds = (unsigned int)len / 16;
4099
+ unsigned int i;
4100
+ XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
3491
4101
  for (i=0; i<8; i++) {
3492
4102
  acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
3493
4103
  }
3494
- acc = XXH3_avalanche(acc);
4104
+ /* last bytes */
4105
+ acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
3495
4106
  XXH_ASSERT(nbRounds >= 8);
4107
+ acc = XXH3_avalanche(acc);
3496
4108
  #if defined(__clang__) /* Clang */ \
3497
4109
  && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
3498
4110
  && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */
@@ -3519,11 +4131,13 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
3519
4131
  #pragma clang loop vectorize(disable)
3520
4132
  #endif
3521
4133
  for (i=8 ; i < nbRounds; i++) {
3522
- acc += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
4134
+ /*
4135
+ * Prevents clang for unrolling the acc loop and interleaving with this one.
4136
+ */
4137
+ XXH_COMPILER_GUARD(acc);
4138
+ acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
3523
4139
  }
3524
- /* last bytes */
3525
- acc += XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
3526
- return XXH3_avalanche(acc);
4140
+ return XXH3_avalanche(acc + acc_end);
3527
4141
  }
3528
4142
  }
3529
4143
 
@@ -3539,10 +4153,51 @@ XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
3539
4153
  # define ACC_NB XXH_ACC_NB
3540
4154
  #endif
3541
4155
 
4156
+ #ifndef XXH_PREFETCH_DIST
4157
+ # ifdef __clang__
4158
+ # define XXH_PREFETCH_DIST 320
4159
+ # else
4160
+ # if (XXH_VECTOR == XXH_AVX512)
4161
+ # define XXH_PREFETCH_DIST 512
4162
+ # else
4163
+ # define XXH_PREFETCH_DIST 384
4164
+ # endif
4165
+ # endif /* __clang__ */
4166
+ #endif /* XXH_PREFETCH_DIST */
4167
+
4168
+ /*
4169
+ * These macros are to generate an XXH3_accumulate() function.
4170
+ * The two arguments select the name suffix and target attribute.
4171
+ *
4172
+ * The name of this symbol is XXH3_accumulate_<name>() and it calls
4173
+ * XXH3_accumulate_512_<name>().
4174
+ *
4175
+ * It may be useful to hand implement this function if the compiler fails to
4176
+ * optimize the inline function.
4177
+ */
4178
+ #define XXH3_ACCUMULATE_TEMPLATE(name) \
4179
+ void \
4180
+ XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \
4181
+ const xxh_u8* XXH_RESTRICT input, \
4182
+ const xxh_u8* XXH_RESTRICT secret, \
4183
+ size_t nbStripes) \
4184
+ { \
4185
+ size_t n; \
4186
+ for (n = 0; n < nbStripes; n++ ) { \
4187
+ const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \
4188
+ XXH_PREFETCH(in + XXH_PREFETCH_DIST); \
4189
+ XXH3_accumulate_512_##name( \
4190
+ acc, \
4191
+ in, \
4192
+ secret + n*XXH_SECRET_CONSUME_RATE); \
4193
+ } \
4194
+ }
4195
+
4196
+
3542
4197
  XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
3543
4198
  {
3544
4199
  if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
3545
- memcpy(dst, &v64, sizeof(v64));
4200
+ XXH_memcpy(dst, &v64, sizeof(v64));
3546
4201
  }
3547
4202
 
3548
4203
  /* Several intrinsic functions below are supposed to accept __int64 as argument,
@@ -3559,6 +4214,7 @@ XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
3559
4214
  typedef long long xxh_i64;
3560
4215
  #endif
3561
4216
 
4217
+
3562
4218
  /*
3563
4219
  * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
3564
4220
  *
@@ -3594,7 +4250,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
3594
4250
  const void* XXH_RESTRICT input,
3595
4251
  const void* XXH_RESTRICT secret)
3596
4252
  {
3597
- XXH_ALIGN(64) __m512i* const xacc = (__m512i *) acc;
4253
+ __m512i* const xacc = (__m512i *) acc;
3598
4254
  XXH_ASSERT((((size_t)acc) & 63) == 0);
3599
4255
  XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
3600
4256
 
@@ -3606,7 +4262,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
3606
4262
  /* data_key = data_vec ^ key_vec; */
3607
4263
  __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
3608
4264
  /* data_key_lo = data_key >> 32; */
3609
- __m512i const data_key_lo = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
4265
+ __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
3610
4266
  /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
3611
4267
  __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo);
3612
4268
  /* xacc[0] += swap(data_vec); */
@@ -3616,6 +4272,7 @@ XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
3616
4272
  *xacc = _mm512_add_epi64(product, sum);
3617
4273
  }
3618
4274
  }
4275
+ XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
3619
4276
 
3620
4277
  /*
3621
4278
  * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
@@ -3643,19 +4300,18 @@ XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3643
4300
  {
3644
4301
  XXH_ASSERT((((size_t)acc) & 63) == 0);
3645
4302
  XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
3646
- { XXH_ALIGN(64) __m512i* const xacc = (__m512i*) acc;
4303
+ { __m512i* const xacc = (__m512i*) acc;
3647
4304
  const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
3648
4305
 
3649
4306
  /* xacc[0] ^= (xacc[0] >> 47) */
3650
4307
  __m512i const acc_vec = *xacc;
3651
4308
  __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47);
3652
- __m512i const data_vec = _mm512_xor_si512 (acc_vec, shifted);
3653
4309
  /* xacc[0] ^= secret; */
3654
4310
  __m512i const key_vec = _mm512_loadu_si512 (secret);
3655
- __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec);
4311
+ __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
3656
4312
 
3657
4313
  /* xacc[0] *= XXH_PRIME32_1; */
3658
- __m512i const data_key_hi = _mm512_shuffle_epi32 (data_key, (_MM_PERM_ENUM)_MM_SHUFFLE(0, 3, 0, 1));
4314
+ __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
3659
4315
  __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32);
3660
4316
  __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32);
3661
4317
  *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
@@ -3670,20 +4326,16 @@ XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
3670
4326
  XXH_ASSERT(((size_t)customSecret & 63) == 0);
3671
4327
  (void)(&XXH_writeLE64);
3672
4328
  { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
3673
- __m512i const seed = _mm512_mask_set1_epi64(_mm512_set1_epi64((xxh_i64)seed64), 0xAA, (xxh_i64)(0U - seed64));
4329
+ __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
4330
+ __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
3674
4331
 
3675
- XXH_ALIGN(64) const __m512i* const src = (const __m512i*) XXH3_kSecret;
3676
- XXH_ALIGN(64) __m512i* const dest = ( __m512i*) customSecret;
4332
+ const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret);
4333
+ __m512i* const dest = ( __m512i*) customSecret;
3677
4334
  int i;
4335
+ XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
4336
+ XXH_ASSERT(((size_t)dest & 63) == 0);
3678
4337
  for (i=0; i < nbRounds; ++i) {
3679
- /* GCC has a bug, _mm512_stream_load_si512 accepts 'void*', not 'void const*',
3680
- * this will warn "discards 'const' qualifier". */
3681
- union {
3682
- XXH_ALIGN(64) const __m512i* cp;
3683
- XXH_ALIGN(64) void* p;
3684
- } remote_const_void;
3685
- remote_const_void.cp = src + i;
3686
- dest[i] = _mm512_add_epi64(_mm512_stream_load_si512(remote_const_void.p), seed);
4338
+ dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
3687
4339
  } }
3688
4340
  }
3689
4341
 
@@ -3702,7 +4354,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
3702
4354
  const void* XXH_RESTRICT secret)
3703
4355
  {
3704
4356
  XXH_ASSERT((((size_t)acc) & 31) == 0);
3705
- { XXH_ALIGN(32) __m256i* const xacc = (__m256i *) acc;
4357
+ { __m256i* const xacc = (__m256i *) acc;
3706
4358
  /* Unaligned. This is mainly for pointer arithmetic, and because
3707
4359
  * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
3708
4360
  const __m256i* const xinput = (const __m256i *) input;
@@ -3719,7 +4371,7 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
3719
4371
  /* data_key = data_vec ^ key_vec; */
3720
4372
  __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
3721
4373
  /* data_key_lo = data_key >> 32; */
3722
- __m256i const data_key_lo = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
4374
+ __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
3723
4375
  /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
3724
4376
  __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo);
3725
4377
  /* xacc[i] += swap(data_vec); */
@@ -3729,12 +4381,13 @@ XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
3729
4381
  xacc[i] = _mm256_add_epi64(product, sum);
3730
4382
  } }
3731
4383
  }
4384
+ XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
3732
4385
 
3733
4386
  XXH_FORCE_INLINE XXH_TARGET_AVX2 void
3734
4387
  XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3735
4388
  {
3736
4389
  XXH_ASSERT((((size_t)acc) & 31) == 0);
3737
- { XXH_ALIGN(32) __m256i* const xacc = (__m256i*) acc;
4390
+ { __m256i* const xacc = (__m256i*) acc;
3738
4391
  /* Unaligned. This is mainly for pointer arithmetic, and because
3739
4392
  * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
3740
4393
  const __m256i* const xsecret = (const __m256i *) secret;
@@ -3751,7 +4404,7 @@ XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3751
4404
  __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec);
3752
4405
 
3753
4406
  /* xacc[i] *= XXH_PRIME32_1; */
3754
- __m256i const data_key_hi = _mm256_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
4407
+ __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
3755
4408
  __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32);
3756
4409
  __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32);
3757
4410
  xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
@@ -3768,8 +4421,8 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR
3768
4421
  XXH_PREFETCH(customSecret);
3769
4422
  { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
3770
4423
 
3771
- XXH_ALIGN(64) const __m256i* const src = (const __m256i*) XXH3_kSecret;
3772
- XXH_ALIGN(64) __m256i* dest = ( __m256i*) customSecret;
4424
+ const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret);
4425
+ __m256i* dest = ( __m256i*) customSecret;
3773
4426
 
3774
4427
  # if defined(__GNUC__) || defined(__clang__)
3775
4428
  /*
@@ -3779,14 +4432,16 @@ XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTR
3779
4432
  */
3780
4433
  XXH_COMPILER_GUARD(dest);
3781
4434
  # endif
4435
+ XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
4436
+ XXH_ASSERT(((size_t)dest & 31) == 0);
3782
4437
 
3783
4438
  /* GCC -O2 need unroll loop manually */
3784
- dest[0] = _mm256_add_epi64(_mm256_stream_load_si256(src+0), seed);
3785
- dest[1] = _mm256_add_epi64(_mm256_stream_load_si256(src+1), seed);
3786
- dest[2] = _mm256_add_epi64(_mm256_stream_load_si256(src+2), seed);
3787
- dest[3] = _mm256_add_epi64(_mm256_stream_load_si256(src+3), seed);
3788
- dest[4] = _mm256_add_epi64(_mm256_stream_load_si256(src+4), seed);
3789
- dest[5] = _mm256_add_epi64(_mm256_stream_load_si256(src+5), seed);
4439
+ dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
4440
+ dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
4441
+ dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
4442
+ dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
4443
+ dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
4444
+ dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
3790
4445
  }
3791
4446
  }
3792
4447
 
@@ -3806,7 +4461,7 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
3806
4461
  {
3807
4462
  /* SSE2 is just a half-scale version of the AVX2 version. */
3808
4463
  XXH_ASSERT((((size_t)acc) & 15) == 0);
3809
- { XXH_ALIGN(16) __m128i* const xacc = (__m128i *) acc;
4464
+ { __m128i* const xacc = (__m128i *) acc;
3810
4465
  /* Unaligned. This is mainly for pointer arithmetic, and because
3811
4466
  * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
3812
4467
  const __m128i* const xinput = (const __m128i *) input;
@@ -3833,12 +4488,13 @@ XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
3833
4488
  xacc[i] = _mm_add_epi64(product, sum);
3834
4489
  } }
3835
4490
  }
4491
+ XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
3836
4492
 
3837
4493
  XXH_FORCE_INLINE XXH_TARGET_SSE2 void
3838
4494
  XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3839
4495
  {
3840
4496
  XXH_ASSERT((((size_t)acc) & 15) == 0);
3841
- { XXH_ALIGN(16) __m128i* const xacc = (__m128i*) acc;
4497
+ { __m128i* const xacc = (__m128i*) acc;
3842
4498
  /* Unaligned. This is mainly for pointer arithmetic, and because
3843
4499
  * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
3844
4500
  const __m128i* const xsecret = (const __m128i *) secret;
@@ -3870,7 +4526,7 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTR
3870
4526
  { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
3871
4527
 
3872
4528
  # if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
3873
- // MSVC 32bit mode does not support _mm_set_epi64x before 2015
4529
+ /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */
3874
4530
  XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) };
3875
4531
  __m128i const seed = _mm_load_si128((__m128i const*)seed64x2);
3876
4532
  # else
@@ -3878,19 +4534,21 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTR
3878
4534
  # endif
3879
4535
  int i;
3880
4536
 
3881
- XXH_ALIGN(64) const float* const src = (float const*) XXH3_kSecret;
3882
- XXH_ALIGN(XXH_SEC_ALIGN) __m128i* dest = (__m128i*) customSecret;
4537
+ const void* const src16 = XXH3_kSecret;
4538
+ __m128i* dst16 = (__m128i*) customSecret;
3883
4539
  # if defined(__GNUC__) || defined(__clang__)
3884
4540
  /*
3885
4541
  * On GCC & Clang, marking 'dest' as modified will cause the compiler:
3886
4542
  * - do not extract the secret from sse registers in the internal loop
3887
4543
  * - use less common registers, and avoid pushing these reg into stack
3888
4544
  */
3889
- XXH_COMPILER_GUARD(dest);
4545
+ XXH_COMPILER_GUARD(dst16);
3890
4546
  # endif
4547
+ XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
4548
+ XXH_ASSERT(((size_t)dst16 & 15) == 0);
3891
4549
 
3892
4550
  for (i=0; i < nbRounds; ++i) {
3893
- dest[i] = _mm_add_epi64(_mm_castps_si128(_mm_load_ps(src+i*4)), seed);
4551
+ dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);
3894
4552
  } }
3895
4553
  }
3896
4554
 
@@ -3898,42 +4556,112 @@ XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTR
3898
4556
 
3899
4557
  #if (XXH_VECTOR == XXH_NEON)
3900
4558
 
4559
+ /* forward declarations for the scalar routines */
4560
+ XXH_FORCE_INLINE void
4561
+ XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
4562
+ void const* XXH_RESTRICT secret, size_t lane);
4563
+
4564
+ XXH_FORCE_INLINE void
4565
+ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
4566
+ void const* XXH_RESTRICT secret, size_t lane);
4567
+
4568
+ /*!
4569
+ * @internal
4570
+ * @brief The bulk processing loop for NEON.
4571
+ *
4572
+ * The NEON code path is actually partially scalar when running on AArch64. This
4573
+ * is to optimize the pipelining and can have up to 15% speedup depending on the
4574
+ * CPU, and it also mitigates some GCC codegen issues.
4575
+ *
4576
+ * @see XXH3_NEON_LANES for configuring this and details about this optimization.
4577
+ */
3901
4578
  XXH_FORCE_INLINE void
3902
4579
  XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
3903
4580
  const void* XXH_RESTRICT input,
3904
4581
  const void* XXH_RESTRICT secret)
3905
4582
  {
3906
4583
  XXH_ASSERT((((size_t)acc) & 15) == 0);
4584
+ XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
3907
4585
  {
3908
- XXH_ALIGN(16) uint64x2_t* const xacc = (uint64x2_t *) acc;
4586
+ uint64x2_t* const xacc = (uint64x2_t *) acc;
3909
4587
  /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
3910
4588
  uint8_t const* const xinput = (const uint8_t *) input;
3911
4589
  uint8_t const* const xsecret = (const uint8_t *) secret;
3912
4590
 
3913
4591
  size_t i;
3914
- for (i=0; i < XXH_STRIPE_LEN / sizeof(uint64x2_t); i++) {
4592
+ /* AArch64 uses both scalar and neon at the same time */
4593
+ for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
4594
+ XXH3_scalarRound(acc, input, secret, i);
4595
+ }
4596
+ i = 0;
4597
+ for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
4598
+ uint64x2_t acc_vec1 = xacc[i];
4599
+ /* data_vec = xinput[i]; */
4600
+ uint64x2_t data_vec1 = XXH_vld1q_u64(xinput + (i * 16));
4601
+ /* key_vec = xsecret[i]; */
4602
+ uint64x2_t key_vec1 = XXH_vld1q_u64(xsecret + (i * 16));
4603
+ /* acc_vec_2 = swap(data_vec) */
4604
+ uint64x2_t acc_vec_21 = vextq_u64(data_vec1, data_vec1, 1);
4605
+ /* data_key = data_vec ^ key_vec; */
4606
+ uint64x2_t data_key1 = veorq_u64(data_vec1, key_vec1);
4607
+
4608
+ uint64x2_t acc_vec2 = xacc[i+1];
3915
4609
  /* data_vec = xinput[i]; */
3916
- uint8x16_t data_vec = vld1q_u8(xinput + (i * 16));
4610
+ uint64x2_t data_vec2 = XXH_vld1q_u64(xinput + ((i+1) * 16));
3917
4611
  /* key_vec = xsecret[i]; */
3918
- uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
4612
+ uint64x2_t key_vec2 = XXH_vld1q_u64(xsecret + ((i+1) * 16));
4613
+ /* acc_vec_2 = swap(data_vec) */
4614
+ uint64x2_t acc_vec_22 = vextq_u64(data_vec2, data_vec2, 1);
4615
+ /* data_key = data_vec ^ key_vec; */
4616
+ uint64x2_t data_key2 = veorq_u64(data_vec2, key_vec2);
4617
+
4618
+ /* data_key_lo = {(data_key1 & 0xFFFFFFFF), (data_key2 & 0xFFFFFFFF)};
4619
+ * data_key_hi = {(data_key1 >> 32), (data_key2 >> 32)};
4620
+ */
4621
+ uint32x4x2_t zipped = vuzpq_u32(vreinterpretq_u32_u64(data_key1), vreinterpretq_u32_u64(data_key2));
4622
+ uint32x4_t data_key_lo = zipped.val[0];
4623
+ uint32x4_t data_key_hi = zipped.val[1];
4624
+
4625
+ /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
4626
+ acc_vec_21 = vmlal_u32 (acc_vec_21, vget_low_u32(data_key_lo), vget_low_u32(data_key_hi));
4627
+ XXH_COMPILER_GUARD_W(acc_vec_21);
4628
+ /* xacc[i] += acc_vec_2; */
4629
+ acc_vec1 = vaddq_u64 (acc_vec1, acc_vec_21);
4630
+ xacc[i] = acc_vec1;
4631
+ /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
4632
+ acc_vec_22 = vmlal_u32 (acc_vec_22, vget_high_u32(data_key_lo), vget_high_u32(data_key_hi));
4633
+ XXH_COMPILER_GUARD_W(acc_vec_22);
4634
+ /* xacc[i] += acc_vec_2; */
4635
+ acc_vec2 = vaddq_u64 (acc_vec2, acc_vec_22);
4636
+ xacc[i+1] = acc_vec2;
4637
+ }
4638
+ for (; i < XXH3_NEON_LANES / 2; i++) {
4639
+ uint64x2_t acc_vec = xacc[i];
4640
+ /* data_vec = xinput[i]; */
4641
+ uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16));
4642
+ /* key_vec = xsecret[i]; */
4643
+ uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16));
3919
4644
  uint64x2_t data_key;
3920
4645
  uint32x2_t data_key_lo, data_key_hi;
3921
- /* xacc[i] += swap(data_vec); */
3922
- uint64x2_t const data64 = vreinterpretq_u64_u8(data_vec);
3923
- uint64x2_t const swapped = vextq_u64(data64, data64, 1);
3924
- xacc[i] = vaddq_u64 (xacc[i], swapped);
4646
+ /* acc_vec_2 = swap(data_vec) */
4647
+ uint64x2_t acc_vec_2 = vextq_u64(data_vec, data_vec, 1);
3925
4648
  /* data_key = data_vec ^ key_vec; */
3926
- data_key = vreinterpretq_u64_u8(veorq_u8(data_vec, key_vec));
4649
+ data_key = veorq_u64(data_vec, key_vec);
3927
4650
  /* data_key_lo = (uint32x2_t) (data_key & 0xFFFFFFFF);
3928
4651
  * data_key_hi = (uint32x2_t) (data_key >> 32);
3929
4652
  * data_key = UNDEFINED; */
3930
4653
  XXH_SPLIT_IN_PLACE(data_key, data_key_lo, data_key_hi);
3931
- /* xacc[i] += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
3932
- xacc[i] = vmlal_u32 (xacc[i], data_key_lo, data_key_hi);
3933
-
4654
+ /* acc_vec_2 += (uint64x2_t) data_key_lo * (uint64x2_t) data_key_hi; */
4655
+ acc_vec_2 = vmlal_u32 (acc_vec_2, data_key_lo, data_key_hi);
4656
+ XXH_COMPILER_GUARD_W(acc_vec_2);
4657
+ /* xacc[i] += acc_vec_2; */
4658
+ acc_vec = vaddq_u64 (acc_vec, acc_vec_2);
4659
+ xacc[i] = acc_vec;
3934
4660
  }
4661
+
3935
4662
  }
3936
4663
  }
4664
+ XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
3937
4665
 
3938
4666
  XXH_FORCE_INLINE void
3939
4667
  XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
@@ -3945,15 +4673,19 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3945
4673
  uint32x2_t prime = vdup_n_u32 (XXH_PRIME32_1);
3946
4674
 
3947
4675
  size_t i;
3948
- for (i=0; i < XXH_STRIPE_LEN/sizeof(uint64x2_t); i++) {
4676
+ /* AArch64 uses both scalar and neon at the same time */
4677
+ for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
4678
+ XXH3_scalarScrambleRound(acc, secret, i);
4679
+ }
4680
+ for (i=0; i < XXH3_NEON_LANES / 2; i++) {
3949
4681
  /* xacc[i] ^= (xacc[i] >> 47); */
3950
4682
  uint64x2_t acc_vec = xacc[i];
3951
- uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
3952
- uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
4683
+ uint64x2_t shifted = vshrq_n_u64 (acc_vec, 47);
4684
+ uint64x2_t data_vec = veorq_u64 (acc_vec, shifted);
3953
4685
 
3954
4686
  /* xacc[i] ^= xsecret[i]; */
3955
- uint8x16_t key_vec = vld1q_u8(xsecret + (i * 16));
3956
- uint64x2_t data_key = veorq_u64(data_vec, vreinterpretq_u64_u8(key_vec));
4687
+ uint64x2_t key_vec = XXH_vld1q_u64 (xsecret + (i * 16));
4688
+ uint64x2_t data_key = veorq_u64 (data_vec, key_vec);
3957
4689
 
3958
4690
  /* xacc[i] *= XXH_PRIME32_1 */
3959
4691
  uint32x2_t data_key_lo, data_key_hi;
@@ -3981,11 +4713,12 @@ XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
3981
4713
  */
3982
4714
  uint64x2_t prod_hi = vmull_u32 (data_key_hi, prime);
3983
4715
  /* xacc[i] = prod_hi << 32; */
3984
- xacc[i] = vshlq_n_u64(prod_hi, 32);
4716
+ prod_hi = vshlq_n_u64(prod_hi, 32);
3985
4717
  /* xacc[i] += (prod_hi & 0xFFFFFFFF) * XXH_PRIME32_1; */
3986
- xacc[i] = vmlal_u32(xacc[i], data_key_lo, prime);
4718
+ xacc[i] = vmlal_u32(prod_hi, data_key_lo, prime);
3987
4719
  }
3988
- } }
4720
+ }
4721
+ }
3989
4722
  }
3990
4723
 
3991
4724
  #endif
@@ -3997,7 +4730,8 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
3997
4730
  const void* XXH_RESTRICT input,
3998
4731
  const void* XXH_RESTRICT secret)
3999
4732
  {
4000
- xxh_u64x2* const xacc = (xxh_u64x2*) acc; /* presumed aligned */
4733
+ /* presumed aligned */
4734
+ unsigned int* const xacc = (unsigned int*) acc;
4001
4735
  xxh_u64x2 const* const xinput = (xxh_u64x2 const*) input; /* no alignment restriction */
4002
4736
  xxh_u64x2 const* const xsecret = (xxh_u64x2 const*) secret; /* no alignment restriction */
4003
4737
  xxh_u64x2 const v32 = { 32, 32 };
@@ -4012,16 +4746,21 @@ XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc,
4012
4746
  xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
4013
4747
  /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
4014
4748
  xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
4015
- xacc[i] += product;
4749
+ /* acc_vec = xacc[i]; */
4750
+ xxh_u64x2 acc_vec = (xxh_u64x2)vec_xl(0, xacc + 4 * i);
4751
+ acc_vec += product;
4016
4752
 
4017
4753
  /* swap high and low halves */
4018
4754
  #ifdef __s390x__
4019
- xacc[i] += vec_permi(data_vec, data_vec, 2);
4755
+ acc_vec += vec_permi(data_vec, data_vec, 2);
4020
4756
  #else
4021
- xacc[i] += vec_xxpermdi(data_vec, data_vec, 2);
4757
+ acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
4022
4758
  #endif
4759
+ /* xacc[i] = acc_vec; */
4760
+ vec_xst((xxh_u32x4)acc_vec, 0, xacc + 4 * i);
4023
4761
  }
4024
4762
  }
4763
+ XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
4025
4764
 
4026
4765
  XXH_FORCE_INLINE void
4027
4766
  XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
@@ -4055,40 +4794,202 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4055
4794
 
4056
4795
  #endif
4057
4796
 
4797
+ #if (XXH_VECTOR == XXH_SVE)
4798
+
4799
+ XXH_FORCE_INLINE void
4800
+ XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
4801
+ const void* XXH_RESTRICT input,
4802
+ const void* XXH_RESTRICT secret)
4803
+ {
4804
+ uint64_t *xacc = (uint64_t *)acc;
4805
+ const uint64_t *xinput = (const uint64_t *)(const void *)input;
4806
+ const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
4807
+ svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
4808
+ uint64_t element_count = svcntd();
4809
+ if (element_count >= 8) {
4810
+ svbool_t mask = svptrue_pat_b64(SV_VL8);
4811
+ svuint64_t vacc = svld1_u64(mask, xacc);
4812
+ ACCRND(vacc, 0);
4813
+ svst1_u64(mask, xacc, vacc);
4814
+ } else if (element_count == 2) { /* sve128 */
4815
+ svbool_t mask = svptrue_pat_b64(SV_VL2);
4816
+ svuint64_t acc0 = svld1_u64(mask, xacc + 0);
4817
+ svuint64_t acc1 = svld1_u64(mask, xacc + 2);
4818
+ svuint64_t acc2 = svld1_u64(mask, xacc + 4);
4819
+ svuint64_t acc3 = svld1_u64(mask, xacc + 6);
4820
+ ACCRND(acc0, 0);
4821
+ ACCRND(acc1, 2);
4822
+ ACCRND(acc2, 4);
4823
+ ACCRND(acc3, 6);
4824
+ svst1_u64(mask, xacc + 0, acc0);
4825
+ svst1_u64(mask, xacc + 2, acc1);
4826
+ svst1_u64(mask, xacc + 4, acc2);
4827
+ svst1_u64(mask, xacc + 6, acc3);
4828
+ } else {
4829
+ svbool_t mask = svptrue_pat_b64(SV_VL4);
4830
+ svuint64_t acc0 = svld1_u64(mask, xacc + 0);
4831
+ svuint64_t acc1 = svld1_u64(mask, xacc + 4);
4832
+ ACCRND(acc0, 0);
4833
+ ACCRND(acc1, 4);
4834
+ svst1_u64(mask, xacc + 0, acc0);
4835
+ svst1_u64(mask, xacc + 4, acc1);
4836
+ }
4837
+ }
4838
+
4839
+ XXH_FORCE_INLINE void
4840
+ XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
4841
+ const xxh_u8* XXH_RESTRICT input,
4842
+ const xxh_u8* XXH_RESTRICT secret,
4843
+ size_t nbStripes)
4844
+ {
4845
+ if (nbStripes != 0) {
4846
+ uint64_t *xacc = (uint64_t *)acc;
4847
+ const uint64_t *xinput = (const uint64_t *)(const void *)input;
4848
+ const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
4849
+ svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
4850
+ uint64_t element_count = svcntd();
4851
+ if (element_count >= 8) {
4852
+ svbool_t mask = svptrue_pat_b64(SV_VL8);
4853
+ svuint64_t vacc = svld1_u64(mask, xacc + 0);
4854
+ do {
4855
+ /* svprfd(svbool_t, void *, enum svfprop); */
4856
+ svprfd(mask, xinput + 128, SV_PLDL1STRM);
4857
+ ACCRND(vacc, 0);
4858
+ xinput += 8;
4859
+ xsecret += 1;
4860
+ nbStripes--;
4861
+ } while (nbStripes != 0);
4862
+
4863
+ svst1_u64(mask, xacc + 0, vacc);
4864
+ } else if (element_count == 2) { /* sve128 */
4865
+ svbool_t mask = svptrue_pat_b64(SV_VL2);
4866
+ svuint64_t acc0 = svld1_u64(mask, xacc + 0);
4867
+ svuint64_t acc1 = svld1_u64(mask, xacc + 2);
4868
+ svuint64_t acc2 = svld1_u64(mask, xacc + 4);
4869
+ svuint64_t acc3 = svld1_u64(mask, xacc + 6);
4870
+ do {
4871
+ svprfd(mask, xinput + 128, SV_PLDL1STRM);
4872
+ ACCRND(acc0, 0);
4873
+ ACCRND(acc1, 2);
4874
+ ACCRND(acc2, 4);
4875
+ ACCRND(acc3, 6);
4876
+ xinput += 8;
4877
+ xsecret += 1;
4878
+ nbStripes--;
4879
+ } while (nbStripes != 0);
4880
+
4881
+ svst1_u64(mask, xacc + 0, acc0);
4882
+ svst1_u64(mask, xacc + 2, acc1);
4883
+ svst1_u64(mask, xacc + 4, acc2);
4884
+ svst1_u64(mask, xacc + 6, acc3);
4885
+ } else {
4886
+ svbool_t mask = svptrue_pat_b64(SV_VL4);
4887
+ svuint64_t acc0 = svld1_u64(mask, xacc + 0);
4888
+ svuint64_t acc1 = svld1_u64(mask, xacc + 4);
4889
+ do {
4890
+ svprfd(mask, xinput + 128, SV_PLDL1STRM);
4891
+ ACCRND(acc0, 0);
4892
+ ACCRND(acc1, 4);
4893
+ xinput += 8;
4894
+ xsecret += 1;
4895
+ nbStripes--;
4896
+ } while (nbStripes != 0);
4897
+
4898
+ svst1_u64(mask, xacc + 0, acc0);
4899
+ svst1_u64(mask, xacc + 4, acc1);
4900
+ }
4901
+ }
4902
+ }
4903
+
4904
+ #endif
4905
+
4058
4906
  /* scalar variants - universal */
4059
4907
 
4908
+ /*!
4909
+ * @internal
4910
+ * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
4911
+ *
4912
+ * This is extracted to its own function because the NEON path uses a combination
4913
+ * of NEON and scalar.
4914
+ */
4915
+ XXH_FORCE_INLINE void
4916
+ XXH3_scalarRound(void* XXH_RESTRICT acc,
4917
+ void const* XXH_RESTRICT input,
4918
+ void const* XXH_RESTRICT secret,
4919
+ size_t lane)
4920
+ {
4921
+ xxh_u64* xacc = (xxh_u64*) acc;
4922
+ xxh_u8 const* xinput = (xxh_u8 const*) input;
4923
+ xxh_u8 const* xsecret = (xxh_u8 const*) secret;
4924
+ XXH_ASSERT(lane < XXH_ACC_NB);
4925
+ XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
4926
+ {
4927
+ xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
4928
+ xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
4929
+ xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
4930
+ xacc[lane] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
4931
+ }
4932
+ }
4933
+
4934
+ /*!
4935
+ * @internal
4936
+ * @brief Processes a 64 byte block of data using the scalar path.
4937
+ */
4060
4938
  XXH_FORCE_INLINE void
4061
4939
  XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
4062
4940
  const void* XXH_RESTRICT input,
4063
4941
  const void* XXH_RESTRICT secret)
4064
4942
  {
4065
- XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
4066
- const xxh_u8* const xinput = (const xxh_u8*) input; /* no alignment restriction */
4067
- const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
4068
4943
  size_t i;
4069
- XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
4944
+ /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */
4945
+ #if defined(__GNUC__) && !defined(__clang__) \
4946
+ && (defined(__arm__) || defined(__thumb2__)) \
4947
+ && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \
4948
+ && XXH_SIZE_OPT <= 0
4949
+ # pragma GCC unroll 8
4950
+ #endif
4070
4951
  for (i=0; i < XXH_ACC_NB; i++) {
4071
- xxh_u64 const data_val = XXH_readLE64(xinput + 8*i);
4072
- xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + i*8);
4073
- xacc[i ^ 1] += data_val; /* swap adjacent lanes */
4074
- xacc[i] += XXH_mult32to64(data_key & 0xFFFFFFFF, data_key >> 32);
4952
+ XXH3_scalarRound(acc, input, secret, i);
4075
4953
  }
4076
4954
  }
4955
+ XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
4077
4956
 
4957
+ /*!
4958
+ * @internal
4959
+ * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
4960
+ *
4961
+ * This is extracted to its own function because the NEON path uses a combination
4962
+ * of NEON and scalar.
4963
+ */
4078
4964
  XXH_FORCE_INLINE void
4079
- XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4965
+ XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
4966
+ void const* XXH_RESTRICT secret,
4967
+ size_t lane)
4080
4968
  {
4081
- XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
4969
+ xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */
4082
4970
  const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */
4083
- size_t i;
4084
4971
  XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
4085
- for (i=0; i < XXH_ACC_NB; i++) {
4086
- xxh_u64 const key64 = XXH_readLE64(xsecret + 8*i);
4087
- xxh_u64 acc64 = xacc[i];
4972
+ XXH_ASSERT(lane < XXH_ACC_NB);
4973
+ {
4974
+ xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
4975
+ xxh_u64 acc64 = xacc[lane];
4088
4976
  acc64 = XXH_xorshift64(acc64, 47);
4089
4977
  acc64 ^= key64;
4090
4978
  acc64 *= XXH_PRIME32_1;
4091
- xacc[i] = acc64;
4979
+ xacc[lane] = acc64;
4980
+ }
4981
+ }
4982
+
4983
+ /*!
4984
+ * @internal
4985
+ * @brief Scrambles the accumulators after a large chunk has been read
4986
+ */
4987
+ XXH_FORCE_INLINE void
4988
+ XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
4989
+ {
4990
+ size_t i;
4991
+ for (i=0; i < XXH_ACC_NB; i++) {
4992
+ XXH3_scalarScrambleRound(acc, secret, i);
4092
4993
  }
4093
4994
  }
4094
4995
 
@@ -4110,8 +5011,9 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4110
5011
  * placed sequentially, in order, at the top of the unrolled loop.
4111
5012
  *
4112
5013
  * While MOVK is great for generating constants (2 cycles for a 64-bit
4113
- * constant compared to 4 cycles for LDR), long MOVK chains stall the
4114
- * integer pipelines:
5014
+ * constant compared to 4 cycles for LDR), it fights for bandwidth with
5015
+ * the arithmetic instructions.
5016
+ *
4115
5017
  * I L S
4116
5018
  * MOVK
4117
5019
  * MOVK
@@ -4128,6 +5030,9 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4128
5030
  * ADD LDR
4129
5031
  * SUB STR
4130
5032
  * STR
5033
+ *
5034
+ * See XXH3_NEON_LANES for details on the pipsline.
5035
+ *
4131
5036
  * XXH3_64bits_withSeed, len == 256, Snapdragon 835
4132
5037
  * without hack: 2654.4 MB/s
4133
5038
  * with hack: 3202.9 MB/s
@@ -4157,7 +5062,7 @@ XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
4157
5062
  }
4158
5063
 
4159
5064
 
4160
- typedef void (*XXH3_f_accumulate_512)(void* XXH_RESTRICT, const void*, const void*);
5065
+ typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
4161
5066
  typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
4162
5067
  typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
4163
5068
 
@@ -4165,82 +5070,63 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
4165
5070
  #if (XXH_VECTOR == XXH_AVX512)
4166
5071
 
4167
5072
  #define XXH3_accumulate_512 XXH3_accumulate_512_avx512
5073
+ #define XXH3_accumulate XXH3_accumulate_avx512
4168
5074
  #define XXH3_scrambleAcc XXH3_scrambleAcc_avx512
4169
5075
  #define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
4170
5076
 
4171
5077
  #elif (XXH_VECTOR == XXH_AVX2)
4172
5078
 
4173
5079
  #define XXH3_accumulate_512 XXH3_accumulate_512_avx2
5080
+ #define XXH3_accumulate XXH3_accumulate_avx2
4174
5081
  #define XXH3_scrambleAcc XXH3_scrambleAcc_avx2
4175
5082
  #define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
4176
5083
 
4177
5084
  #elif (XXH_VECTOR == XXH_SSE2)
4178
5085
 
4179
5086
  #define XXH3_accumulate_512 XXH3_accumulate_512_sse2
5087
+ #define XXH3_accumulate XXH3_accumulate_sse2
4180
5088
  #define XXH3_scrambleAcc XXH3_scrambleAcc_sse2
4181
5089
  #define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
4182
5090
 
4183
5091
  #elif (XXH_VECTOR == XXH_NEON)
4184
5092
 
4185
5093
  #define XXH3_accumulate_512 XXH3_accumulate_512_neon
5094
+ #define XXH3_accumulate XXH3_accumulate_neon
4186
5095
  #define XXH3_scrambleAcc XXH3_scrambleAcc_neon
4187
5096
  #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4188
5097
 
4189
5098
  #elif (XXH_VECTOR == XXH_VSX)
4190
5099
 
4191
5100
  #define XXH3_accumulate_512 XXH3_accumulate_512_vsx
5101
+ #define XXH3_accumulate XXH3_accumulate_vsx
4192
5102
  #define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
4193
5103
  #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4194
5104
 
5105
+ #elif (XXH_VECTOR == XXH_SVE)
5106
+ #define XXH3_accumulate_512 XXH3_accumulate_512_sve
5107
+ #define XXH3_accumulate XXH3_accumulate_sve
5108
+ #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
5109
+ #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
5110
+
4195
5111
  #else /* scalar */
4196
5112
 
4197
5113
  #define XXH3_accumulate_512 XXH3_accumulate_512_scalar
5114
+ #define XXH3_accumulate XXH3_accumulate_scalar
4198
5115
  #define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
4199
5116
  #define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
4200
5117
 
4201
5118
  #endif
4202
5119
 
4203
-
4204
-
4205
- #ifndef XXH_PREFETCH_DIST
4206
- # ifdef __clang__
4207
- # define XXH_PREFETCH_DIST 320
4208
- # else
4209
- # if (XXH_VECTOR == XXH_AVX512)
4210
- # define XXH_PREFETCH_DIST 512
4211
- # else
4212
- # define XXH_PREFETCH_DIST 384
4213
- # endif
4214
- # endif /* __clang__ */
4215
- #endif /* XXH_PREFETCH_DIST */
4216
-
4217
- /*
4218
- * XXH3_accumulate()
4219
- * Loops over XXH3_accumulate_512().
4220
- * Assumption: nbStripes will not overflow the secret size
4221
- */
4222
- XXH_FORCE_INLINE void
4223
- XXH3_accumulate( xxh_u64* XXH_RESTRICT acc,
4224
- const xxh_u8* XXH_RESTRICT input,
4225
- const xxh_u8* XXH_RESTRICT secret,
4226
- size_t nbStripes,
4227
- XXH3_f_accumulate_512 f_acc512)
4228
- {
4229
- size_t n;
4230
- for (n = 0; n < nbStripes; n++ ) {
4231
- const xxh_u8* const in = input + n*XXH_STRIPE_LEN;
4232
- XXH_PREFETCH(in + XXH_PREFETCH_DIST);
4233
- f_acc512(acc,
4234
- in,
4235
- secret + n*XXH_SECRET_CONSUME_RATE);
4236
- }
4237
- }
5120
+ #if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */
5121
+ # undef XXH3_initCustomSecret
5122
+ # define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
5123
+ #endif
4238
5124
 
4239
5125
  XXH_FORCE_INLINE void
4240
5126
  XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
4241
5127
  const xxh_u8* XXH_RESTRICT input, size_t len,
4242
5128
  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
4243
- XXH3_f_accumulate_512 f_acc512,
5129
+ XXH3_f_accumulate f_acc,
4244
5130
  XXH3_f_scrambleAcc f_scramble)
4245
5131
  {
4246
5132
  size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
@@ -4252,7 +5138,7 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
4252
5138
  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
4253
5139
 
4254
5140
  for (n = 0; n < nb_blocks; n++) {
4255
- XXH3_accumulate(acc, input + n*block_len, secret, nbStripesPerBlock, f_acc512);
5141
+ f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
4256
5142
  f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
4257
5143
  }
4258
5144
 
@@ -4260,12 +5146,12 @@ XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
4260
5146
  XXH_ASSERT(len > XXH_STRIPE_LEN);
4261
5147
  { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
4262
5148
  XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
4263
- XXH3_accumulate(acc, input + nb_blocks*block_len, secret, nbStripes, f_acc512);
5149
+ f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
4264
5150
 
4265
5151
  /* last stripe */
4266
5152
  { const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
4267
5153
  #define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */
4268
- f_acc512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
5154
+ XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
4269
5155
  } }
4270
5156
  }
4271
5157
 
@@ -4310,12 +5196,12 @@ XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secre
4310
5196
  XXH_FORCE_INLINE XXH64_hash_t
4311
5197
  XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
4312
5198
  const void* XXH_RESTRICT secret, size_t secretSize,
4313
- XXH3_f_accumulate_512 f_acc512,
5199
+ XXH3_f_accumulate f_acc,
4314
5200
  XXH3_f_scrambleAcc f_scramble)
4315
5201
  {
4316
5202
  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
4317
5203
 
4318
- XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc512, f_scramble);
5204
+ XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
4319
5205
 
4320
5206
  /* converge into final hash */
4321
5207
  XXH_STATIC_ASSERT(sizeof(acc) == 64);
@@ -4326,29 +5212,30 @@ XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
4326
5212
  }
4327
5213
 
4328
5214
  /*
4329
- * It's important for performance that XXH3_hashLong is not inlined.
5215
+ * It's important for performance to transmit secret's size (when it's static)
5216
+ * so that the compiler can properly optimize the vectorized loop.
5217
+ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
4330
5218
  */
4331
- XXH_NO_INLINE XXH64_hash_t
5219
+ XXH_FORCE_INLINE XXH64_hash_t
4332
5220
  XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
4333
5221
  XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
4334
5222
  {
4335
5223
  (void)seed64;
4336
- return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate_512, XXH3_scrambleAcc);
5224
+ return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
4337
5225
  }
4338
5226
 
4339
5227
  /*
4340
- * It's important for performance that XXH3_hashLong is not inlined.
4341
- * Since the function is not inlined, the compiler may not be able to understand that,
4342
- * in some scenarios, its `secret` argument is actually a compile time constant.
4343
- * This variant enforces that the compiler can detect that,
4344
- * and uses this opportunity to streamline the generated code for better performance.
5228
+ * It's preferable for performance that XXH3_hashLong is not inlined,
5229
+ * as it results in a smaller function for small data, easier to the instruction cache.
5230
+ * Note that inside this no_inline function, we do inline the internal loop,
5231
+ * and provide a statically defined secret size to allow optimization of vector loop.
4345
5232
  */
4346
- XXH_NO_INLINE XXH64_hash_t
5233
+ XXH_NO_INLINE XXH_PUREF XXH64_hash_t
4347
5234
  XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
4348
5235
  XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
4349
5236
  {
4350
5237
  (void)seed64; (void)secret; (void)secretLen;
4351
- return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate_512, XXH3_scrambleAcc);
5238
+ return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
4352
5239
  }
4353
5240
 
4354
5241
  /*
@@ -4365,18 +5252,20 @@ XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
4365
5252
  XXH_FORCE_INLINE XXH64_hash_t
4366
5253
  XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
4367
5254
  XXH64_hash_t seed,
4368
- XXH3_f_accumulate_512 f_acc512,
5255
+ XXH3_f_accumulate f_acc,
4369
5256
  XXH3_f_scrambleAcc f_scramble,
4370
5257
  XXH3_f_initCustomSecret f_initSec)
4371
5258
  {
5259
+ #if XXH_SIZE_OPT <= 0
4372
5260
  if (seed == 0)
4373
5261
  return XXH3_hashLong_64b_internal(input, len,
4374
5262
  XXH3_kSecret, sizeof(XXH3_kSecret),
4375
- f_acc512, f_scramble);
5263
+ f_acc, f_scramble);
5264
+ #endif
4376
5265
  { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
4377
5266
  f_initSec(secret, seed);
4378
5267
  return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
4379
- f_acc512, f_scramble);
5268
+ f_acc, f_scramble);
4380
5269
  }
4381
5270
  }
4382
5271
 
@@ -4384,12 +5273,12 @@ XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
4384
5273
  * It's important for performance that XXH3_hashLong is not inlined.
4385
5274
  */
4386
5275
  XXH_NO_INLINE XXH64_hash_t
4387
- XXH3_hashLong_64b_withSeed(const void* input, size_t len,
4388
- XXH64_hash_t seed, const xxh_u8* secret, size_t secretLen)
5276
+ XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
5277
+ XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
4389
5278
  {
4390
5279
  (void)secret; (void)secretLen;
4391
5280
  return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
4392
- XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
5281
+ XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
4393
5282
  }
4394
5283
 
4395
5284
 
@@ -4421,29 +5310,37 @@ XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
4421
5310
 
4422
5311
  /* === Public entry point === */
4423
5312
 
4424
- /*! @ingroup xxh3_family */
4425
- XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(const void* input, size_t len)
5313
+ /*! @ingroup XXH3_family */
5314
+ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
4426
5315
  {
4427
- return XXH3_64bits_internal(input, len, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
5316
+ return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
4428
5317
  }
4429
5318
 
4430
- /*! @ingroup xxh3_family */
5319
+ /*! @ingroup XXH3_family */
4431
5320
  XXH_PUBLIC_API XXH64_hash_t
4432
- XXH3_64bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
5321
+ XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
4433
5322
  {
4434
- return XXH3_64bits_internal(input, len, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
5323
+ return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
4435
5324
  }
4436
5325
 
4437
- /*! @ingroup xxh3_family */
5326
+ /*! @ingroup XXH3_family */
4438
5327
  XXH_PUBLIC_API XXH64_hash_t
4439
- XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
5328
+ XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
4440
5329
  {
4441
- return XXH3_64bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
5330
+ return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
4442
5331
  }
4443
5332
 
5333
+ XXH_PUBLIC_API XXH64_hash_t
5334
+ XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
5335
+ {
5336
+ if (length <= XXH3_MIDSIZE_MAX)
5337
+ return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
5338
+ return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize);
5339
+ }
4444
5340
 
4445
- /* === XXH3 streaming === */
4446
5341
 
5342
+ /* === XXH3 streaming === */
5343
+ #ifndef XXH_NO_STREAM
4447
5344
  /*
4448
5345
  * Malloc's a pointer that is always aligned to align.
4449
5346
  *
@@ -4467,7 +5364,7 @@ XXH3_64bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
4467
5364
  *
4468
5365
  * Align must be a power of 2 and 8 <= align <= 128.
4469
5366
  */
4470
- static void* XXH_alignedMalloc(size_t s, size_t align)
5367
+ static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align)
4471
5368
  {
4472
5369
  XXH_ASSERT(align <= 128 && align >= 8); /* range check */
4473
5370
  XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */
@@ -4509,7 +5406,7 @@ static void XXH_alignedFree(void* p)
4509
5406
  XXH_free(base);
4510
5407
  }
4511
5408
  }
4512
- /*! @ingroup xxh3_family */
5409
+ /*! @ingroup XXH3_family */
4513
5410
  XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
4514
5411
  {
4515
5412
  XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
@@ -4518,24 +5415,24 @@ XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
4518
5415
  return state;
4519
5416
  }
4520
5417
 
4521
- /*! @ingroup xxh3_family */
5418
+ /*! @ingroup XXH3_family */
4522
5419
  XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
4523
5420
  {
4524
5421
  XXH_alignedFree(statePtr);
4525
5422
  return XXH_OK;
4526
5423
  }
4527
5424
 
4528
- /*! @ingroup xxh3_family */
5425
+ /*! @ingroup XXH3_family */
4529
5426
  XXH_PUBLIC_API void
4530
- XXH3_copyState(XXH3_state_t* dst_state, const XXH3_state_t* src_state)
5427
+ XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
4531
5428
  {
4532
- memcpy(dst_state, src_state, sizeof(*dst_state));
5429
+ XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
4533
5430
  }
4534
5431
 
4535
5432
  static void
4536
5433
  XXH3_reset_internal(XXH3_state_t* statePtr,
4537
- XXH64_hash_t seed,
4538
- const void* secret, size_t secretSize)
5434
+ XXH64_hash_t seed,
5435
+ const void* secret, size_t secretSize)
4539
5436
  {
4540
5437
  size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
4541
5438
  size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
@@ -4552,24 +5449,25 @@ XXH3_reset_internal(XXH3_state_t* statePtr,
4552
5449
  statePtr->acc[6] = XXH_PRIME64_5;
4553
5450
  statePtr->acc[7] = XXH_PRIME32_1;
4554
5451
  statePtr->seed = seed;
5452
+ statePtr->useSeed = (seed != 0);
4555
5453
  statePtr->extSecret = (const unsigned char*)secret;
4556
5454
  XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
4557
5455
  statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
4558
5456
  statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
4559
5457
  }
4560
5458
 
4561
- /*! @ingroup xxh3_family */
5459
+ /*! @ingroup XXH3_family */
4562
5460
  XXH_PUBLIC_API XXH_errorcode
4563
- XXH3_64bits_reset(XXH3_state_t* statePtr)
5461
+ XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
4564
5462
  {
4565
5463
  if (statePtr == NULL) return XXH_ERROR;
4566
5464
  XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
4567
5465
  return XXH_OK;
4568
5466
  }
4569
5467
 
4570
- /*! @ingroup xxh3_family */
5468
+ /*! @ingroup XXH3_family */
4571
5469
  XXH_PUBLIC_API XXH_errorcode
4572
- XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
5470
+ XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
4573
5471
  {
4574
5472
  if (statePtr == NULL) return XXH_ERROR;
4575
5473
  XXH3_reset_internal(statePtr, 0, secret, secretSize);
@@ -4578,17 +5476,30 @@ XXH3_64bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t
4578
5476
  return XXH_OK;
4579
5477
  }
4580
5478
 
4581
- /*! @ingroup xxh3_family */
5479
+ /*! @ingroup XXH3_family */
4582
5480
  XXH_PUBLIC_API XXH_errorcode
4583
- XXH3_64bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
5481
+ XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
4584
5482
  {
4585
5483
  if (statePtr == NULL) return XXH_ERROR;
4586
5484
  if (seed==0) return XXH3_64bits_reset(statePtr);
4587
- if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
5485
+ if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
5486
+ XXH3_initCustomSecret(statePtr->customSecret, seed);
4588
5487
  XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
4589
5488
  return XXH_OK;
4590
5489
  }
4591
5490
 
5491
+ /*! @ingroup XXH3_family */
5492
+ XXH_PUBLIC_API XXH_errorcode
5493
+ XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
5494
+ {
5495
+ if (statePtr == NULL) return XXH_ERROR;
5496
+ if (secret == NULL) return XXH_ERROR;
5497
+ if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
5498
+ XXH3_reset_internal(statePtr, seed64, secret, secretSize);
5499
+ statePtr->useSeed = 1; /* always, even if seed64==0 */
5500
+ return XXH_OK;
5501
+ }
5502
+
4592
5503
  /* Note : when XXH3_consumeStripes() is invoked,
4593
5504
  * there must be a guarantee that at least one more byte must be consumed from input
4594
5505
  * so that the function can blindly consume all stripes using the "normal" secret segment */
@@ -4597,7 +5508,7 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
4597
5508
  size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
4598
5509
  const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
4599
5510
  const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
4600
- XXH3_f_accumulate_512 f_acc512,
5511
+ XXH3_f_accumulate f_acc,
4601
5512
  XXH3_f_scrambleAcc f_scramble)
4602
5513
  {
4603
5514
  XXH_ASSERT(nbStripes <= nbStripesPerBlock); /* can handle max 1 scramble per invocation */
@@ -4606,45 +5517,58 @@ XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
4606
5517
  /* need a scrambling operation */
4607
5518
  size_t const nbStripesToEndofBlock = nbStripesPerBlock - *nbStripesSoFarPtr;
4608
5519
  size_t const nbStripesAfterBlock = nbStripes - nbStripesToEndofBlock;
4609
- XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock, f_acc512);
5520
+ f_acc(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripesToEndofBlock);
4610
5521
  f_scramble(acc, secret + secretLimit);
4611
- XXH3_accumulate(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock, f_acc512);
5522
+ f_acc(acc, input + nbStripesToEndofBlock * XXH_STRIPE_LEN, secret, nbStripesAfterBlock);
4612
5523
  *nbStripesSoFarPtr = nbStripesAfterBlock;
4613
5524
  } else {
4614
- XXH3_accumulate(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes, f_acc512);
5525
+ f_acc(acc, input, secret + nbStripesSoFarPtr[0] * XXH_SECRET_CONSUME_RATE, nbStripes);
4615
5526
  *nbStripesSoFarPtr += nbStripes;
4616
5527
  }
4617
5528
  }
4618
5529
 
5530
+ #ifndef XXH3_STREAM_USE_STACK
5531
+ # if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */
5532
+ # define XXH3_STREAM_USE_STACK 1
5533
+ # endif
5534
+ #endif
4619
5535
  /*
4620
5536
  * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
4621
5537
  */
4622
5538
  XXH_FORCE_INLINE XXH_errorcode
4623
- XXH3_update(XXH3_state_t* state,
4624
- const xxh_u8* input, size_t len,
4625
- XXH3_f_accumulate_512 f_acc512,
5539
+ XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
5540
+ const xxh_u8* XXH_RESTRICT input, size_t len,
5541
+ XXH3_f_accumulate f_acc,
4626
5542
  XXH3_f_scrambleAcc f_scramble)
4627
5543
  {
4628
- if (input==NULL)
4629
- #if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
5544
+ if (input==NULL) {
5545
+ XXH_ASSERT(len == 0);
4630
5546
  return XXH_OK;
4631
- #else
4632
- return XXH_ERROR;
4633
- #endif
5547
+ }
4634
5548
 
5549
+ XXH_ASSERT(state != NULL);
4635
5550
  { const xxh_u8* const bEnd = input + len;
4636
5551
  const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
4637
-
5552
+ #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
5553
+ /* For some reason, gcc and MSVC seem to suffer greatly
5554
+ * when operating accumulators directly into state.
5555
+ * Operating into stack space seems to enable proper optimization.
5556
+ * clang, on the other hand, doesn't seem to need this trick */
5557
+ XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; memcpy(acc, state->acc, sizeof(acc));
5558
+ #else
5559
+ xxh_u64* XXH_RESTRICT const acc = state->acc;
5560
+ #endif
4638
5561
  state->totalLen += len;
4639
5562
  XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
4640
5563
 
4641
- if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) { /* fill in tmp buffer */
5564
+ /* small input : just fill in tmp buffer */
5565
+ if (state->bufferedSize + len <= XXH3_INTERNALBUFFER_SIZE) {
4642
5566
  XXH_memcpy(state->buffer + state->bufferedSize, input, len);
4643
5567
  state->bufferedSize += (XXH32_hash_t)len;
4644
5568
  return XXH_OK;
4645
5569
  }
4646
- /* total input is now > XXH3_INTERNALBUFFER_SIZE */
4647
5570
 
5571
+ /* total input is now > XXH3_INTERNALBUFFER_SIZE */
4648
5572
  #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
4649
5573
  XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */
4650
5574
 
@@ -4656,45 +5580,82 @@ XXH3_update(XXH3_state_t* state,
4656
5580
  size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
4657
5581
  XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
4658
5582
  input += loadSize;
4659
- XXH3_consumeStripes(state->acc,
5583
+ XXH3_consumeStripes(acc,
4660
5584
  &state->nbStripesSoFar, state->nbStripesPerBlock,
4661
5585
  state->buffer, XXH3_INTERNALBUFFER_STRIPES,
4662
5586
  secret, state->secretLimit,
4663
- f_acc512, f_scramble);
5587
+ f_acc, f_scramble);
4664
5588
  state->bufferedSize = 0;
4665
5589
  }
4666
5590
  XXH_ASSERT(input < bEnd);
4667
5591
 
4668
- /* Consume input by a multiple of internal buffer size */
4669
- if (input+XXH3_INTERNALBUFFER_SIZE < bEnd) {
4670
- const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
4671
- do {
4672
- XXH3_consumeStripes(state->acc,
4673
- &state->nbStripesSoFar, state->nbStripesPerBlock,
4674
- input, XXH3_INTERNALBUFFER_STRIPES,
4675
- secret, state->secretLimit,
4676
- f_acc512, f_scramble);
4677
- input += XXH3_INTERNALBUFFER_SIZE;
4678
- } while (input<limit);
4679
- /* for last partial stripe */
4680
- memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5592
+ /* large input to consume : ingest per full block */
5593
+ if ((size_t)(bEnd - input) > state->nbStripesPerBlock * XXH_STRIPE_LEN) {
5594
+ size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
5595
+ XXH_ASSERT(state->nbStripesPerBlock >= state->nbStripesSoFar);
5596
+ /* join to current block's end */
5597
+ { size_t const nbStripesToEnd = state->nbStripesPerBlock - state->nbStripesSoFar;
5598
+ XXH_ASSERT(nbStripesToEnd <= nbStripes);
5599
+ f_acc(acc, input, secret + state->nbStripesSoFar * XXH_SECRET_CONSUME_RATE, nbStripesToEnd);
5600
+ f_scramble(acc, secret + state->secretLimit);
5601
+ state->nbStripesSoFar = 0;
5602
+ input += nbStripesToEnd * XXH_STRIPE_LEN;
5603
+ nbStripes -= nbStripesToEnd;
5604
+ }
5605
+ /* consume per entire blocks */
5606
+ while(nbStripes >= state->nbStripesPerBlock) {
5607
+ f_acc(acc, input, secret, state->nbStripesPerBlock);
5608
+ f_scramble(acc, secret + state->secretLimit);
5609
+ input += state->nbStripesPerBlock * XXH_STRIPE_LEN;
5610
+ nbStripes -= state->nbStripesPerBlock;
5611
+ }
5612
+ /* consume last partial block */
5613
+ f_acc(acc, input, secret, nbStripes);
5614
+ input += nbStripes * XXH_STRIPE_LEN;
5615
+ XXH_ASSERT(input < bEnd); /* at least some bytes left */
5616
+ state->nbStripesSoFar = nbStripes;
5617
+ /* buffer predecessor of last partial stripe */
5618
+ XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5619
+ XXH_ASSERT(bEnd - input <= XXH_STRIPE_LEN);
5620
+ } else {
5621
+ /* content to consume <= block size */
5622
+ /* Consume input by a multiple of internal buffer size */
5623
+ if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
5624
+ const xxh_u8* const limit = bEnd - XXH3_INTERNALBUFFER_SIZE;
5625
+ do {
5626
+ XXH3_consumeStripes(acc,
5627
+ &state->nbStripesSoFar, state->nbStripesPerBlock,
5628
+ input, XXH3_INTERNALBUFFER_STRIPES,
5629
+ secret, state->secretLimit,
5630
+ f_acc, f_scramble);
5631
+ input += XXH3_INTERNALBUFFER_SIZE;
5632
+ } while (input<limit);
5633
+ /* buffer predecessor of last partial stripe */
5634
+ XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
5635
+ }
4681
5636
  }
4682
- XXH_ASSERT(input < bEnd);
4683
5637
 
4684
5638
  /* Some remaining input (always) : buffer it */
5639
+ XXH_ASSERT(input < bEnd);
5640
+ XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
5641
+ XXH_ASSERT(state->bufferedSize == 0);
4685
5642
  XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
4686
5643
  state->bufferedSize = (XXH32_hash_t)(bEnd-input);
5644
+ #if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
5645
+ /* save stack accumulators into state */
5646
+ memcpy(state->acc, acc, sizeof(acc));
5647
+ #endif
4687
5648
  }
4688
5649
 
4689
5650
  return XXH_OK;
4690
5651
  }
4691
5652
 
4692
- /*! @ingroup xxh3_family */
5653
+ /*! @ingroup XXH3_family */
4693
5654
  XXH_PUBLIC_API XXH_errorcode
4694
- XXH3_64bits_update(XXH3_state_t* state, const void* input, size_t len)
5655
+ XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
4695
5656
  {
4696
5657
  return XXH3_update(state, (const xxh_u8*)input, len,
4697
- XXH3_accumulate_512, XXH3_scrambleAcc);
5658
+ XXH3_accumulate, XXH3_scrambleAcc);
4698
5659
  }
4699
5660
 
4700
5661
 
@@ -4707,7 +5668,7 @@ XXH3_digest_long (XXH64_hash_t* acc,
4707
5668
  * Digest on a local copy. This way, the state remains unaltered, and it can
4708
5669
  * continue ingesting more input afterwards.
4709
5670
  */
4710
- memcpy(acc, state->acc, sizeof(state->acc));
5671
+ XXH_memcpy(acc, state->acc, sizeof(state->acc));
4711
5672
  if (state->bufferedSize >= XXH_STRIPE_LEN) {
4712
5673
  size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
4713
5674
  size_t nbStripesSoFar = state->nbStripesSoFar;
@@ -4715,7 +5676,7 @@ XXH3_digest_long (XXH64_hash_t* acc,
4715
5676
  &nbStripesSoFar, state->nbStripesPerBlock,
4716
5677
  state->buffer, nbStripes,
4717
5678
  secret, state->secretLimit,
4718
- XXH3_accumulate_512, XXH3_scrambleAcc);
5679
+ XXH3_accumulate, XXH3_scrambleAcc);
4719
5680
  /* last stripe */
4720
5681
  XXH3_accumulate_512(acc,
4721
5682
  state->buffer + state->bufferedSize - XXH_STRIPE_LEN,
@@ -4724,16 +5685,16 @@ XXH3_digest_long (XXH64_hash_t* acc,
4724
5685
  xxh_u8 lastStripe[XXH_STRIPE_LEN];
4725
5686
  size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
4726
5687
  XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */
4727
- memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
4728
- memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
5688
+ XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
5689
+ XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
4729
5690
  XXH3_accumulate_512(acc,
4730
5691
  lastStripe,
4731
5692
  secret + state->secretLimit - XXH_SECRET_LASTACC_START);
4732
5693
  }
4733
5694
  }
4734
5695
 
4735
- /*! @ingroup xxh3_family */
4736
- XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
5696
+ /*! @ingroup XXH3_family */
5697
+ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
4737
5698
  {
4738
5699
  const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
4739
5700
  if (state->totalLen > XXH3_MIDSIZE_MAX) {
@@ -4744,57 +5705,12 @@ XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (const XXH3_state_t* state)
4744
5705
  (xxh_u64)state->totalLen * XXH_PRIME64_1);
4745
5706
  }
4746
5707
  /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
4747
- if (state->seed)
5708
+ if (state->useSeed)
4748
5709
  return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
4749
5710
  return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
4750
5711
  secret, state->secretLimit + XXH_STRIPE_LEN);
4751
5712
  }
4752
-
4753
-
4754
- #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
4755
-
4756
- /*! @ingroup xxh3_family */
4757
- XXH_PUBLIC_API void
4758
- XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSeedSize)
4759
- {
4760
- XXH_ASSERT(secretBuffer != NULL);
4761
- if (customSeedSize == 0) {
4762
- memcpy(secretBuffer, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
4763
- return;
4764
- }
4765
- XXH_ASSERT(customSeed != NULL);
4766
-
4767
- { size_t const segmentSize = sizeof(XXH128_hash_t);
4768
- size_t const nbSegments = XXH_SECRET_DEFAULT_SIZE / segmentSize;
4769
- XXH128_canonical_t scrambler;
4770
- XXH64_hash_t seeds[12];
4771
- size_t segnb;
4772
- XXH_ASSERT(nbSegments == 12);
4773
- XXH_ASSERT(segmentSize * nbSegments == XXH_SECRET_DEFAULT_SIZE); /* exact multiple */
4774
- XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
4775
-
4776
- /*
4777
- * Copy customSeed to seeds[], truncating or repeating as necessary.
4778
- */
4779
- { size_t toFill = XXH_MIN(customSeedSize, sizeof(seeds));
4780
- size_t filled = toFill;
4781
- memcpy(seeds, customSeed, toFill);
4782
- while (filled < sizeof(seeds)) {
4783
- toFill = XXH_MIN(filled, sizeof(seeds) - filled);
4784
- memcpy((char*)seeds + filled, seeds, toFill);
4785
- filled += toFill;
4786
- } }
4787
-
4788
- /* generate secret */
4789
- memcpy(secretBuffer, &scrambler, sizeof(scrambler));
4790
- for (segnb=1; segnb < nbSegments; segnb++) {
4791
- size_t const segmentStart = segnb * segmentSize;
4792
- XXH128_canonical_t segment;
4793
- XXH128_canonicalFromHash(&segment,
4794
- XXH128(&scrambler, sizeof(scrambler), XXH_readLE64(seeds + segnb) + segnb) );
4795
- memcpy((char*)secretBuffer + segmentStart, &segment, sizeof(segment));
4796
- } }
4797
- }
5713
+ #endif /* !XXH_NO_STREAM */
4798
5714
 
4799
5715
 
4800
5716
  /* ==========================================
@@ -4814,7 +5730,7 @@ XXH3_generateSecret(void* secretBuffer, const void* customSeed, size_t customSee
4814
5730
  * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
4815
5731
  */
4816
5732
 
4817
- XXH_FORCE_INLINE XXH128_hash_t
5733
+ XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
4818
5734
  XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
4819
5735
  {
4820
5736
  /* A doubled version of 1to3_64b with different constants. */
@@ -4843,7 +5759,7 @@ XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_
4843
5759
  }
4844
5760
  }
4845
5761
 
4846
- XXH_FORCE_INLINE XXH128_hash_t
5762
+ XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
4847
5763
  XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
4848
5764
  {
4849
5765
  XXH_ASSERT(input != NULL);
@@ -4870,7 +5786,7 @@ XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_
4870
5786
  }
4871
5787
  }
4872
5788
 
4873
- XXH_FORCE_INLINE XXH128_hash_t
5789
+ XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
4874
5790
  XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
4875
5791
  {
4876
5792
  XXH_ASSERT(input != NULL);
@@ -4945,7 +5861,7 @@ XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64
4945
5861
  /*
4946
5862
  * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
4947
5863
  */
4948
- XXH_FORCE_INLINE XXH128_hash_t
5864
+ XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
4949
5865
  XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
4950
5866
  {
4951
5867
  XXH_ASSERT(len <= 16);
@@ -4976,7 +5892,7 @@ XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
4976
5892
  }
4977
5893
 
4978
5894
 
4979
- XXH_FORCE_INLINE XXH128_hash_t
5895
+ XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
4980
5896
  XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
4981
5897
  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
4982
5898
  XXH64_hash_t seed)
@@ -4987,6 +5903,16 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
4987
5903
  { XXH128_hash_t acc;
4988
5904
  acc.low64 = len * XXH_PRIME64_1;
4989
5905
  acc.high64 = 0;
5906
+
5907
+ #if XXH_SIZE_OPT >= 1
5908
+ {
5909
+ /* Smaller, but slightly slower. */
5910
+ unsigned int i = (unsigned int)(len - 1) / 32;
5911
+ do {
5912
+ acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
5913
+ } while (i-- != 0);
5914
+ }
5915
+ #else
4990
5916
  if (len > 32) {
4991
5917
  if (len > 64) {
4992
5918
  if (len > 96) {
@@ -4997,6 +5923,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
4997
5923
  acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
4998
5924
  }
4999
5925
  acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
5926
+ #endif
5000
5927
  { XXH128_hash_t h128;
5001
5928
  h128.low64 = acc.low64 + acc.high64;
5002
5929
  h128.high64 = (acc.low64 * XXH_PRIME64_1)
@@ -5009,7 +5936,7 @@ XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5009
5936
  }
5010
5937
  }
5011
5938
 
5012
- XXH_NO_INLINE XXH128_hash_t
5939
+ XXH_NO_INLINE XXH_PUREF XXH128_hash_t
5013
5940
  XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5014
5941
  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
5015
5942
  XXH64_hash_t seed)
@@ -5018,25 +5945,34 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5018
5945
  XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
5019
5946
 
5020
5947
  { XXH128_hash_t acc;
5021
- int const nbRounds = (int)len / 32;
5022
- int i;
5948
+ unsigned i;
5023
5949
  acc.low64 = len * XXH_PRIME64_1;
5024
5950
  acc.high64 = 0;
5025
- for (i=0; i<4; i++) {
5951
+ /*
5952
+ * We set as `i` as offset + 32. We do this so that unchanged
5953
+ * `len` can be used as upper bound. This reaches a sweet spot
5954
+ * where both x86 and aarch64 get simple agen and good codegen
5955
+ * for the loop.
5956
+ */
5957
+ for (i = 32; i < 160; i += 32) {
5026
5958
  acc = XXH128_mix32B(acc,
5027
- input + (32 * i),
5028
- input + (32 * i) + 16,
5029
- secret + (32 * i),
5959
+ input + i - 32,
5960
+ input + i - 16,
5961
+ secret + i - 32,
5030
5962
  seed);
5031
5963
  }
5032
5964
  acc.low64 = XXH3_avalanche(acc.low64);
5033
5965
  acc.high64 = XXH3_avalanche(acc.high64);
5034
- XXH_ASSERT(nbRounds >= 4);
5035
- for (i=4 ; i < nbRounds; i++) {
5966
+ /*
5967
+ * NB: `i <= len` will duplicate the last 32-bytes if
5968
+ * len % 32 was zero. This is an unfortunate necessity to keep
5969
+ * the hash result stable.
5970
+ */
5971
+ for (i=160; i <= len; i += 32) {
5036
5972
  acc = XXH128_mix32B(acc,
5037
- input + (32 * i),
5038
- input + (32 * i) + 16,
5039
- secret + XXH3_MIDSIZE_STARTOFFSET + (32 * (i - 4)),
5973
+ input + i - 32,
5974
+ input + i - 16,
5975
+ secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
5040
5976
  seed);
5041
5977
  }
5042
5978
  /* last bytes */
@@ -5044,7 +5980,7 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5044
5980
  input + len - 16,
5045
5981
  input + len - 32,
5046
5982
  secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
5047
- 0ULL - seed);
5983
+ (XXH64_hash_t)0 - seed);
5048
5984
 
5049
5985
  { XXH128_hash_t h128;
5050
5986
  h128.low64 = acc.low64 + acc.high64;
@@ -5061,12 +5997,12 @@ XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
5061
5997
  XXH_FORCE_INLINE XXH128_hash_t
5062
5998
  XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
5063
5999
  const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
5064
- XXH3_f_accumulate_512 f_acc512,
6000
+ XXH3_f_accumulate f_acc,
5065
6001
  XXH3_f_scrambleAcc f_scramble)
5066
6002
  {
5067
6003
  XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
5068
6004
 
5069
- XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc512, f_scramble);
6005
+ XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
5070
6006
 
5071
6007
  /* converge into final hash */
5072
6008
  XXH_STATIC_ASSERT(sizeof(acc) == 64);
@@ -5084,46 +6020,47 @@ XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
5084
6020
  }
5085
6021
 
5086
6022
  /*
5087
- * It's important for performance that XXH3_hashLong is not inlined.
6023
+ * It's important for performance that XXH3_hashLong() is not inlined.
5088
6024
  */
5089
- XXH_NO_INLINE XXH128_hash_t
6025
+ XXH_NO_INLINE XXH_PUREF XXH128_hash_t
5090
6026
  XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
5091
6027
  XXH64_hash_t seed64,
5092
6028
  const void* XXH_RESTRICT secret, size_t secretLen)
5093
6029
  {
5094
6030
  (void)seed64; (void)secret; (void)secretLen;
5095
6031
  return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
5096
- XXH3_accumulate_512, XXH3_scrambleAcc);
6032
+ XXH3_accumulate, XXH3_scrambleAcc);
5097
6033
  }
5098
6034
 
5099
6035
  /*
5100
- * It's important for performance that XXH3_hashLong is not inlined.
6036
+ * It's important for performance to pass @p secretLen (when it's static)
6037
+ * to the compiler, so that it can properly optimize the vectorized loop.
5101
6038
  */
5102
- XXH_NO_INLINE XXH128_hash_t
6039
+ XXH_FORCE_INLINE XXH128_hash_t
5103
6040
  XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
5104
6041
  XXH64_hash_t seed64,
5105
6042
  const void* XXH_RESTRICT secret, size_t secretLen)
5106
6043
  {
5107
6044
  (void)seed64;
5108
6045
  return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
5109
- XXH3_accumulate_512, XXH3_scrambleAcc);
6046
+ XXH3_accumulate, XXH3_scrambleAcc);
5110
6047
  }
5111
6048
 
5112
6049
  XXH_FORCE_INLINE XXH128_hash_t
5113
6050
  XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
5114
6051
  XXH64_hash_t seed64,
5115
- XXH3_f_accumulate_512 f_acc512,
6052
+ XXH3_f_accumulate f_acc,
5116
6053
  XXH3_f_scrambleAcc f_scramble,
5117
6054
  XXH3_f_initCustomSecret f_initSec)
5118
6055
  {
5119
6056
  if (seed64 == 0)
5120
6057
  return XXH3_hashLong_128b_internal(input, len,
5121
6058
  XXH3_kSecret, sizeof(XXH3_kSecret),
5122
- f_acc512, f_scramble);
6059
+ f_acc, f_scramble);
5123
6060
  { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
5124
6061
  f_initSec(secret, seed64);
5125
6062
  return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
5126
- f_acc512, f_scramble);
6063
+ f_acc, f_scramble);
5127
6064
  }
5128
6065
  }
5129
6066
 
@@ -5136,7 +6073,7 @@ XXH3_hashLong_128b_withSeed(const void* input, size_t len,
5136
6073
  {
5137
6074
  (void)secret; (void)secretLen;
5138
6075
  return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
5139
- XXH3_accumulate_512, XXH3_scrambleAcc, XXH3_initCustomSecret);
6076
+ XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
5140
6077
  }
5141
6078
 
5142
6079
  typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
@@ -5166,88 +6103,94 @@ XXH3_128bits_internal(const void* input, size_t len,
5166
6103
 
5167
6104
  /* === Public XXH128 API === */
5168
6105
 
5169
- /*! @ingroup xxh3_family */
5170
- XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(const void* input, size_t len)
6106
+ /*! @ingroup XXH3_family */
6107
+ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
5171
6108
  {
5172
6109
  return XXH3_128bits_internal(input, len, 0,
5173
6110
  XXH3_kSecret, sizeof(XXH3_kSecret),
5174
6111
  XXH3_hashLong_128b_default);
5175
6112
  }
5176
6113
 
5177
- /*! @ingroup xxh3_family */
6114
+ /*! @ingroup XXH3_family */
5178
6115
  XXH_PUBLIC_API XXH128_hash_t
5179
- XXH3_128bits_withSecret(const void* input, size_t len, const void* secret, size_t secretSize)
6116
+ XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
5180
6117
  {
5181
6118
  return XXH3_128bits_internal(input, len, 0,
5182
6119
  (const xxh_u8*)secret, secretSize,
5183
6120
  XXH3_hashLong_128b_withSecret);
5184
6121
  }
5185
6122
 
5186
- /*! @ingroup xxh3_family */
6123
+ /*! @ingroup XXH3_family */
5187
6124
  XXH_PUBLIC_API XXH128_hash_t
5188
- XXH3_128bits_withSeed(const void* input, size_t len, XXH64_hash_t seed)
6125
+ XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
5189
6126
  {
5190
6127
  return XXH3_128bits_internal(input, len, seed,
5191
6128
  XXH3_kSecret, sizeof(XXH3_kSecret),
5192
6129
  XXH3_hashLong_128b_withSeed);
5193
6130
  }
5194
6131
 
5195
- /*! @ingroup xxh3_family */
6132
+ /*! @ingroup XXH3_family */
6133
+ XXH_PUBLIC_API XXH128_hash_t
6134
+ XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
6135
+ {
6136
+ if (len <= XXH3_MIDSIZE_MAX)
6137
+ return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
6138
+ return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
6139
+ }
6140
+
6141
+ /*! @ingroup XXH3_family */
5196
6142
  XXH_PUBLIC_API XXH128_hash_t
5197
- XXH128(const void* input, size_t len, XXH64_hash_t seed)
6143
+ XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
5198
6144
  {
5199
6145
  return XXH3_128bits_withSeed(input, len, seed);
5200
6146
  }
5201
6147
 
5202
6148
 
5203
6149
  /* === XXH3 128-bit streaming === */
5204
-
6150
+ #ifndef XXH_NO_STREAM
5205
6151
  /*
5206
- * All the functions are actually the same as for 64-bit streaming variant.
6152
+ * All initialization and update functions are identical to 64-bit streaming variant.
5207
6153
  * The only difference is the finalization routine.
5208
6154
  */
5209
6155
 
5210
- /*! @ingroup xxh3_family */
6156
+ /*! @ingroup XXH3_family */
5211
6157
  XXH_PUBLIC_API XXH_errorcode
5212
- XXH3_128bits_reset(XXH3_state_t* statePtr)
6158
+ XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
5213
6159
  {
5214
- if (statePtr == NULL) return XXH_ERROR;
5215
- XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
5216
- return XXH_OK;
6160
+ return XXH3_64bits_reset(statePtr);
5217
6161
  }
5218
6162
 
5219
- /*! @ingroup xxh3_family */
6163
+ /*! @ingroup XXH3_family */
5220
6164
  XXH_PUBLIC_API XXH_errorcode
5221
- XXH3_128bits_reset_withSecret(XXH3_state_t* statePtr, const void* secret, size_t secretSize)
6165
+ XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
5222
6166
  {
5223
- if (statePtr == NULL) return XXH_ERROR;
5224
- XXH3_reset_internal(statePtr, 0, secret, secretSize);
5225
- if (secret == NULL) return XXH_ERROR;
5226
- if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
5227
- return XXH_OK;
6167
+ return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
5228
6168
  }
5229
6169
 
5230
- /*! @ingroup xxh3_family */
6170
+ /*! @ingroup XXH3_family */
5231
6171
  XXH_PUBLIC_API XXH_errorcode
5232
- XXH3_128bits_reset_withSeed(XXH3_state_t* statePtr, XXH64_hash_t seed)
6172
+ XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
5233
6173
  {
5234
- if (statePtr == NULL) return XXH_ERROR;
5235
- if (seed==0) return XXH3_128bits_reset(statePtr);
5236
- if (seed != statePtr->seed) XXH3_initCustomSecret(statePtr->customSecret, seed);
5237
- XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
5238
- return XXH_OK;
6174
+ return XXH3_64bits_reset_withSeed(statePtr, seed);
6175
+ }
6176
+
6177
+ /*! @ingroup XXH3_family */
6178
+ XXH_PUBLIC_API XXH_errorcode
6179
+ XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
6180
+ {
6181
+ return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
5239
6182
  }
5240
6183
 
5241
- /*! @ingroup xxh3_family */
6184
+ /*! @ingroup XXH3_family */
5242
6185
  XXH_PUBLIC_API XXH_errorcode
5243
- XXH3_128bits_update(XXH3_state_t* state, const void* input, size_t len)
6186
+ XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
5244
6187
  {
5245
6188
  return XXH3_update(state, (const xxh_u8*)input, len,
5246
- XXH3_accumulate_512, XXH3_scrambleAcc);
6189
+ XXH3_accumulate, XXH3_scrambleAcc);
5247
6190
  }
5248
6191
 
5249
- /*! @ingroup xxh3_family */
5250
- XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
6192
+ /*! @ingroup XXH3_family */
6193
+ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
5251
6194
  {
5252
6195
  const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
5253
6196
  if (state->totalLen > XXH3_MIDSIZE_MAX) {
@@ -5271,13 +6214,13 @@ XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (const XXH3_state_t* state)
5271
6214
  return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
5272
6215
  secret, state->secretLimit + XXH_STRIPE_LEN);
5273
6216
  }
5274
-
6217
+ #endif /* !XXH_NO_STREAM */
5275
6218
  /* 128-bit utility functions */
5276
6219
 
5277
6220
  #include <string.h> /* memcmp, memcpy */
5278
6221
 
5279
6222
  /* return : 1 is equal, 0 if different */
5280
- /*! @ingroup xxh3_family */
6223
+ /*! @ingroup XXH3_family */
5281
6224
  XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
5282
6225
  {
5283
6226
  /* note : XXH128_hash_t is compact, it has no padding byte */
@@ -5285,11 +6228,11 @@ XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
5285
6228
  }
5286
6229
 
5287
6230
  /* This prototype is compatible with stdlib's qsort().
5288
- * return : >0 if *h128_1 > *h128_2
5289
- * <0 if *h128_1 < *h128_2
5290
- * =0 if *h128_1 == *h128_2 */
5291
- /*! @ingroup xxh3_family */
5292
- XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
6231
+ * @return : >0 if *h128_1 > *h128_2
6232
+ * <0 if *h128_1 < *h128_2
6233
+ * =0 if *h128_1 == *h128_2 */
6234
+ /*! @ingroup XXH3_family */
6235
+ XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
5293
6236
  {
5294
6237
  XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
5295
6238
  XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
@@ -5301,22 +6244,22 @@ XXH_PUBLIC_API int XXH128_cmp(const void* h128_1, const void* h128_2)
5301
6244
 
5302
6245
 
5303
6246
  /*====== Canonical representation ======*/
5304
- /*! @ingroup xxh3_family */
6247
+ /*! @ingroup XXH3_family */
5305
6248
  XXH_PUBLIC_API void
5306
- XXH128_canonicalFromHash(XXH128_canonical_t* dst, XXH128_hash_t hash)
6249
+ XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
5307
6250
  {
5308
6251
  XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
5309
6252
  if (XXH_CPU_LITTLE_ENDIAN) {
5310
6253
  hash.high64 = XXH_swap64(hash.high64);
5311
6254
  hash.low64 = XXH_swap64(hash.low64);
5312
6255
  }
5313
- memcpy(dst, &hash.high64, sizeof(hash.high64));
5314
- memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
6256
+ XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
6257
+ XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
5315
6258
  }
5316
6259
 
5317
- /*! @ingroup xxh3_family */
6260
+ /*! @ingroup XXH3_family */
5318
6261
  XXH_PUBLIC_API XXH128_hash_t
5319
- XXH128_hashFromCanonical(const XXH128_canonical_t* src)
6262
+ XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
5320
6263
  {
5321
6264
  XXH128_hash_t h;
5322
6265
  h.high64 = XXH_readBE64(src);
@@ -5324,10 +6267,81 @@ XXH128_hashFromCanonical(const XXH128_canonical_t* src)
5324
6267
  return h;
5325
6268
  }
5326
6269
 
6270
+
6271
+
6272
+ /* ==========================================
6273
+ * Secret generators
6274
+ * ==========================================
6275
+ */
6276
+ #define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
6277
+
6278
+ XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
6279
+ {
6280
+ XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
6281
+ XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
6282
+ }
6283
+
6284
+ /*! @ingroup XXH3_family */
6285
+ XXH_PUBLIC_API XXH_errorcode
6286
+ XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
6287
+ {
6288
+ #if (XXH_DEBUGLEVEL >= 1)
6289
+ XXH_ASSERT(secretBuffer != NULL);
6290
+ XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
6291
+ #else
6292
+ /* production mode, assert() are disabled */
6293
+ if (secretBuffer == NULL) return XXH_ERROR;
6294
+ if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
6295
+ #endif
6296
+
6297
+ if (customSeedSize == 0) {
6298
+ customSeed = XXH3_kSecret;
6299
+ customSeedSize = XXH_SECRET_DEFAULT_SIZE;
6300
+ }
6301
+ #if (XXH_DEBUGLEVEL >= 1)
6302
+ XXH_ASSERT(customSeed != NULL);
6303
+ #else
6304
+ if (customSeed == NULL) return XXH_ERROR;
6305
+ #endif
6306
+
6307
+ /* Fill secretBuffer with a copy of customSeed - repeat as needed */
6308
+ { size_t pos = 0;
6309
+ while (pos < secretSize) {
6310
+ size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
6311
+ memcpy((char*)secretBuffer + pos, customSeed, toCopy);
6312
+ pos += toCopy;
6313
+ } }
6314
+
6315
+ { size_t const nbSeg16 = secretSize / 16;
6316
+ size_t n;
6317
+ XXH128_canonical_t scrambler;
6318
+ XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
6319
+ for (n=0; n<nbSeg16; n++) {
6320
+ XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
6321
+ XXH3_combine16((char*)secretBuffer + n*16, h128);
6322
+ }
6323
+ /* last segment */
6324
+ XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
6325
+ }
6326
+ return XXH_OK;
6327
+ }
6328
+
6329
+ /*! @ingroup XXH3_family */
6330
+ XXH_PUBLIC_API void
6331
+ XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
6332
+ {
6333
+ XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
6334
+ XXH3_initCustomSecret(secret, seed);
6335
+ XXH_ASSERT(secretBuffer != NULL);
6336
+ memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
6337
+ }
6338
+
6339
+
6340
+
5327
6341
  /* Pop our optimization override from above */
5328
6342
  #if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
5329
6343
  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
5330
- && defined(__OPTIMIZE__) && !defined(__OPTIMIZE_SIZE__) /* respect -O0 and -Os */
6344
+ && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
5331
6345
  # pragma GCC pop_options
5332
6346
  #endif
5333
6347