demake 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/bin/demake +503 -579
  3. data/lib/apps/example/Makefile +374 -0
  4. data/lib/apps/example/demake/applications +3 -0
  5. data/lib/apps/example/demake/brief_description +1 -0
  6. data/lib/apps/example/demake/license +19 -0
  7. data/lib/apps/example/demake/settings.rb +62 -0
  8. data/lib/apps/example/demake/suggestion +1 -0
  9. data/lib/apps/example/demake/test-target.rb +8 -0
  10. data/lib/apps/example/src/goodbye.c +12 -0
  11. data/lib/apps/example/src/hello.c +12 -0
  12. data/lib/apps/example/src/string/string.c +12 -0
  13. data/lib/apps/example/src/string/string.h +7 -0
  14. data/lib/apps/oreo/Makefile +260 -0
  15. data/lib/apps/oreo/demake/applications +1 -0
  16. data/lib/apps/oreo/demake/brief_description +1 -0
  17. data/lib/apps/oreo/demake/license +19 -0
  18. data/lib/apps/oreo/demake/settings.rb +62 -0
  19. data/lib/apps/oreo/demake/suggestion +1 -0
  20. data/lib/apps/oreo/demake/test-target.rb +9 -0
  21. data/lib/apps/oreo/oreo_test.txt +1 -0
  22. data/lib/apps/oreo/src/defines.h +29 -0
  23. data/lib/apps/oreo/src/fast_read_file.h +259 -0
  24. data/lib/apps/oreo/src/oreo.c +102 -0
  25. data/lib/apps/oreo/src/typedefs.h +61 -0
  26. data/lib/data/libsrc/auto_bits.h +33 -0
  27. data/lib/data/libsrc/base.h +10 -0
  28. data/lib/data/libsrc/cpu_mark_check.h +267 -0
  29. data/lib/data/libsrc/defines.h +29 -0
  30. data/lib/data/libsrc/fast_read_file.h +259 -0
  31. data/lib/data/libsrc/fast_sha2.h +1840 -0
  32. data/lib/data/libsrc/parse_arguments.h +0 -0
  33. data/lib/data/libsrc/rb_library.c +140 -0
  34. data/lib/data/libsrc/typedefs.h +61 -0
  35. data/lib/template/build_target.rb +6 -0
  36. data/lib/template/clean_target.rb +6 -0
  37. data/lib/template/debug_executable_target.rb +14 -0
  38. data/lib/template/debug_library_target.rb +14 -0
  39. data/lib/template/debug_target.rb +4 -0
  40. data/lib/template/dependency_targets.rb +22 -0
  41. data/lib/template/executable_debug_target.rb +6 -0
  42. data/lib/template/executable_target.rb +14 -0
  43. data/lib/template/generic_dependency_targets.rb +26 -0
  44. data/lib/template/library_debug_target.rb +6 -0
  45. data/lib/template/library_target.rb +6 -0
  46. data/lib/template/license_target.rb +10 -0
  47. data/lib/template/link_library_target.rb +14 -0
  48. data/lib/template/strip_build.rb +8 -0
  49. metadata +54 -7
@@ -0,0 +1,1840 @@
1
+ /*
2
+
3
+ fast_sha2.h -- *Full Library* Header File
4
+
5
+ Uses version 2 of the Secure Hashing Algorithm to produce a message
6
+ digest as output.
7
+
8
+ Based on:
9
+ https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf
10
+
11
+ With help from:
12
+ https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
13
+ https://developer.arm.com/architectures/instruction-sets/intrinsics/
14
+
15
+ Validated using sha-validation.rb along with:
16
+ https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA256.pdf
17
+ https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/SHA512.pdf
18
+
19
+ 1 2 3 4 5 6 7 8
20
+ 12345678901234567890123456789012345678901234567890123456789012345678901234567890
21
+ ────────────────────────────────────────────────────────────────────────────────
22
+
23
+ */
24
+
25
+ #ifndef FAST_SHA2_H_Minaswan /* Prevents multiple inclusions */
26
+ #define FAST_SHA2_H_Minaswan
27
+ #define FAST_SHA2_VERSION "0.1.0"
28
+ #ifndef TYPEDEFS_H_Minaswan /* Header guard for typedefs.h */
29
+ #define TYPEDEFS_H_Minaswan /* The SAME as typedefs.h */
30
+ #include <stdint.h>
31
+
32
+ typedef uint8_t b8; /* Booleans */
33
+ typedef uint16_t b16;
34
+ typedef uint32_t b32;
35
+ typedef uint64_t b64;
36
+
37
+ typedef char c8; /* Characters */
38
+ typedef unsigned char uc8;
39
+
40
+ typedef uint8_t u8; /* Unsigned Numbers */
41
+ typedef uint16_t u16;
42
+ typedef uint32_t u32;
43
+ typedef uint64_t u64;
44
+
45
+ typedef int8_t i8; /* Signed Numbers */
46
+ typedef int16_t i16;
47
+ typedef int32_t i32;
48
+ typedef int64_t i64;
49
+
50
+ typedef float f32; /* Floating Point Numbers */
51
+ typedef double f64;
52
+ #endif /* TYPEDEFS_H_Minaswan */
53
+
54
+ const c8 *fast_sha2_cpu_instructions[];
55
+
56
+ /* Public Functions */
57
+ b8 fast_sha2_list_cpu_instructions(c8 *output, b32 hardware_support);
58
+ b32 fast_sha2_supported_by_cpu(void);
59
+ c8 *fast_sha2_256_digest(uc8 *hash);
60
+ c8 *fast_sha2_512_digest(uc8 *hash);
61
+ uc8 *fast_sha2_256(uc8 *string, b8 software_only);
62
+ uc8 *fast_sha2_512(uc8 *string, b8 software_only);
63
+ uc8 *fast_sha2_256_hash(uc8 *message, u64 length, b8 software_only);
64
+ uc8 *fast_sha2_512_hash(uc8 *message, u64 length, b8 software_only);
65
+ void fast_sha2_256_show_hash(uc8 *hash);
66
+ void fast_sha2_512_show_hash(uc8 *hash);
67
+
68
+ #ifdef FAST_SHA2_IMPLEMENTATION /* Effectively fast_sha2.c */
69
+ #include <string.h>
70
+ #include <stdio.h>
71
+ #include <stdlib.h>
72
+
73
+ #ifndef FALSE
74
+ #define FALSE 0
75
+ #endif
76
+
77
+ #ifndef TRUE
78
+ #define TRUE 1
79
+ #endif
80
+
81
+ #ifndef NULL
82
+ #define NULL ((void *) 0)
83
+ #endif
84
+
85
+ #ifndef ALIGN64
86
+ #define ALIGN64 __attribute__((aligned(64)))
87
+ #endif
88
+
89
+ #if defined(__x86_64__) || defined(_M_X64)
90
+ /* Used to Intel/AMD hardware acceleration */
91
+ /* Modern compiler -- supports __has_attribute */
92
+ #if defined(__has_attribute)
93
+ #if __has_attribute(target)
94
+ #define FAST_SHA2_ATTRIBUTES
95
+ #define FAST_SHA2_ATTRIBUTES_256 __attribute__((target("ssse3,avx2,sha,sse4.1")))
96
+ #if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 14) /* Need for target sha512 */
97
+ #define FAST_SHA2_ATTRIBUTES_512 __attribute__((target("ssse3,avx2,sha512,avx512f,avx512bw,avx512vl,avx512vbmi2,avx512vnni")))
98
+ #else
99
+ #define FAST_SHA2_ATTRIBUTES_512 __attribute__((target("ssse3,avx2,avx512f,avx512bw,avx512vl,avx512vbmi2,avx512vnni")))
100
+ #endif
101
+ #endif
102
+ #elif defined(__GNUC__) && (__GNUC__ >= 8) /* Older compiler */
103
+ #define FAST_SHA2_ATTRIBUTES
104
+ #define FAST_SHA2_ATTRIBUTES_256 __attribute__((target("ssse3,avx2,sha,sse4.1")))
105
+ #if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 14) /* Need for target sha512 */
106
+ #define FAST_SHA2_ATTRIBUTES_512 __attribute__((target("ssse3,avx2,sha512,avx512f,avx512bw,avx512vl,avx512vbmi2,avx512vnni")))
107
+ #else
108
+ #define FAST_SHA2_ATTRIBUTES_512 __attribute__((target("ssse3,avx2,avx512f,avx512bw,avx512vl,avx512vbmi2,avx512vnni")))
109
+ #endif
110
+ #endif
111
+ #if defined(FAST_SHA2_ATTRIBUTES)
112
+ #define FAST_SHA2_NI TRUE
113
+ #include <immintrin.h>
114
+ typedef __m128i v128;
115
+ typedef __m256i v256;
116
+ #endif
117
+ /* Used to test if the CPU supports the instructions */
118
+ #ifndef cpuid
119
+ #define cpuid(ax, bx, cx, dx) __asm__ __volatile__ ("cpuid" : \
120
+ "+a"(ax), "=b"(bx), "+c"(cx), "=d"(dx))
121
+ #endif
122
+ #ifndef xgetbv
123
+ #define xgetbv(idx, lo, hi) __asm__ __volatile__ ("xgetbv" : \
124
+ "=a"(lo), "=d"(hi) : "c"(idx));
125
+ #endif
126
+ #elif defined(__ARM_NEON) || defined(_AARCH64_NEON_H_)
127
+ /* Used for ARM hardware acceleration */
128
+ #if defined(__has_attribute)
129
+ #if __has_attribute(target)
130
+ #define FAST_SHA2_ATTRIBUTES
131
+ #define FAST_SHA2_ATTRIBUTES_256 __attribute__((target("+crypto,+sha3")))
132
+ #define FAST_SHA2_ATTRIBUTES_512 __attribute__((target("+crypto,+sha3")))
133
+ #endif
134
+ #elif defined(__GNUC__) && (__GNUC__ >= 8)
135
+ #define FAST_SHA2_ATTRIBUTES
136
+ #define FAST_SHA2_ATTRIBUTES_256 __attribute__((target("+crypto,+sha3")))
137
+ #define FAST_SHA2_ATTRIBUTES_512 __attribute__((target("+crypto,+sha3")))
138
+ #endif
139
+ #if defined(FAST_SHA2_ATTRIBUTES)
140
+ #define FAST_SHA2_ARM TRUE
141
+ #include <arm_neon.h>
142
+ #include <arm_acle.h>
143
+ #include <sys/auxv.h>
144
+ #include <asm/hwcap.h>
145
+ #if defined(__ARM_FEATURE_SVE)
146
+ #include <arm_sve.h>
147
+ typedef svuint32_t v32; /* Scalable 32-bit */
148
+ typedef svuint64_t v64; /* Scalable 64-bit */
149
+ #else
150
+ typedef uint32x4_t v128_32; /* 32-bit for SHA-256 */
151
+ typedef uint64x2_t v128_64; /* 64-bit for SHA-512 */
152
+ #endif
153
+ #endif
154
+ #elif defined(__riscv)
155
+ /* #if defined(__riscv_vector) || defined(HAVE_RISCV_V) */
156
+ #if defined(__has_attribute)
157
+ #if __has_attribute(target)
158
+ #define FAST_SHA2_ATTRIBUTES
159
+ #define FAST_SHA2_ATTRIBUTES_256
160
+ #define FAST_SHA2_ATTRIBUTES_512
161
+ /*
162
+ #define FAST_SHA2_ATTRIBUTES_V __attribute__((target("arch=+v")))
163
+ #define FAST_SHA2_ATTRIBUTES_256 __attribute__((target("arch=+v,+zvknha")))
164
+ #define FAST_SHA2_ATTRIBUTES_512 __attribute__((target("arch=+v,+zvknhb")))
165
+ */
166
+ #endif
167
+ #elif defined(__GNUC__) && (__GNUC__ >= 8)
168
+ #define FAST_SHA2_ATTRIBUTES
169
+ #define FAST_SHA2_ATTRIBUTES_256
170
+ #define FAST_SHA2_ATTRIBUTES_512
171
+ /*
172
+ #define FAST_SHA2_ATTRIBUTES_V __attribute__((target("arch=+v")))
173
+ #define FAST_SHA2_ATTRIBUTES_256 __attribute__((target("arch=+v,+zvknha")))
174
+ #define FAST_SHA2_ATTRIBUTES_512 __attribute__((target("arch=+v,+zvknhb")))
175
+ */
176
+ #endif
177
+ #if defined(FAST_SHA2_ATTRIBUTES)
178
+ #define FAST_SHA2_RISC_V TRUE
179
+ #include <riscv_vector.h>
180
+ typedef vuint32m1_t v32;
181
+ typedef vuint64m1_t v64;
182
+ /*
183
+ #include <asm/hwprobe.h>
184
+ #include <sys/hwprobe.h>
185
+ #include <sys/syscall.h>
186
+
187
+ typedef u32 v32 __attribute__((vector_size(32)));
188
+ typedef u64 v64 __attribute__((vector_size(32)));
189
+ */
190
+ #ifndef __riscv_hwprobe
191
+ #define FAST_SHA2_RISC_V_PROBE
192
+ struct riscv_hwprobe {
193
+ i64 key;
194
+ u64 value;
195
+ };
196
+ #define RISCV_HWPROBE_KEY_IMA_EXT_0 4
197
+
198
+ #define RISCV_HWPROBE_EXT_V (1ULL << 0)
199
+ #define RISCV_HWPROBE_EXT_ZBA (1ULL << 3)
200
+ #define RISCV_HWPROBE_EXT_ZBB (1ULL << 4)
201
+ #define RISCV_HWPROBE_EXT_ZBKB (1ULL << 8)
202
+ #define RISCV_HWPROBE_EXT_ZKNH (1ULL << 13)
203
+ #define RISCV_HWPROBE_EXT_ZVKNHA (1ULL << 34)
204
+ #define RISCV_HWPROBE_EXT_ZVKNHB (1ULL << 35)
205
+ #define __riscv_hwprobe(pairs, pair_count, cpu_count, cpus, flags) \
206
+ ({ long __ret; \
207
+ register void * __a0 __asm__("a0") = (void*)(pairs); \
208
+ register size_t __a1 __asm__("a1") = (size_t)(pair_count); \
209
+ register size_t __a2 __asm__("a2") = (size_t)(cpu_count); \
210
+ register void * __a3 __asm__("a3") = (void*)(cpus); \
211
+ register unsigned int __a4 __asm__("a4") = (unsigned int)(flags); \
212
+ register long __a7 __asm__("a7") = 258; \
213
+ __asm__ __volatile__ ("ecall" : "=r" (__ret) : "r" (__a0), \
214
+ "r" (__a1), "r" (__a2), "r" (__a3), "r" (__a4), "r" (__a7) \
215
+ : "memory"); __ret; })
216
+
217
+ /* long result = __riscv_hwprobe(kv_pair, 1, 0, NULL, 0); */
218
+
219
+ #endif
220
+ #endif
221
+ #endif
222
+
223
+ /* Main hash internal structures */
224
+ typedef struct fast_sha2_hash_context256 {
225
+ u64 length;
226
+ u32 hash[8];
227
+ u64 total;
228
+ uc8 buf[128];
229
+ } fast_sha2_hash_context256;
230
+
231
+ typedef struct fast_sha2_hash_context512 {
232
+ u64 length;
233
+ u64 hash[8];
234
+ u64 total;
235
+ uc8 buf[256];
236
+ } fast_sha2_hash_context512;
237
+
238
+ /* Padding Demarkation */
239
+ #define FAST_SHA2_PAD_MARK 0x80
240
+
241
+ /* Shift Right */
242
+ #define FAST_SHA2_SHR(x, n) ((x) >> (n))
243
+
244
+ /* Rotate Right */
245
+ #define FAST_SHA2_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
246
+ #define FAST_SHA2_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
247
+
248
+ /* SHA2 256+512 Functions Change & Majority */
249
+ #define FAST_SHA2_CH(x, y, z) (((x) & (y)) ^ ((~(x)) & (z)))
250
+ #define FAST_SHA2_MAJ(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
251
+
252
+ /* Sigma0 & 1 and Gamma0 & 1 for SHA-256 */
253
+ #define FAST_SHA2_S0_32(x) (FAST_SHA2_ROTR32((x), 2) ^ FAST_SHA2_ROTR32((x), 13) ^ FAST_SHA2_ROTR32((x), 22))
254
+ #define FAST_SHA2_S1_32(x) (FAST_SHA2_ROTR32((x), 6) ^ FAST_SHA2_ROTR32((x), 11) ^ FAST_SHA2_ROTR32((x), 25))
255
+ #define FAST_SHA2_G0_32(x) (FAST_SHA2_ROTR32((x), 7) ^ FAST_SHA2_ROTR32((x), 18) ^ FAST_SHA2_SHR((x), 3))
256
+ #define FAST_SHA2_G1_32(x) (FAST_SHA2_ROTR32((x), 17) ^ FAST_SHA2_ROTR32((x), 19) ^ FAST_SHA2_SHR((x), 10))
257
+
258
+ /* Sigma0 & 1 and Gamma0 & 1 for SHA-512 */
259
+ #define FAST_SHA2_S0_64(x) (FAST_SHA2_ROTR64((x), 28) ^ FAST_SHA2_ROTR64((x), 34) ^ FAST_SHA2_ROTR64((x), 39))
260
+ #define FAST_SHA2_S1_64(x) (FAST_SHA2_ROTR64((x), 14) ^ FAST_SHA2_ROTR64((x), 18) ^ FAST_SHA2_ROTR64((x), 41))
261
+ #define FAST_SHA2_G0_64(x) (FAST_SHA2_ROTR64((x), 1) ^ FAST_SHA2_ROTR64((x), 8) ^ FAST_SHA2_SHR((x), 7))
262
+ #define FAST_SHA2_G1_64(x) (FAST_SHA2_ROTR64((x), 19) ^ FAST_SHA2_ROTR64((x), 61) ^ FAST_SHA2_SHR((x), 6))
263
+
264
+ /* Conversion */
265
+ #define FAST_SHA2_UNPACK32(x, s) \
266
+ { \
267
+ *((s) + 3) = (uc8) ((x)); \
268
+ *((s) + 2) = (uc8) ((x) >> 8); \
269
+ *((s) + 1) = (uc8) ((x) >> 16); \
270
+ *((s) + 0) = (uc8) ((x) >> 24); \
271
+ }
272
+
273
+ #define FAST_SHA2_PACK32(x, s) \
274
+ { \
275
+ *(x) = ((u32) *((s) + 3)) | \
276
+ ((u32) *((s) + 2) << 8) | \
277
+ ((u32) *((s) + 1) << 16) | \
278
+ ((u32) *((s) + 0) << 24); \
279
+ }
280
+
281
+ #define FAST_SHA2_UNPACK64(x, s) \
282
+ { \
283
+ *((s) + 7) = (uc8) ((x)); \
284
+ *((s) + 6) = (uc8) ((x) >> 8); \
285
+ *((s) + 5) = (uc8) ((x) >> 16); \
286
+ *((s) + 4) = (uc8) ((x) >> 24); \
287
+ *((s) + 3) = (uc8) ((x) >> 32); \
288
+ *((s) + 2) = (uc8) ((x) >> 40); \
289
+ *((s) + 1) = (uc8) ((x) >> 48); \
290
+ *((s) + 0) = (uc8) ((x) >> 56); \
291
+ }
292
+
293
+ #define FAST_SHA2_PACK64(x, s) \
294
+ { \
295
+ *(x) = ((u64) *((s) + 7)) | \
296
+ ((u64) *((s) + 6) << 8) | \
297
+ ((u64) *((s) + 5) << 16) | \
298
+ ((u64) *((s) + 4) << 24) | \
299
+ ((u64) *((s) + 3) << 32) | \
300
+ ((u64) *((s) + 2) << 40) | \
301
+ ((u64) *((s) + 1) << 48) | \
302
+ ((u64) *((s) + 0) << 56); \
303
+ }
304
+
305
+ ALIGN64
306
+ static const u32 fast_sha2_256_k_table[64] = {
307
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
308
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
309
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
310
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
311
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
312
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
313
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
314
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
315
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
316
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
317
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
318
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
319
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
320
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
321
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
322
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
323
+ };
324
+
325
+ ALIGN64
326
+ static const u32 fast_sha2_256_initial_states[8] = {
327
+ 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
328
+ 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
329
+ };
330
+
331
+ ALIGN64
332
+ static const u64 fast_sha2_512_k_table[80] = {
333
+ 0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f,
334
+ 0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019,
335
+ 0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242,
336
+ 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
337
+ 0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235,
338
+ 0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
339
+ 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x2de92c6f592b0275,
340
+ 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
341
+ 0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f,
342
+ 0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725,
343
+ 0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc,
344
+ 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
345
+ 0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6,
346
+ 0x92722c851482353b, 0xa2bfe8a14cf10364, 0xa81a664bbc423001,
347
+ 0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218,
348
+ 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
349
+ 0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99,
350
+ 0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
351
+ 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc,
352
+ 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
353
+ 0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915,
354
+ 0xc67178f2e372532b, 0xca273eceea26619c, 0xd186b8c721c0c207,
355
+ 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba,
356
+ 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
357
+ 0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc,
358
+ 0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
359
+ 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
360
+ };
361
+
362
+ ALIGN64
363
+ static const u64 fast_sha2_512_initial_states[8] = {
364
+ 0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b,
365
+ 0xa54ff53a5f1d36f1, 0x510e527fade682d1, 0x9b05688c2b3e6c1f,
366
+ 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
367
+ };
368
+
369
+ #define FAST_SHA2_BIT_IS_SET(field, bit) ((field) & (b32)(bit))
370
+ #define FAST_SHA2_CPU_SUPPORTS_SSSE3 (1 << 0)
371
+ #define FAST_SHA2_CPU_SUPPORTS_AVX2 (1 << 1)
372
+ #define FAST_SHA2_CPU_SUPPORTS_SHA256 (1 << 2)
373
+ #define FAST_SHA2_CPU_SUPPORTS_SHA512 (1 << 3)
374
+ #define FAST_SHA2_CPU_SUPPORTS_AVX512 (1 << 4)
375
+ #define FAST_SHA2_CPU_SUPPORTS_RVV1 (1 << 5)
376
+ #define FAST_SHA2_CPU_SUPPORTS_ZBA (1 << 6)
377
+ #define FAST_SHA2_CPU_SUPPORTS_ZBB (1 << 7)
378
+ #define FAST_SHA2_CPU_SUPPORTS_ZBKB (1 << 8)
379
+ #define FAST_SHA2_CPU_SUPPORTS_ZKNH (1 << 9)
380
+ #define FAST_SHA2_CPU_SUPPORTS_ZVKNHA (1 << 10)
381
+ #define FAST_SHA2_CPU_SUPPORTS_ZVKNHB (1 << 11)
382
+
383
+ const c8 *fast_sha2_cpu_instructions[] = {
384
+ "SSSE3",
385
+ "AVX2",
386
+ "SHA256",
387
+ "SHA512",
388
+ "AVX512",
389
+ "RVV1",
390
+ "ZBA",
391
+ "ZBB",
392
+ "ZBKB",
393
+ "ZKNH",
394
+ "ZVKNHA",
395
+ "ZVKNHB",
396
+ NULL
397
+ };
398
+
399
+ b8 fast_sha2_list_cpu_instructions(c8 *output, b32 hardware_support)
400
+ {
401
+ u32 i = 0;
402
+ b8 found = FALSE;
403
+
404
+ *output = '\0';
405
+ while(fast_sha2_cpu_instructions[i] != NULL) {
406
+ if(FAST_SHA2_BIT_IS_SET(hardware_support, (1 << i))) {
407
+ if(found) {
408
+ strcat(output, ", ");
409
+ if(fast_sha2_cpu_instructions[i])
410
+ strcat(output, fast_sha2_cpu_instructions[i]);
411
+ else
412
+ strcat(output, "UNDEFINED");
413
+ } else {
414
+ found = TRUE;
415
+ if(fast_sha2_cpu_instructions[i])
416
+ strcpy(output, fast_sha2_cpu_instructions[i]);
417
+ else
418
+ strcpy(output, "UNDEFINED");
419
+ }
420
+ }
421
+ i++;
422
+ }
423
+ return(found);
424
+ }
425
+
426
+ b32 fast_sha2_supported_by_cpu(void)
427
+ {
428
+ b32 hardware_support = 0;
429
+
430
+ /* Check for x86 SHA Instructions */
431
+ #if defined(FAST_SHA2_NI)
432
+ u32 a = 0, b = 0, c = 0, d = 0;
433
+
434
+ /* SSSE3: CPUID.1.ECX[9] */
435
+ a = 1; c = 0;
436
+ cpuid(a, b, c, d);
437
+ if((c >> 9) & 1)
438
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_SSSE3;
439
+ a = c = 0;
440
+ cpuid(a, b, c, d);
441
+ if(a < 7) /* Otherwise bail if leaf 7 isn't supported */
442
+ return(hardware_support);
443
+ /* SHA256 CPUID.7.0.EBX[29] / EAX = 7, ECX = 0 */
444
+ a = 7, c = 0;
445
+ cpuid(a, b, c, d);
446
+ if((b >> 29) & 1)
447
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_SHA256;
448
+
449
+ /* SHA512 CPUID.7.1.EDX[0] / EAX = 7, ECX = 1 */
450
+ if(a > 0) {
451
+ a = 7, c = 1;
452
+ cpuid(a, b, c, d);
453
+ if(d & 1)
454
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_SHA512;
455
+ }
456
+ /* AVX2: EBX[5] + XCR0[2:1] */
457
+ a = 7, c = 0;
458
+ cpuid(a, b, c, d);
459
+ if((b >> 5) & 1) {
460
+ a = 1; b = 0; c = 0; d = 0;
461
+ cpuid(a, b, c, d);
462
+ if((c >> 27) & 1) {
463
+ xgetbv(0, a, d); /* Needs OS Support too */
464
+ if((((u64)d << 32) | (a & 0x6)) == 0x6)
465
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_AVX2;
466
+ }
467
+ }
468
+ /* AVX-512: EBX[16] + XCR0[7:5] */
469
+ a = 7, c = 0;
470
+ cpuid(a, b, c, d);
471
+ if((b >> 16) & 1) {
472
+ a = 1; b = 0; c = 0; d = 0;
473
+ cpuid(a, b, c, d);
474
+ if((c >> 27) & 1) {
475
+ xgetbv(0, a, d); /* Needs OS Support too */
476
+ if((((u64)d << 32) | (a & 0xE6)) == 0xE6)
477
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_AVX512;
478
+ }
479
+ }
480
+ /* Check for ARM SHA Instructions */
481
+ #elif defined(FAST_SHA2_ARM)
482
+ #if defined(HWCAP_SHA2)
483
+ u64 hwcaps = getauxval(AT_HWCAP);
484
+ if(hwcaps & HWCAP_SHA2)
485
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_SHA256;
486
+ #if defined(HWCAP_SHA512)
487
+ if(hwcaps & HWCAP_SHA512)
488
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_SHA512;
489
+ #endif
490
+ #endif
491
+ #elif defined(FAST_SHA2_RISC_V)
492
+ struct riscv_hwprobe kv_pair[] = {
493
+ { RISCV_HWPROBE_KEY_IMA_EXT_0, 0 }
494
+ };
495
+ /* printf("rv = %ld\n", __riscv_hwprobe(kv_pair, 1, 0, NULL, 0)); */
496
+ /* if(__riscv_hwprobe(kv_pair, 1, 0, NULL, 0) == 0) { */
497
+
498
+ long result = __riscv_hwprobe(kv_pair, 1, 0, NULL, 0);
499
+ printf("hardware probe = %ld %ld %lu\n", result, kv_pair[0].key, kv_pair[0].value);
500
+ if(kv_pair[0].value != 0) {
501
+ if(kv_pair[0].value & RISCV_HWPROBE_EXT_V)
502
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_RVV1;
503
+ if(kv_pair[0].value & RISCV_HWPROBE_EXT_ZBA)
504
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_ZBA;
505
+ if(kv_pair[0].value & RISCV_HWPROBE_EXT_ZBB)
506
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_ZBB;
507
+ if(kv_pair[0].value & RISCV_HWPROBE_EXT_ZBKB)
508
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_ZBKB;
509
+ if(kv_pair[0].value & RISCV_HWPROBE_EXT_ZKNH)
510
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_ZKNH;
511
+ if(kv_pair[0].value & RISCV_HWPROBE_EXT_ZVKNHA)
512
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_ZVKNHA;
513
+ if(kv_pair[0].value & RISCV_HWPROBE_EXT_ZVKNHB)
514
+ hardware_support |= FAST_SHA2_CPU_SUPPORTS_ZVKNHB;
515
+ }
516
+ #endif
517
+
518
+ return(hardware_support);
519
+ }
520
+
521
+ /*
522
+ Warning: unused function - if FAST_SHA2_NI and FAST_SHA2_ARM hardware support isn't
523
+ available, this function is empty. In this case, the function
524
+ fast_sha2_supported_by_cpu will return false, and so in this case the empty
525
+ function should never be called.
526
+
527
+ However, this *may* cause compiler warnings.
528
+ */
529
+ #if defined(FAST_SHA2_ATTRIBUTES)
530
+ FAST_SHA2_ATTRIBUTES_256
531
+ #endif
532
+ void fast_sha2_256_process_hardware_hash(fast_sha2_hash_context256 *context,
533
+ uc8 *message, u64 n)
534
+ {
535
+ #if defined(FAST_SHA2_ARM)
536
+ v128_32 state0, state1, save0, save1, keys;
537
+ v128_32 msg, tmp, tmsg0, tmsg1, tmsg2, tmsg3;
538
+ #elif defined(FAST_SHA2_NI)
539
+ v128 state0, state1, save0, save1, keys;
540
+ v128 msg, tmp, tmsg0, tmsg1, tmsg2, tmsg3;
541
+ const v128 shuffle_mask = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10,
542
+ 11, 4, 5, 6, 7, 0, 1, 2, 3);
543
+ #elif defined(FAST_SHA2_RISC_V)
544
+ v32 state0, state1, save0, save1, keys;
545
+ v32 msg, tmp, tmsg0, tmsg1, tmsg2, tmsg3;
546
+ u64 vlmax = __riscv_vsetvl_e32m1(0); /* number of 32-bit elements */
547
+ #endif
548
+ #if defined(FAST_SHA2_NI) || defined(FAST_SHA2_ARM) || defined(FAST_SHA2_RISC_V)
549
+ u32 s0[16] = {0}, s1[16] = {0}, i = 0;
550
+ uc8 *p = NULL;
551
+ #endif
552
+ /* Initialize state from context */
553
+ #if defined(FAST_SHA2_NI)
554
+ state0 = _mm_set_epi32((i32)context->hash[0], (i32)context->hash[1],
555
+ (i32)context->hash[4], (i32)context->hash[5]);
556
+ state1 = _mm_set_epi32((i32)context->hash[2], (i32)context->hash[3],
557
+ (i32)context->hash[6], (i32)context->hash[7]);
558
+ #elif defined(FAST_SHA2_ARM)
559
+ state0 = vld1q_u32((const u32 *)&context->hash[0]);
560
+ state1 = vld1q_u32((const u32 *)&context->hash[4]);
561
+ #elif defined(FAST_SHA2_RISC_V)
562
+
563
+ printf("Things are getting RISC-VY!\n");
564
+
565
+ #endif
566
+
567
+
568
+ /*
569
+
570
+ // We want to process 'n' message blocks in parallel
571
+ void sha256_vector_parallel(uint32_t *data, size_t n) {
572
+ for (size_t vl; n > 0; n -= vl, data += (vl * 16)) {
573
+ // 1. Ask the hardware: "How many 32-bit elements can you handle at once?"
574
+ // This sets 'vl' to the hardware's max capacity (VLMAX) or 'n', whichever is smaller.
575
+ vl = __riscv_vsetvl_e32m1(n);
576
+
577
+ // 2. Load your data into the vector registers
578
+ // If VLEN=128, vl is 4. If VLEN=256, vl is 8.
579
+ vuint32m1_t v_data = __riscv_vle32_v_u32m1(data, vl);
580
+
581
+ // ... Perform SHA-256 rounds here ...
582
+ }
583
+ }
584
+
585
+ */
586
+
587
+ #if defined(FAST_SHA2_NI) || defined(FAST_SHA2_ARM) || defined(FAST_SHA2_RISC_V)
588
+ for(i = 0; i < n; i++) {
589
+ p = message + (i << 6);
590
+
591
+ save0 = state0; /* Save for next round */
592
+ save1 = state1;
593
+
594
+
595
+
596
+
597
+ #endif
598
+
599
+ /* key # corresponds to round number */
600
+ #if defined(FAST_SHA2_NI)
601
+ keys = _mm_set_epi32((i32)fast_sha2_256_k_table[3], (i32)fast_sha2_256_k_table[2],
602
+ (i32)fast_sha2_256_k_table[1], (i32)fast_sha2_256_k_table[0]);
603
+ msg = _mm_loadu_si128((v128 *)p);
604
+ tmsg0 = _mm_shuffle_epi8(msg, shuffle_mask);
605
+ msg = _mm_add_epi32(tmsg0, keys);
606
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
607
+ msg = _mm_shuffle_epi32(msg, 0x0E);
608
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
609
+ #elif defined(FAST_SHA2_ARM)
610
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[0]);
611
+ msg = vld1q_u32((const u32 *)p);
612
+ tmsg0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg)));
613
+ msg = vaddq_u32(tmsg0, keys);
614
+ tmp = state1;
615
+ /* efgh = vsha256h2q_u32(efgh, abcd, wk); */
616
+ state1 = vsha256h2q_u32(state1, state0, msg);
617
+ /* abcd = vsha256hq_u32(abcd, efgh, wk); */
618
+ state0 = vsha256hq_u32(state0, tmp, msg);
619
+ #elif defined(FAST_SHA2_RISC_V)
620
+
621
+ /*
622
+ // Assume 'state' vectors (vuint32m1_t) hold SHA-256 variables {a,b,c,d} and {e,f,g,h}
623
+ // 'W_K' is a vector containing (Message Word W[i] + Round Constant K[i])
624
+ for (int i = 0; i < 64; i += 4) {
625
+ // 1. Prepare message schedule expansion for future rounds if needed
626
+ // msg0 = __riscv_vsha2ms_vv_u32m1(msg0, msg1, msg2, vl);
627
+
628
+ // 2. Perform compression rounds (2 rounds per instruction pair)
629
+ // Round i and i+1
630
+ state0 = __riscv_vsha2cl_vv_u32m1(state0, state1, W_K[i], vl);
631
+ state1 = __riscv_vsha2ch_vv_u32m1(state1, state0, W_K[i], vl);
632
+
633
+ // Round i+2 and i+3
634
+ state0 = __riscv_vsha2cl_vv_u32m1(state0, state1, W_K[i+2], vl);
635
+ state1 = __riscv_vsha2ch_vv_u32m1(state1, state0, W_K[i+2], vl);
636
+ }
637
+ */
638
+
639
+
640
+
641
+ #endif
642
+
643
+ #if defined(FAST_SHA2_NI)
644
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[7], (i32)fast_sha2_256_k_table[6],
645
+ (i32)fast_sha2_256_k_table[5], (i32)fast_sha2_256_k_table[4]);
646
+ msg = _mm_loadu_si128((v128 *)(p + 16));
647
+ tmsg1 = _mm_shuffle_epi8(msg, shuffle_mask);
648
+ msg = _mm_add_epi32(tmsg1, keys);
649
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
650
+ msg = _mm_shuffle_epi32(msg, 0x0E);
651
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
652
+ tmsg0 = _mm_sha256msg1_epu32(tmsg0, tmsg1);
653
+ #elif defined(FAST_SHA2_ARM)
654
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[4]);
655
+ msg = vld1q_u32((const u32 *)(p + 16));
656
+ tmsg1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg)));
657
+ msg = vaddq_u32(tmsg1, keys);
658
+ tmp = state1;
659
+ state1 = vsha256h2q_u32(state1, state0, msg);
660
+ state0 = vsha256hq_u32(state0, tmp, msg);
661
+ tmsg0 = vsha256su0q_u32(tmsg0, tmsg1);
662
+ #endif
663
+
664
+ #if defined(FAST_SHA2_NI)
665
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[11], (i32)fast_sha2_256_k_table[10],
666
+ (i32)fast_sha2_256_k_table[9], (i32)fast_sha2_256_k_table[8]);
667
+ msg = _mm_loadu_si128((v128 *)(p + 32));
668
+ tmsg2 = _mm_shuffle_epi8(msg, shuffle_mask);
669
+ msg = _mm_add_epi32(tmsg2, keys);
670
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
671
+ msg = _mm_shuffle_epi32(msg, 0x0E);
672
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
673
+ tmsg1 = _mm_sha256msg1_epu32(tmsg1, tmsg2);
674
+ #elif defined(FAST_SHA2_ARM)
675
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[8]);
676
+ msg = vld1q_u32((const u32 *)(p + 32));
677
+ tmsg2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg)));
678
+ msg = vaddq_u32(tmsg2, keys);
679
+ tmp = state1;
680
+ state1 = vsha256h2q_u32(state1, state0, msg);
681
+ state0 = vsha256hq_u32(state0, tmp, msg);
682
+ tmsg1 = vsha256su0q_u32(tmsg1, tmsg2);
683
+ #endif
684
+
685
+ #if defined(FAST_SHA2_NI)
686
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[15], (i32)fast_sha2_256_k_table[14],
687
+ (i32)fast_sha2_256_k_table[13], (i32)fast_sha2_256_k_table[12]);
688
+ msg = _mm_loadu_si128((v128 *)(p + 48));
689
+ tmsg3 = _mm_shuffle_epi8(msg, shuffle_mask);
690
+ msg = _mm_add_epi32(tmsg3, keys);
691
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
692
+ tmp = _mm_alignr_epi8(tmsg3, tmsg2, 4);
693
+ tmsg0 = _mm_add_epi32(tmsg0, tmp);
694
+ tmsg0 = _mm_sha256msg2_epu32(tmsg0, tmsg3);
695
+ msg = _mm_shuffle_epi32(msg, 0x0E);
696
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
697
+ tmsg2 = _mm_sha256msg1_epu32(tmsg2, tmsg3);
698
+ #elif defined(FAST_SHA2_ARM)
699
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[12]);
700
+ msg = vld1q_u32((const u32 *)(p + 48));
701
+ tmsg3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(msg)));
702
+ msg = vaddq_u32(tmsg3, keys);
703
+ tmsg0 = vsha256su1q_u32(tmsg0, tmsg2, tmsg3);
704
+ tmp = state1;
705
+ state1 = vsha256h2q_u32(state1, state0, msg);
706
+ state0 = vsha256hq_u32(state0, tmp, msg);
707
+ tmsg2 = vsha256su0q_u32(tmsg2, tmsg3);
708
+ #endif
709
+
710
+ #if defined(FAST_SHA2_NI)
711
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[19], (i32)fast_sha2_256_k_table[18],
712
+ (i32)fast_sha2_256_k_table[17], (i32)fast_sha2_256_k_table[16]);
713
+ msg = _mm_add_epi32(tmsg0, keys);
714
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
715
+ tmp = _mm_alignr_epi8(tmsg0, tmsg3, 4);
716
+ tmsg1 = _mm_add_epi32(tmsg1, tmp);
717
+ tmsg1 = _mm_sha256msg2_epu32(tmsg1, tmsg0);
718
+ msg = _mm_shuffle_epi32(msg, 0x0E);
719
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
720
+ tmsg3 = _mm_sha256msg1_epu32(tmsg3, tmsg0);
721
+ #elif defined(FAST_SHA2_ARM)
722
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[16]);
723
+ msg = vaddq_u32(tmsg0, keys);
724
+ tmp = state1;
725
+ state1 = vsha256h2q_u32(state1, state0, msg);
726
+ tmsg1 = vsha256su1q_u32(tmsg1, tmsg3, tmsg0);
727
+ state0 = vsha256hq_u32(state0, tmp, msg);
728
+ tmsg3 = vsha256su0q_u32(tmsg3, tmsg0);
729
+ #endif
730
+
731
+ #if defined(FAST_SHA2_NI)
732
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[23], (i32)fast_sha2_256_k_table[22],
733
+ (i32)fast_sha2_256_k_table[21], (i32)fast_sha2_256_k_table[20]);
734
+ msg = _mm_add_epi32(tmsg1, keys);
735
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
736
+ tmp = _mm_alignr_epi8(tmsg1, tmsg0, 4);
737
+ tmsg2 = _mm_add_epi32(tmsg2, tmp);
738
+ tmsg2 = _mm_sha256msg2_epu32(tmsg2, tmsg1);
739
+ msg = _mm_shuffle_epi32(msg, 0x0E);
740
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
741
+ tmsg0 = _mm_sha256msg1_epu32(tmsg0, tmsg1);
742
+ #elif defined(FAST_SHA2_ARM)
743
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[20]);
744
+ msg = vaddq_u32(tmsg1, keys);
745
+ tmp = state1;
746
+ state1 = vsha256h2q_u32(state1, state0, msg);
747
+ tmsg2 = vsha256su1q_u32(tmsg2, tmsg0, tmsg1);
748
+ state0 = vsha256hq_u32(state0, tmp, msg);
749
+ tmsg0 = vsha256su0q_u32(tmsg0, tmsg1);
750
+ #endif
751
+
752
+ #if defined(FAST_SHA2_NI)
753
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[27], (i32)fast_sha2_256_k_table[26],
754
+ (i32)fast_sha2_256_k_table[25], (i32)fast_sha2_256_k_table[24]);
755
+ msg = _mm_add_epi32(tmsg2, keys);
756
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
757
+ tmp = _mm_alignr_epi8(tmsg2, tmsg1, 4);
758
+ tmsg3 = _mm_add_epi32(tmsg3, tmp);
759
+ tmsg3 = _mm_sha256msg2_epu32(tmsg3, tmsg2);
760
+ msg = _mm_shuffle_epi32(msg, 0x0E);
761
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
762
+ tmsg1 = _mm_sha256msg1_epu32(tmsg1, tmsg2);
763
+ #elif defined(FAST_SHA2_ARM)
764
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[24]);
765
+ msg = vaddq_u32(tmsg2, keys);
766
+ tmp = state1;
767
+ state1 = vsha256h2q_u32(state1, state0, msg);
768
+ tmsg3 = vsha256su1q_u32(tmsg3, tmsg1, tmsg2);
769
+ state0 = vsha256hq_u32(state0, tmp, msg);
770
+ tmsg1 = vsha256su0q_u32(tmsg1, tmsg2);
771
+ #endif
772
+
773
+ #if defined(FAST_SHA2_NI)
774
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[31], (i32)fast_sha2_256_k_table[30],
775
+ (i32)fast_sha2_256_k_table[29], (i32)fast_sha2_256_k_table[28]);
776
+ msg = _mm_add_epi32(tmsg3, keys);
777
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
778
+ tmp = _mm_alignr_epi8(tmsg3, tmsg2, 4);
779
+ tmsg0 = _mm_add_epi32(tmsg0, tmp);
780
+ tmsg0 = _mm_sha256msg2_epu32(tmsg0, tmsg3);
781
+ msg = _mm_shuffle_epi32(msg, 0x0E);
782
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
783
+ tmsg2 = _mm_sha256msg1_epu32(tmsg2, tmsg3);
784
+ #elif defined(FAST_SHA2_ARM)
785
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[28]);
786
+ msg = vaddq_u32(tmsg3, keys);
787
+ tmp = state1;
788
+ state1 = vsha256h2q_u32(state1, state0, msg);
789
+ tmsg0 = vsha256su1q_u32(tmsg0, tmsg2, tmsg3);
790
+ state0 = vsha256hq_u32(state0, tmp, msg);
791
+ tmsg2 = vsha256su0q_u32(tmsg2, tmsg3);
792
+ #endif
793
+
794
+ #if defined(FAST_SHA2_NI)
795
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[35], (i32)fast_sha2_256_k_table[34],
796
+ (i32)fast_sha2_256_k_table[33], (i32)fast_sha2_256_k_table[32]);
797
+ msg = _mm_add_epi32(tmsg0, keys);
798
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
799
+ tmp = _mm_alignr_epi8(tmsg0, tmsg3, 4);
800
+ tmsg1 = _mm_add_epi32(tmsg1, tmp);
801
+ tmsg1 = _mm_sha256msg2_epu32(tmsg1, tmsg0);
802
+ msg = _mm_shuffle_epi32(msg, 0x0E);
803
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
804
+ tmsg3 = _mm_sha256msg1_epu32(tmsg3, tmsg0);
805
+ #elif defined(FAST_SHA2_ARM)
806
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[32]);
807
+ msg = vaddq_u32(tmsg0, keys);
808
+ tmp = state1;
809
+ state1 = vsha256h2q_u32(state1, state0, msg);
810
+ tmsg1 = vsha256su1q_u32(tmsg1, tmsg3, tmsg0);
811
+ state0 = vsha256hq_u32(state0, tmp, msg);
812
+ tmsg3 = vsha256su0q_u32(tmsg3, tmsg0);
813
+ #endif
814
+
815
+ #if defined(FAST_SHA2_NI)
816
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[39], (i32)fast_sha2_256_k_table[38],
817
+ (i32)fast_sha2_256_k_table[37], (i32)fast_sha2_256_k_table[36]);
818
+ msg = _mm_add_epi32(tmsg1, keys);
819
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
820
+ tmp = _mm_alignr_epi8(tmsg1, tmsg0, 4);
821
+ tmsg2 = _mm_add_epi32(tmsg2, tmp);
822
+ tmsg2 = _mm_sha256msg2_epu32(tmsg2, tmsg1);
823
+ msg = _mm_shuffle_epi32(msg, 0x0E);
824
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
825
+ tmsg0 = _mm_sha256msg1_epu32(tmsg0, tmsg1);
826
+ #elif defined(FAST_SHA2_ARM)
827
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[36]);
828
+ msg = vaddq_u32(tmsg1, keys);
829
+ tmp = state1;
830
+ state1 = vsha256h2q_u32(state1, state0, msg);
831
+ tmsg2 = vsha256su1q_u32(tmsg2, tmsg0, tmsg1);
832
+ state0 = vsha256hq_u32(state0, tmp, msg);
833
+ tmsg0 = vsha256su0q_u32(tmsg0, tmsg1);
834
+ #endif
835
+
836
+ #if defined(FAST_SHA2_NI)
837
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[43], (i32)fast_sha2_256_k_table[42],
838
+ (i32)fast_sha2_256_k_table[41], (i32)fast_sha2_256_k_table[40]);
839
+ msg = _mm_add_epi32(tmsg2, keys);
840
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
841
+ tmp = _mm_alignr_epi8(tmsg2, tmsg1, 4);
842
+ tmsg3 = _mm_add_epi32(tmsg3, tmp);
843
+ tmsg3 = _mm_sha256msg2_epu32(tmsg3, tmsg2);
844
+ msg = _mm_shuffle_epi32(msg, 0x0E);
845
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
846
+ tmsg1 = _mm_sha256msg1_epu32(tmsg1, tmsg2);
847
+ #elif defined(FAST_SHA2_ARM)
848
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[40]);
849
+ msg = vaddq_u32(tmsg2, keys);
850
+ tmp = state1;
851
+ state1 = vsha256h2q_u32(state1, state0, msg);
852
+ tmsg3 = vsha256su1q_u32(tmsg3, tmsg1, tmsg2);
853
+ state0 = vsha256hq_u32(state0, tmp, msg);
854
+ tmsg1 = vsha256su0q_u32(tmsg1, tmsg2);
855
+ #endif
856
+
857
+ #if defined(FAST_SHA2_NI)
858
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[47], (i32)fast_sha2_256_k_table[46],
859
+ (i32)fast_sha2_256_k_table[45], (i32)fast_sha2_256_k_table[44]);
860
+ msg = _mm_add_epi32(tmsg3, keys);
861
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
862
+ tmp = _mm_alignr_epi8(tmsg3, tmsg2, 4);
863
+ tmsg0 = _mm_add_epi32(tmsg0, tmp);
864
+ tmsg0 = _mm_sha256msg2_epu32(tmsg0, tmsg3);
865
+ msg = _mm_shuffle_epi32(msg, 0x0E);
866
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
867
+ tmsg2 = _mm_sha256msg1_epu32(tmsg2, tmsg3);
868
+ #elif defined(FAST_SHA2_ARM)
869
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[44]);
870
+ msg = vaddq_u32(tmsg3, keys);
871
+ tmp = state1;
872
+ state1 = vsha256h2q_u32(state1, state0, msg);
873
+ tmsg0 = vsha256su1q_u32(tmsg0, tmsg2, tmsg3);
874
+ state0 = vsha256hq_u32(state0, tmp, msg);
875
+ tmsg2 = vsha256su0q_u32(tmsg2, tmsg3);
876
+ #endif
877
+
878
+ #if defined(FAST_SHA2_NI)
879
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[51], (i32)fast_sha2_256_k_table[50],
880
+ (i32)fast_sha2_256_k_table[49], (i32)fast_sha2_256_k_table[48]);
881
+ msg = _mm_add_epi32(tmsg0, keys);
882
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
883
+ tmp = _mm_alignr_epi8(tmsg0, tmsg3, 4);
884
+ tmsg1 = _mm_add_epi32(tmsg1, tmp);
885
+ tmsg1 = _mm_sha256msg2_epu32(tmsg1, tmsg0);
886
+ msg = _mm_shuffle_epi32(msg, 0x0E);
887
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
888
+ tmsg3 = _mm_sha256msg1_epu32(tmsg3, tmsg0);
889
+ #elif defined(FAST_SHA2_ARM)
890
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[48]);
891
+ msg = vaddq_u32(tmsg0, keys);
892
+ tmp = state1;
893
+ state1 = vsha256h2q_u32(state1, state0, msg);
894
+ tmsg1 = vsha256su1q_u32(tmsg1, tmsg3, tmsg0);
895
+ state0 = vsha256hq_u32(state0, tmp, msg);
896
+ tmsg3 = vsha256su0q_u32(tmsg3, tmsg0);
897
+ #endif
898
+
899
+ #if defined(FAST_SHA2_NI)
900
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[55], (i32)fast_sha2_256_k_table[54],
901
+ (i32)fast_sha2_256_k_table[53], (i32)fast_sha2_256_k_table[52]);
902
+ msg = _mm_add_epi32(tmsg1, keys);
903
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
904
+ tmp = _mm_alignr_epi8(tmsg1, tmsg0, 4);
905
+ tmsg2 = _mm_add_epi32(tmsg2, tmp);
906
+ tmsg2 = _mm_sha256msg2_epu32(tmsg2, tmsg1);
907
+ msg = _mm_shuffle_epi32(msg, 0x0E);
908
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
909
+ #elif defined(FAST_SHA2_ARM)
910
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[52]);
911
+ msg = vaddq_u32(tmsg1, keys);
912
+ tmp = state1;
913
+ state1 = vsha256h2q_u32(state1, state0, msg);
914
+ tmsg2 = vsha256su1q_u32(tmsg2, tmsg0, tmsg1);
915
+ state0 = vsha256hq_u32(state0, tmp, msg);
916
+ tmsg0 = vsha256su0q_u32(tmsg0, tmsg1);
917
+ #endif
918
+
919
+ #if defined(FAST_SHA2_NI)
920
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[59], (i32)fast_sha2_256_k_table[58],
921
+ (i32)fast_sha2_256_k_table[57], (i32)fast_sha2_256_k_table[56]);
922
+ msg = _mm_add_epi32(tmsg2, keys);
923
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
924
+ tmp = _mm_alignr_epi8(tmsg2, tmsg1, 4);
925
+ tmsg3 = _mm_add_epi32(tmsg3, tmp);
926
+ tmsg3 = _mm_sha256msg2_epu32(tmsg3, tmsg2);
927
+ msg = _mm_shuffle_epi32(msg, 0x0E);
928
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
929
+ #elif defined(FAST_SHA2_ARM)
930
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[56]);
931
+ msg = vaddq_u32(tmsg2, keys);
932
+ tmp = state1;
933
+ state1 = vsha256h2q_u32(state1, state0, msg);
934
+ tmsg3 = vsha256su1q_u32(tmsg3, tmsg1, tmsg2);
935
+ state0 = vsha256hq_u32(state0, tmp, msg);
936
+ #endif
937
+
938
+ #if defined(FAST_SHA2_NI)
939
+ keys =_mm_set_epi32((i32)fast_sha2_256_k_table[63], (i32)fast_sha2_256_k_table[62],
940
+ (i32)fast_sha2_256_k_table[61], (i32)fast_sha2_256_k_table[60]);
941
+ msg = _mm_add_epi32(tmsg3, keys);
942
+ state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
943
+ msg = _mm_shuffle_epi32(msg, 0x0E);
944
+ state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
945
+ #elif defined(FAST_SHA2_ARM)
946
+ keys = vld1q_u32((const u32 *)&fast_sha2_256_k_table[60]);
947
+ msg = vaddq_u32(tmsg3, keys);
948
+ tmp = state1;
949
+ state1 = vsha256h2q_u32(state1, state0, msg);
950
+ state0 = vsha256hq_u32(state0, tmp, msg);
951
+ #endif
952
+
953
+ #ifdef FAST_SHA2_DEBUG
954
+ #if defined(FAST_SHA2_NI)
955
+ _mm_storeu_si128((v128 *) &s0, state0);
956
+ _mm_storeu_si128((v128 *) &s1, state1);
957
+ #elif defined(FAST_SHA2_ARM)
958
+ vst1q_u32((u32 *)&s0, state0);
959
+ vst1q_u32((u32 *)&s1, state1);
960
+ #endif
961
+
962
+ #if defined(FAST_SHA2_NI) || defined(FAST_SHA2_ARM)
963
+ context->hash[0] = s0[0];
964
+ context->hash[1] = s0[1];
965
+ context->hash[2] = s0[2];
966
+ context->hash[3] = s0[3];
967
+ context->hash[4] = s1[0];
968
+ context->hash[5] = s1[1];
969
+ context->hash[6] = s1[2];
970
+ context->hash[7] = s1[3];
971
+ printf("t60-63 = %08x %08x %08x %08x %08x %08x %08x %08x\n",
972
+ context->hash[0], context->hash[1], context->hash[2],
973
+ context->hash[3], context->hash[4], context->hash[5],
974
+ context->hash[6], context->hash[7]);
975
+ #endif
976
+ #endif
977
+
978
+ #if defined(FAST_SHA2_NI)
979
+ state0 = _mm_add_epi32(state0, save0);
980
+ state1 = _mm_add_epi32(state1, save1);
981
+ #elif defined(FAST_SHA2_ARM)
982
+ state0 = vaddq_u32(state0, save0);
983
+ state1 = vaddq_u32(state1, save1);
984
+ #endif
985
+
986
+ #ifdef FAST_SHA2_DEBUG
987
+ #if defined(FAST_SHA2_NI)
988
+ printf("s0 = %016llx %016llx\n", state0[0], state0[1]);
989
+ printf("s1 = %016llx %016llx\n", state1[0], state1[1]);
990
+ #elif defined(FAST_SHA2_ARM)
991
+ printf("s0 = %08x %08x %08x %08x\n",
992
+ state0[0], state0[1], state0[2], state0[3]);
993
+ printf("s1 = %08x %08x %08x %08x\n",
994
+ state1[0], state1[1], state1[2], state1[3]);
995
+ #endif
996
+ #endif
997
+
998
+ #if defined(FAST_SHA2_NI) || defined(FAST_SHA2_ARM) || defined(FAST_SHA2_RISC_V)
999
+ }
1000
+ #endif
1001
+
1002
+ #if defined(FAST_SHA2_NI)
1003
+ _mm_storeu_si128((v128 *) &s0, state0);
1004
+ _mm_storeu_si128((v128 *) &s1, state1);
1005
+ context->hash[0] = s0[3];
1006
+ context->hash[1] = s0[2];
1007
+ context->hash[2] = s1[3];
1008
+ context->hash[3] = s1[2];
1009
+ context->hash[4] = s0[1];
1010
+ context->hash[5] = s0[0];
1011
+ context->hash[6] = s1[1];
1012
+ context->hash[7] = s1[0];
1013
+ #elif defined(FAST_SHA2_ARM)
1014
+ vst1q_u32((u32 *)&s0, state0);
1015
+ vst1q_u32((u32 *)&s1, state1);
1016
+ context->hash[0] = s0[0];
1017
+ context->hash[1] = s0[1];
1018
+ context->hash[2] = s0[2];
1019
+ context->hash[3] = s0[3];
1020
+ context->hash[4] = s1[0];
1021
+ context->hash[5] = s1[1];
1022
+ context->hash[6] = s1[2];
1023
+ context->hash[7] = s1[3];
1024
+ #endif
1025
+ return;
1026
+ }
1027
+
1028
+ void fast_sha2_256_process_software_hash(fast_sha2_hash_context256 *context,
1029
+ uc8 *message, u64 n)
1030
+ {
1031
+ u64 i = 0;
1032
+ u32 a = 0, b = 0, c = 0, d = 0, e = 0, f = 0, g = 0, h = 0, j = 0;
1033
+ u32 t1 = 0, t2 = 0, w[64];
1034
+ uc8 *p = NULL;
1035
+
1036
+ for(i = 0; i < n; i++) {
1037
+ p = message + (i << 6);
1038
+ for(j = 0; j < 16; j++) {
1039
+ if(j < 16) {
1040
+ FAST_SHA2_PACK32(&w[j], &p[j << 2]);
1041
+ }
1042
+ }
1043
+ a = (u32)context->hash[0];
1044
+ b = (u32)context->hash[1];
1045
+ c = (u32)context->hash[2];
1046
+ d = (u32)context->hash[3];
1047
+ e = (u32)context->hash[4];
1048
+ f = (u32)context->hash[5];
1049
+ g = (u32)context->hash[6];
1050
+ h = (u32)context->hash[7];
1051
+ for(j = 0; j < 64; j++) {
1052
+ if(j >= 16)
1053
+ w[j] = FAST_SHA2_G1_32(w[j - 2]) + w[j - 7] + FAST_SHA2_G0_32(w[j - 15]) + w[j - 16];
1054
+ t1 = h + FAST_SHA2_S1_32(e) + FAST_SHA2_CH(e, f, g) + fast_sha2_256_k_table[j] + w[j];
1055
+ t2 = FAST_SHA2_S0_32(a) + FAST_SHA2_MAJ(a, b, c);
1056
+ h = g;
1057
+ g = f;
1058
+ f = e;
1059
+ e = d + t1;
1060
+ d = c;
1061
+ c = b;
1062
+ b = a;
1063
+ a = t1 + t2;
1064
+ #ifdef FAST_SHA2_DEBUG
1065
+ printf("t = %2d %08x %08x %08x %08x %08x %08x %08x %08x, "
1066
+ "k = %08x, w = %08x\n", j, a, b, c, d, e, f, g, h,
1067
+ fast_sha2_256_k_table[j], w[j]);
1068
+ #endif
1069
+ }
1070
+ context->hash[0] += a;
1071
+ context->hash[1] += b;
1072
+ context->hash[2] += c;
1073
+ context->hash[3] += d;
1074
+ context->hash[4] += e;
1075
+ context->hash[5] += f;
1076
+ context->hash[6] += g;
1077
+ context->hash[7] += h;
1078
+ }
1079
+ }
1080
+
1081
+ uc8 *fast_sha2_256_hash(uc8 *message, u64 length, b8 software_only)
1082
+ {
1083
+ fast_sha2_hash_context256 context;
1084
+ u64 i = 0, l = 0, n = 0;
1085
+ uc8 *digest = NULL, *p = NULL;
1086
+ b32 hardware_support = fast_sha2_supported_by_cpu();
1087
+
1088
+ memcpy(context.hash, fast_sha2_256_initial_states, (size_t)32);
1089
+ memset(context.buf, 0, (size_t)128);
1090
+ context.length = 0;
1091
+ context.total = 0;
1092
+ digest = calloc((size_t)sizeof(c8), (size_t)33);
1093
+ #ifdef FAST_SHA2_DEBUG
1094
+ printf("init = %08x %08x %08x %08x %08x %08x %08x %08x\n",
1095
+ context.hash[0], context.hash[1], context.hash[2], context.hash[3],
1096
+ context.hash[4], context.hash[5], context.hash[6], context.hash[7]);
1097
+ #endif
1098
+ if(software_only)
1099
+ hardware_support = FALSE;
1100
+ l = 64 - context.length;
1101
+ if(length < l)
1102
+ l = length;
1103
+ memcpy(&context.buf[context.length], message, l);
1104
+ if(context.length + length < 64)
1105
+ context.length += length;
1106
+ else {
1107
+ length -= l;
1108
+ n = length / 64;
1109
+ p = (uc8 *)message + l;
1110
+
1111
+ if(FAST_SHA2_BIT_IS_SET(hardware_support, FAST_SHA2_CPU_SUPPORTS_SHA256))
1112
+ fast_sha2_256_process_hardware_hash(&context, context.buf, 1);
1113
+ else
1114
+ fast_sha2_256_process_software_hash(&context, context.buf, 1);
1115
+
1116
+ if(FAST_SHA2_BIT_IS_SET(hardware_support, FAST_SHA2_CPU_SUPPORTS_SHA256))
1117
+ fast_sha2_256_process_hardware_hash(&context, p, n);
1118
+ else
1119
+ fast_sha2_256_process_software_hash(&context, p, n);
1120
+
1121
+ length = length % 64;
1122
+ memcpy(&context.buf, &p[n << 6], length);
1123
+ context.length = length;
1124
+ context.total += (n + 1) << 6;
1125
+ }
1126
+ n = 1 + (55 < (context.length % 64));
1127
+ length = (context.total + context.length) << 3;
1128
+ l = n << 6;
1129
+ memset(context.buf + context.length, 0, l - context.length);
1130
+ context.buf[context.length] = FAST_SHA2_PAD_MARK;
1131
+ FAST_SHA2_UNPACK32(length, context.buf + l - 4);
1132
+
1133
+ if(FAST_SHA2_BIT_IS_SET(hardware_support, FAST_SHA2_CPU_SUPPORTS_SHA256))
1134
+ fast_sha2_256_process_hardware_hash(&context, context.buf, n);
1135
+ else
1136
+ fast_sha2_256_process_software_hash(&context, context.buf, n);
1137
+
1138
+ for(i = 0; i < 8; i++)
1139
+ FAST_SHA2_UNPACK32(context.hash[i], &digest[i << 2]);
1140
+ return(digest);
1141
+ }
1142
+
1143
+ uc8 *fast_sha2_256(uc8 *string, b8 software_only)
1144
+ {
1145
+ if(string == NULL)
1146
+ return(fast_sha2_256_hash(string, 0, software_only));
1147
+ else
1148
+ return(fast_sha2_256_hash(string, (u32)strlen((c8 *)string), software_only));
1149
+ }
1150
+
1151
+ c8 *fast_sha2_256_digest(uc8 *hash)
1152
+ {
1153
+ c8 *digest = NULL;
1154
+
1155
+ digest = calloc((size_t)sizeof(c8), (size_t)65);
1156
+ /*
1157
+ i32 i = 0;
1158
+ for(i = 0; i < 32; i++)
1159
+ sprintf((c8 *)(digest + (i * 2)), "%02x", hash[i]);
1160
+ */
1161
+ sprintf(digest, "%02x%02x%02x%02x%02x%02x%02x%02x"
1162
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1163
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1164
+ "%02x%02x%02x%02x%02x%02x%02x%02x",
1165
+ hash[0], hash[1], hash[2], hash[3], hash[4], hash[5], hash[6], hash[7],
1166
+ hash[8], hash[9], hash[10], hash[11], hash[12], hash[13], hash[14], hash[15],
1167
+ hash[16], hash[17], hash[18], hash[19], hash[20], hash[21], hash[22], hash[23],
1168
+ hash[24], hash[25], hash[26], hash[27], hash[28], hash[29], hash[30], hash[31]);
1169
+ digest[64] = '\0';
1170
+
1171
+ return(digest);
1172
+ }
1173
+
1174
+ void fast_sha2_256_show_hash(uc8 *hash)
1175
+ {
1176
+ printf("%02x%02x%02x%02x%02x%02x%02x%02x"
1177
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1178
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1179
+ "%02x%02x%02x%02x%02x%02x%02x%02x\n",
1180
+ hash[0], hash[1], hash[2], hash[3], hash[4], hash[5], hash[6], hash[7],
1181
+ hash[8], hash[9], hash[10], hash[11], hash[12], hash[13], hash[14], hash[15],
1182
+ hash[16], hash[17], hash[18], hash[19], hash[20], hash[21], hash[22], hash[23],
1183
+ hash[24], hash[25], hash[26], hash[27], hash[28], hash[29], hash[30], hash[31]);
1184
+ /*
1185
+ i32 i = 0;
1186
+
1187
+ for(i = 0; i < 32; i++)
1188
+ printf("%02x", hash[i]);
1189
+ printf("\n");
1190
+ */
1191
+ }
1192
+
1193
+
1194
+
1195
+
1196
+
1197
+
1198
+
1199
+
1200
+
1201
+ /* 512 */
1202
+
1203
+
1204
+
1205
+
1206
+
1207
+
1208
+ /*
1209
+ Warning: unused function - if FAST_SHA2_NI and FAST_SHA2_ARM hardware support isn't
1210
+ available, this function is empty. In this case, the function
1211
+ fast_sha2_supported_by_cpu will return false, and so in this case the empty
1212
+ function should never be called.
1213
+
1214
+ However, this *may* cause compiler warnings.
1215
+ */
1216
+ #if defined(FAST_SHA2_ATTRIBUTES)
1217
+ FAST_SHA2_ATTRIBUTES_512
1218
+ #endif
1219
+ void fast_sha2_512_process_hardware_hash(fast_sha2_hash_context512 *context,
1220
+ uc8 *message, u64 n, b32 hardware_support)
1221
+ {
1222
+ #if defined(FAST_SHA2_NI)
1223
+ v256 state0, state1, save0, save1, keys;
1224
+ v256 msg, tmp, tmsg0, tmsg1, tmsg2, tmsg3;
1225
+ #elif defined(FAST_SHA2_ARM)
1226
+ v128_64 state0, state1, state2, state3, save0, save1, save2, save3, keys;
1227
+ v128_64 msg, tmp, tmsg0, tmsg1, tmsg2, tmsg3;
1228
+ #endif
1229
+ #if defined(FAST_SHA2_NI)
1230
+ const v256 shuffle_mask = _mm256_set_epi8(8, 9, 10, 11, 12, 13, 14, 15,
1231
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3,
1232
+ 4, 5, 6, 7);
1233
+ #endif
1234
+ #if defined(FAST_SHA2_NI) || defined(FAST_SHA2_ARM)
1235
+ u64 s0[16] = {0}, s1[16] = {0}, i = 0;
1236
+ uc8 *p = NULL;
1237
+ #endif
1238
+
1239
+ /* Initialize state from context */
1240
+ #if defined(FAST_SHA2_NI)
1241
+ state0 = _mm256_set_epi64x((i64)context->hash[4], (i64)context->hash[5],
1242
+ (i64)context->hash[0], (i64)context->hash[1]); /* EFAB */
1243
+ state1 = _mm256_set_epi64x((i64)context->hash[6], (i64)context->hash[7],
1244
+ (i64)context->hash[2], (i64)context->hash[3]); /* GHCD */
1245
+ #elif defined(FAST_SHA2_ARM)
1246
+ state0 = vld1q_u64((const u64 *)&context->hash[0]); /* AB */
1247
+ state1 = vld1q_u64((const u64 *)&context->hash[2]); /* CD */
1248
+ state2 = vld1q_u64((const u64 *)&context->hash[4]); /* EF */
1249
+ state3 = vld1q_u64((const u64 *)&context->hash[6]); /* GH */
1250
+ #endif
1251
+
1252
+ #if defined(FAST_SHA2_NI) || defined(FAST_SHA2_ARM)
1253
+ for(i = 0; i < n; i++) {
1254
+ p = message + (i << 7);
1255
+
1256
+ save0 = state0; /* Save for next round */
1257
+ save1 = state1;
1258
+ #if defined(FAST_SHA2_ARM)
1259
+ save2 = state2;
1260
+ save3 = state3;
1261
+ #endif
1262
+ #endif
1263
+
1264
+ #if defined(FAST_SHA2_NI)
1265
+ if(FAST_SHA2_BIT_IS_SET(hardware_support, FAST_SHA2_CPU_SUPPORTS_SHA512)) {
1266
+ /* Rounds 0-3 */
1267
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[1], (i64)fast_sha2_512_k_table[0],
1268
+ (i64)fast_sha2_512_k_table[1], (i64)fast_sha2_512_k_table[0]);
1269
+ msg = _mm256_loadu_si256((v256 *)p);
1270
+ tmsg0 = _mm256_shuffle_epi8(msg, shuffle_mask);
1271
+ msg = _mm256_add_epi64(tmsg0, keys);
1272
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1273
+
1274
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[3], (i64)fast_sha2_512_k_table[2],
1275
+ (i64)fast_sha2_512_k_table[3], (i64)fast_sha2_512_k_table[2]);
1276
+ msg = _mm256_add_epi64(tmsg0, keys);
1277
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1278
+
1279
+ /* Rounds 4-7 */
1280
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[5], (i64)fast_sha2_512_k_table[4],
1281
+ (i64)fast_sha2_512_k_table[5], (i64)fast_sha2_512_k_table[4]);
1282
+ msg = _mm256_loadu_si256((v256 *)(p + 32));
1283
+ tmsg1 = _mm256_shuffle_epi8(msg, shuffle_mask);
1284
+ msg = _mm256_add_epi64(tmsg1, keys);
1285
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1286
+
1287
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[7], (i64)fast_sha2_512_k_table[6],
1288
+ (i64)fast_sha2_512_k_table[7], (i64)fast_sha2_512_k_table[6]);
1289
+ msg = _mm256_add_epi64(tmsg1, keys);
1290
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1291
+
1292
+ /* Rounds 8-11 */
1293
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[9], (i64)fast_sha2_512_k_table[8],
1294
+ (i64)fast_sha2_512_k_table[9], (i64)fast_sha2_512_k_table[8]);
1295
+ msg = _mm256_loadu_si256((v256 *)(p + 64));
1296
+ tmsg2 = _mm256_shuffle_epi8(msg, shuffle_mask);
1297
+ msg = _mm256_add_epi64(tmsg2, keys);
1298
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1299
+
1300
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[11], (i64)fast_sha2_512_k_table[10],
1301
+ (i64)fast_sha2_512_k_table[11], (i64)fast_sha2_512_k_table[10]);
1302
+ msg = _mm256_add_epi64(tmsg2, keys);
1303
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1304
+
1305
+ /* Rounds 12-15 */
1306
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[13], (i64)fast_sha2_512_k_table[12],
1307
+ (i64)fast_sha2_512_k_table[13], (i64)fast_sha2_512_k_table[12]);
1308
+ msg = _mm256_loadu_si256((v256 *)(p + 96));
1309
+ tmsg3 = _mm256_shuffle_epi8(msg, shuffle_mask);
1310
+ msg = _mm256_add_epi64(tmsg3, keys);
1311
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1312
+
1313
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[15], (i64)fast_sha2_512_k_table[14],
1314
+ (i64)fast_sha2_512_k_table[15], (i64)fast_sha2_512_k_table[14]);
1315
+ msg = _mm256_add_epi64(tmsg3, keys);
1316
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1317
+
1318
+ /* Rounds 16-19 */
1319
+ tmsg0 = _mm256_sha512msg1_epi64(tmsg0, _mm256_extracti128_si256(tmsg1, 0));
1320
+ tmsg0 = _mm256_sha512msg2_epi64(tmsg0, tmsg3);
1321
+
1322
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[17], (i64)fast_sha2_512_k_table[16],
1323
+ (i64)fast_sha2_512_k_table[17], (i64)fast_sha2_512_k_table[16]);
1324
+ msg = _mm256_add_epi64(tmsg0, keys);
1325
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1326
+
1327
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[19], (i64)fast_sha2_512_k_table[18],
1328
+ (i64)fast_sha2_512_k_table[19], (i64)fast_sha2_512_k_table[18]);
1329
+ msg = _mm256_add_epi64(tmsg0, keys);
1330
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1331
+
1332
+ /* --- ROUNDS 20-23 --- */
1333
+ tmsg1 = _mm256_sha512msg1_epi64(tmsg1, _mm256_extracti128_si256(tmsg2, 0));
1334
+ tmsg1 = _mm256_sha512msg2_epi64(tmsg1, tmsg0);
1335
+
1336
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[21], (i64)fast_sha2_512_k_table[20],
1337
+ (i64)fast_sha2_512_k_table[21], (i64)fast_sha2_512_k_table[20]);
1338
+ msg = _mm256_add_epi64(tmsg1, keys);
1339
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1340
+
1341
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[23], (i64)fast_sha2_512_k_table[22],
1342
+ (i64)fast_sha2_512_k_table[23], (i64)fast_sha2_512_k_table[22]);
1343
+ msg = _mm256_add_epi64(tmsg1, keys);
1344
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1345
+
1346
+ /* --- ROUNDS 24-27 --- */
1347
+ tmsg2 = _mm256_sha512msg1_epi64(tmsg2, _mm256_extracti128_si256(tmsg3, 0));
1348
+ tmsg2 = _mm256_sha512msg2_epi64(tmsg2, tmsg1);
1349
+
1350
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[25], (i64)fast_sha2_512_k_table[24],
1351
+ (i64)fast_sha2_512_k_table[25], (i64)fast_sha2_512_k_table[24]);
1352
+ msg = _mm256_add_epi64(tmsg2, keys);
1353
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1354
+
1355
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[27], (i64)fast_sha2_512_k_table[26],
1356
+ (i64)fast_sha2_512_k_table[27], (i64)fast_sha2_512_k_table[26]);
1357
+ msg = _mm256_add_epi64(tmsg2, keys);
1358
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1359
+
1360
+ /* --- ROUNDS 28-31 --- */
1361
+ tmsg3 = _mm256_sha512msg1_epi64(tmsg3, _mm256_extracti128_si256(tmsg0, 0));
1362
+ tmsg3 = _mm256_sha512msg2_epi64(tmsg3, tmsg2);
1363
+
1364
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[29], (i64)fast_sha2_512_k_table[28],
1365
+ (i64)fast_sha2_512_k_table[29], (i64)fast_sha2_512_k_table[28]);
1366
+ msg = _mm256_add_epi64(tmsg3, keys);
1367
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1368
+
1369
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[31], (i64)fast_sha2_512_k_table[30],
1370
+ (i64)fast_sha2_512_k_table[31], (i64)fast_sha2_512_k_table[30]);
1371
+ msg = _mm256_add_epi64(tmsg3, keys);
1372
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1373
+
1374
+ /* --- ROUNDS 32-35 --- */
1375
+ tmsg0 = _mm256_sha512msg1_epi64(tmsg0, _mm256_extracti128_si256(tmsg1, 0));
1376
+ tmsg0 = _mm256_sha512msg2_epi64(tmsg0, tmsg3);
1377
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[33], (i64)fast_sha2_512_k_table[32],
1378
+ (i64)fast_sha2_512_k_table[33], (i64)fast_sha2_512_k_table[32]);
1379
+ msg = _mm256_add_epi64(tmsg0, keys);
1380
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1381
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[35], (i64)fast_sha2_512_k_table[34],
1382
+ (i64)fast_sha2_512_k_table[35], (i64)fast_sha2_512_k_table[34]);
1383
+ msg = _mm256_add_epi64(tmsg0, keys);
1384
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1385
+
1386
+ /* --- ROUNDS 36-39 --- */
1387
+ tmsg1 = _mm256_sha512msg1_epi64(tmsg1, _mm256_extracti128_si256(tmsg2, 0));
1388
+ tmsg1 = _mm256_sha512msg2_epi64(tmsg1, tmsg0);
1389
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[37], (i64)fast_sha2_512_k_table[36],
1390
+ (i64)fast_sha2_512_k_table[37], (i64)fast_sha2_512_k_table[36]);
1391
+ msg = _mm256_add_epi64(tmsg1, keys);
1392
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1393
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[39], (i64)fast_sha2_512_k_table[38],
1394
+ (i64)fast_sha2_512_k_table[39], (i64)fast_sha2_512_k_table[38]);
1395
+ msg = _mm256_add_epi64(tmsg1, keys);
1396
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1397
+
1398
+ /* --- ROUNDS 40-43 --- */
1399
+ tmsg2 = _mm256_sha512msg1_epi64(tmsg2, _mm256_extracti128_si256(tmsg3, 0));
1400
+ tmsg2 = _mm256_sha512msg2_epi64(tmsg2, tmsg1);
1401
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[41], (i64)fast_sha2_512_k_table[40],
1402
+ (i64)fast_sha2_512_k_table[41], (i64)fast_sha2_512_k_table[40]);
1403
+ msg = _mm256_add_epi64(tmsg2, keys);
1404
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1405
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[43], (i64)fast_sha2_512_k_table[42],
1406
+ (i64)fast_sha2_512_k_table[43], (i64)fast_sha2_512_k_table[42]);
1407
+ msg = _mm256_add_epi64(tmsg2, keys);
1408
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1409
+
1410
+ /* --- ROUNDS 44-47 --- */
1411
+ tmsg3 = _mm256_sha512msg1_epi64(tmsg3, _mm256_extracti128_si256(tmsg0, 0));
1412
+ tmsg3 = _mm256_sha512msg2_epi64(tmsg3, tmsg2);
1413
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[45], (i64)fast_sha2_512_k_table[44],
1414
+ (i64)fast_sha2_512_k_table[45], (i64)fast_sha2_512_k_table[44]);
1415
+ msg = _mm256_add_epi64(tmsg3, keys);
1416
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1417
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[47], (i64)fast_sha2_512_k_table[46],
1418
+ (i64)fast_sha2_512_k_table[47], (i64)fast_sha2_512_k_table[46]);
1419
+ msg = _mm256_add_epi64(tmsg3, keys);
1420
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1421
+
1422
+ /* --- ROUNDS 48-51 --- */
1423
+ tmsg0 = _mm256_sha512msg1_epi64(tmsg0, _mm256_extracti128_si256(tmsg1, 0));
1424
+ tmsg0 = _mm256_sha512msg2_epi64(tmsg0, tmsg3);
1425
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[49], (i64)fast_sha2_512_k_table[48],
1426
+ (i64)fast_sha2_512_k_table[49], (i64)fast_sha2_512_k_table[48]);
1427
+ msg = _mm256_add_epi64(tmsg0, keys);
1428
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1429
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[51], (i64)fast_sha2_512_k_table[50],
1430
+ (i64)fast_sha2_512_k_table[51], (i64)fast_sha2_512_k_table[50]);
1431
+ msg = _mm256_add_epi64(tmsg0, keys);
1432
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1433
+
1434
+ /* --- ROUNDS 52-55 --- */
1435
+ tmsg1 = _mm256_sha512msg1_epi64(tmsg1, _mm256_extracti128_si256(tmsg2, 0));
1436
+ tmsg1 = _mm256_sha512msg2_epi64(tmsg1, tmsg0);
1437
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[53], (i64)fast_sha2_512_k_table[52],
1438
+ (i64)fast_sha2_512_k_table[53], (i64)fast_sha2_512_k_table[52]);
1439
+ msg = _mm256_add_epi64(tmsg1, keys);
1440
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1441
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[55], (i64)fast_sha2_512_k_table[54],
1442
+ (i64)fast_sha2_512_k_table[55], (i64)fast_sha2_512_k_table[54]);
1443
+ msg = _mm256_add_epi64(tmsg1, keys);
1444
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1445
+
1446
+ /* --- ROUNDS 56-59 --- */
1447
+ tmsg2 = _mm256_sha512msg1_epi64(tmsg2, _mm256_extracti128_si256(tmsg3, 0));
1448
+ tmsg2 = _mm256_sha512msg2_epi64(tmsg2, tmsg1);
1449
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[57], (i64)fast_sha2_512_k_table[56],
1450
+ (i64)fast_sha2_512_k_table[57], (i64)fast_sha2_512_k_table[56]);
1451
+ msg = _mm256_add_epi64(tmsg2, keys);
1452
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1453
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[59], (i64)fast_sha2_512_k_table[58],
1454
+ (i64)fast_sha2_512_k_table[59], (i64)fast_sha2_512_k_table[58]);
1455
+ msg = _mm256_add_epi64(tmsg2, keys);
1456
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1457
+
1458
+ /* --- ROUNDS 60-63 --- */
1459
+ tmsg3 = _mm256_sha512msg1_epi64(tmsg3, _mm256_extracti128_si256(tmsg0, 0));
1460
+ tmsg3 = _mm256_sha512msg2_epi64(tmsg3, tmsg2);
1461
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[61], (i64)fast_sha2_512_k_table[60],
1462
+ (i64)fast_sha2_512_k_table[61], (i64)fast_sha2_512_k_table[60]);
1463
+ msg = _mm256_add_epi64(tmsg3, keys);
1464
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1465
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[63], (i64)fast_sha2_512_k_table[62],
1466
+ (i64)fast_sha2_512_k_table[63], (i64)fast_sha2_512_k_table[62]);
1467
+ msg = _mm256_add_epi64(tmsg3, keys);
1468
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1469
+
1470
+ /* --- ROUNDS 64-67 --- */
1471
+ tmsg0 = _mm256_sha512msg1_epi64(tmsg0, _mm256_extracti128_si256(tmsg1, 0));
1472
+ tmsg0 = _mm256_sha512msg2_epi64(tmsg0, tmsg3);
1473
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[65], (i64)fast_sha2_512_k_table[64],
1474
+ (i64)fast_sha2_512_k_table[65], (i64)fast_sha2_512_k_table[64]);
1475
+ msg = _mm256_add_epi64(tmsg0, keys);
1476
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1477
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[67], (i64)fast_sha2_512_k_table[66],
1478
+ (i64)fast_sha2_512_k_table[67], (i64)fast_sha2_512_k_table[66]);
1479
+ msg = _mm256_add_epi64(tmsg0, keys);
1480
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1481
+
1482
+ /* --- ROUNDS 68-71 --- */
1483
+ tmsg1 = _mm256_sha512msg1_epi64(tmsg1, _mm256_extracti128_si256(tmsg2, 0));
1484
+ tmsg1 = _mm256_sha512msg2_epi64(tmsg1, tmsg0);
1485
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[69], (i64)fast_sha2_512_k_table[68],
1486
+ (i64)fast_sha2_512_k_table[69], (i64)fast_sha2_512_k_table[68]);
1487
+ msg = _mm256_add_epi64(tmsg1, keys);
1488
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1489
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[71], (i64)fast_sha2_512_k_table[70],
1490
+ (i64)fast_sha2_512_k_table[71], (i64)fast_sha2_512_k_table[70]);
1491
+ msg = _mm256_add_epi64(tmsg1, keys);
1492
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1493
+
1494
+ /* --- ROUNDS 72-75 --- */
1495
+ tmsg2 = _mm256_sha512msg1_epi64(tmsg2, _mm256_extracti128_si256(tmsg3, 0));
1496
+ tmsg2 = _mm256_sha512msg2_epi64(tmsg2, tmsg1);
1497
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[73], (i64)fast_sha2_512_k_table[72],
1498
+ (i64)fast_sha2_512_k_table[73], (i64)fast_sha2_512_k_table[72]);
1499
+ msg = _mm256_add_epi64(tmsg2, keys);
1500
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1501
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[75], (i64)fast_sha2_512_k_table[74],
1502
+ (i64)fast_sha2_512_k_table[75], (i64)fast_sha2_512_k_table[74]);
1503
+ msg = _mm256_add_epi64(tmsg2, keys);
1504
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1505
+
1506
+ /* --- ROUNDS 76-79 --- */
1507
+ tmsg3 = _mm256_sha512msg1_epi64(tmsg3, _mm256_extracti128_si256(tmsg0, 0));
1508
+ tmsg3 = _mm256_sha512msg2_epi64(tmsg3, tmsg2);
1509
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[77], (i64)fast_sha2_512_k_table[76],
1510
+ (i64)fast_sha2_512_k_table[77], (i64)fast_sha2_512_k_table[76]);
1511
+ msg = _mm256_add_epi64(tmsg3, keys);
1512
+ state0 = _mm256_sha512rnds2_epi64(state0, state1, _mm256_extracti128_si256(msg, 0));
1513
+ keys = _mm256_set_epi64x((i64)fast_sha2_512_k_table[79], (i64)fast_sha2_512_k_table[78],
1514
+ (i64)fast_sha2_512_k_table[79], (i64)fast_sha2_512_k_table[78]);
1515
+ msg = _mm256_add_epi64(tmsg3, keys);
1516
+ state1 = _mm256_sha512rnds2_epi64(state1, state0, _mm256_extracti128_si256(msg, 1));
1517
+ } else if(FAST_SHA2_BIT_IS_SET(hardware_support, FAST_SHA2_CPU_SUPPORTS_AVX512)) {
1518
+ } else if(FAST_SHA2_BIT_IS_SET(hardware_support, FAST_SHA2_CPU_SUPPORTS_AVX2)) {
1519
+ }
1520
+
1521
+
1522
+ #elif defined(FAST_SHA2_ARM)
1523
+
1524
+ /*
1525
+ v128_64 ab = vld1q_u64(&fast_sha2_512_initial_states[0]); // a, b state0
1526
+ v128_64 cd = vld1q_u64(&fast_sha2_512_initial_states[2]); // c, d state1
1527
+ v128_64 ef = vld1q_u64(&fast_sha2_512_initial_states[4]); // e, f state2
1528
+ v128_64 gh = vld1q_u64(&fast_sha2_512_initial_states[6]); // g, h state3
1529
+ */
1530
+
1531
+ /* --- Rounds 0 & 1 --- */
1532
+ keys = vld1q_u64((const u64 *)&fast_sha2_512_k_table[0]);
1533
+ msg = vld1q_u64((const u64 *)p);
1534
+ tmsg0 = (v128_64)vrev64q_u8((uint8x16_t)msg);
1535
+
1536
+ msg = vaddq_u64(tmsg0, keys);
1537
+
1538
+ tmp = vsha512hq_u64(state0, state2, msg);
1539
+ state2 = vsha512h2q_u64(state2, state3, state0);
1540
+ state0 = tmp;
1541
+
1542
+ /* --- Rounds 2 & 3 --- */
1543
+ keys = vld1q_u64((const u64 *)&fast_sha2_512_k_table[2]);
1544
+ msg = vld1q_u64((const u64 *)p + 16);
1545
+ tmsg1 = (v128_64)vrev64q_u8((uint8x16_t)msg);
1546
+ msg = vaddq_u64(tmsg1, keys);
1547
+
1548
+ tmp = vsha512hq_u64(state0, state2, msg);
1549
+ state2 = vsha512h2q_u64(state2, state3, state0);
1550
+ state0 = tmp;
1551
+
1552
+ printf("State 'a' after round 3: %016lx\n", vgetq_lane_u64(state0, 0));
1553
+
1554
+
1555
+ #endif
1556
+
1557
+ #if defined(FAST_SHA2_NI)
1558
+ #elif defined(FAST_SHA2_ARM)
1559
+ #endif
1560
+
1561
+
1562
+ #ifdef FAST_SHA2_DEBUG
1563
+ #if defined(FAST_SHA2_NI)
1564
+ _mm256_storeu_si256((v256 *)&s0, state0);
1565
+ _mm256_storeu_si256((v256 *)&s1, state1);
1566
+ #elif defined(FAST_SHA2_ARM)
1567
+ vst1q_u64((u64 *)&s0, state0);
1568
+ vst1q_u64((u64 *)&s1, state1);
1569
+ #endif
1570
+
1571
+ #if defined(FAST_SHA2_NI) || defined(FAST_SHA2_ARM)
1572
+ context->hash[0] = s0[0];
1573
+ context->hash[1] = s0[1];
1574
+ context->hash[2] = s0[2];
1575
+ context->hash[3] = s0[3];
1576
+ context->hash[4] = s1[0];
1577
+ context->hash[5] = s1[1];
1578
+ context->hash[6] = s1[2];
1579
+ context->hash[7] = s1[3];
1580
+ printf("t60-63 = %016lx %016lx %016lx %016lx\n"
1581
+ " %016lx %016lx %016lx %016lx\n",
1582
+ context->hash[0], context->hash[1], context->hash[2],
1583
+ context->hash[3], context->hash[4], context->hash[5],
1584
+ context->hash[6], context->hash[7]);
1585
+ #endif
1586
+ #endif
1587
+
1588
+ #if defined(FAST_SHA2_NI)
1589
+ state0 = _mm256_add_epi64(state0, save0);
1590
+ state1 = _mm256_add_epi64(state1, save1);
1591
+ #elif defined(FAST_SHA2_ARM)
1592
+ state0 = vaddq_u64(state0, save0);
1593
+ state1 = vaddq_u64(state1, save1);
1594
+ #endif
1595
+
1596
+ #ifdef FAST_SHA2_DEBUG
1597
+ #if defined(FAST_SHA2_NI)
1598
+ printf("s0 = %032llx %032llx\n", state0[0], state0[1]);
1599
+ printf("s1 = %032llx %032llx\n", state1[0], state1[1]);
1600
+ #elif defined(FAST_SHA2_ARM)
1601
+ printf("s0 = %016lx %016lx %016lx %016lx\n",
1602
+ state0[0], state0[1], state1[0], state1[1]);
1603
+ printf("s1 = %016lx %016lx %016lx %016lx\n",
1604
+ state2[0], state2[1], state3[0], state3[1]);
1605
+ #endif
1606
+ #endif
1607
+
1608
+ #if defined(FAST_SHA2_NI) || defined(FAST_SHA2_ARM)
1609
+ }
1610
+ #endif
1611
+
1612
+ #if defined(FAST_SHA2_NI)
1613
+ _mm256_storeu_si256((v256 *)&s0, state0);
1614
+ _mm256_storeu_si256((v256 *)&s1, state1);
1615
+ context->hash[0] = s0[3];
1616
+ context->hash[1] = s0[2];
1617
+ context->hash[2] = s1[3];
1618
+ context->hash[3] = s1[2];
1619
+ context->hash[4] = s0[1];
1620
+ context->hash[5] = s0[0];
1621
+ context->hash[6] = s1[1];
1622
+ context->hash[7] = s1[0];
1623
+ #elif defined(FAST_SHA2_ARM)
1624
+ vst1q_u64((u64 *)&s0, state0);
1625
+ vst1q_u64((u64 *)&s1, state1);
1626
+ context->hash[0] = s0[0];
1627
+ context->hash[1] = s0[1];
1628
+ context->hash[2] = s0[2];
1629
+ context->hash[3] = s0[3];
1630
+ context->hash[4] = s1[0];
1631
+ context->hash[5] = s1[1];
1632
+ context->hash[6] = s1[2];
1633
+ context->hash[7] = s1[3];
1634
+ #endif
1635
+ return;
1636
+ }
1637
+
1638
+ void fast_sha2_512_process_software_hash(fast_sha2_hash_context512 *context,
1639
+ uc8 *message, u64 n)
1640
+ {
1641
+ /*
1642
+ #if defined(FAST_SHA2_ARM)
1643
+ v128_64 tmsg = {0};
1644
+ #endif
1645
+ */
1646
+ u64 a = 0, b = 0, c = 0, d = 0, e = 0, f = 0, g = 0, h = 0, i = 0, j = 0, t1 = 0, t2 = 0, w[80];
1647
+ uc8 *p = NULL;
1648
+
1649
+ for(i = 0; i < n; i++) {
1650
+ p = message + (i << 7);
1651
+ /*
1652
+ #if defined(FAST_SHA2_ARM)
1653
+ for(j = 0; j < 16; j += 2) {
1654
+ tmsg = (v128_64)vrev64q_u8(vld1q_u8(p + (j << 3)));
1655
+ vst1q_u64(&w[j], tmsg);
1656
+ }
1657
+ #else
1658
+ for(j = 0; j < 16; j++) {
1659
+ FAST_SHA2_PACK64(&w[j], &p[j << 3]);
1660
+ }
1661
+ #endif
1662
+ */
1663
+
1664
+ for(j = 0; j < 16; j++) {
1665
+ FAST_SHA2_PACK64(&w[j], &p[j << 3]);
1666
+ }
1667
+ a = context->hash[0];
1668
+ b = context->hash[1];
1669
+ c = context->hash[2];
1670
+ d = context->hash[3];
1671
+ e = context->hash[4];
1672
+ f = context->hash[5];
1673
+ g = context->hash[6];
1674
+ h = context->hash[7];
1675
+
1676
+ for(j = 0; j < 80; j++) {
1677
+ if(j >= 16)
1678
+ w[j] = FAST_SHA2_G1_64(w[j - 2]) + w[j - 7] + FAST_SHA2_G0_64(w[j - 15]) + w[j - 16];
1679
+ t1 = h + FAST_SHA2_S1_64(e) + FAST_SHA2_CH(e, f, g) + fast_sha2_512_k_table[j] + w[j];
1680
+ t2 = FAST_SHA2_S0_64(a) + FAST_SHA2_MAJ(a, b, c);
1681
+ h = g;
1682
+ g = f;
1683
+ f = e;
1684
+ e = d + t1;
1685
+ d = c;
1686
+ c = b;
1687
+ b = a;
1688
+ a = t1 + t2;
1689
+ #ifdef FAST_SHA2_DEBUG
1690
+ printf("t = %2ld %16.16lx %16.16lx %16.16lx %16.16lx\n"
1691
+ " %16.16lx %16.16lx %16.16lx %16.16lx, "
1692
+ "k = %16.16lx, w = %16.16lx\n", j, a, b, c, d, e, f, g, h,
1693
+ fast_sha2_512_k_table[j], w[j]);
1694
+ #endif
1695
+ }
1696
+ context->hash[0] += a;
1697
+ context->hash[1] += b;
1698
+ context->hash[2] += c;
1699
+ context->hash[3] += d;
1700
+ context->hash[4] += e;
1701
+ context->hash[5] += f;
1702
+ context->hash[6] += g;
1703
+ context->hash[7] += h;
1704
+ }
1705
+ }
1706
+
1707
+ void fast_sha2_512_create_hash(fast_sha2_hash_context512 *context, uc8 *message,
1708
+ uc8 *digest, u64 length, b8 software_only)
1709
+ {
1710
+ u64 i = 0, l = 0, n = 0;
1711
+ uc8 *p = NULL;
1712
+ u32 hardware_support = fast_sha2_supported_by_cpu();
1713
+
1714
+ if(software_only)
1715
+ hardware_support = FALSE;
1716
+ l = 128 - context->length;
1717
+ if(length < l)
1718
+ l = length;
1719
+ memcpy(&context->buf[context->length], message, l);
1720
+ if(context->length + length < 128)
1721
+ context->length += length;
1722
+ else {
1723
+ length -= l;
1724
+ n = length / 128;
1725
+ p = (uc8 *)message + l;
1726
+ if(FAST_SHA2_BIT_IS_SET(hardware_support, FAST_SHA2_CPU_SUPPORTS_SHA512))
1727
+ fast_sha2_512_process_hardware_hash(context, context->buf, 1, hardware_support);
1728
+ else
1729
+ fast_sha2_512_process_software_hash(context, context->buf, 1);
1730
+ if(FAST_SHA2_BIT_IS_SET(hardware_support, FAST_SHA2_CPU_SUPPORTS_SHA512))
1731
+ fast_sha2_512_process_hardware_hash(context, p, n, hardware_support);
1732
+ else
1733
+ fast_sha2_512_process_software_hash(context, p, n);
1734
+ length = length % 128;
1735
+ memcpy(context->buf, &p[n << 7], length);
1736
+ context->length = length;
1737
+ context->total += (n + 1) << 7;
1738
+ }
1739
+ n = 1 + (111 < (context->length % 128));
1740
+ length = (context->total + context->length) << 3;
1741
+ l = n << 7;
1742
+ memset(context->buf + context->length, 0, l - context->length);
1743
+ context->buf[context->length] = FAST_SHA2_PAD_MARK;
1744
+ FAST_SHA2_UNPACK64(length, (context->buf + l - 8));
1745
+ if(FAST_SHA2_BIT_IS_SET(hardware_support, FAST_SHA2_CPU_SUPPORTS_SHA512))
1746
+ fast_sha2_512_process_hardware_hash(context, context->buf, n, hardware_support);
1747
+ else
1748
+ fast_sha2_512_process_software_hash(context, context->buf, n);
1749
+ for(i = 0; i < 8; i++)
1750
+ FAST_SHA2_UNPACK64(context->hash[i], &digest[i << 3]);
1751
+ }
1752
+
1753
+ uc8 *fast_sha2_512_hash(uc8 *message, u64 length, b8 software_only)
1754
+ {
1755
+ fast_sha2_hash_context512 context;
1756
+ uc8 *digest = NULL;
1757
+
1758
+ memcpy(context.hash, fast_sha2_512_initial_states, (size_t)64);
1759
+ memset(context.buf, 0, (size_t)256);
1760
+ context.length = 0;
1761
+ context.total = 0;
1762
+ digest = calloc((size_t)sizeof(c8), (size_t)65);
1763
+ #ifdef FAST_SHA2_DEBUG
1764
+ printf("init %16.16lx %16.16lx %16.16lx %16.16lx\r\n"
1765
+ " %16.16lx %16.16lx %16.16lx %16.16lx\r\n",
1766
+ context.hash[0], context.hash[1], context.hash[2], context.hash[3],
1767
+ context.hash[4], context.hash[5], context.hash[6], context.hash[7]);
1768
+ #endif
1769
+ fast_sha2_512_create_hash(&context, message, digest, length, software_only);
1770
+
1771
+ return(digest);
1772
+ }
1773
+
1774
+ uc8 *fast_sha2_512(uc8 *string, b8 software_only)
1775
+ {
1776
+ if(string == NULL)
1777
+ return(fast_sha2_512_hash(string, 0, software_only));
1778
+ else
1779
+ return(fast_sha2_512_hash(string, (u32)strlen((c8 *)string), software_only));
1780
+ }
1781
+
1782
+ c8 *fast_sha2_512_digest(uc8 *hash)
1783
+ {
1784
+ c8 *digest = NULL;
1785
+
1786
+ digest = calloc((size_t)sizeof(c8), (size_t)129);
1787
+ /*
1788
+ i32 i = 0;
1789
+ for(i = 0; i < 32; i++)
1790
+ sprintf((c8 *)(digest + (i * 2)), "%02x", hash[i]);
1791
+ */
1792
+ sprintf(digest, "%02x%02x%02x%02x%02x%02x%02x%02x"
1793
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1794
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1795
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1796
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1797
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1798
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1799
+ "%02x%02x%02x%02x%02x%02x%02x%02x",
1800
+ hash[0], hash[1], hash[2], hash[3], hash[4], hash[5], hash[6], hash[7],
1801
+ hash[8], hash[9], hash[10], hash[11], hash[12], hash[13], hash[14], hash[15],
1802
+ hash[16], hash[17], hash[18], hash[19], hash[20], hash[21], hash[22], hash[23],
1803
+ hash[24], hash[25], hash[26], hash[27], hash[28], hash[29], hash[30], hash[31],
1804
+ hash[32], hash[33], hash[34], hash[35], hash[36], hash[37], hash[38], hash[39],
1805
+ hash[40], hash[41], hash[42], hash[43], hash[44], hash[45], hash[46], hash[47],
1806
+ hash[48], hash[49], hash[50], hash[51], hash[52], hash[53], hash[54], hash[55],
1807
+ hash[56], hash[57], hash[58], hash[59], hash[60], hash[61], hash[62], hash[63]);
1808
+ digest[128] = '\0';
1809
+
1810
+ return(digest);
1811
+ }
1812
+
1813
+ void fast_sha2_512_show_hash(uc8 *hash)
1814
+ {
1815
+ printf("%02x%02x%02x%02x%02x%02x%02x%02x"
1816
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1817
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1818
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1819
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1820
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1821
+ "%02x%02x%02x%02x%02x%02x%02x%02x"
1822
+ "%02x%02x%02x%02x%02x%02x%02x%02x\n",
1823
+ hash[0], hash[1], hash[2], hash[3], hash[4], hash[5], hash[6], hash[7],
1824
+ hash[8], hash[9], hash[10], hash[11], hash[12], hash[13], hash[14], hash[15],
1825
+ hash[16], hash[17], hash[18], hash[19], hash[20], hash[21], hash[22], hash[23],
1826
+ hash[24], hash[25], hash[26], hash[27], hash[28], hash[29], hash[30], hash[31],
1827
+ hash[32], hash[33], hash[34], hash[35], hash[36], hash[37], hash[38], hash[39],
1828
+ hash[40], hash[41], hash[42], hash[43], hash[44], hash[45], hash[46], hash[47],
1829
+ hash[48], hash[49], hash[50], hash[51], hash[52], hash[53], hash[54], hash[55],
1830
+ hash[56], hash[57], hash[58], hash[59], hash[60], hash[61], hash[62], hash[63]);
1831
+ /*
1832
+ i32 i = 0;
1833
+ for(i = 0; i < 64; i++)
1834
+ printf("%02x", hash[i]);
1835
+ printf("\n");
1836
+ */
1837
+ }
1838
+
1839
+ #endif /* FAST_SHA2_IMPLEMENTATION */
1840
+ #endif /* FAST_SHA2_H_Minaswan */