@pinkparrot/qsafe-mayo-wasm 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitmodules +3 -0
- package/.vscode/launch.json +12 -0
- package/LICENSE +201 -0
- package/bridge/mayo1_bridge.c +26 -0
- package/bridge/mayo2_bridge.c +26 -0
- package/bridge/randombytes_inject.c +44 -0
- package/build_mayo1.ps1 +36 -0
- package/build_mayo2.ps1 +36 -0
- package/dist/mayo.browser.min.js +216 -0
- package/dist/mayo1.js +0 -0
- package/dist/mayo2.js +0 -0
- package/dist/mayo_api.js +139 -0
- package/dist/package.json +1 -0
- package/gitignore +2 -0
- package/index.mjs +1 -0
- package/mayo-c/.astylerc +16 -0
- package/mayo-c/.cmake/flags.cmake +45 -0
- package/mayo-c/.cmake/sanitizers.cmake +81 -0
- package/mayo-c/.cmake/target.cmake +71 -0
- package/mayo-c/.github/workflows/ci_clang.yml +61 -0
- package/mayo-c/.github/workflows/ci_gcc.yml +60 -0
- package/mayo-c/.github/workflows/cmake.yml +160 -0
- package/mayo-c/.github/workflows/macos_m1.yml +68 -0
- package/mayo-c/CMakeLists.txt +35 -0
- package/mayo-c/KAT/PQCsignKAT_24_MAYO_1.req +900 -0
- package/mayo-c/KAT/PQCsignKAT_24_MAYO_1.rsp +902 -0
- package/mayo-c/KAT/PQCsignKAT_24_MAYO_2.req +900 -0
- package/mayo-c/KAT/PQCsignKAT_24_MAYO_2.rsp +902 -0
- package/mayo-c/KAT/PQCsignKAT_32_MAYO_3.req +900 -0
- package/mayo-c/KAT/PQCsignKAT_32_MAYO_3.rsp +902 -0
- package/mayo-c/KAT/PQCsignKAT_40_MAYO_5.req +900 -0
- package/mayo-c/KAT/PQCsignKAT_40_MAYO_5.rsp +902 -0
- package/mayo-c/LICENSE +202 -0
- package/mayo-c/META/MAYO-1_META.yml +52 -0
- package/mayo-c/META/MAYO-2_META.yml +52 -0
- package/mayo-c/META/MAYO-3_META.yml +52 -0
- package/mayo-c/META/MAYO-5_META.yml +52 -0
- package/mayo-c/NOTICE +13 -0
- package/mayo-c/README.md +183 -0
- package/mayo-c/apps/CMakeLists.txt +31 -0
- package/mayo-c/apps/PQCgenKAT_sign.c +281 -0
- package/mayo-c/apps/example.c +151 -0
- package/mayo-c/apps/example_nistapi.c +124 -0
- package/mayo-c/include/mayo.h +442 -0
- package/mayo-c/include/mem.h +25 -0
- package/mayo-c/include/randombytes.h +31 -0
- package/mayo-c/scripts/contstants.py +141 -0
- package/mayo-c/scripts/find_irred_poly.sage +39 -0
- package/mayo-c/src/AVX2/arithmetic_common.h +159 -0
- package/mayo-c/src/AVX2/echelon_form.h +91 -0
- package/mayo-c/src/AVX2/echelon_form_loop.h +58 -0
- package/mayo-c/src/AVX2/shuffle_arithmetic.h +442 -0
- package/mayo-c/src/CMakeLists.txt +98 -0
- package/mayo-c/src/arithmetic.c +128 -0
- package/mayo-c/src/arithmetic.h +124 -0
- package/mayo-c/src/common/aes128ctr.c +293 -0
- package/mayo-c/src/common/aes_c.c +741 -0
- package/mayo-c/src/common/aes_ctr.h +32 -0
- package/mayo-c/src/common/aes_neon.c +201 -0
- package/mayo-c/src/common/debug_bench_tools.h +69 -0
- package/mayo-c/src/common/fips202.c +1093 -0
- package/mayo-c/src/common/fips202.h +12 -0
- package/mayo-c/src/common/mem.c +19 -0
- package/mayo-c/src/common/randombytes_ctrdrbg.c +141 -0
- package/mayo-c/src/common/randombytes_system.c +399 -0
- package/mayo-c/src/generic/arithmetic_dynamic.h +68 -0
- package/mayo-c/src/generic/arithmetic_fixed.h +84 -0
- package/mayo-c/src/generic/echelon_form.h +152 -0
- package/mayo-c/src/generic/ef_inner_loop.h +56 -0
- package/mayo-c/src/generic/generic_arithmetic.h +294 -0
- package/mayo-c/src/mayo.c +675 -0
- package/mayo-c/src/mayo_1/api.c +46 -0
- package/mayo-c/src/mayo_1/api.h +43 -0
- package/mayo-c/src/mayo_2/api.c +46 -0
- package/mayo-c/src/mayo_2/api.h +43 -0
- package/mayo-c/src/mayo_3/api.c +46 -0
- package/mayo-c/src/mayo_3/api.h +43 -0
- package/mayo-c/src/mayo_5/api.c +46 -0
- package/mayo-c/src/mayo_5/api.h +43 -0
- package/mayo-c/src/neon/arithmetic_common.h +132 -0
- package/mayo-c/src/neon/echelon_form.h +55 -0
- package/mayo-c/src/neon/echelon_form_loop.h +58 -0
- package/mayo-c/src/neon/shuffle_arithmetic.h +462 -0
- package/mayo-c/src/params.c +42 -0
- package/mayo-c/src/simple_arithmetic.h +138 -0
- package/mayo-c/test/CMakeLists.txt +51 -0
- package/mayo-c/test/bench.c +166 -0
- package/mayo-c/test/m1cycles.c +155 -0
- package/mayo-c/test/m1cycles.h +13 -0
- package/mayo-c/test/test_kat.c +271 -0
- package/mayo-c/test/test_mayo.c +139 -0
- package/mayo-c/test/test_sample_solution.c +75 -0
- package/mayo-c/test/test_various.c +680 -0
- package/package.json +39 -0
- package/publish.bat +22 -0
- package/readme.md +80 -0
- package/test/test.mjs +42 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
|
|
3
|
+
#ifndef AESCTR_H
|
|
4
|
+
#define AESCTR_H
|
|
5
|
+
|
|
6
|
+
#include <stddef.h>
|
|
7
|
+
#include <stdint.h>
|
|
8
|
+
|
|
9
|
+
void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
|
|
10
|
+
#define AES_ECB_encrypt AES_256_ECB
|
|
11
|
+
|
|
12
|
+
#ifdef ENABLE_AESNI
|
|
13
|
+
int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
|
|
14
|
+
const unsigned char *input, size_t inputByteLen);
|
|
15
|
+
int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
|
|
16
|
+
const unsigned char *input, size_t inputByteLen);
|
|
17
|
+
#define AES_128_CTR AES_128_CTR_NI
|
|
18
|
+
#else
|
|
19
|
+
#ifdef ENABLE_AESNEON
|
|
20
|
+
int AES_128_CTR_NEON(unsigned char *output, size_t outputByteLen,
|
|
21
|
+
const unsigned char *input, size_t inputByteLen);
|
|
22
|
+
int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
|
|
23
|
+
const unsigned char *input, size_t inputByteLen);
|
|
24
|
+
#define AES_128_CTR AES_128_CTR_NEON
|
|
25
|
+
#else
|
|
26
|
+
int AES_128_CTR(unsigned char *output, size_t outputByteLen,
|
|
27
|
+
const unsigned char *input, size_t inputByteLen);
|
|
28
|
+
#endif
|
|
29
|
+
#endif
|
|
30
|
+
|
|
31
|
+
#endif
|
|
32
|
+
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
// SPDX-License-Identifier: MIT and Public Domain
|
|
2
|
+
|
|
3
|
+
#ifdef ENABLE_AESNEON
|
|
4
|
+
|
|
5
|
+
// Code taken from https://github.com/ChristerKnorborg/post-quantum-signature-schemes/blob/main/src/genkat/aes_arm.c
|
|
6
|
+
|
|
7
|
+
/* ARMv8 AES Implementation adapted
|
|
8
|
+
from liboqs/src/common/aes which
|
|
9
|
+
in turn takes it from:
|
|
10
|
+
crypto_core/aes128ncrypt/dolbeau/aesenc-int
|
|
11
|
+
(https://bench.cr.yp.to/supercop.html) */
|
|
12
|
+
|
|
13
|
+
#include "mem.h"
|
|
14
|
+
#include <string.h>
|
|
15
|
+
#include <stdlib.h>
|
|
16
|
+
|
|
17
|
+
#if defined(__arm__) || defined(__aarch32__) || defined(__arm64__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64)
|
|
18
|
+
# if defined(__GNUC__)
|
|
19
|
+
# include <stdint.h>
|
|
20
|
+
# endif
|
|
21
|
+
# if defined(__ARM_NEON) || defined(_MSC_VER)
|
|
22
|
+
# include <arm_neon.h>
|
|
23
|
+
# endif
|
|
24
|
+
/* GCC and LLVM Clang, but not Apple Clang */
|
|
25
|
+
# if defined(__GNUC__) && !defined(__apple_build_version__)
|
|
26
|
+
# if defined(__ARM_ACLE) || defined(__ARM_FEATURE_CRYPTO)
|
|
27
|
+
# include <arm_acle.h>
|
|
28
|
+
# endif
|
|
29
|
+
# endif
|
|
30
|
+
#endif /* ARM Headers */
|
|
31
|
+
|
|
32
|
+
// aes s-box
|
|
33
|
+
static const uint8_t sbox[256] = {
|
|
34
|
+
//0 1 2 3 4 5 6 7 8 9 A B C D E F
|
|
35
|
+
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
|
|
36
|
+
0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
|
|
37
|
+
0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
|
|
38
|
+
0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
|
|
39
|
+
0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
|
|
40
|
+
0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
|
|
41
|
+
0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
|
|
42
|
+
0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
|
|
43
|
+
0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
|
|
44
|
+
0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
|
|
45
|
+
0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
|
|
46
|
+
0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
|
|
47
|
+
0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
|
|
48
|
+
0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
|
|
49
|
+
0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
|
|
50
|
+
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 };
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
// subword algorithm used in the aes key scheduling.
|
|
56
|
+
uint32_t subword(uint32_t word) {
|
|
57
|
+
return (uint32_t)sbox[(word >> 24) & 0xFF] << 24 |
|
|
58
|
+
(uint32_t)sbox[(word >> 16) & 0xFF] << 16 |
|
|
59
|
+
(uint32_t)sbox[(word >> 8) & 0xFF] << 8 |
|
|
60
|
+
(uint32_t)sbox[word & 0xFF];
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
uint32x4_t aeskeygenassist(uint32x4_t a32, uint8_t rcon) {
|
|
64
|
+
// Extract words X1 and X3
|
|
65
|
+
uint32_t X1 = vgetq_lane_u32(a32, 1);
|
|
66
|
+
uint32_t X3 = vgetq_lane_u32(a32, 3);
|
|
67
|
+
|
|
68
|
+
// Apply SubWord (Manually implemented using AES S-box lookup table)
|
|
69
|
+
uint32_t subX1 = subword(X1); // Implement this function based on AES S-box
|
|
70
|
+
uint32_t subX3 = subword(X3); // Implement this function based on AES S-box
|
|
71
|
+
|
|
72
|
+
// RotWord
|
|
73
|
+
uint32_t rotX1 = (subX1 >> 8) | (subX1 << 24);
|
|
74
|
+
uint32_t rotX3 = (subX3 >> 8) | (subX3 << 24);
|
|
75
|
+
|
|
76
|
+
// Apply RCON
|
|
77
|
+
rotX1 ^= (uint32_t)rcon;
|
|
78
|
+
rotX3 ^= (uint32_t)rcon;
|
|
79
|
+
|
|
80
|
+
// Assemble the final vector
|
|
81
|
+
uint32x4_t result = a32;
|
|
82
|
+
result = vsetq_lane_u32(subX1, result, 0);
|
|
83
|
+
result = vsetq_lane_u32(rotX1, result, 1);
|
|
84
|
+
result = vsetq_lane_u32(subX3, result, 2);
|
|
85
|
+
result = vsetq_lane_u32(rotX3, result, 3);
|
|
86
|
+
|
|
87
|
+
return result;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
static void aes_setkey_encrypt(const unsigned char *key, uint8x16_t rkeys[]) {
|
|
91
|
+
uint8x16_t key0 = vld1q_u8((const uint8_t *)(key));
|
|
92
|
+
uint32x4_t temp0, temp1, temp4;
|
|
93
|
+
int idx = 0;
|
|
94
|
+
|
|
95
|
+
temp0 = vreinterpretq_u32_u8(key0);
|
|
96
|
+
temp4 = vdupq_n_u32(0);
|
|
97
|
+
|
|
98
|
+
#define BLOCK1(IMM) \
|
|
99
|
+
temp1 = aeskeygenassist(temp0, IMM); \
|
|
100
|
+
rkeys[idx++] = vreinterpretq_u8_u32(temp0); \
|
|
101
|
+
temp4 = vsetq_lane_u32(vgetq_lane_u32(temp0, 0), temp4, 1); \
|
|
102
|
+
temp4 = vsetq_lane_u32(vgetq_lane_u32(temp0, 1), temp4, 2); \
|
|
103
|
+
temp4 = vsetq_lane_u32(vgetq_lane_u32(temp0, 2), temp4, 3); \
|
|
104
|
+
temp0 = veorq_u32(temp0, temp4); \
|
|
105
|
+
temp0 = vreinterpretq_u32_u64((vsetq_lane_u64(((uint64_t) veor_u32(vget_high_u32(temp0), vget_low_u32(temp0))), vreinterpretq_u64_u32(temp0), 1))); \
|
|
106
|
+
temp1 = vdupq_n_u32(vgetq_lane_u32(temp1, 3)); \
|
|
107
|
+
temp0 = veorq_u32(temp0, temp1); \
|
|
108
|
+
|
|
109
|
+
BLOCK1(0x01);
|
|
110
|
+
BLOCK1(0x02);
|
|
111
|
+
BLOCK1(0x04);
|
|
112
|
+
BLOCK1(0x08);
|
|
113
|
+
BLOCK1(0x10);
|
|
114
|
+
BLOCK1(0x20);
|
|
115
|
+
BLOCK1(0x40);
|
|
116
|
+
BLOCK1(0x80);
|
|
117
|
+
BLOCK1(0x1b);
|
|
118
|
+
BLOCK1(0x36);
|
|
119
|
+
rkeys[idx++] = vreinterpretq_u8_u32(temp0);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
void arm_aes128_load_schedule(const uint8_t *key, void **_schedule) {
|
|
123
|
+
*_schedule = malloc(11 * sizeof(uint8x16_t));
|
|
124
|
+
// assert(*_schedule != NULL);
|
|
125
|
+
uint8x16_t *schedule = (uint8x16_t *)*_schedule;
|
|
126
|
+
aes_setkey_encrypt(key, schedule);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
// AES encryption using NEON intrinsics. Round constants are 0 as the function mimics Intel AVX2 implementation,
|
|
131
|
+
// which applies in order: ShiftRows, SubBytes, MixColumns, AddRoundKey.
|
|
132
|
+
// vaeseq_u8 applies SubBytes, ShiftRows, AddRoundKey.
|
|
133
|
+
static void arm_aes128_encrypt(const uint8x16_t rkeys[11], uint8x16_t nv, unsigned char *out) {
|
|
134
|
+
|
|
135
|
+
uint8x16_t temp = vaeseq_u8(nv, rkeys[0]);
|
|
136
|
+
|
|
137
|
+
temp = vaesmcq_u8(temp);
|
|
138
|
+
temp = vaeseq_u8(temp, rkeys[1]);
|
|
139
|
+
|
|
140
|
+
temp = vaesmcq_u8(temp);
|
|
141
|
+
temp = vaeseq_u8(temp, rkeys[2]);
|
|
142
|
+
|
|
143
|
+
temp = vaesmcq_u8(temp);
|
|
144
|
+
temp = vaeseq_u8(temp, rkeys[3]);
|
|
145
|
+
|
|
146
|
+
temp = vaesmcq_u8(temp);
|
|
147
|
+
temp = vaeseq_u8(temp, rkeys[4]);
|
|
148
|
+
|
|
149
|
+
temp = vaesmcq_u8(temp);
|
|
150
|
+
temp = vaeseq_u8(temp, rkeys[5]);
|
|
151
|
+
|
|
152
|
+
temp = vaesmcq_u8(temp);
|
|
153
|
+
temp = vaeseq_u8(temp, rkeys[6]);
|
|
154
|
+
|
|
155
|
+
temp = vaesmcq_u8(temp);
|
|
156
|
+
temp = vaeseq_u8(temp, rkeys[7]);
|
|
157
|
+
|
|
158
|
+
temp = vaesmcq_u8(temp);
|
|
159
|
+
temp = vaeseq_u8(temp, rkeys[8]);
|
|
160
|
+
|
|
161
|
+
temp = vaesmcq_u8(temp);
|
|
162
|
+
temp = vaeseq_u8(temp, rkeys[9]);
|
|
163
|
+
|
|
164
|
+
temp = veorq_u8(temp, rkeys[10]);
|
|
165
|
+
vst1q_u8(out, temp);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
void arm_aes128_free_schedule(void *schedule) {
|
|
169
|
+
if (schedule != NULL) {
|
|
170
|
+
mayo_secure_free(schedule, 11 * sizeof(uint16x8_t));
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
static void arm_aes128_ctr_enc_sch(const void *schedule, uint8_t *out,
|
|
176
|
+
size_t out_len) {
|
|
177
|
+
uint8x16_t mask = {0, 1, 2, 3, 4, 5, 6, 7, 15, 14, 13, 12, 11, 10, 9, 8};
|
|
178
|
+
uint8x16_t block = vdupq_n_u8(0); // Initialize block to zero
|
|
179
|
+
while (out_len >= 16) {
|
|
180
|
+
arm_aes128_encrypt(schedule, block, out);
|
|
181
|
+
out += 16;
|
|
182
|
+
out_len -= 16;
|
|
183
|
+
block = vqtbl1q_u8(vreinterpretq_u8_u64(vaddq_u64(vreinterpretq_u64_u8(vqtbl1q_u8(block, mask)), (uint64x2_t) {0,1}) ), mask);
|
|
184
|
+
}
|
|
185
|
+
if (out_len > 0) {
|
|
186
|
+
uint8_t tmp[16];
|
|
187
|
+
arm_aes128_encrypt(schedule, block, tmp);
|
|
188
|
+
memcpy(out, tmp, out_len);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
int AES_128_CTR_NEON(unsigned char *output, size_t outputByteLen,
|
|
193
|
+
const unsigned char *input) {
|
|
194
|
+
void *schedule = NULL;
|
|
195
|
+
arm_aes128_load_schedule(input, &schedule);
|
|
196
|
+
arm_aes128_ctr_enc_sch(schedule, output, outputByteLen);
|
|
197
|
+
arm_aes128_free_schedule(schedule);
|
|
198
|
+
return (int)outputByteLen;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
#endif
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
|
|
3
|
+
#ifndef DEBUG_BENCH_TOOLS_H
|
|
4
|
+
#define DEBUG_BENCH_TOOLS_H
|
|
5
|
+
|
|
6
|
+
#include <stdio.h>
|
|
7
|
+
#include <sys/time.h>
|
|
8
|
+
|
|
9
|
+
static inline int64_t cpucycles(void) {
|
|
10
|
+
#if (defined(TARGET_AMD64) || defined(TARGET_X86))
|
|
11
|
+
unsigned int hi, lo;
|
|
12
|
+
|
|
13
|
+
asm volatile ("rdtsc" : "=a" (lo), "=d"(hi));
|
|
14
|
+
return ((int64_t) lo) | (((int64_t) hi) << 32);
|
|
15
|
+
#elif (defined(TARGET_S390X))
|
|
16
|
+
uint64_t tod;
|
|
17
|
+
asm volatile("stckf %0\n" : "=Q" (tod) : : "cc");
|
|
18
|
+
return (tod * 1000 / 4096);
|
|
19
|
+
#else
|
|
20
|
+
struct timespec time;
|
|
21
|
+
clock_gettime(CLOCK_REALTIME, &time);
|
|
22
|
+
return (int64_t)(time.tv_sec * 1e9 + time.tv_nsec);
|
|
23
|
+
#endif
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
#ifdef TICTOC
|
|
27
|
+
#define TIC printf("\n"); \
|
|
28
|
+
int64_t tic_toc_cycles = cpucycles();
|
|
29
|
+
|
|
30
|
+
#define TOC(name) printf(" %-30s cycles: %lu \n", name, cpucycles() - tic_toc_cycles); \
|
|
31
|
+
tic_toc_cycles = cpucycles();
|
|
32
|
+
#else
|
|
33
|
+
#define TIC
|
|
34
|
+
#define TOC(name)
|
|
35
|
+
#endif
|
|
36
|
+
|
|
37
|
+
#ifdef MAYO_AVX
|
|
38
|
+
|
|
39
|
+
#include <immintrin.h>
|
|
40
|
+
|
|
41
|
+
static inline void print_avx2(__m256i a){
|
|
42
|
+
unsigned char *temp = (unsigned char*) &a;
|
|
43
|
+
for (size_t i = 0; i < 32; i++)
|
|
44
|
+
{
|
|
45
|
+
printf("%X", temp[i] & 0xf);
|
|
46
|
+
printf("%X", temp[i] >> 4);
|
|
47
|
+
if(i%4 == 3){
|
|
48
|
+
printf(" ");
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
printf("\n");
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
static inline void print_avx2_(__m256i a){
|
|
55
|
+
unsigned char *temp = (unsigned char*) &a;
|
|
56
|
+
for (size_t i = 0; i < 32; i++)
|
|
57
|
+
{
|
|
58
|
+
printf("%X", temp[i] & 0xf);
|
|
59
|
+
printf("%X", temp[i] >> 4);
|
|
60
|
+
if(i%4 == 3){
|
|
61
|
+
printf(" ");
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
#endif
|
|
67
|
+
|
|
68
|
+
#endif
|
|
69
|
+
|