blake2b 0.9.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1c78ef171664fbbac65fc70a86d9b123356a768cdb46ad30fafd82bf3399deea
4
- data.tar.gz: 7ca48369241e2a3ec69de3acd39eb2828f5178ccae2a09b3254a932409a313fa
3
+ metadata.gz: 1dc7b3e51fa897480c2b5c8e255b2b0fba72fde3945612bbcdd333e79b5254da
4
+ data.tar.gz: 06637f83a411f3b24b9b31b09b997e1cbc2fe264d583d52fe646d97c7f88b63e
5
5
  SHA512:
6
- metadata.gz: 2d37aa6fc2b2eaf050c8885f1a8340625bb038636fe4c217728d2c71a198b4a03018f22387a411a09704e8b2d1f88755579d739bf90c172eff14b32af4bfd96d
7
- data.tar.gz: 15146aff85dcccd447bcc1d2108951191861ced8863cd69c44809ce03c669da37922fbd8f8f56838ecdd2c3e613fb6f5c29b61654c4032e5dd65adba5d949b71
6
+ metadata.gz: 5ae81bc780bf5f4391bcd90f85cabefb3c1c735555df20a3dec205cbed741c298631566c0f05bf850fa13a50074a7311a061f84fa7a2d17c33f58600be25a19f
7
+ data.tar.gz: 5d4bd5931b54044d5729fea12b1d65f22d7643a111a1967d28a2907b1e86aa070855f06194c4fcd4d9fcc2f20fea84319d4af6022ef54660f1efd2efb44ff2c4
data/README.md CHANGED
@@ -4,19 +4,19 @@ BLAKE2 is a cryptographic hash function faster than MD5, SHA-1, SHA-2, and SHA-3
4
4
 
5
5
  More info at: [https://blake2.net](https://blake2.net).
6
6
 
7
- ## SUMMARY
7
+ ## Summary
8
8
 
9
- This gem is a C-extension for using BLAKE2b in Ruby. BLAKE2b (or just BLAKE2) is optimized for 64-bit platforms—including NEON-enabled ARMs—and produces digests of any size between 1 and 64 bytes.
9
+ This gem is a C-extension to enable using BLAKE2b in Ruby. This BLAKE2b implementation (or just BLAKE2) is optimized for 64-bit platforms with SSE support (excluding NEON-enabled ARMs). It produces digests of any size between 1 and 64 bytes.
10
10
 
11
11
  The C code for this gem is taken from the [official reference C implementation](https://github.com/BLAKE2/BLAKE2) as of commit [ca4c89314abff54e3806b44e4a08164f8204f09a](https://github.com/BLAKE2/BLAKE2/tree/ca4c89314abff54e3806b44e4a08164f8204f09a).
12
12
 
13
- ## INSTALL
13
+ ## Install
14
14
 
15
15
  ```
16
16
  gem install blake2b
17
17
  ```
18
18
 
19
- ## USAGE
19
+ ## Usage
20
20
 
21
21
  ``` ruby
22
22
  require 'blake2b'
@@ -60,7 +60,41 @@ Blake2b.bytes(input, key, out_len)
60
60
 
61
61
  ```
62
62
 
63
- ## DEVELOPMENT
63
+ ## Performance
64
+
65
+ `Blake2b` really shines on larger inputs. Here are some benchmarks on various input sizes. You can find the performance suite used for these benchmarks at `performance/performance_suite.rb`. All tests were run on an iMac 27" Late 2014, 4GHz Core i7 CPU (4790K) w/ SSE4.1 + SSE4.2, 32GB DDR3 RAM.
66
+
67
+ ### 1KB (1M digests)
68
+
69
+ ```
70
+ MD5 result: 2.694545999998809 seconds.
71
+ SHA2 result: 4.037195000011707 seconds.
72
+ SHA512 result: 3.213850000000093 seconds.
73
+ BLAKE2s result: 5.6867979999951785 seconds.
74
+ BLAKE2b result: 4.375018999999156 seconds.
75
+ ```
76
+
77
+ ### 50KB (500k digests)
78
+
79
+ ```
80
+ MD5 result: 34.33997299999464 seconds.
81
+ SHA2 result: 50.161426999999094 seconds.
82
+ SHA512 result: 35.24845699999423 seconds.
83
+ BLAKE2s result: 64.8592859999917 seconds.
84
+ BLAKE2b result: 30.783814999987953 seconds.
85
+ ```
86
+
87
+ ### 250KB (500k digests)
88
+
89
+ ```
90
+ MD5 result: 67.89016799999808 seconds.
91
+ SHA2 result: 103.09026799999992 seconds.
92
+ SHA512 result: 72.46762200001103 seconds.
93
+ BLAKE2s result: 133.5229810000019 seconds.
94
+ BLAKE2b result: 64.30263599999307 seconds.
95
+ ```
96
+
97
+ ## Development
64
98
 
65
99
  After checking out the repo, run `bundle` to install dependencies. Then,
66
100
  run `rake full` to build and test, or `rake test` to only run the tests.
@@ -1,7 +1,7 @@
1
1
  # coding: utf-8
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "blake2b"
4
- spec.version = "0.9.0"
4
+ spec.version = "0.10.0"
5
5
  spec.authors = ["Franck Verrot", "Mauricio Gomes"]
6
6
  spec.email = ["mauricio@edge14.com"]
7
7
  spec.homepage = "https://github.com/mgomes/blake2b"
@@ -0,0 +1,72 @@
1
+ /*
2
+ BLAKE2 reference source code package - optimized C implementations
3
+
4
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
5
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
6
+ your option. The terms of these licenses can be found at:
7
+
8
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
9
+ - OpenSSL license : https://www.openssl.org/source/license.html
10
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ More information about the BLAKE2 hash function can be found at
13
+ https://blake2.net.
14
+ */
15
+ #ifndef BLAKE2_CONFIG_H
16
+ #define BLAKE2_CONFIG_H
17
+
18
+ /* These don't work everywhere */
19
+ #if defined(__SSE2__) || defined(__x86_64__) || defined(__amd64__)
20
+ #define HAVE_SSE2
21
+ #endif
22
+
23
+ #if defined(__SSSE3__)
24
+ #define HAVE_SSSE3
25
+ #endif
26
+
27
+ #if defined(__SSE4_1__)
28
+ #define HAVE_SSE41
29
+ #endif
30
+
31
+ #if defined(__AVX__)
32
+ #define HAVE_AVX
33
+ #endif
34
+
35
+ #if defined(__XOP__)
36
+ #define HAVE_XOP
37
+ #endif
38
+
39
+
40
+ #ifdef HAVE_AVX2
41
+ #ifndef HAVE_AVX
42
+ #define HAVE_AVX
43
+ #endif
44
+ #endif
45
+
46
+ #ifdef HAVE_XOP
47
+ #ifndef HAVE_AVX
48
+ #define HAVE_AVX
49
+ #endif
50
+ #endif
51
+
52
+ #ifdef HAVE_AVX
53
+ #ifndef HAVE_SSE41
54
+ #define HAVE_SSE41
55
+ #endif
56
+ #endif
57
+
58
+ #ifdef HAVE_SSE41
59
+ #ifndef HAVE_SSSE3
60
+ #define HAVE_SSSE3
61
+ #endif
62
+ #endif
63
+
64
+ #ifdef HAVE_SSSE3
65
+ #define HAVE_SSE2
66
+ #endif
67
+
68
+ #if !defined(HAVE_SSE2)
69
+ #error "This code requires at least SSE2."
70
+ #endif
71
+
72
+ #endif
@@ -0,0 +1,68 @@
1
+ /*
2
+ BLAKE2 reference source code package - optimized C implementations
3
+
4
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
5
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
6
+ your option. The terms of these licenses can be found at:
7
+
8
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
9
+ - OpenSSL license : https://www.openssl.org/source/license.html
10
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ More information about the BLAKE2 hash function can be found at
13
+ https://blake2.net.
14
+ */
15
+ #ifndef BLAKE2B_LOAD_SSE2_H
16
+ #define BLAKE2B_LOAD_SSE2_H
17
+
18
+ #define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
19
+ #define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
20
+ #define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
21
+ #define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
22
+ #define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
23
+ #define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
24
+ #define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
25
+ #define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
26
+ #define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5)
27
+ #define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2)
28
+ #define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7)
29
+ #define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1)
30
+ #define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13)
31
+ #define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12)
32
+ #define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4)
33
+ #define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0)
34
+ #define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2)
35
+ #define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4)
36
+ #define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6)
37
+ #define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8)
38
+ #define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0)
39
+ #define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11)
40
+ #define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15)
41
+ #define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14)
42
+ #define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14)
43
+ #define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13)
44
+ #define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9)
45
+ #define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2)
46
+ #define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12)
47
+ #define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1)
48
+ #define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8)
49
+ #define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6)
50
+ #define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11)
51
+ #define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3)
52
+ #define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1)
53
+ #define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4)
54
+ #define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7)
55
+ #define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6)
56
+ #define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3)
57
+ #define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12)
58
+ #define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4)
59
+ #define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5)
60
+ #define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12)
61
+ #define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13)
62
+ #define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9)
63
+ #define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15)
64
+ #define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11)
65
+ #define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7)
66
+
67
+
68
+ #endif
@@ -0,0 +1,402 @@
1
+ /*
2
+ BLAKE2 reference source code package - optimized C implementations
3
+
4
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
5
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
6
+ your option. The terms of these licenses can be found at:
7
+
8
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
9
+ - OpenSSL license : https://www.openssl.org/source/license.html
10
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ More information about the BLAKE2 hash function can be found at
13
+ https://blake2.net.
14
+ */
15
+ #ifndef BLAKE2B_LOAD_SSE41_H
16
+ #define BLAKE2B_LOAD_SSE41_H
17
+
18
+ #define LOAD_MSG_0_1(b0, b1) \
19
+ do \
20
+ { \
21
+ b0 = _mm_unpacklo_epi64(m0, m1); \
22
+ b1 = _mm_unpacklo_epi64(m2, m3); \
23
+ } while(0)
24
+
25
+
26
+ #define LOAD_MSG_0_2(b0, b1) \
27
+ do \
28
+ { \
29
+ b0 = _mm_unpackhi_epi64(m0, m1); \
30
+ b1 = _mm_unpackhi_epi64(m2, m3); \
31
+ } while(0)
32
+
33
+
34
+ #define LOAD_MSG_0_3(b0, b1) \
35
+ do \
36
+ { \
37
+ b0 = _mm_unpacklo_epi64(m4, m5); \
38
+ b1 = _mm_unpacklo_epi64(m6, m7); \
39
+ } while(0)
40
+
41
+
42
+ #define LOAD_MSG_0_4(b0, b1) \
43
+ do \
44
+ { \
45
+ b0 = _mm_unpackhi_epi64(m4, m5); \
46
+ b1 = _mm_unpackhi_epi64(m6, m7); \
47
+ } while(0)
48
+
49
+
50
+ #define LOAD_MSG_1_1(b0, b1) \
51
+ do \
52
+ { \
53
+ b0 = _mm_unpacklo_epi64(m7, m2); \
54
+ b1 = _mm_unpackhi_epi64(m4, m6); \
55
+ } while(0)
56
+
57
+
58
+ #define LOAD_MSG_1_2(b0, b1) \
59
+ do \
60
+ { \
61
+ b0 = _mm_unpacklo_epi64(m5, m4); \
62
+ b1 = _mm_alignr_epi8(m3, m7, 8); \
63
+ } while(0)
64
+
65
+
66
+ #define LOAD_MSG_1_3(b0, b1) \
67
+ do \
68
+ { \
69
+ b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
70
+ b1 = _mm_unpackhi_epi64(m5, m2); \
71
+ } while(0)
72
+
73
+
74
+ #define LOAD_MSG_1_4(b0, b1) \
75
+ do \
76
+ { \
77
+ b0 = _mm_unpacklo_epi64(m6, m1); \
78
+ b1 = _mm_unpackhi_epi64(m3, m1); \
79
+ } while(0)
80
+
81
+
82
+ #define LOAD_MSG_2_1(b0, b1) \
83
+ do \
84
+ { \
85
+ b0 = _mm_alignr_epi8(m6, m5, 8); \
86
+ b1 = _mm_unpackhi_epi64(m2, m7); \
87
+ } while(0)
88
+
89
+
90
+ #define LOAD_MSG_2_2(b0, b1) \
91
+ do \
92
+ { \
93
+ b0 = _mm_unpacklo_epi64(m4, m0); \
94
+ b1 = _mm_blend_epi16(m1, m6, 0xF0); \
95
+ } while(0)
96
+
97
+
98
+ #define LOAD_MSG_2_3(b0, b1) \
99
+ do \
100
+ { \
101
+ b0 = _mm_blend_epi16(m5, m1, 0xF0); \
102
+ b1 = _mm_unpackhi_epi64(m3, m4); \
103
+ } while(0)
104
+
105
+
106
+ #define LOAD_MSG_2_4(b0, b1) \
107
+ do \
108
+ { \
109
+ b0 = _mm_unpacklo_epi64(m7, m3); \
110
+ b1 = _mm_alignr_epi8(m2, m0, 8); \
111
+ } while(0)
112
+
113
+
114
+ #define LOAD_MSG_3_1(b0, b1) \
115
+ do \
116
+ { \
117
+ b0 = _mm_unpackhi_epi64(m3, m1); \
118
+ b1 = _mm_unpackhi_epi64(m6, m5); \
119
+ } while(0)
120
+
121
+
122
+ #define LOAD_MSG_3_2(b0, b1) \
123
+ do \
124
+ { \
125
+ b0 = _mm_unpackhi_epi64(m4, m0); \
126
+ b1 = _mm_unpacklo_epi64(m6, m7); \
127
+ } while(0)
128
+
129
+
130
+ #define LOAD_MSG_3_3(b0, b1) \
131
+ do \
132
+ { \
133
+ b0 = _mm_blend_epi16(m1, m2, 0xF0); \
134
+ b1 = _mm_blend_epi16(m2, m7, 0xF0); \
135
+ } while(0)
136
+
137
+
138
+ #define LOAD_MSG_3_4(b0, b1) \
139
+ do \
140
+ { \
141
+ b0 = _mm_unpacklo_epi64(m3, m5); \
142
+ b1 = _mm_unpacklo_epi64(m0, m4); \
143
+ } while(0)
144
+
145
+
146
+ #define LOAD_MSG_4_1(b0, b1) \
147
+ do \
148
+ { \
149
+ b0 = _mm_unpackhi_epi64(m4, m2); \
150
+ b1 = _mm_unpacklo_epi64(m1, m5); \
151
+ } while(0)
152
+
153
+
154
+ #define LOAD_MSG_4_2(b0, b1) \
155
+ do \
156
+ { \
157
+ b0 = _mm_blend_epi16(m0, m3, 0xF0); \
158
+ b1 = _mm_blend_epi16(m2, m7, 0xF0); \
159
+ } while(0)
160
+
161
+
162
+ #define LOAD_MSG_4_3(b0, b1) \
163
+ do \
164
+ { \
165
+ b0 = _mm_blend_epi16(m7, m5, 0xF0); \
166
+ b1 = _mm_blend_epi16(m3, m1, 0xF0); \
167
+ } while(0)
168
+
169
+
170
+ #define LOAD_MSG_4_4(b0, b1) \
171
+ do \
172
+ { \
173
+ b0 = _mm_alignr_epi8(m6, m0, 8); \
174
+ b1 = _mm_blend_epi16(m4, m6, 0xF0); \
175
+ } while(0)
176
+
177
+
178
+ #define LOAD_MSG_5_1(b0, b1) \
179
+ do \
180
+ { \
181
+ b0 = _mm_unpacklo_epi64(m1, m3); \
182
+ b1 = _mm_unpacklo_epi64(m0, m4); \
183
+ } while(0)
184
+
185
+
186
+ #define LOAD_MSG_5_2(b0, b1) \
187
+ do \
188
+ { \
189
+ b0 = _mm_unpacklo_epi64(m6, m5); \
190
+ b1 = _mm_unpackhi_epi64(m5, m1); \
191
+ } while(0)
192
+
193
+
194
+ #define LOAD_MSG_5_3(b0, b1) \
195
+ do \
196
+ { \
197
+ b0 = _mm_blend_epi16(m2, m3, 0xF0); \
198
+ b1 = _mm_unpackhi_epi64(m7, m0); \
199
+ } while(0)
200
+
201
+
202
+ #define LOAD_MSG_5_4(b0, b1) \
203
+ do \
204
+ { \
205
+ b0 = _mm_unpackhi_epi64(m6, m2); \
206
+ b1 = _mm_blend_epi16(m7, m4, 0xF0); \
207
+ } while(0)
208
+
209
+
210
+ #define LOAD_MSG_6_1(b0, b1) \
211
+ do \
212
+ { \
213
+ b0 = _mm_blend_epi16(m6, m0, 0xF0); \
214
+ b1 = _mm_unpacklo_epi64(m7, m2); \
215
+ } while(0)
216
+
217
+
218
+ #define LOAD_MSG_6_2(b0, b1) \
219
+ do \
220
+ { \
221
+ b0 = _mm_unpackhi_epi64(m2, m7); \
222
+ b1 = _mm_alignr_epi8(m5, m6, 8); \
223
+ } while(0)
224
+
225
+
226
+ #define LOAD_MSG_6_3(b0, b1) \
227
+ do \
228
+ { \
229
+ b0 = _mm_unpacklo_epi64(m0, m3); \
230
+ b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
231
+ } while(0)
232
+
233
+
234
+ #define LOAD_MSG_6_4(b0, b1) \
235
+ do \
236
+ { \
237
+ b0 = _mm_unpackhi_epi64(m3, m1); \
238
+ b1 = _mm_blend_epi16(m1, m5, 0xF0); \
239
+ } while(0)
240
+
241
+
242
+ #define LOAD_MSG_7_1(b0, b1) \
243
+ do \
244
+ { \
245
+ b0 = _mm_unpackhi_epi64(m6, m3); \
246
+ b1 = _mm_blend_epi16(m6, m1, 0xF0); \
247
+ } while(0)
248
+
249
+
250
+ #define LOAD_MSG_7_2(b0, b1) \
251
+ do \
252
+ { \
253
+ b0 = _mm_alignr_epi8(m7, m5, 8); \
254
+ b1 = _mm_unpackhi_epi64(m0, m4); \
255
+ } while(0)
256
+
257
+
258
+ #define LOAD_MSG_7_3(b0, b1) \
259
+ do \
260
+ { \
261
+ b0 = _mm_unpackhi_epi64(m2, m7); \
262
+ b1 = _mm_unpacklo_epi64(m4, m1); \
263
+ } while(0)
264
+
265
+
266
+ #define LOAD_MSG_7_4(b0, b1) \
267
+ do \
268
+ { \
269
+ b0 = _mm_unpacklo_epi64(m0, m2); \
270
+ b1 = _mm_unpacklo_epi64(m3, m5); \
271
+ } while(0)
272
+
273
+
274
+ #define LOAD_MSG_8_1(b0, b1) \
275
+ do \
276
+ { \
277
+ b0 = _mm_unpacklo_epi64(m3, m7); \
278
+ b1 = _mm_alignr_epi8(m0, m5, 8); \
279
+ } while(0)
280
+
281
+
282
+ #define LOAD_MSG_8_2(b0, b1) \
283
+ do \
284
+ { \
285
+ b0 = _mm_unpackhi_epi64(m7, m4); \
286
+ b1 = _mm_alignr_epi8(m4, m1, 8); \
287
+ } while(0)
288
+
289
+
290
+ #define LOAD_MSG_8_3(b0, b1) \
291
+ do \
292
+ { \
293
+ b0 = m6; \
294
+ b1 = _mm_alignr_epi8(m5, m0, 8); \
295
+ } while(0)
296
+
297
+
298
+ #define LOAD_MSG_8_4(b0, b1) \
299
+ do \
300
+ { \
301
+ b0 = _mm_blend_epi16(m1, m3, 0xF0); \
302
+ b1 = m2; \
303
+ } while(0)
304
+
305
+
306
+ #define LOAD_MSG_9_1(b0, b1) \
307
+ do \
308
+ { \
309
+ b0 = _mm_unpacklo_epi64(m5, m4); \
310
+ b1 = _mm_unpackhi_epi64(m3, m0); \
311
+ } while(0)
312
+
313
+
314
+ #define LOAD_MSG_9_2(b0, b1) \
315
+ do \
316
+ { \
317
+ b0 = _mm_unpacklo_epi64(m1, m2); \
318
+ b1 = _mm_blend_epi16(m3, m2, 0xF0); \
319
+ } while(0)
320
+
321
+
322
+ #define LOAD_MSG_9_3(b0, b1) \
323
+ do \
324
+ { \
325
+ b0 = _mm_unpackhi_epi64(m7, m4); \
326
+ b1 = _mm_unpackhi_epi64(m1, m6); \
327
+ } while(0)
328
+
329
+
330
+ #define LOAD_MSG_9_4(b0, b1) \
331
+ do \
332
+ { \
333
+ b0 = _mm_alignr_epi8(m7, m5, 8); \
334
+ b1 = _mm_unpacklo_epi64(m6, m0); \
335
+ } while(0)
336
+
337
+
338
+ #define LOAD_MSG_10_1(b0, b1) \
339
+ do \
340
+ { \
341
+ b0 = _mm_unpacklo_epi64(m0, m1); \
342
+ b1 = _mm_unpacklo_epi64(m2, m3); \
343
+ } while(0)
344
+
345
+
346
+ #define LOAD_MSG_10_2(b0, b1) \
347
+ do \
348
+ { \
349
+ b0 = _mm_unpackhi_epi64(m0, m1); \
350
+ b1 = _mm_unpackhi_epi64(m2, m3); \
351
+ } while(0)
352
+
353
+
354
+ #define LOAD_MSG_10_3(b0, b1) \
355
+ do \
356
+ { \
357
+ b0 = _mm_unpacklo_epi64(m4, m5); \
358
+ b1 = _mm_unpacklo_epi64(m6, m7); \
359
+ } while(0)
360
+
361
+
362
+ #define LOAD_MSG_10_4(b0, b1) \
363
+ do \
364
+ { \
365
+ b0 = _mm_unpackhi_epi64(m4, m5); \
366
+ b1 = _mm_unpackhi_epi64(m6, m7); \
367
+ } while(0)
368
+
369
+
370
+ #define LOAD_MSG_11_1(b0, b1) \
371
+ do \
372
+ { \
373
+ b0 = _mm_unpacklo_epi64(m7, m2); \
374
+ b1 = _mm_unpackhi_epi64(m4, m6); \
375
+ } while(0)
376
+
377
+
378
+ #define LOAD_MSG_11_2(b0, b1) \
379
+ do \
380
+ { \
381
+ b0 = _mm_unpacklo_epi64(m5, m4); \
382
+ b1 = _mm_alignr_epi8(m3, m7, 8); \
383
+ } while(0)
384
+
385
+
386
+ #define LOAD_MSG_11_3(b0, b1) \
387
+ do \
388
+ { \
389
+ b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
390
+ b1 = _mm_unpackhi_epi64(m5, m2); \
391
+ } while(0)
392
+
393
+
394
+ #define LOAD_MSG_11_4(b0, b1) \
395
+ do \
396
+ { \
397
+ b0 = _mm_unpacklo_epi64(m6, m1); \
398
+ b1 = _mm_unpackhi_epi64(m3, m1); \
399
+ } while(0)
400
+
401
+
402
+ #endif
@@ -1,5 +1,5 @@
1
1
  /*
2
- BLAKE2 reference source code package - reference C implementations
2
+ BLAKE2 reference source code package - optimized C implementations
3
3
 
4
4
  Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
5
5
  terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
@@ -20,6 +20,27 @@
20
20
  #include "blake2.h"
21
21
  #include "blake2-impl.h"
22
22
 
23
+ #include "blake2-config.h"
24
+
25
+ #ifdef _MSC_VER
26
+ #include <intrin.h> /* for _mm_set_epi64x */
27
+ #endif
28
+ #include <emmintrin.h>
29
+ #if defined(HAVE_SSSE3)
30
+ #include <tmmintrin.h>
31
+ #endif
32
+ #if defined(HAVE_SSE41)
33
+ #include <smmintrin.h>
34
+ #endif
35
+ #if defined(HAVE_AVX)
36
+ #include <immintrin.h>
37
+ #endif
38
+ #if defined(HAVE_XOP)
39
+ #include <x86intrin.h>
40
+ #endif
41
+
42
+ #include "blake2b-round.h"
43
+
23
44
  static const uint64_t blake2b_IV[8] =
24
45
  {
25
46
  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
@@ -28,29 +49,12 @@ static const uint64_t blake2b_IV[8] =
28
49
  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
29
50
  };
30
51
 
31
- static const uint8_t blake2b_sigma[12][16] =
32
- {
33
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
34
- { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
35
- { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
36
- { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
37
- { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
38
- { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
39
- { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
40
- { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
41
- { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
42
- { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
43
- { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
44
- { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
45
- };
46
-
47
-
52
+ /* Some helper functions */
48
53
  static void blake2b_set_lastnode( blake2b_state *S )
49
54
  {
50
55
  S->f[1] = (uint64_t)-1;
51
56
  }
52
57
 
53
- /* Some helper functions, not necessarily useful */
54
58
  static int blake2b_is_lastblock( const blake2b_state *S )
55
59
  {
56
60
  return S->f[0] != 0;
@@ -69,32 +73,25 @@ static void blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
69
73
  S->t[1] += ( S->t[0] < inc );
70
74
  }
71
75
 
72
- static void blake2b_init0( blake2b_state *S )
73
- {
74
- size_t i;
75
- memset( S, 0, sizeof( blake2b_state ) );
76
-
77
- for( i = 0; i < 8; ++i ) S->h[i] = blake2b_IV[i];
78
- }
79
-
80
76
  /* init xors IV with input parameter block */
81
77
  int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
82
78
  {
83
- const uint8_t *p = ( const uint8_t * )( P );
84
79
  size_t i;
85
-
86
- blake2b_init0( S );
87
-
80
+ /*blake2b_init0( S ); */
81
+ const unsigned char * v = ( const unsigned char * )( blake2b_IV );
82
+ const unsigned char * p = ( const unsigned char * )( P );
83
+ unsigned char * h = ( unsigned char * )( S->h );
88
84
  /* IV XOR ParamBlock */
89
- for( i = 0; i < 8; ++i )
90
- S->h[i] ^= load64( p + sizeof( S->h[i] ) * i );
85
+ memset( S, 0, sizeof( blake2b_state ) );
86
+
87
+ for( i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
91
88
 
92
89
  S->outlen = P->digest_length;
93
90
  return 0;
94
91
  }
95
92
 
96
93
 
97
-
94
+ /* Some sort of default parameter block initialization, for sequential blake2b */
98
95
  int blake2b_init( blake2b_state *S, size_t outlen )
99
96
  {
100
97
  blake2b_param P[1];
@@ -113,17 +110,17 @@ int blake2b_init( blake2b_state *S, size_t outlen )
113
110
  memset( P->reserved, 0, sizeof( P->reserved ) );
114
111
  memset( P->salt, 0, sizeof( P->salt ) );
115
112
  memset( P->personal, 0, sizeof( P->personal ) );
113
+
116
114
  return blake2b_init_param( S, P );
117
115
  }
118
116
 
119
-
120
117
  int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen )
121
118
  {
122
119
  blake2b_param P[1];
123
120
 
124
121
  if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
125
122
 
126
- if ( !key || !keylen || keylen > BLAKE2B_KEYBYTES ) return -1;
123
+ if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;
127
124
 
128
125
  P->digest_length = (uint8_t)outlen;
129
126
  P->key_length = (uint8_t)keylen;
@@ -138,7 +135,8 @@ int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t k
138
135
  memset( P->salt, 0, sizeof( P->salt ) );
139
136
  memset( P->personal, 0, sizeof( P->personal ) );
140
137
 
141
- if( blake2b_init_param( S, P ) < 0 ) return -1;
138
+ if( blake2b_init_param( S, P ) < 0 )
139
+ return 0;
142
140
 
143
141
  {
144
142
  uint8_t block[BLAKE2B_BLOCKBYTES];
@@ -150,53 +148,53 @@ int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t k
150
148
  return 0;
151
149
  }
152
150
 
153
- #define G(r,i,a,b,c,d) \
154
- do { \
155
- a = a + b + m[blake2b_sigma[r][2*i+0]]; \
156
- d = rotr64(d ^ a, 32); \
157
- c = c + d; \
158
- b = rotr64(b ^ c, 24); \
159
- a = a + b + m[blake2b_sigma[r][2*i+1]]; \
160
- d = rotr64(d ^ a, 16); \
161
- c = c + d; \
162
- b = rotr64(b ^ c, 63); \
163
- } while(0)
164
-
165
- #define ROUND(r) \
166
- do { \
167
- G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
168
- G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
169
- G(r,2,v[ 2],v[ 6],v[10],v[14]); \
170
- G(r,3,v[ 3],v[ 7],v[11],v[15]); \
171
- G(r,4,v[ 0],v[ 5],v[10],v[15]); \
172
- G(r,5,v[ 1],v[ 6],v[11],v[12]); \
173
- G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
174
- G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
175
- } while(0)
176
-
177
151
  static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
178
152
  {
179
- uint64_t m[16];
180
- uint64_t v[16];
181
- size_t i;
182
-
183
- for( i = 0; i < 16; ++i ) {
184
- m[i] = load64( block + i * sizeof( m[i] ) );
185
- }
186
-
187
- for( i = 0; i < 8; ++i ) {
188
- v[i] = S->h[i];
189
- }
190
-
191
- v[ 8] = blake2b_IV[0];
192
- v[ 9] = blake2b_IV[1];
193
- v[10] = blake2b_IV[2];
194
- v[11] = blake2b_IV[3];
195
- v[12] = blake2b_IV[4] ^ S->t[0];
196
- v[13] = blake2b_IV[5] ^ S->t[1];
197
- v[14] = blake2b_IV[6] ^ S->f[0];
198
- v[15] = blake2b_IV[7] ^ S->f[1];
199
-
153
+ __m128i row1l, row1h;
154
+ __m128i row2l, row2h;
155
+ __m128i row3l, row3h;
156
+ __m128i row4l, row4h;
157
+ __m128i b0, b1;
158
+ __m128i t0, t1;
159
+ #if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
160
+ const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 );
161
+ const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 );
162
+ #endif
163
+ #if defined(HAVE_SSE41)
164
+ const __m128i m0 = LOADU( block + 00 );
165
+ const __m128i m1 = LOADU( block + 16 );
166
+ const __m128i m2 = LOADU( block + 32 );
167
+ const __m128i m3 = LOADU( block + 48 );
168
+ const __m128i m4 = LOADU( block + 64 );
169
+ const __m128i m5 = LOADU( block + 80 );
170
+ const __m128i m6 = LOADU( block + 96 );
171
+ const __m128i m7 = LOADU( block + 112 );
172
+ #else
173
+ const uint64_t m0 = load64(block + 0 * sizeof(uint64_t));
174
+ const uint64_t m1 = load64(block + 1 * sizeof(uint64_t));
175
+ const uint64_t m2 = load64(block + 2 * sizeof(uint64_t));
176
+ const uint64_t m3 = load64(block + 3 * sizeof(uint64_t));
177
+ const uint64_t m4 = load64(block + 4 * sizeof(uint64_t));
178
+ const uint64_t m5 = load64(block + 5 * sizeof(uint64_t));
179
+ const uint64_t m6 = load64(block + 6 * sizeof(uint64_t));
180
+ const uint64_t m7 = load64(block + 7 * sizeof(uint64_t));
181
+ const uint64_t m8 = load64(block + 8 * sizeof(uint64_t));
182
+ const uint64_t m9 = load64(block + 9 * sizeof(uint64_t));
183
+ const uint64_t m10 = load64(block + 10 * sizeof(uint64_t));
184
+ const uint64_t m11 = load64(block + 11 * sizeof(uint64_t));
185
+ const uint64_t m12 = load64(block + 12 * sizeof(uint64_t));
186
+ const uint64_t m13 = load64(block + 13 * sizeof(uint64_t));
187
+ const uint64_t m14 = load64(block + 14 * sizeof(uint64_t));
188
+ const uint64_t m15 = load64(block + 15 * sizeof(uint64_t));
189
+ #endif
190
+ row1l = LOADU( &S->h[0] );
191
+ row1h = LOADU( &S->h[2] );
192
+ row2l = LOADU( &S->h[4] );
193
+ row2h = LOADU( &S->h[6] );
194
+ row3l = LOADU( &blake2b_IV[0] );
195
+ row3h = LOADU( &blake2b_IV[2] );
196
+ row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) );
197
+ row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) );
200
198
  ROUND( 0 );
201
199
  ROUND( 1 );
202
200
  ROUND( 2 );
@@ -209,14 +207,16 @@ static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOC
209
207
  ROUND( 9 );
210
208
  ROUND( 10 );
211
209
  ROUND( 11 );
212
-
213
- for( i = 0; i < 8; ++i ) {
214
- S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
215
- }
210
+ row1l = _mm_xor_si128( row3l, row1l );
211
+ row1h = _mm_xor_si128( row3h, row1h );
212
+ STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) );
213
+ STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) );
214
+ row2l = _mm_xor_si128( row4l, row2l );
215
+ row2h = _mm_xor_si128( row4h, row2h );
216
+ STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) );
217
+ STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) );
216
218
  }
217
219
 
218
- #undef G
219
- #undef ROUND
220
220
 
221
221
  int blake2b_update( blake2b_state *S, const void *pin, size_t inlen )
222
222
  {
@@ -245,11 +245,9 @@ int blake2b_update( blake2b_state *S, const void *pin, size_t inlen )
245
245
  return 0;
246
246
  }
247
247
 
248
+
248
249
  int blake2b_final( blake2b_state *S, void *out, size_t outlen )
249
250
  {
250
- uint8_t buffer[BLAKE2B_OUTBYTES] = {0};
251
- size_t i;
252
-
253
251
  if( out == NULL || outlen < S->outlen )
254
252
  return -1;
255
253
 
@@ -261,15 +259,11 @@ int blake2b_final( blake2b_state *S, void *out, size_t outlen )
261
259
  memset( S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
262
260
  blake2b_compress( S, S->buf );
263
261
 
264
- for( i = 0; i < 8; ++i ) /* Output full hash to temp buffer */
265
- store64( buffer + sizeof( S->h[i] ) * i, S->h[i] );
266
-
267
- memcpy( out, buffer, S->outlen );
268
- secure_zero_memory(buffer, sizeof(buffer));
262
+ memcpy( out, &S->h[0], S->outlen );
269
263
  return 0;
270
264
  }
271
265
 
272
- /* inlen, at least, should be uint64_t. Others can be size_t. */
266
+
273
267
  int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
274
268
  {
275
269
  blake2b_state S[1];
@@ -285,7 +279,7 @@ int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void
285
279
 
286
280
  if( keylen > BLAKE2B_KEYBYTES ) return -1;
287
281
 
288
- if( keylen > 0 )
282
+ if( keylen )
289
283
  {
290
284
  if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
291
285
  }
@@ -0,0 +1,157 @@
1
+ /*
2
+ BLAKE2 reference source code package - optimized C implementations
3
+
4
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
5
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
6
+ your option. The terms of these licenses can be found at:
7
+
8
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
9
+ - OpenSSL license : https://www.openssl.org/source/license.html
10
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ More information about the BLAKE2 hash function can be found at
13
+ https://blake2.net.
14
+ */
15
+ #ifndef BLAKE2B_ROUND_H
16
+ #define BLAKE2B_ROUND_H
17
+
18
+ #define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) )
19
+ #define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
20
+
21
+ #define TOF(reg) _mm_castsi128_ps((reg))
22
+ #define TOI(reg) _mm_castps_si128((reg))
23
+
24
+ #define LIKELY(x) __builtin_expect((x),1)
25
+
26
+
27
+ /* Microarchitecture-specific macros */
28
+ #ifndef HAVE_XOP
29
+ #ifdef HAVE_SSSE3
30
+ #define _mm_roti_epi64(x, c) \
31
+ (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
32
+ : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
33
+ : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
34
+ : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
35
+ : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
36
+ #else
37
+ #define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) ))
38
+ #endif
39
+ #else
40
+ /* ... */
41
+ #endif
42
+
43
+
44
+
45
+ #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
46
+ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
47
+ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
48
+ \
49
+ row4l = _mm_xor_si128(row4l, row1l); \
50
+ row4h = _mm_xor_si128(row4h, row1h); \
51
+ \
52
+ row4l = _mm_roti_epi64(row4l, -32); \
53
+ row4h = _mm_roti_epi64(row4h, -32); \
54
+ \
55
+ row3l = _mm_add_epi64(row3l, row4l); \
56
+ row3h = _mm_add_epi64(row3h, row4h); \
57
+ \
58
+ row2l = _mm_xor_si128(row2l, row3l); \
59
+ row2h = _mm_xor_si128(row2h, row3h); \
60
+ \
61
+ row2l = _mm_roti_epi64(row2l, -24); \
62
+ row2h = _mm_roti_epi64(row2h, -24); \
63
+
64
+ #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
65
+ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
66
+ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
67
+ \
68
+ row4l = _mm_xor_si128(row4l, row1l); \
69
+ row4h = _mm_xor_si128(row4h, row1h); \
70
+ \
71
+ row4l = _mm_roti_epi64(row4l, -16); \
72
+ row4h = _mm_roti_epi64(row4h, -16); \
73
+ \
74
+ row3l = _mm_add_epi64(row3l, row4l); \
75
+ row3h = _mm_add_epi64(row3h, row4h); \
76
+ \
77
+ row2l = _mm_xor_si128(row2l, row3l); \
78
+ row2h = _mm_xor_si128(row2h, row3h); \
79
+ \
80
+ row2l = _mm_roti_epi64(row2l, -63); \
81
+ row2h = _mm_roti_epi64(row2h, -63); \
82
+
83
+ #if defined(HAVE_SSSE3)
84
+ #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
85
+ t0 = _mm_alignr_epi8(row2h, row2l, 8); \
86
+ t1 = _mm_alignr_epi8(row2l, row2h, 8); \
87
+ row2l = t0; \
88
+ row2h = t1; \
89
+ \
90
+ t0 = row3l; \
91
+ row3l = row3h; \
92
+ row3h = t0; \
93
+ \
94
+ t0 = _mm_alignr_epi8(row4h, row4l, 8); \
95
+ t1 = _mm_alignr_epi8(row4l, row4h, 8); \
96
+ row4l = t1; \
97
+ row4h = t0;
98
+
99
+ #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
100
+ t0 = _mm_alignr_epi8(row2l, row2h, 8); \
101
+ t1 = _mm_alignr_epi8(row2h, row2l, 8); \
102
+ row2l = t0; \
103
+ row2h = t1; \
104
+ \
105
+ t0 = row3l; \
106
+ row3l = row3h; \
107
+ row3h = t0; \
108
+ \
109
+ t0 = _mm_alignr_epi8(row4l, row4h, 8); \
110
+ t1 = _mm_alignr_epi8(row4h, row4l, 8); \
111
+ row4l = t1; \
112
+ row4h = t0;
113
+ #else
114
+
115
+ #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
116
+ t0 = row4l;\
117
+ t1 = row2l;\
118
+ row4l = row3l;\
119
+ row3l = row3h;\
120
+ row3h = row4l;\
121
+ row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
122
+ row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
123
+ row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
124
+ row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
125
+
126
+ #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
127
+ t0 = row3l;\
128
+ row3l = row3h;\
129
+ row3h = t0;\
130
+ t0 = row2l;\
131
+ t1 = row4l;\
132
+ row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
133
+ row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
134
+ row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
135
+ row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
136
+
137
+ #endif
138
+
139
+ #if defined(HAVE_SSE41)
140
+ #include "blake2b-load-sse41.h"
141
+ #else
142
+ #include "blake2b-load-sse2.h"
143
+ #endif
144
+
145
+ #define ROUND(r) \
146
+ LOAD_MSG_ ##r ##_1(b0, b1); \
147
+ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
148
+ LOAD_MSG_ ##r ##_2(b0, b1); \
149
+ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
150
+ DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
151
+ LOAD_MSG_ ##r ##_3(b0, b1); \
152
+ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
153
+ LOAD_MSG_ ##r ##_4(b0, b1); \
154
+ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
155
+ UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
156
+
157
+ #endif
@@ -1,3 +1,3 @@
1
1
  require 'mkmf'
2
- $CFLAGS += ' -std=c99'
2
+ $CFLAGS += ' -Wall -Wextra -std=c99 -pedantic -Wno-long-long'
3
3
  create_makefile 'blake2b_ext'
@@ -34,7 +34,7 @@ static VALUE blake2_alloc(VALUE klass) {
34
34
  VALUE m_blake2_initialize(VALUE self, VALUE _len, VALUE _key) {
35
35
  Blake2 *blake2;
36
36
  Data_Get_Struct(self, Blake2, blake2);
37
- int i;
37
+ unsigned int i;
38
38
 
39
39
  ID bytes_method = rb_intern("bytes");
40
40
  blake2->to_hex = ID2SYM(rb_intern("to_hex"));
@@ -44,7 +44,7 @@ VALUE m_blake2_initialize(VALUE self, VALUE _len, VALUE _key) {
44
44
  blake2->key_length = RARRAY_LEN(key_bytes_ary);
45
45
  blake2->key_bytes = (uint8_t*)malloc(blake2->key_length * sizeof(uint8_t));
46
46
 
47
- for(i = 0; (unsigned)i < blake2->key_length; i++) {
47
+ for(i = 0; i < blake2->key_length; i++) {
48
48
  VALUE byte = rb_ary_entry(key_bytes_ary, i);
49
49
  blake2->key_bytes[i] = NUM2INT(byte);
50
50
  }
@@ -61,7 +61,7 @@ VALUE m_blake2_digest(VALUE self, VALUE _input, VALUE _representation) {
61
61
 
62
62
  char *input = RSTRING_PTR(_input);
63
63
  uint64_t input_length = RSTRING_LEN(_input);
64
- int i;
64
+ unsigned int i;
65
65
 
66
66
  Data_Get_Struct(self, Blake2, blake2);
67
67
 
@@ -73,14 +73,14 @@ VALUE m_blake2_digest(VALUE self, VALUE _input, VALUE _representation) {
73
73
  if(_representation == blake2->to_bytes) {
74
74
  result = rb_ary_new2(blake2->output_length);
75
75
 
76
- for(i = 0; (unsigned)i < blake2->output_length; i++) {
76
+ for(i = 0; i < blake2->output_length; i++) {
77
77
  rb_ary_push(result, INT2NUM(blake2->output[i]));
78
78
  }
79
79
  } else if(_representation == blake2->to_hex) {
80
80
  unsigned long ary_len = blake2->output_length * (unsigned)sizeof(char) * 2;
81
81
  char *c_str = (char*)malloc(ary_len + 1);
82
82
 
83
- for(i = 0; (unsigned)i < blake2->output_length; i++) {
83
+ for(i = 0; i < blake2->output_length; i++) {
84
84
  sprintf(c_str + (i * 2), "%02x", blake2->output[i]);
85
85
  }
86
86
  c_str[ary_len] = 0;
@@ -0,0 +1,61 @@
1
+ require 'digest'
2
+ require 'blake2'
3
+ require 'blake2b'
4
+
5
+ sample_string = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec quis fermentum mauris, laoreet ultricies ipsum. Phasellus iaculis lacinia bibendum. Aenean eu lectus vitae nisi pellentesque condimentum. Cras imperdiet risus ut interdum dignissim. Nam ultricies vulputate varius. Morbi vehicula mi sit amet velit cursus, eu blandit dolor venenatis. Nunc vitae varius leo. Mauris metus nibh, ultrices nec odio in, viverra luctus purus. Duis luctus, dolor vel sodales semper, enim mauris sagittis dolor, at vehicula ligula ante eu lorem. Morbi porttitor lorem id turpis facilisis volutpat. Sed elementum porttitor sem, a ornare ligula. Integer tincidunt aliquam suscipit. Sed aliquam ligula id enim fringilla, vel ornare ante bibendum. Integer tincidunt, augue id condimentum fermentum, dolor urna molestie massa, sed congue enim quam eget arcu. Quisque feugiat purus sit amet porttitor tincidunt.
6
+
7
+ Fusce odio libero, lobortis quis ornare sit amet, dignissim sed erat. Praesent a iaculis ex. Ut libero amet."
8
+
9
+ run_count = 500_000
10
+
11
+ ### MD5
12
+ starting = Process.clock_gettime(Process::CLOCK_MONOTONIC)
13
+ run_count.times do |i|
14
+ Digest::MD5.hexdigest("#{i}#{sample_string}")
15
+ end
16
+ ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
17
+
18
+ puts "MD5 result: #{ending - starting} seconds."
19
+
20
+
21
+ ### SHA2
22
+ starting = Process.clock_gettime(Process::CLOCK_MONOTONIC)
23
+ run_count.times do |i|
24
+ Digest::SHA2.hexdigest("#{i}#{sample_string}")
25
+ end
26
+ ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
27
+
28
+ puts "SHA2 result: #{ending - starting} seconds."
29
+
30
+
31
+ ### SHA512
32
+ starting = Process.clock_gettime(Process::CLOCK_MONOTONIC)
33
+ run_count.times do |i|
34
+ Digest::SHA512.hexdigest("#{i}#{sample_string}")
35
+ end
36
+ ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
37
+
38
+ puts "SHA512 result: #{ending - starting} seconds."
39
+
40
+
41
+
42
+ ### BLAKE2s
43
+ starting = Process.clock_gettime(Process::CLOCK_MONOTONIC)
44
+ unkeyed = Blake2::Key.none
45
+ run_count.times do |i|
46
+ Blake2.new(32, unkeyed).digest("#{i}#{sample_string}", :to_hex)
47
+ end
48
+ ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
49
+
50
+ puts "BLAKE2s result: #{ending - starting} seconds."
51
+
52
+
53
+ ### BLAKE2b
54
+ starting = Process.clock_gettime(Process::CLOCK_MONOTONIC)
55
+ unkeyed = Blake2b::Key.none
56
+ run_count.times do |i|
57
+ Blake2b.new(32, unkeyed).digest("#{i}#{sample_string}", :to_hex)
58
+ end
59
+ ending = Process.clock_gettime(Process::CLOCK_MONOTONIC)
60
+
61
+ puts "BLAKE2b result: #{ending - starting} seconds."
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: blake2b
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Franck Verrot
@@ -83,13 +83,18 @@ files:
83
83
  - README.md
84
84
  - Rakefile
85
85
  - blake2b.gemspec
86
+ - ext/blake2b_ext/blake2-config.h
86
87
  - ext/blake2b_ext/blake2-impl.h
87
88
  - ext/blake2b_ext/blake2.h
89
+ - ext/blake2b_ext/blake2b-load-sse2.h
90
+ - ext/blake2b_ext/blake2b-load-sse41.h
88
91
  - ext/blake2b_ext/blake2b-ref.c
92
+ - ext/blake2b_ext/blake2b-round.h
89
93
  - ext/blake2b_ext/extconf.rb
90
94
  - ext/blake2b_ext/rbext.c
91
95
  - lib/blake2b.rb
92
96
  - lib/blake2b/key.rb
97
+ - performance/performance_suite.rb
93
98
  homepage: https://github.com/mgomes/blake2b
94
99
  licenses:
95
100
  - GPL-3.0