aez 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 4df307fcf2d926a7d97d8a67b446653314c41da2ba48daddabaa2db99a005f6c
4
+ data.tar.gz: c753537e76da64402c0ccdbdd92c85ce5fe0a81ca6fc767462f80360e2da38ce
5
+ SHA512:
6
+ metadata.gz: 15148d0a90cebfeb9db8169fbda4812374bf1f6e7a5f455225e4b84662e7d98f8f7f11e35ffa8534ba983a86109805af5738677a8ac21266ea7de9f723db8496
7
+ data.tar.gz: d854f7d44f95b9711bcbd111fe91237056bf1772441f984df938171b4106cf1681bb7e354754009cc791e2dc1ab58e60cd958490fd6ebb2380bb0be42a1e3932
@@ -0,0 +1,37 @@
1
+ # This workflow uses actions that are not certified by GitHub.
2
+ # They are provided by a third-party and are governed by
3
+ # separate terms of service, privacy policy, and support
4
+ # documentation.
5
+ # This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
6
+ # For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
7
+
8
+ name: Ruby
9
+
10
+ on:
11
+ push:
12
+ branches: [ master ]
13
+ pull_request:
14
+ branches: [ master ]
15
+
16
+ jobs:
17
+ test:
18
+
19
+ runs-on: ubuntu-latest
20
+ strategy:
21
+ matrix:
22
+ ruby-version: ['2.6', '2.7', '3.0']
23
+
24
+ steps:
25
+ - uses: actions/checkout@v2
26
+ - name: Set up Ruby
27
+ # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby,
28
+ # change this to (see https://github.com/ruby/setup-ruby#versioning):
29
+ # uses: ruby/setup-ruby@v1
30
+ uses: ruby/setup-ruby@473e4d8fe5dd94ee328fdfca9f8c9c7afc9dae5e
31
+ with:
32
+ ruby-version: ${{ matrix.ruby-version }}
33
+ bundler-cache: true # runs 'bundle install' and caches installed gems automatically
34
+ - name: Compile extension
35
+ run: bundle exec rake compile
36
+ - name: Run tests
37
+ run: bundle exec rake spec
data/.gitignore ADDED
@@ -0,0 +1,13 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
12
+ Gemfile.lock
13
+ *.so
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.ruby-gemset ADDED
@@ -0,0 +1 @@
1
+ aez
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ ruby-3.0.0
@@ -0,0 +1,74 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ In the interest of fostering an open and welcoming environment, we as
6
+ contributors and maintainers pledge to making participation in our project and
7
+ our community a harassment-free experience for everyone, regardless of age, body
8
+ size, disability, ethnicity, gender identity and expression, level of experience,
9
+ nationality, personal appearance, race, religion, or sexual identity and
10
+ orientation.
11
+
12
+ ## Our Standards
13
+
14
+ Examples of behavior that contributes to creating a positive environment
15
+ include:
16
+
17
+ * Using welcoming and inclusive language
18
+ * Being respectful of differing viewpoints and experiences
19
+ * Gracefully accepting constructive criticism
20
+ * Focusing on what is best for the community
21
+ * Showing empathy towards other community members
22
+
23
+ Examples of unacceptable behavior by participants include:
24
+
25
+ * The use of sexualized language or imagery and unwelcome sexual attention or
26
+ advances
27
+ * Trolling, insulting/derogatory comments, and personal or political attacks
28
+ * Public or private harassment
29
+ * Publishing others' private information, such as a physical or electronic
30
+ address, without explicit permission
31
+ * Other conduct which could reasonably be considered inappropriate in a
32
+ professional setting
33
+
34
+ ## Our Responsibilities
35
+
36
+ Project maintainers are responsible for clarifying the standards of acceptable
37
+ behavior and are expected to take appropriate and fair corrective action in
38
+ response to any instances of unacceptable behavior.
39
+
40
+ Project maintainers have the right and responsibility to remove, edit, or
41
+ reject comments, commits, code, wiki edits, issues, and other contributions
42
+ that are not aligned to this Code of Conduct, or to ban temporarily or
43
+ permanently any contributor for other behaviors that they deem inappropriate,
44
+ threatening, offensive, or harmful.
45
+
46
+ ## Scope
47
+
48
+ This Code of Conduct applies both within project spaces and in public spaces
49
+ when an individual is representing the project or its community. Examples of
50
+ representing a project or community include using an official project e-mail
51
+ address, posting via an official social media account, or acting as an appointed
52
+ representative at an online or offline event. Representation of a project may be
53
+ further defined and clarified by project maintainers.
54
+
55
+ ## Enforcement
56
+
57
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
58
+ reported by contacting the project team at azuchi@haw.co.jp. All
59
+ complaints will be reviewed and investigated and will result in a response that
60
+ is deemed necessary and appropriate to the circumstances. The project team is
61
+ obligated to maintain confidentiality with regard to the reporter of an incident.
62
+ Further details of specific enforcement policies may be posted separately.
63
+
64
+ Project maintainers who do not follow or enforce the Code of Conduct in good
65
+ faith may face temporary or permanent repercussions as determined by other
66
+ members of the project's leadership.
67
+
68
+ ## Attribution
69
+
70
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71
+ available at [http://contributor-covenant.org/version/1/4][version]
72
+
73
+ [homepage]: http://contributor-covenant.org
74
+ [version]: http://contributor-covenant.org/version/1/4/
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in aez.gemspec
6
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2021 Shigeyuki Azuchi.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,48 @@
1
+ # AEZ for Ruby [![Build Status](https://github.com/azuchi/aez/actions/workflows/ruby.yml/badge.svg?branch=master)](https://github.com/azuchi/aez/actions/workflows/ruby.yml) [![Gem Version](https://badge.fury.io/rb/aez.svg)](https://badge.fury.io/rb/aez) [![MIT License](http://img.shields.io/badge/license-MIT-blue.svg?style=flat)](LICENSE)
2
+
3
+ [AEZ](http://web.cs.ucdavis.edu/~rogaway/aez/) binding for ruby.
4
+ This library calls AEZv5 implementation in C using AES-NI hardware optimizations via FFI.
5
+
6
+ ## Requirements
7
+
8
+ There are the following limitations from Ted Krovetz's C implementation:
9
+
10
+ - Intel or ARM CPU supporting AES instructions
11
+ - Faster if all pointers are 16-byte aligned.
12
+ - Max 16 byte nonce, 16 byte authenticator
13
+ - Single AD (AEZ spec allows vector AD but this code doesn't)
14
+ - Max 2^32-1 byte buffers allowed (due to using unsigned int)
15
+
16
+ ## Installation
17
+
18
+ Add this line to your application's Gemfile:
19
+
20
+ ```ruby
21
+ gem 'aez'
22
+ ```
23
+
24
+ And then execute:
25
+
26
+ $ bundle
27
+
28
+ Or install it yourself as:
29
+
30
+ $ gem install aez
31
+
32
+ ## Usage
33
+
34
+ ```ruby
35
+ require 'aez'
36
+
37
+ key = ['9adf7a023fbc4e663695f627a8d5b5c45f6752e375d19e11a669e6b949347d0cf5e0e2516ee285af365224976afa60be'].pack('H*')
38
+ nonce = ['799de3d90fbd6fed93b5f96cf9f4e852'].pack('H*')
39
+ ad = ['d6e278e0c6ede09d302d6fde09de77711a9a02fc8a049fb34a5e3f00c1cfc336d0'].pack('H*')
40
+ message = ['efea7ecfa45f51b52ce038cf6c0704392c2211bfca17a36284f63a902b37f0ab'].pack('H*')
41
+ abyte = 16
42
+
43
+ # Encryption
44
+ cipher_tex = AEZ.encrypt(key, message, ad, nonce, abyte)
45
+
46
+ # Decryption
47
+ plain_text = AEZ.decrypt(key, message, ad, nonce, abyte)
48
+ ```
data/Rakefile ADDED
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+ require 'rake/extensiontask'
6
+
7
+ RSpec::Core::RakeTask.new(:spec)
8
+
9
+ task default: :spec
10
+
11
+ Rake::ExtensionTask.new 'aezv5' do |ext|
12
+ ext.lib_dir = 'lib/aez'
13
+ end
data/aez.gemspec ADDED
@@ -0,0 +1,33 @@
1
+
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'aez/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'aez'
8
+ spec.version = AEZ::VERSION
9
+ spec.authors = ['Shigeyuki Azuchi']
10
+ spec.email = ['azuchi@chaintope.com']
11
+
12
+ spec.summary = 'AEZ binding for ruby.'
13
+ spec.description = 'AEZ binding for ruby.'
14
+ spec.homepage = 'https://github.com/azuchi/aez'
15
+ spec.license = 'MIT'
16
+
17
+ # Specify which files should be added to the gem when it is released.
18
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
19
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
20
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
21
+ end
22
+ spec.bindir = 'exe'
23
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
24
+ spec.require_paths = ['lib']
25
+ spec.extensions = ['ext/aezv5/extconf.rb']
26
+ spec.add_runtime_dependency 'ffi', '>= 1.15.1'
27
+
28
+ spec.add_development_dependency 'bundler'
29
+ spec.add_development_dependency 'rake', '>= 12.3.3'
30
+ spec.add_development_dependency 'rake-compiler', '>= 1.1.1'
31
+ spec.add_development_dependency 'rspec', '~> 3.0'
32
+
33
+ end
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "aez"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,943 @@
1
+ /*
2
+ // AEZ v5 AES-NI version. AEZ info: http://www.cs.ucdavis.edu/~rogaway/aez
3
+ //
4
+ // REQUIREMENTS: - Intel or ARM CPU supporting AES instructions
5
+ // - Faster if all pointers are 16-byte aligned.
6
+ // - Max 16 byte nonce, 16 byte authenticator
7
+ // - Single AD (AEZ spec allows vector AD but this code doesn't)
8
+ // - Max 2^32-1 byte buffers allowed (due to using unsigned int)
9
+ //
10
+ // Written by Ted Krovetz (ted@krovetz.net). Last modified 21 March 2017.
11
+ //
12
+ // This is free and unencumbered software released into the public domain.
13
+ //
14
+ // Anyone is free to copy, modify, publish, use, compile, sell, or
15
+ // distribute this software, either in source code form or as a compiled
16
+ // binary, for any purpose, commercial or non-commercial, and by any
17
+ // means.
18
+ //
19
+ // In jurisdictions that recognize copyright laws, the author or authors
20
+ // of this software dedicate any and all copyright interest in the
21
+ // software to the public domain. We make this dedication for the benefit
22
+ // of the public at large and to the detriment of our heirs and
23
+ // successors. We intend this dedication to be an overt act of
24
+ // relinquishment in perpetuity of all present and future rights to this
25
+ // software under copyright law.
26
+ //
27
+ // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28
+ // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29
+ // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
30
+ // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
31
+ // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
32
+ // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
33
+ // OTHER DEALINGS IN THE SOFTWARE.
34
+ //
35
+ // For more information, please refer to <http://unlicense.org/>
36
+ */
37
+
38
+ #include <stdint.h>
39
+ #include <stddef.h>
40
+
41
+ /* ------------------------------------------------------------------------- */
42
+ #if __AES__ /* Defined by gcc/clang when compiling for AES-NI */
43
+ /* ------------------------------------------------------------------------- */
44
+
45
+ #include <smmintrin.h>
46
+ #include <wmmintrin.h>
47
+ #define block __m128i
48
+
49
+ /* ------------------------------------------------------------------------- */
50
+
51
+ #define zero _mm_setzero_si128()
52
+ #define vadd(x,y) _mm_add_epi8(x,y)
53
+ #define vand(x,y) _mm_and_si128(x,y)
54
+ #define vandnot(x,y) _mm_andnot_si128(x,y) /* (~x)&y */
55
+ #define vor(x,y) _mm_or_si128(x,y)
56
+ #define vxor(x,y) _mm_xor_si128(x,y)
57
+
58
+ static int is_zero(block x) { return _mm_testz_si128(x,x); } /* 0 or 1 */
59
+
60
+ static block sll4(block x) {
61
+ return vor(_mm_srli_epi64(x, 4), _mm_slli_epi64(_mm_srli_si128(x, 8), 60));
62
+ }
63
+
64
+ static block srl4(block x) {
65
+ return vor(_mm_slli_epi64(x, 4), _mm_srli_epi64(_mm_slli_si128(x, 8), 60));
66
+ }
67
+
68
+ static __m128i bswap16(__m128i b) {
69
+ const __m128i t = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
70
+ return _mm_shuffle_epi8(b,t);
71
+ }
72
+
73
+ static __m128i double_block(__m128i bl) {
74
+ const __m128i mask = _mm_set_epi32(135,1,1,1);
75
+ __m128i tmp = _mm_srai_epi32(bl, 31);
76
+ tmp = _mm_and_si128(tmp, mask);
77
+ tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2,1,0,3));
78
+ bl = _mm_slli_epi32(bl, 1);
79
+ return _mm_xor_si128(bl,tmp);
80
+ }
81
+
82
+ static __m128i aes(__m128i *key, __m128i in, __m128i first_key) {
83
+ in = vxor(in, first_key);
84
+ in = _mm_aesenc_si128 (in,key[0]);
85
+ in = _mm_aesenc_si128 (in,key[2]);
86
+ in = _mm_aesenc_si128 (in,key[5]);
87
+ in = _mm_aesenc_si128 (in,key[0]);
88
+ in = _mm_aesenc_si128 (in,key[2]);
89
+ in = _mm_aesenc_si128 (in,key[5]);
90
+ in = _mm_aesenc_si128 (in,key[0]);
91
+ in = _mm_aesenc_si128 (in,key[2]);
92
+ in = _mm_aesenc_si128 (in,key[5]);
93
+ return _mm_aesenc_si128 (in,key[0]);
94
+ }
95
+
96
+ static __m128i aes4(__m128i in, __m128i a, __m128i b,
97
+ __m128i c, __m128i d, __m128i e) {
98
+ in = _mm_aesenc_si128(vxor(in,a),b);
99
+ in = _mm_aesenc_si128(in,c);
100
+ in = _mm_aesenc_si128(in,d);
101
+ return _mm_aesenc_si128 (in,e);
102
+ }
103
+
104
+ #define aes4pre(in,a,b,c,d) aes4(in,a,b,c,d,zero)
105
+
106
+ static __m128i loadu(const void *p) { return _mm_loadu_si128((__m128i*)p); }
107
+ static void storeu(const void *p, __m128i x) {_mm_storeu_si128((__m128i*)p,x);}
108
+
109
+ #define load loadu /* Intel with AES-NI has fast unaligned loads/stores */
110
+ #define store storeu
111
+
112
+ /* ------------------------------------------------------------------------- */
113
+ #elif __ARM_FEATURE_CRYPTO
114
+ /* ------------------------------------------------------------------------- */
115
+
116
+ #include <arm_neon.h>
117
+ #define block uint8x16_t
118
+
119
+ #define zero vmovq_n_u8(0)
120
+ #define vadd(x,y) vaddq_u8(x,y)
121
+ #define vand(x,y) vandq_u8(x,y)
122
+ #define vandnot(x,y) vbicq_u8(y,x) /* (~x)&y */
123
+ #define vor(x,y) vorrq_u8(x,y)
124
+ #define vxor(x,y) veorq_u8(x,y)
125
+
126
+ static int is_zero(block x) { /* 0 or 1 */
127
+ uint8x8_t t = vorr_u8(vget_high_u8(x), vget_low_u8(x));
128
+ return vget_lane_u64(vreinterpret_u64_u8(t),0) == 0;
129
+ }
130
+
131
+ static block srl4(block x) {
132
+ const block mask = {15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,0};
133
+ uint8x16_t tmp = vandq_u8(vshrq_n_u8(vextq_u8(x, x, 1),4),mask);
134
+ return veorq_u8(tmp,vshlq_n_u8(x,4));
135
+ }
136
+
137
+ static block sll4(block x) {
138
+ const block mask = {0,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15};
139
+ uint8x16_t tmp = vshlq_n_u8(vandq_u8(vextq_u8(x, x, 15),mask),4);
140
+ return veorq_u8(tmp,vshrq_n_u8(x,4));
141
+ }
142
+
143
+ static uint8x16_t bswap16(uint8x16_t b) { return b; } /* Not with uint8x16_t */
144
+
145
+ static block double_block(block b) {
146
+ const block mask = {135,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
147
+ block tmp = (block)vshrq_n_s8((int8x16_t)b,7);
148
+ tmp = vandq_u8(tmp, mask);
149
+ tmp = vextq_u8(tmp, tmp, 1); /* Rotate high byte to low end */
150
+ b = vshlq_n_u8(b,1);
151
+ return veorq_u8(tmp,b);
152
+ }
153
+
154
+ static uint8x16_t aes(uint8x16_t *key, uint8x16_t in, uint8x16_t first_key) {
155
+ in = vaesmcq_u8(vaeseq_u8(in, first_key));
156
+ in = vaesmcq_u8(vaeseq_u8(in, key[0]));
157
+ in = vaesmcq_u8(vaeseq_u8(in, key[2]));
158
+ in = vaesmcq_u8(vaeseq_u8(in, key[5]));
159
+ in = vaesmcq_u8(vaeseq_u8(in, key[0]));
160
+ in = vaesmcq_u8(vaeseq_u8(in, key[2]));
161
+ in = vaesmcq_u8(vaeseq_u8(in, key[5]));
162
+ in = vaesmcq_u8(vaeseq_u8(in, key[0]));
163
+ in = vaesmcq_u8(vaeseq_u8(in, key[2]));
164
+ in = vaesmcq_u8(vaeseq_u8(in, key[5]));
165
+ return vxor(in, key[0]);
166
+ }
167
+
168
+ static uint8x16_t aes4pre(uint8x16_t in, uint8x16_t a, uint8x16_t b,
169
+ uint8x16_t c, uint8x16_t d) {
170
+ in = vaesmcq_u8(vaeseq_u8(in, a));
171
+ in = vaesmcq_u8(vaeseq_u8(in, b));
172
+ in = vaesmcq_u8(vaeseq_u8(in, c));
173
+ return vaesmcq_u8(vaeseq_u8(in, d));
174
+ }
175
+
176
+ #define aes4(in,a,b,c,d,e) vxor(aes4pre(in,a,b,c,d),e)
177
+
178
+ static uint8x16_t load(const void *p) { return *(uint8x16_t *)p; }
179
+ static void store(void *p, uint8x16_t x) { *(uint8x16_t *)p = x; }
180
+
181
+ #define loadu load /* ARMv8 allows unaligned loads/stores */
182
+ #define storeu store /* ARMv8 allows unaligned stores */
183
+
184
+ /* ------------------------------------------------------------------------- */
185
+ #else
186
+ #error - This implementation requires __AES__ or __ARM_FEATURE_CRYPTO
187
+ #endif
188
+ /* ------------------------------------------------------------------------- */
189
+
190
+ #define vxor3(x,y,z) vxor(vxor(x,y),z)
191
+ #define vxor4(w,x,y,z) vxor(vxor(w,x),vxor(y,z))
192
+ #define load_partial(p,n) loadu(p)
193
+
194
+ /*
195
+ Might need a version like this if, for example, we want to load a 12-byte nonce
196
+ into a 16-byte block.
197
+
198
+ static block load_partial(const void *p, unsigned n) {
199
+ if ((intptr_t)p % 16 == 0) return load(p);
200
+ else {
201
+ block tmp; unsigned i;
202
+ for (i=0; i<n; i++) ((char*)&tmp)[i] = ((char*)p)[i];
203
+ return tmp;
204
+ }
205
+ }
206
+ */
207
+
208
+ static const unsigned char pad[] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
209
+ 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
210
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
211
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
212
+ 0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
213
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
214
+
215
+ static block zero_pad(block x, unsigned zero_bytes) {
216
+ return vand(x, loadu(pad + zero_bytes));
217
+ }
218
+
219
+ static block one_zero_pad(block x, unsigned one_zero_bytes) {
220
+ block *p = (block*)(pad + one_zero_bytes);
221
+ return vor(vand(x, loadu(p)), loadu(p+1));
222
+ }
223
+
224
+ static block zero_set_byte(char val, unsigned idx) {
225
+ block tmp = zero; ((char *)&tmp)[idx] = val; return tmp;
226
+ }
227
+
228
+ /* ------------------------------------------------------------------------- */
229
+
230
+ typedef struct { /* All data memory-correct except 2I register-correct */
231
+ block I[2]; /* 1I, 2I */
232
+ block J[3]; /* 1J,2J,4J */
233
+ block L[3]; /* 1L,2L,4L */
234
+ block delta3_cache;
235
+ } aez_ctx_t;
236
+
237
+ /* ------------------------------------------------------------------------- */
238
+
239
+ static int blake2b(void *out, size_t outlen,
240
+ const void *key, size_t keylen,
241
+ const void *in, size_t inlen);
242
+
243
+ /* ------------------------------------------------------------------------- */
244
+
245
+ void aez_setup(unsigned char *key, unsigned keylen, aez_ctx_t *ctx) {
246
+ block tmp;
247
+ if (keylen==48) {
248
+ ctx->I[0] = loadu(key);
249
+ ctx->J[0] = loadu(key+16);
250
+ ctx->L[0] = loadu(key+32);
251
+ } else {
252
+ blake2b(ctx, 48, 0, 0, key, keylen); /* Puts IJL into ctx */
253
+ ctx->L[0] = ctx->J[0]; /* Rearrange. */
254
+ ctx->J[0] = ctx->I[1]; /* Rearrange. */
255
+ }
256
+ /* Fill remaining ctx locations with doublings */
257
+ ctx->I[1] = double_block(bswap16(ctx->I[0])); /* No post-bswap */
258
+ ctx->J[1] = bswap16(tmp = double_block(bswap16(ctx->J[0])));
259
+ ctx->J[2] = bswap16(double_block(tmp));
260
+ ctx->L[1] = bswap16(tmp = double_block(bswap16(ctx->L[0])));
261
+ ctx->L[2] = bswap16(double_block(tmp));
262
+ ctx->delta3_cache = zero;
263
+ }
264
+
265
+ /* ------------------------------------------------------------------------- */
266
+
267
+ /* !! Warning !! Only handles nbytes <= 16 and abytes <= 16 */
268
+ static block aez_hash(aez_ctx_t *ctx, char *n, unsigned nbytes, char *ad,
269
+ unsigned adbytes, unsigned abytes) {
270
+ block o1, o2, o3, o4, o5, o6, o7, o8, sum, offset, tmp;
271
+ block I=ctx->I[0], Ifordoubling = ctx->I[1], I2 = bswap16(Ifordoubling);
272
+ block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2];
273
+ block J=ctx->J[0], J2 = ctx->J[1], J4 = ctx->J[2], J5 = vxor(J,J4);
274
+
275
+ /* Process abytes and nonce */
276
+ offset = vxor4(J, J2, I2, L);
277
+ tmp = zero_set_byte((char)(8*abytes),15);
278
+ sum = aes4pre(offset,tmp,J,I,L);
279
+
280
+ if (nbytes==16) sum = aes4(vxor(loadu(n), J4), vxor(I2, L),J,I,L,sum);
281
+ else sum = aes4(vxor(J4, I),
282
+ one_zero_pad(load_partial(n,nbytes),16-nbytes),J,I,L,sum);
283
+
284
+ if (ad) { /* Possible easy misuse: ad==null && adbytes==0 */
285
+ if (adbytes==0) {
286
+ ctx->delta3_cache = aes4pre(vxor(J5, I), loadu(pad+32),J,I,L);
287
+ } else {
288
+ block delta3 = zero;
289
+ offset = vxor(J5, I2);
290
+ while (adbytes >= 8*16) {
291
+ o1 = vxor(offset,L);
292
+ o2 = vxor(offset,L2);
293
+ o3 = vxor(o1,L2);
294
+ o4 = vxor(offset,L4);
295
+ o5 = vxor(o1,L4);
296
+ o6 = vxor(o2,L4);
297
+ o7 = vxor(o3,L4);
298
+ o8 = offset;
299
+ Ifordoubling = double_block(Ifordoubling);
300
+ offset = vxor(J5, bswap16(Ifordoubling));
301
+ delta3 = vxor(delta3, aes4pre(load(ad+ 0), o1, J, I, L));
302
+ delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L));
303
+ delta3 = vxor(delta3, aes4pre(load(ad+ 32), o3, J, I, L));
304
+ delta3 = vxor(delta3, aes4pre(load(ad+ 48), o4, J, I, L));
305
+ delta3 = vxor(delta3, aes4pre(load(ad+ 64), o5, J, I, L));
306
+ delta3 = vxor(delta3, aes4pre(load(ad+ 80), o6, J, I, L));
307
+ delta3 = vxor(delta3, aes4pre(load(ad+ 96), o7, J, I, L));
308
+ delta3 = vxor(delta3, aes4pre(load(ad+112), o8, J, I, L));
309
+ adbytes-=8*16; ad+=8*16;
310
+ }
311
+ if (adbytes >= 4*16) {
312
+ o1 = vxor(offset,L);
313
+ o2 = vxor(offset,L2);
314
+ o3 = vxor(o1,L2);
315
+ o4 = offset = vxor(offset,L4);
316
+ delta3 = vxor(delta3, aes4pre(load(ad+ 0), o1, J, I, L));
317
+ delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L));
318
+ delta3 = vxor(delta3, aes4pre(load(ad+ 32), o3, J, I, L));
319
+ delta3 = vxor(delta3, aes4pre(load(ad+ 48), o4, J, I, L));
320
+ adbytes-=4*16; ad+=4*16;
321
+ }
322
+ if (adbytes >= 2*16) {
323
+ o1 = vxor(offset,L);
324
+ o2 = offset = vxor(offset,L2);
325
+ delta3 = vxor(delta3, aes4pre(load(ad+ 0), o1, J, I, L));
326
+ delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L));
327
+ adbytes-=2*16; ad+=2*16;
328
+ }
329
+ if (adbytes >= 1*16) {
330
+ o1 = vxor(offset,L);
331
+ delta3 = vxor(delta3, aes4pre(load(ad+ 0), o1, J, I, L));
332
+ adbytes-=1*16; ad+=1*16;
333
+ }
334
+ if (adbytes) {
335
+ tmp = vxor3(J5, I, one_zero_pad(load(ad),16-adbytes));
336
+ delta3 = aes4(vxor(J5, I), one_zero_pad(load(ad),16-adbytes),
337
+ J, I, L, delta3);
338
+ }
339
+ ctx->delta3_cache = delta3;
340
+ }
341
+ }
342
+ return vxor(sum,ctx->delta3_cache);
343
+ }
344
+
345
+ /* ------------------------------------------------------------------------- */
346
+
347
+ static block pass_one(aez_ctx_t *ctx, block *src, unsigned bytes, block *dst) {
348
+ block o1, o2, o3, o4, o5, o6, o7, o8, offset, tmp, sum=zero;
349
+ block I=ctx->I[0], Ifordoubling = ctx->I[1];
350
+ block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2];
351
+ block J=ctx->J[0];
352
+ offset = vxor(J, bswap16(Ifordoubling));
353
+ while (bytes >= 16*16) {
354
+ o1 = vxor(offset,L);
355
+ o2 = vxor(offset,L2);
356
+ o3 = vxor(o1,L2);
357
+ o4 = vxor(offset,L4);
358
+ o5 = vxor(o1,L4);
359
+ o6 = vxor(o2,L4);
360
+ o7 = vxor(o3,L4);
361
+ o8 = offset;
362
+ Ifordoubling = double_block(Ifordoubling);
363
+ offset = vxor(J,bswap16(Ifordoubling));
364
+ store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0)));
365
+ store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2)));
366
+ store(dst+ 4, aes4(load(src + 5),o3, J, I, L, load(src+ 4)));
367
+ store(dst+ 6, aes4(load(src + 7),o4, J, I, L, load(src+ 6)));
368
+ store(dst+ 8, aes4(load(src + 9),o5, J, I, L, load(src+ 8)));
369
+ store(dst+10, aes4(load(src +11),o6, J, I, L, load(src+10)));
370
+ store(dst+12, aes4(load(src +13),o7, J, I, L, load(src+12)));
371
+ store(dst+14, aes4(load(src +15),o8, J, I, L, load(src+14)));
372
+ tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));store(dst+ 1,tmp);
373
+ sum=vxor(sum,tmp);
374
+ tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3));
375
+ store(dst+ 3,tmp);sum=vxor(sum,tmp);
376
+ tmp=aes4(I,load(dst+ 4),J,I,L,load(src+ 5));
377
+ store(dst+ 5,tmp);sum=vxor(sum,tmp);
378
+ tmp=aes4(I,load(dst+ 6),J,I,L,load(src+ 7));
379
+ store(dst+ 7,tmp);sum=vxor(sum,tmp);
380
+ tmp=aes4(I,load(dst+ 8),J,I,L,load(src+ 9));
381
+ store(dst+ 9,tmp);sum=vxor(sum,tmp);
382
+ tmp=aes4(I,load(dst+10),J,I,L,load(src+11));
383
+ store(dst+11,tmp);sum=vxor(sum,tmp);
384
+ tmp=aes4(I,load(dst+12),J,I,L,load(src+13));
385
+ store(dst+13,tmp);sum=vxor(sum,tmp);
386
+ tmp=aes4(I,load(dst+14),J,I,L,load(src+15));
387
+ store(dst+15,tmp);sum=vxor(sum,tmp);
388
+ bytes -= 16*16; dst += 16; src += 16;
389
+ }
390
+ if (bytes >= 8*16) {
391
+ o1 = vxor(offset,L);
392
+ o2 = vxor(offset,L2);
393
+ o3 = vxor(o1,L2);
394
+ o4 = offset = vxor(offset,L4);
395
+ store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0)));
396
+ store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2)));
397
+ store(dst+ 4, aes4(load(src + 5),o3, J, I, L, load(src+ 4)));
398
+ store(dst+ 6, aes4(load(src + 7),o4, J, I, L, load(src+ 6)));
399
+ tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));
400
+ store(dst+ 1,tmp);sum=vxor(sum,tmp);
401
+ tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3));
402
+ store(dst+ 3,tmp);sum=vxor(sum,tmp);
403
+ tmp=aes4(I,load(dst+ 4),J,I,L,load(src+ 5));
404
+ store(dst+ 5,tmp);sum=vxor(sum,tmp);
405
+ tmp=aes4(I,load(dst+ 6),J,I,L,load(src+ 7));
406
+ store(dst+ 7,tmp);sum=vxor(sum,tmp);
407
+ bytes -= 8*16; dst += 8; src += 8;
408
+ }
409
+ if (bytes >= 4*16) {
410
+ o1 = vxor(offset,L);
411
+ o2 = offset = vxor(offset,L2);
412
+ store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0)));
413
+ store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2)));
414
+ tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));
415
+ store(dst+ 1,tmp);sum=vxor(sum,tmp);
416
+ tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3));
417
+ store(dst+ 3,tmp);sum=vxor(sum,tmp);
418
+ bytes -= 4*16; dst += 4; src += 4;
419
+ }
420
+ if (bytes) {
421
+ o1 = vxor(offset,L);
422
+ store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0)));
423
+ tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));
424
+ store(dst+ 1,tmp);sum=vxor(sum,tmp);
425
+ }
426
+ return sum;
427
+ }
428
+
429
+ /* ------------------------------------------------------------------------- */
430
+
431
+ static block pass_two(aez_ctx_t *ctx, block s, unsigned bytes, block *dst) {
432
+ block o1, o2, o3, o4, o5, o6, o7, o8, sum=zero, offset, fs[8], tmp[8];
433
+ block I=ctx->I[0], Ifordoubling = ctx->I[1];
434
+ block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2];
435
+ block J=ctx->J[0], J2=ctx->J[1], J3=vxor(J,J2);
436
+ offset = vxor(J2, bswap16(Ifordoubling));
437
+ while (bytes >= 16*16) {
438
+ o1 = vxor(offset,L);
439
+ o2 = vxor(offset,L2);
440
+ o3 = vxor(o1,L2);
441
+ o4 = vxor(offset,L4);
442
+ o5 = vxor(o1,L4);
443
+ o6 = vxor(o2,L4);
444
+ o7 = vxor(o3,L4);
445
+ o8 = offset;
446
+ Ifordoubling = double_block(Ifordoubling);
447
+ offset = vxor(J2, bswap16(Ifordoubling));
448
+ fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L);
449
+ fs[2] = aes4pre(s,o3,J,I,L); fs[3] = aes4pre(s,o4,J,I,L);
450
+ fs[4] = aes4pre(s,o5,J,I,L); fs[5] = aes4pre(s,o6,J,I,L);
451
+ fs[6] = aes4pre(s,o7,J,I,L); fs[7] = aes4pre(s,o8,J,I,L);
452
+ o1 = vxor(J3,o1); o2 = vxor(J3,o2);
453
+ o3 = vxor(J3,o3); o4 = vxor(J3,o4);
454
+ o5 = vxor(J3,o5); o6 = vxor(J3,o6);
455
+ o7 = vxor(J3,o7); o8 = vxor(J3,o8);
456
+ tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]);
457
+ store(dst+ 0,vxor(load(dst+ 1),fs[0]));
458
+ tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]);
459
+ store(dst+ 2,vxor(load(dst+ 3),fs[1]));
460
+ tmp[2] = vxor(load(dst+ 4),fs[2]); sum = vxor(sum,tmp[2]);
461
+ store(dst+ 4,vxor(load(dst+ 5),fs[2]));
462
+ tmp[3] = vxor(load(dst+ 6),fs[3]); sum = vxor(sum,tmp[3]);
463
+ store(dst+ 6,vxor(load(dst+ 7),fs[3]));
464
+ tmp[4] = vxor(load(dst+ 8),fs[4]); sum = vxor(sum,tmp[4]);
465
+ store(dst+ 8,vxor(load(dst+ 9),fs[4]));
466
+ tmp[5] = vxor(load(dst+10),fs[5]); sum = vxor(sum,tmp[5]);
467
+ store(dst+10,vxor(load(dst+11),fs[5]));
468
+ tmp[6] = vxor(load(dst+12),fs[6]); sum = vxor(sum,tmp[6]);
469
+ store(dst+12,vxor(load(dst+13),fs[6]));
470
+ tmp[7] = vxor(load(dst+14),fs[7]); sum = vxor(sum,tmp[7]);
471
+ store(dst+14,vxor(load(dst+15),fs[7]));
472
+ store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0]));
473
+ store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1]));
474
+ store(dst+ 5, aes4(I,load(dst+ 4), J, I, L, tmp[2]));
475
+ store(dst+ 7, aes4(I,load(dst+ 6), J, I, L, tmp[3]));
476
+ store(dst+ 9, aes4(I,load(dst+ 8), J, I, L, tmp[4]));
477
+ store(dst+11, aes4(I,load(dst+10), J, I, L, tmp[5]));
478
+ store(dst+13, aes4(I,load(dst+12), J, I, L, tmp[6]));
479
+ store(dst+15, aes4(I,load(dst+14), J, I, L, tmp[7]));
480
+ store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0)));
481
+ store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2)));
482
+ store(dst+ 4, aes4(load(dst+ 5),o3, J, I, L, load(dst+ 4)));
483
+ store(dst+ 6, aes4(load(dst+ 7),o4, J, I, L, load(dst+ 6)));
484
+ store(dst+ 8, aes4(load(dst+ 9),o5, J, I, L, load(dst+ 8)));
485
+ store(dst+10, aes4(load(dst+11),o6, J, I, L, load(dst+10)));
486
+ store(dst+12, aes4(load(dst+13),o7, J, I, L, load(dst+12)));
487
+ store(dst+14, aes4(load(dst+15),o8, J, I, L, load(dst+14)));
488
+ bytes -= 16*16; dst += 16;
489
+ }
490
+ if (bytes >= 8*16) {
491
+ o1 = vxor(offset,L);
492
+ o2 = vxor(offset,L2);
493
+ o3 = vxor(o1,L2);
494
+ o4 = offset = vxor(offset,L4);
495
+ fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L);
496
+ fs[2] = aes4pre(s,o3,J,I,L); fs[3] = aes4pre(s,o4,J,I,L);
497
+ o1 = vxor(J3,o1); o2 = vxor(J3,o2);
498
+ o3 = vxor(J3,o3); o4 = vxor(J3,o4);
499
+ tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]);
500
+ store(dst+ 0,vxor(load(dst+ 1),fs[0]));
501
+ tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]);
502
+ store(dst+ 2,vxor(load(dst+ 3),fs[1]));
503
+ tmp[2] = vxor(load(dst+ 4),fs[2]); sum = vxor(sum,tmp[2]);
504
+ store(dst+ 4,vxor(load(dst+ 5),fs[2]));
505
+ tmp[3] = vxor(load(dst+ 6),fs[3]); sum = vxor(sum,tmp[3]);
506
+ store(dst+ 6,vxor(load(dst+ 7),fs[3]));
507
+ store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0]));
508
+ store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1]));
509
+ store(dst+ 5, aes4(I,load(dst+ 4), J, I, L, tmp[2]));
510
+ store(dst+ 7, aes4(I,load(dst+ 6), J, I, L, tmp[3]));
511
+ store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0)));
512
+ store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2)));
513
+ store(dst+ 4, aes4(load(dst+ 5),o3, J, I, L, load(dst+ 4)));
514
+ store(dst+ 6, aes4(load(dst+ 7),o4, J, I, L, load(dst+ 6)));
515
+ bytes -= 8*16; dst += 8;
516
+ }
517
+ if (bytes >= 4*16) {
518
+ o1 = vxor(offset,L);
519
+ o2 = offset = vxor(offset,L2);
520
+ fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L);
521
+ o1 = vxor(J3,o1); o2 = vxor(J3,o2);
522
+ tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]);
523
+ store(dst+ 0,vxor(load(dst+ 1),fs[0]));
524
+ tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]);
525
+ store(dst+ 2,vxor(load(dst+ 3),fs[1]));
526
+ store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0]));
527
+ store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1]));
528
+ store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0)));
529
+ store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2)));
530
+ bytes -= 4*16; dst += 4;
531
+ }
532
+ if (bytes) {
533
+ o1 = vxor(offset,L);
534
+ fs[0] = aes4pre(s,o1,J,I,L);
535
+ o1 = vxor(J3,o1);
536
+ tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]);
537
+ store(dst+ 0,vxor(load(dst+ 1),fs[0]));
538
+ store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0]));
539
+ store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0)));
540
+ }
541
+ return sum;
542
+ }
543
+
544
+ /* ------------------------------------------------------------------------- */
545
+
546
+ static int cipher_aez_core(aez_ctx_t *ctx, block t, int d, char *src,
547
+ unsigned bytes, unsigned abytes, char *dst) {
548
+ block s, x, y, frag0, frag1, final0, final1;
549
+ block I=ctx->I[0], L=ctx->L[0], J=ctx->J[0];
550
+ block L4=ctx->L[2], I2 = bswap16(ctx->I[1]);
551
+ unsigned i, frag_bytes, initial_bytes;
552
+
553
+ if (!d) bytes += abytes;
554
+ frag_bytes = bytes % 32;
555
+ initial_bytes = bytes - frag_bytes - 32;
556
+
557
+ /* Compute x and store intermediate results */
558
+ x = pass_one(ctx, (block*)src, initial_bytes, (block*)dst);
559
+ if (frag_bytes >= 16) {
560
+ frag0 = load(src + initial_bytes);
561
+ frag1 = one_zero_pad(load(src + initial_bytes + 16), 32-frag_bytes);
562
+ x = aes4(frag0, vxor(L4, I2), J, I, L, x);
563
+ x = vxor(x, aes4pre(frag1, vxor3(I2, L4, L), J, I, L));
564
+ } else if (frag_bytes) {
565
+ frag0 = one_zero_pad(load(src + initial_bytes), 16-frag_bytes);
566
+ x = aes4(frag0, vxor(L4, I2), J, I, L, x);
567
+ }
568
+
569
+ /* Calculate s and final block values (y xor'd to final1 later) */
570
+ final0 = vxor3(loadu(src + (bytes - 32)), x, t);
571
+ if (d || !abytes) final1 = loadu(src+(bytes-32)+16);
572
+ else final1 = zero_pad(loadu(src+(bytes-32)+16), abytes);
573
+ final0 = aes4(final1, vxor(I2, ctx->L[d]), J, I, L, final0);
574
+ final1 = vxor(final1, aes((block*)ctx, final0, ctx->L[d]));
575
+ s = vxor(final0, final1);
576
+ final0 = vxor(final0, aes((block*)ctx, final1, ctx->L[d^1]));
577
+ /* Decryption: final0 should hold abytes zero bytes. If not, failure */
578
+ if (d && !is_zero(vandnot(loadu(pad+abytes),final0))) return -1;
579
+ final1 = aes4(final0, vxor(I2, ctx->L[d^1]), J, I, L, final1);
580
+
581
+ /* Compute y and store final results */
582
+ y = pass_two(ctx, s, initial_bytes, (block*)dst);
583
+ if (frag_bytes >= 16) {
584
+ frag0 = vxor(frag0, aes((block*)ctx, s, L4));
585
+ frag1 = vxor(frag1, aes((block*)ctx, s, vxor(L4, L)));
586
+ frag1 = one_zero_pad(frag1, 32-frag_bytes);
587
+ y = aes4(frag0, vxor(I2, L4), J, I, L, y);
588
+ y = vxor(y, aes4pre(frag1, vxor3(I2, L4, L), J, I, L));
589
+ store(dst + initial_bytes, frag0);
590
+ store(dst + initial_bytes + 16, frag1);
591
+ } else if (frag_bytes) {
592
+ frag0 = vxor(frag0, aes((block*)ctx, s, L4));
593
+ frag0 = one_zero_pad(frag0, 16-frag_bytes);
594
+ y = aes4(frag0, vxor(I2, L4), J, I, L, y);
595
+ store(dst + initial_bytes, frag0);
596
+ }
597
+
598
+ storeu(dst + (bytes - 32), vxor3(final1, y, t));
599
+ if (!d || !abytes)
600
+ storeu(dst + (bytes - 32) + 16, final0);
601
+ else {
602
+ for (i=0; i<16-abytes; i++)
603
+ ((char*)dst + (bytes - 16))[i] = ((char*)&final0)[i];
604
+ }
605
+ return 0;
606
+ }
607
+
608
+ /* ------------------------------------------------------------------------- */
609
+
610
+ static int cipher_aez_tiny(aez_ctx_t *ctx, block t, int d, char *src,
611
+ unsigned bytes, unsigned abytes, char *dst) {
612
+ block l, r, tmp, one, rcon, buf[2], mask_10, mask_ff;
613
+ block I=ctx->I[0], L=ctx->L[0], J=ctx->J[0], t_orig = t;
614
+ block L2=ctx->L[1], L4=ctx->L[2], I2 = bswap16(ctx->I[1]);
615
+ unsigned rnds, i;
616
+
617
+ /* load src into buf, zero pad, update bytes for abytes */
618
+ if (bytes >= 16) {
619
+ buf[0] = load(src);
620
+ buf[1] = zero_pad(load_partial(src+16,bytes-16),32-bytes);
621
+ } else {
622
+ buf[0] = zero_pad(load_partial(src,bytes),16-bytes);
623
+ buf[1] = zero;
624
+ }
625
+ if (!d) bytes += abytes;
626
+
627
+ /* load l/r, create 10* padding masks, shift r 4 bits if odd length */
628
+ l = buf[0];
629
+ r = loadu((char*)buf+bytes/2);
630
+ mask_ff = loadu(pad+16-bytes/2);
631
+ mask_10 = loadu(pad+32-bytes/2);
632
+ if (bytes&1) { /* Odd length. Deal with nibbles. */
633
+ mask_10 = sll4(mask_10);
634
+ ((char*)&mask_ff)[bytes/2] = (char)0xf0;
635
+ r = bswap16(r);
636
+ r = srl4(r);
637
+ r = bswap16(r);
638
+ }
639
+ r = vor(vand(r, mask_ff), mask_10);
640
+
641
+ /* Add tweak offset into t, and determine the number of rounds */
642
+ if (bytes >= 16) {
643
+ t = vxor4(t, I2, L2, L4); /* (0,6) offset */
644
+ rnds = 8;
645
+ } else {
646
+ t = vxor(vxor4(t, I2, L2, L4), L); /* (0,7) offset */
647
+ if (bytes>=3) rnds = 10; else if (bytes==2) rnds = 16; else rnds = 24;
648
+ }
649
+
650
+ if (!d) {
651
+ one = zero_set_byte(1,15);
652
+ rcon = zero;
653
+ } else {
654
+ one = zero_set_byte(-1,15);
655
+ rcon = zero_set_byte((char)(rnds-1),15);
656
+ }
657
+
658
+ if ((d) && (bytes < 16)) {
659
+ block offset = vxor3(I2, L, L2);
660
+ tmp = vor(l, loadu(pad+32));
661
+ tmp = aes4pre(t_orig, vxor(tmp,offset), J, I, L);
662
+ tmp = vand(tmp, loadu(pad+32));
663
+ l = vxor(l, tmp);
664
+ }
665
+
666
+ /* Feistel */
667
+ for (i=0; i<rnds; i+=2) {
668
+ l = vor(vand(aes4(t,vxor(r,rcon), J, I, L, l), mask_ff), mask_10);
669
+ rcon = vadd(rcon,one);
670
+ r = vor(vand(aes4(t,vxor(l,rcon), J, I, L, r), mask_ff), mask_10);
671
+ rcon = vadd(rcon,one);
672
+ }
673
+ buf[0] = r;
674
+ if (bytes&1) {
675
+ l = bswap16(l);
676
+ l = sll4(l);
677
+ l = bswap16(l);
678
+ r = vand(loadu((char*)buf+bytes/2), zero_set_byte((char)0xf0,0));
679
+ l = vor(l, r);
680
+ }
681
+ storeu((char*)buf+bytes/2, l);
682
+ if (d) {
683
+ bytes -= abytes;
684
+ if (abytes==16) tmp = loadu((char*)buf+bytes);
685
+ else {
686
+ tmp = zero;
687
+ for (i=0; i<abytes; i++) ((char*)&tmp)[i] = ((char*)buf+bytes)[i];
688
+ }
689
+ if (!is_zero(tmp)) return -1;
690
+ } else if (bytes < 16) {
691
+ block offset = vxor3(I2, L, L2);
692
+ tmp = vor(zero_pad(buf[0], 16-bytes), loadu(pad+32));
693
+ tmp = aes4pre(t_orig,vxor(tmp,offset), J, I, L);
694
+ buf[0] = vxor(buf[0], vand(tmp, loadu(pad+32)));
695
+ }
696
+ for (i=0; i<bytes; i++) dst[i] = ((char*)buf)[i];
697
+ return 0;
698
+ }
699
+
700
+ /* ------------------------------------------------------------------------- */
701
+
702
+ void aez_encrypt(aez_ctx_t *ctx, char *n, unsigned nbytes,
703
+ char *ad, unsigned adbytes, unsigned abytes,
704
+ char *src, unsigned bytes, char *dst) {
705
+
706
+ block t = aez_hash(ctx, n, nbytes, ad, adbytes, abytes);
707
+ if (bytes==0) {
708
+ unsigned i;
709
+ t = aes((block*)ctx, t, vxor(ctx->L[0], ctx->L[1]));
710
+ for (i=0; i<abytes; i++) dst[i] = ((char*)&t)[i];
711
+ } else if (bytes+abytes < 32)
712
+ cipher_aez_tiny(ctx, t, 0, src, bytes, abytes, dst);
713
+ else
714
+ cipher_aez_core(ctx, t, 0, src, bytes, abytes, dst);
715
+ }
716
+
717
+ /* ------------------------------------------------------------------------- */
718
+
719
+ int aez_decrypt(aez_ctx_t *ctx, char *n, unsigned nbytes,
720
+ char *ad, unsigned adbytes, unsigned abytes,
721
+ char *src, unsigned bytes, char *dst) {
722
+
723
+ block t;
724
+ if (bytes < abytes) return -1;
725
+ t = aez_hash(ctx, n, nbytes, ad, adbytes, abytes);
726
+ if (bytes==abytes) {
727
+ block claimed = zero_pad(load_partial(src,abytes), 16-abytes);
728
+ t = zero_pad(aes((block*)ctx, t, vxor(ctx->L[0], ctx->L[1])), 16-abytes);
729
+ return is_zero(vandnot(t, claimed)) - 1; /* is_zero return 0 or 1 */
730
+ } else if (bytes < 32) {
731
+ return cipher_aez_tiny(ctx, t, 1, src, bytes, abytes, dst);
732
+ } else {
733
+ return cipher_aez_core(ctx, t, 1, src, bytes, abytes, dst);
734
+ }
735
+ }
736
+
737
+ /* ------------------------------------------------------------------------- */
738
+ /* Reference Blake2b code, here for convenience, and not for speed. */
739
+ /* Dowloaded Sep 2015 from https://github.com/mjosaarinen/blake2_mjosref */
740
+
741
+ #include <stdint.h>
742
+
743
+ typedef struct {
744
+ uint8_t b[128];
745
+ uint64_t h[8];
746
+ uint64_t t[2];
747
+ size_t c;
748
+ size_t outlen;
749
+ } blake2b_ctx;
750
+
751
+ #ifndef ROTR64
752
+ #define ROTR64(x, y) (((x) >> (y)) ^ ((x) << (64 - (y))))
753
+ #endif
754
+
755
+ #define B2B_GET64(p) \
756
+ (((uint64_t) ((uint8_t *) (p))[0]) ^ \
757
+ (((uint64_t) ((uint8_t *) (p))[1]) << 8) ^ \
758
+ (((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
759
+ (((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
760
+ (((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
761
+ (((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
762
+ (((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
763
+ (((uint64_t) ((uint8_t *) (p))[7]) << 56))
764
+
765
+ #define B2B_G(a, b, c, d, x, y) { \
766
+ v[a] = v[a] + v[b] + x; \
767
+ v[d] = ROTR64(v[d] ^ v[a], 32); \
768
+ v[c] = v[c] + v[d]; \
769
+ v[b] = ROTR64(v[b] ^ v[c], 24); \
770
+ v[a] = v[a] + v[b] + y; \
771
+ v[d] = ROTR64(v[d] ^ v[a], 16); \
772
+ v[c] = v[c] + v[d]; \
773
+ v[b] = ROTR64(v[b] ^ v[c], 63); }
774
+
775
+ static const uint64_t blake2b_iv[8] = {
776
+ 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
777
+ 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
778
+ 0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
779
+ 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
780
+ };
781
+
782
+ static void blake2b_compress(blake2b_ctx *ctx, int last)
783
+ {
784
+ const uint8_t sigma[12][16] = {
785
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
786
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
787
+ { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
788
+ { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
789
+ { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
790
+ { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
791
+ { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
792
+ { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
793
+ { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
794
+ { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
795
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
796
+ { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
797
+ };
798
+ int i;
799
+ uint64_t v[16], m[16];
800
+
801
+ for (i = 0; i < 8; i++) {
802
+ v[i] = ctx->h[i];
803
+ v[i + 8] = blake2b_iv[i];
804
+ }
805
+
806
+ v[12] ^= ctx->t[0];
807
+ v[13] ^= ctx->t[1];
808
+ if (last)
809
+ v[14] = ~v[14];
810
+
811
+ for (i = 0; i < 16; i++)
812
+ m[i] = B2B_GET64(&ctx->b[8 * i]);
813
+
814
+ for (i = 0; i < 12; i++) {
815
+ B2B_G( 0, 4, 8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
816
+ B2B_G( 1, 5, 9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
817
+ B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
818
+ B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
819
+ B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
820
+ B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
821
+ B2B_G( 2, 7, 8, 13, m[sigma[i][12]], m[sigma[i][13]]);
822
+ B2B_G( 3, 4, 9, 14, m[sigma[i][14]], m[sigma[i][15]]);
823
+ }
824
+
825
+ for( i = 0; i < 8; ++i )
826
+ ctx->h[i] ^= v[i] ^ v[i + 8];
827
+ }
828
+
829
+ static void blake2b_update(blake2b_ctx *ctx,
830
+ const void *in, size_t inlen)
831
+ {
832
+ size_t i;
833
+
834
+ for (i = 0; i < inlen; i++) {
835
+ if (ctx->c == 128) {
836
+ ctx->t[0] += ctx->c;
837
+ if (ctx->t[0] < ctx->c)
838
+ ctx->t[1]++;
839
+ blake2b_compress(ctx, 0);
840
+ ctx->c = 0;
841
+ }
842
+ ctx->b[ctx->c++] = ((const uint8_t *) in)[i];
843
+ }
844
+ }
845
+
846
+ static void blake2b_final(blake2b_ctx *ctx, void *out)
847
+ {
848
+ size_t i;
849
+
850
+ ctx->t[0] += ctx->c;
851
+ if (ctx->t[0] < ctx->c)
852
+ ctx->t[1]++;
853
+
854
+ while (ctx->c < 128)
855
+ ctx->b[ctx->c++] = 0;
856
+ blake2b_compress(ctx, 1);
857
+
858
+ for (i = 0; i < ctx->outlen; i++) {
859
+ ((uint8_t *) out)[i] =
860
+ (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
861
+ }
862
+ }
863
+
864
+ static int blake2b_init(blake2b_ctx *ctx, size_t outlen,
865
+ const void *key, size_t keylen)
866
+ {
867
+ size_t i;
868
+
869
+ if (outlen == 0 || outlen > 64 || keylen > 64)
870
+ return -1;
871
+
872
+ for (i = 0; i < 8; i++)
873
+ ctx->h[i] = blake2b_iv[i];
874
+ ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;
875
+
876
+ ctx->t[0] = 0;
877
+ ctx->t[1] = 0;
878
+ ctx->c = 0;
879
+ ctx->outlen = outlen;
880
+
881
+ for (i = keylen; i < 128; i++)
882
+ ctx->b[i] = 0;
883
+ if (keylen > 0) {
884
+ blake2b_update(ctx, key, keylen);
885
+ ctx->c = 128;
886
+ }
887
+
888
+ return 0;
889
+ }
890
+
891
+ static int blake2b(void *out, size_t outlen,
892
+ const void *key, size_t keylen,
893
+ const void *in, size_t inlen)
894
+ {
895
+ blake2b_ctx ctx;
896
+
897
+ if (blake2b_init(&ctx, outlen, key, keylen))
898
+ return -1;
899
+ blake2b_update(&ctx, in, inlen);
900
+ blake2b_final(&ctx, out);
901
+
902
+ return 0;
903
+ }
904
+
905
+ /* ------------------------------------------------------------------------- */
906
+ /* aez mapping for CAESAR competition */
907
+
908
+ int crypto_aead_encrypt(
909
+ unsigned char *c,unsigned long long *clen,
910
+ const unsigned char *m,unsigned long long mlen,
911
+ const unsigned char *ad,unsigned long long adlen,
912
+ const unsigned char *nsec,
913
+ const unsigned char *npub,
914
+ const unsigned char *k
915
+ )
916
+ {
917
+ aez_ctx_t ctx;
918
+ (void)nsec;
919
+ if (clen) *clen = mlen+16;
920
+ aez_setup((unsigned char *)k, 48, &ctx);
921
+ aez_encrypt(&ctx, (char *)npub, 12,
922
+ (char *)ad, (unsigned)adlen, 16,
923
+ (char *)m, (unsigned)mlen, (char *)c);
924
+ return 0;
925
+ }
926
+
927
+ int crypto_aead_decrypt(
928
+ unsigned char *m,unsigned long long *mlen,
929
+ unsigned char *nsec,
930
+ const unsigned char *c,unsigned long long clen,
931
+ const unsigned char *ad,unsigned long long adlen,
932
+ const unsigned char *npub,
933
+ const unsigned char *k
934
+ )
935
+ {
936
+ aez_ctx_t ctx;
937
+ (void)nsec;
938
+ if (mlen) *mlen = clen-16;
939
+ aez_setup((unsigned char *)k, 48, &ctx);
940
+ return aez_decrypt(&ctx, (char *)npub, 12,
941
+ (char *)ad, (unsigned)adlen, 16,
942
+ (char *)c, (unsigned)clen, (char *)m);
943
+ }