RubyGems - aez - Versions diffs - 0.1.0 - Mend

aez 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: 4df307fcf2d926a7d97d8a67b446653314c41da2ba48daddabaa2db99a005f6c
+  data.tar.gz: c753537e76da64402c0ccdbdd92c85ce5fe0a81ca6fc767462f80360e2da38ce
+SHA512:
+  metadata.gz: 15148d0a90cebfeb9db8169fbda4812374bf1f6e7a5f455225e4b84662e7d98f8f7f11e35ffa8534ba983a86109805af5738677a8ac21266ea7de9f723db8496
+  data.tar.gz: d854f7d44f95b9711bcbd111fe91237056bf1772441f984df938171b4106cf1681bb7e354754009cc791e2dc1ab58e60cd958490fd6ebb2380bb0be42a1e3932

data/.github/workflows/ruby.yml ADDED Viewed

@@ -0,0 +1,37 @@
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+# This workflow will download a prebuilt Ruby version, install dependencies and run tests with Rake
+# For more information see: https://github.com/marketplace/actions/setup-ruby-jruby-and-truffleruby
+name: Ruby
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        ruby-version: ['2.6', '2.7', '3.0']
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Ruby
+    # To automatically get bug fixes and new Ruby versions for ruby/setup-ruby,
+    # change this to (see https://github.com/ruby/setup-ruby#versioning):
+    # uses: ruby/setup-ruby@v1
+      uses: ruby/setup-ruby@473e4d8fe5dd94ee328fdfca9f8c9c7afc9dae5e
+      with:
+        ruby-version: ${{ matrix.ruby-version }}
+        bundler-cache: true # runs 'bundle install' and caches installed gems automatically
+    - name: Compile extension
+      run: bundle exec rake compile
+    - name: Run tests
+      run: bundle exec rake spec

data/.gitignore ADDED Viewed

@@ -0,0 +1,13 @@
+/.bundle/
+/.yardoc
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+# rspec failure tracking
+.rspec_status
+Gemfile.lock
+*.so

data/.rspec ADDED Viewed

@@ -0,0 +1,3 @@
+--format documentation
+--color
+--require spec_helper

data/.ruby-gemset ADDED Viewed

	@@ -0,0 +1 @@
1	+ aez

data/.ruby-version ADDED Viewed

	@@ -0,0 +1 @@
1	+ ruby-3.0.0

data/CODE_OF_CONDUCT.md ADDED Viewed

@@ -0,0 +1,74 @@
+# Contributor Covenant Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, gender identity and expression, level of experience,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at azuchi@haw.co.jp. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at [http://contributor-covenant.org/version/1/4][version]
+[homepage]: http://contributor-covenant.org
+[version]: http://contributor-covenant.org/version/1/4/

data/Gemfile ADDED Viewed

@@ -0,0 +1,6 @@
+source "https://rubygems.org"
+git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
+# Specify your gem's dependencies in aez.gemspec
+gemspec

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2021 Shigeyuki Azuchi.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,48 @@
+# AEZ for Ruby [![Build Status](https://github.com/azuchi/aez/actions/workflows/ruby.yml/badge.svg?branch=master)](https://github.com/azuchi/aez/actions/workflows/ruby.yml) [![Gem Version](https://badge.fury.io/rb/aez.svg)](https://badge.fury.io/rb/aez) [![MIT License](http://img.shields.io/badge/license-MIT-blue.svg?style=flat)](LICENSE)
+[AEZ](http://web.cs.ucdavis.edu/~rogaway/aez/) binding for ruby.
+This library calls AEZv5 implementation in C using AES-NI hardware optimizations via FFI.
+## Requirements
+There are the following limitations from Ted Krovetz's C implementation:
+- Intel or ARM CPU supporting AES instructions
+- Faster if all pointers are 16-byte aligned.
+- Max 16 byte nonce, 16 byte authenticator
+- Single AD (AEZ spec allows vector AD but this code doesn't)
+- Max 2^32-1 byte buffers allowed (due to using unsigned int)
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'aez'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install aez
+## Usage
+```ruby
+require 'aez'
+key = ['9adf7a023fbc4e663695f627a8d5b5c45f6752e375d19e11a669e6b949347d0cf5e0e2516ee285af365224976afa60be'].pack('H*')
+nonce = ['799de3d90fbd6fed93b5f96cf9f4e852'].pack('H*')
+ad = ['d6e278e0c6ede09d302d6fde09de77711a9a02fc8a049fb34a5e3f00c1cfc336d0'].pack('H*')
+message = ['efea7ecfa45f51b52ce038cf6c0704392c2211bfca17a36284f63a902b37f0ab'].pack('H*')
+abyte = 16
+# Encryption
+cipher_tex = AEZ.encrypt(key, message, ad, nonce, abyte)
+# Decryption
+plain_text = AEZ.decrypt(key, message, ad, nonce, abyte)
+```

data/Rakefile ADDED Viewed

@@ -0,0 +1,13 @@
+# frozen_string_literal: true
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+require 'rake/extensiontask'
+RSpec::Core::RakeTask.new(:spec)
+task default: :spec
+Rake::ExtensionTask.new 'aezv5' do |ext|
+  ext.lib_dir = 'lib/aez'
+end

data/aez.gemspec ADDED Viewed

@@ -0,0 +1,33 @@
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'aez/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'aez'
+  spec.version       = AEZ::VERSION
+  spec.authors       = ['Shigeyuki Azuchi']
+  spec.email         = ['azuchi@chaintope.com']
+  spec.summary       = 'AEZ binding for ruby.'
+  spec.description   = 'AEZ binding for ruby.'
+  spec.homepage      = 'https://github.com/azuchi/aez'
+  spec.license       = 'MIT'
+  # Specify which files should be added to the gem when it is released.
+  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
+  spec.files         = Dir.chdir(File.expand_path('..', __FILE__)) do
+    `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  end
+  spec.bindir        = 'exe'
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ['lib']
+  spec.extensions = ['ext/aezv5/extconf.rb']
+  spec.add_runtime_dependency 'ffi', '>= 1.15.1'
+  spec.add_development_dependency 'bundler'
+  spec.add_development_dependency 'rake', '>= 12.3.3'
+  spec.add_development_dependency 'rake-compiler', '>= 1.1.1'
+  spec.add_development_dependency 'rspec', '~> 3.0'
+end

data/bin/console ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "aez"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start(__FILE__)

data/bin/setup ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/ext/aezv5/encrypt.c ADDED Viewed

@@ -0,0 +1,943 @@
+/*
+ // AEZ v5 AES-NI version. AEZ info: http://www.cs.ucdavis.edu/~rogaway/aez
+ //
+ // REQUIREMENTS: - Intel or ARM CPU supporting AES instructions
+ //               - Faster if all pointers are 16-byte aligned.
+ //               - Max 16 byte nonce, 16 byte authenticator
+ //               - Single AD (AEZ spec allows vector AD but this code doesn't)
+ //               - Max 2^32-1 byte buffers allowed (due to using unsigned int)
+ //
+ // Written by Ted Krovetz (ted@krovetz.net). Last modified 21 March 2017.
+ //
+ // This is free and unencumbered software released into the public domain.
+ //
+ // Anyone is free to copy, modify, publish, use, compile, sell, or
+ // distribute this software, either in source code form or as a compiled
+ // binary, for any purpose, commercial or non-commercial, and by any
+ // means.
+ //
+ // In jurisdictions that recognize copyright laws, the author or authors
+ // of this software dedicate any and all copyright interest in the
+ // software to the public domain. We make this dedication for the benefit
+ // of the public at large and to the detriment of our heirs and
+ // successors. We intend this dedication to be an overt act of
+ // relinquishment in perpetuity of all present and future rights to this
+ // software under copyright law.
+ //
+ // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ // OTHER DEALINGS IN THE SOFTWARE.
+ //
+ // For more information, please refer to <http://unlicense.org/>
+ */
+#include <stdint.h>
+#include <stddef.h>
+/* ------------------------------------------------------------------------- */
+#if __AES__                /* Defined by gcc/clang when compiling for AES-NI */
+/* ------------------------------------------------------------------------- */
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#define block __m128i
+/* ------------------------------------------------------------------------- */
+#define zero           _mm_setzero_si128()
+#define vadd(x,y)      _mm_add_epi8(x,y)
+#define vand(x,y)      _mm_and_si128(x,y)
+#define vandnot(x,y)   _mm_andnot_si128(x,y)  /* (~x)&y */
+#define vor(x,y)       _mm_or_si128(x,y)
+#define vxor(x,y)      _mm_xor_si128(x,y)
+static int is_zero(block x) { return _mm_testz_si128(x,x); }      /* 0 or 1 */
+static block sll4(block x) {
+    return vor(_mm_srli_epi64(x, 4), _mm_slli_epi64(_mm_srli_si128(x, 8), 60));
+}
+static block srl4(block x) {
+    return vor(_mm_slli_epi64(x, 4), _mm_srli_epi64(_mm_slli_si128(x, 8), 60));
+}
+static __m128i bswap16(__m128i b) {
+    const __m128i t = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+    return _mm_shuffle_epi8(b,t);
+}
+static __m128i double_block(__m128i bl) {
+    const __m128i mask = _mm_set_epi32(135,1,1,1);
+    __m128i tmp = _mm_srai_epi32(bl, 31);
+    tmp = _mm_and_si128(tmp, mask);
+    tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2,1,0,3));
+    bl = _mm_slli_epi32(bl, 1);
+    return _mm_xor_si128(bl,tmp);
+}
+static __m128i aes(__m128i *key, __m128i in, __m128i first_key) {
+    in = vxor(in, first_key);
+    in = _mm_aesenc_si128 (in,key[0]);
+    in = _mm_aesenc_si128 (in,key[2]);
+    in = _mm_aesenc_si128 (in,key[5]);
+    in = _mm_aesenc_si128 (in,key[0]);
+    in = _mm_aesenc_si128 (in,key[2]);
+    in = _mm_aesenc_si128 (in,key[5]);
+    in = _mm_aesenc_si128 (in,key[0]);
+    in = _mm_aesenc_si128 (in,key[2]);
+    in = _mm_aesenc_si128 (in,key[5]);
+    return _mm_aesenc_si128 (in,key[0]);
+}
+static __m128i aes4(__m128i in, __m128i a, __m128i b,
+                    __m128i c, __m128i d, __m128i e) {
+    in = _mm_aesenc_si128(vxor(in,a),b);
+    in = _mm_aesenc_si128(in,c);
+    in = _mm_aesenc_si128(in,d);
+    return _mm_aesenc_si128 (in,e);
+}
+#define aes4pre(in,a,b,c,d) aes4(in,a,b,c,d,zero)
+static __m128i loadu(const void *p) { return _mm_loadu_si128((__m128i*)p); }
+static void storeu(const void *p, __m128i x) {_mm_storeu_si128((__m128i*)p,x);}
+#define load loadu      /* Intel with AES-NI has fast unaligned loads/stores */
+#define store storeu
+/* ------------------------------------------------------------------------- */
+#elif __ARM_FEATURE_CRYPTO
+/* ------------------------------------------------------------------------- */
+#include <arm_neon.h>
+#define block uint8x16_t
+#define zero           vmovq_n_u8(0)
+#define vadd(x,y)      vaddq_u8(x,y)
+#define vand(x,y)      vandq_u8(x,y)
+#define vandnot(x,y)   vbicq_u8(y,x)  /* (~x)&y */
+#define vor(x,y)       vorrq_u8(x,y)
+#define vxor(x,y)      veorq_u8(x,y)
+static int is_zero(block x) {         /* 0 or 1 */
+    uint8x8_t t = vorr_u8(vget_high_u8(x), vget_low_u8(x));
+    return vget_lane_u64(vreinterpret_u64_u8(t),0) == 0;
+}
+static block srl4(block x) {
+    const block mask = {15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,0};
+    uint8x16_t tmp = vandq_u8(vshrq_n_u8(vextq_u8(x, x, 1),4),mask);
+    return veorq_u8(tmp,vshlq_n_u8(x,4));
+}
+static block sll4(block x) {
+    const block mask = {0,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15};
+    uint8x16_t tmp = vshlq_n_u8(vandq_u8(vextq_u8(x, x, 15),mask),4);
+    return veorq_u8(tmp,vshrq_n_u8(x,4));
+}
+static uint8x16_t bswap16(uint8x16_t b) { return b; } /* Not with uint8x16_t */
+static block double_block(block b) {
+    const block mask = {135,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
+    block tmp = (block)vshrq_n_s8((int8x16_t)b,7);
+    tmp = vandq_u8(tmp, mask);
+    tmp = vextq_u8(tmp, tmp, 1);  /* Rotate high byte to low end */
+    b = vshlq_n_u8(b,1);
+    return veorq_u8(tmp,b);
+}
+static uint8x16_t aes(uint8x16_t *key, uint8x16_t in, uint8x16_t first_key) {
+    in = vaesmcq_u8(vaeseq_u8(in, first_key));
+    in = vaesmcq_u8(vaeseq_u8(in, key[0]));
+    in = vaesmcq_u8(vaeseq_u8(in, key[2]));
+    in = vaesmcq_u8(vaeseq_u8(in, key[5]));
+    in = vaesmcq_u8(vaeseq_u8(in, key[0]));
+    in = vaesmcq_u8(vaeseq_u8(in, key[2]));
+    in = vaesmcq_u8(vaeseq_u8(in, key[5]));
+    in = vaesmcq_u8(vaeseq_u8(in, key[0]));
+    in = vaesmcq_u8(vaeseq_u8(in, key[2]));
+    in = vaesmcq_u8(vaeseq_u8(in, key[5]));
+    return vxor(in, key[0]);
+}
+static uint8x16_t aes4pre(uint8x16_t in, uint8x16_t a, uint8x16_t b,
+                          uint8x16_t c, uint8x16_t d) {
+    in = vaesmcq_u8(vaeseq_u8(in, a));
+    in = vaesmcq_u8(vaeseq_u8(in, b));
+    in = vaesmcq_u8(vaeseq_u8(in, c));
+    return vaesmcq_u8(vaeseq_u8(in, d));
+}
+#define aes4(in,a,b,c,d,e) vxor(aes4pre(in,a,b,c,d),e)
+static uint8x16_t load(const void *p) { return *(uint8x16_t *)p; }
+static void store(void *p, uint8x16_t x) { *(uint8x16_t *)p = x; }
+#define loadu load    /* ARMv8 allows unaligned loads/stores */
+#define storeu store  /* ARMv8 allows unaligned stores       */
+/* ------------------------------------------------------------------------- */
+#else
+#error - This implementation requires __AES__ or __ARM_FEATURE_CRYPTO
+#endif
+/* ------------------------------------------------------------------------- */
+#define vxor3(x,y,z)        vxor(vxor(x,y),z)
+#define vxor4(w,x,y,z)      vxor(vxor(w,x),vxor(y,z))
+#define load_partial(p,n)   loadu(p)
+/*
+Might need a version like this if, for example, we want to load a 12-byte nonce
+into a 16-byte block.
+static block load_partial(const void *p, unsigned n) {
+    if ((intptr_t)p % 16 == 0) return load(p);
+    else {
+        block tmp; unsigned i;
+        for (i=0; i<n; i++) ((char*)&tmp)[i] = ((char*)p)[i];
+        return tmp;
+    }
+}
+*/
+static const unsigned char pad[] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
+                                    0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,
+                                    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                                    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                                    0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+                                    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
+static block zero_pad(block x, unsigned zero_bytes) {
+    return vand(x, loadu(pad + zero_bytes));
+}
+static block one_zero_pad(block x, unsigned one_zero_bytes) {
+    block *p = (block*)(pad + one_zero_bytes);
+    return vor(vand(x, loadu(p)), loadu(p+1));
+}
+static block zero_set_byte(char val, unsigned idx) {
+    block tmp = zero; ((char *)&tmp)[idx] = val; return tmp;
+}
+/* ------------------------------------------------------------------------- */
+typedef struct {   /* All data memory-correct except 2I register-correct */
+    block I[2];    /* 1I, 2I */
+    block J[3];    /* 1J,2J,4J */
+    block L[3];    /* 1L,2L,4L */
+    block delta3_cache;
+} aez_ctx_t;
+/* ------------------------------------------------------------------------- */
+static int blake2b(void *out, size_t outlen,
+                   const void *key, size_t keylen,
+                   const void *in, size_t inlen);
+/* ------------------------------------------------------------------------- */
+void aez_setup(unsigned char *key, unsigned keylen, aez_ctx_t *ctx) {
+    block tmp;
+    if (keylen==48) {
+        ctx->I[0] = loadu(key);
+        ctx->J[0] = loadu(key+16);
+        ctx->L[0] = loadu(key+32);
+    } else {
+        blake2b(ctx, 48, 0, 0, key, keylen);    /* Puts IJL into ctx */
+        ctx->L[0] = ctx->J[0];                  /* Rearrange.        */
+        ctx->J[0] = ctx->I[1];                  /* Rearrange.        */
+    }
+    /* Fill remaining ctx locations with doublings */
+    ctx->I[1] = double_block(bswap16(ctx->I[0]));           /* No post-bswap */
+    ctx->J[1] = bswap16(tmp = double_block(bswap16(ctx->J[0])));
+    ctx->J[2] = bswap16(double_block(tmp));
+    ctx->L[1] = bswap16(tmp = double_block(bswap16(ctx->L[0])));
+    ctx->L[2] = bswap16(double_block(tmp));
+    ctx->delta3_cache = zero;
+}
+/* ------------------------------------------------------------------------- */
+/* !! Warning !! Only handles nbytes <= 16 and abytes <= 16 */
+static block aez_hash(aez_ctx_t *ctx, char *n, unsigned nbytes, char *ad,
+               unsigned adbytes, unsigned abytes) {
+    block o1, o2, o3, o4, o5, o6, o7, o8, sum, offset, tmp;
+    block I=ctx->I[0], Ifordoubling = ctx->I[1], I2 = bswap16(Ifordoubling);
+    block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2];
+    block J=ctx->J[0], J2 = ctx->J[1], J4 = ctx->J[2], J5 = vxor(J,J4);
+    /* Process abytes and nonce */
+    offset = vxor4(J, J2, I2, L);
+    tmp = zero_set_byte((char)(8*abytes),15);
+    sum = aes4pre(offset,tmp,J,I,L);
+    if (nbytes==16) sum = aes4(vxor(loadu(n), J4), vxor(I2, L),J,I,L,sum);
+    else sum = aes4(vxor(J4, I),
+                    one_zero_pad(load_partial(n,nbytes),16-nbytes),J,I,L,sum);
+    if (ad) {  /* Possible easy misuse: ad==null && adbytes==0 */
+        if (adbytes==0) {
+            ctx->delta3_cache = aes4pre(vxor(J5, I), loadu(pad+32),J,I,L);
+        } else {
+            block delta3 = zero;
+            offset = vxor(J5, I2);
+            while (adbytes >= 8*16) {
+                o1 = vxor(offset,L);
+                o2 = vxor(offset,L2);
+                o3 = vxor(o1,L2);
+                o4 = vxor(offset,L4);
+                o5 = vxor(o1,L4);
+                o6 = vxor(o2,L4);
+                o7 = vxor(o3,L4);
+                o8 = offset;
+                Ifordoubling = double_block(Ifordoubling);
+                offset = vxor(J5, bswap16(Ifordoubling));
+                delta3 = vxor(delta3, aes4pre(load(ad+  0), o1, J, I, L));
+                delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L));
+                delta3 = vxor(delta3, aes4pre(load(ad+ 32), o3, J, I, L));
+                delta3 = vxor(delta3, aes4pre(load(ad+ 48), o4, J, I, L));
+                delta3 = vxor(delta3, aes4pre(load(ad+ 64), o5, J, I, L));
+                delta3 = vxor(delta3, aes4pre(load(ad+ 80), o6, J, I, L));
+                delta3 = vxor(delta3, aes4pre(load(ad+ 96), o7, J, I, L));
+                delta3 = vxor(delta3, aes4pre(load(ad+112), o8, J, I, L));
+                adbytes-=8*16; ad+=8*16;
+            }
+            if (adbytes >= 4*16) {
+                o1 = vxor(offset,L);
+                o2 = vxor(offset,L2);
+                o3 = vxor(o1,L2);
+                o4 = offset = vxor(offset,L4);
+                delta3 = vxor(delta3, aes4pre(load(ad+  0), o1, J, I, L));
+                delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L));
+                delta3 = vxor(delta3, aes4pre(load(ad+ 32), o3, J, I, L));
+                delta3 = vxor(delta3, aes4pre(load(ad+ 48), o4, J, I, L));
+                adbytes-=4*16; ad+=4*16;
+            }
+            if (adbytes >= 2*16) {
+                o1 = vxor(offset,L);
+                o2 = offset = vxor(offset,L2);
+                delta3 = vxor(delta3, aes4pre(load(ad+  0), o1, J, I, L));
+                delta3 = vxor(delta3, aes4pre(load(ad+ 16), o2, J, I, L));
+                adbytes-=2*16; ad+=2*16;
+            }
+            if (adbytes >= 1*16) {
+                o1 = vxor(offset,L);
+                delta3 = vxor(delta3, aes4pre(load(ad+  0), o1, J, I, L));
+                adbytes-=1*16; ad+=1*16;
+            }
+            if (adbytes) {
+                tmp = vxor3(J5, I, one_zero_pad(load(ad),16-adbytes));
+                delta3 = aes4(vxor(J5, I), one_zero_pad(load(ad),16-adbytes),
+                              J, I, L, delta3);
+            }
+            ctx->delta3_cache = delta3;
+        }
+    }
+    return vxor(sum,ctx->delta3_cache);
+}
+/* ------------------------------------------------------------------------- */
+static block pass_one(aez_ctx_t *ctx, block *src, unsigned bytes, block *dst) {
+    block o1, o2, o3, o4, o5, o6, o7, o8, offset, tmp, sum=zero;
+    block I=ctx->I[0], Ifordoubling = ctx->I[1];
+    block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2];
+    block J=ctx->J[0];
+    offset = vxor(J, bswap16(Ifordoubling));
+    while (bytes >= 16*16) {
+        o1 = vxor(offset,L);
+        o2 = vxor(offset,L2);
+        o3 = vxor(o1,L2);
+        o4 = vxor(offset,L4);
+        o5 = vxor(o1,L4);
+        o6 = vxor(o2,L4);
+        o7 = vxor(o3,L4);
+        o8 = offset;
+        Ifordoubling = double_block(Ifordoubling);
+        offset = vxor(J,bswap16(Ifordoubling));
+        store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0)));
+        store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2)));
+        store(dst+ 4, aes4(load(src + 5),o3, J, I, L, load(src+ 4)));
+        store(dst+ 6, aes4(load(src + 7),o4, J, I, L, load(src+ 6)));
+        store(dst+ 8, aes4(load(src + 9),o5, J, I, L, load(src+ 8)));
+        store(dst+10, aes4(load(src +11),o6, J, I, L, load(src+10)));
+        store(dst+12, aes4(load(src +13),o7, J, I, L, load(src+12)));
+        store(dst+14, aes4(load(src +15),o8, J, I, L, load(src+14)));
+        tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));store(dst+ 1,tmp);
+        sum=vxor(sum,tmp);
+        tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3));
+        store(dst+ 3,tmp);sum=vxor(sum,tmp);
+        tmp=aes4(I,load(dst+ 4),J,I,L,load(src+ 5));
+        store(dst+ 5,tmp);sum=vxor(sum,tmp);
+        tmp=aes4(I,load(dst+ 6),J,I,L,load(src+ 7));
+        store(dst+ 7,tmp);sum=vxor(sum,tmp);
+        tmp=aes4(I,load(dst+ 8),J,I,L,load(src+ 9));
+        store(dst+ 9,tmp);sum=vxor(sum,tmp);
+        tmp=aes4(I,load(dst+10),J,I,L,load(src+11));
+        store(dst+11,tmp);sum=vxor(sum,tmp);
+        tmp=aes4(I,load(dst+12),J,I,L,load(src+13));
+        store(dst+13,tmp);sum=vxor(sum,tmp);
+        tmp=aes4(I,load(dst+14),J,I,L,load(src+15));
+        store(dst+15,tmp);sum=vxor(sum,tmp);
+        bytes -= 16*16; dst += 16; src += 16;
+    }
+    if (bytes >= 8*16) {
+        o1 = vxor(offset,L);
+        o2 = vxor(offset,L2);
+        o3 = vxor(o1,L2);
+        o4 = offset = vxor(offset,L4);
+        store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0)));
+        store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2)));
+        store(dst+ 4, aes4(load(src + 5),o3, J, I, L, load(src+ 4)));
+        store(dst+ 6, aes4(load(src + 7),o4, J, I, L, load(src+ 6)));
+        tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));
+        store(dst+ 1,tmp);sum=vxor(sum,tmp);
+        tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3));
+        store(dst+ 3,tmp);sum=vxor(sum,tmp);
+        tmp=aes4(I,load(dst+ 4),J,I,L,load(src+ 5));
+        store(dst+ 5,tmp);sum=vxor(sum,tmp);
+        tmp=aes4(I,load(dst+ 6),J,I,L,load(src+ 7));
+        store(dst+ 7,tmp);sum=vxor(sum,tmp);
+        bytes -= 8*16; dst += 8; src += 8;
+    }
+    if (bytes >= 4*16) {
+        o1 = vxor(offset,L);
+        o2 = offset = vxor(offset,L2);
+        store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0)));
+        store(dst+ 2, aes4(load(src + 3),o2, J, I, L, load(src+ 2)));
+        tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));
+        store(dst+ 1,tmp);sum=vxor(sum,tmp);
+        tmp=aes4(I,load(dst+ 2),J,I,L,load(src+ 3));
+        store(dst+ 3,tmp);sum=vxor(sum,tmp);
+        bytes -= 4*16; dst += 4; src += 4;
+    }
+    if (bytes) {
+        o1 = vxor(offset,L);
+        store(dst+ 0, aes4(load(src + 1),o1, J, I, L, load(src+ 0)));
+        tmp=aes4(I,load(dst+ 0),J,I,L,load(src+ 1));
+        store(dst+ 1,tmp);sum=vxor(sum,tmp);
+    }
+    return sum;
+}
+/* ------------------------------------------------------------------------- */
+static block pass_two(aez_ctx_t *ctx, block s, unsigned bytes, block *dst) {
+    block o1, o2, o3, o4, o5, o6, o7, o8, sum=zero, offset, fs[8], tmp[8];
+    block I=ctx->I[0], Ifordoubling = ctx->I[1];
+    block L=ctx->L[0], L2=ctx->L[1],L4=ctx->L[2];
+    block J=ctx->J[0], J2=ctx->J[1], J3=vxor(J,J2);
+    offset = vxor(J2, bswap16(Ifordoubling));
+    while (bytes >= 16*16) {
+        o1 = vxor(offset,L);
+        o2 = vxor(offset,L2);
+        o3 = vxor(o1,L2);
+        o4 = vxor(offset,L4);
+        o5 = vxor(o1,L4);
+        o6 = vxor(o2,L4);
+        o7 = vxor(o3,L4);
+        o8 = offset;
+        Ifordoubling = double_block(Ifordoubling);
+        offset = vxor(J2, bswap16(Ifordoubling));
+        fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L);
+        fs[2] = aes4pre(s,o3,J,I,L); fs[3] = aes4pre(s,o4,J,I,L);
+        fs[4] = aes4pre(s,o5,J,I,L); fs[5] = aes4pre(s,o6,J,I,L);
+        fs[6] = aes4pre(s,o7,J,I,L); fs[7] = aes4pre(s,o8,J,I,L);
+        o1 = vxor(J3,o1); o2 = vxor(J3,o2);
+        o3 = vxor(J3,o3); o4 = vxor(J3,o4);
+        o5 = vxor(J3,o5); o6 = vxor(J3,o6);
+        o7 = vxor(J3,o7); o8 = vxor(J3,o8);
+        tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]);
+        store(dst+ 0,vxor(load(dst+ 1),fs[0]));
+        tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]);
+        store(dst+ 2,vxor(load(dst+ 3),fs[1]));
+        tmp[2] = vxor(load(dst+ 4),fs[2]); sum = vxor(sum,tmp[2]);
+        store(dst+ 4,vxor(load(dst+ 5),fs[2]));
+        tmp[3] = vxor(load(dst+ 6),fs[3]); sum = vxor(sum,tmp[3]);
+        store(dst+ 6,vxor(load(dst+ 7),fs[3]));
+        tmp[4] = vxor(load(dst+ 8),fs[4]); sum = vxor(sum,tmp[4]);
+        store(dst+ 8,vxor(load(dst+ 9),fs[4]));
+        tmp[5] = vxor(load(dst+10),fs[5]); sum = vxor(sum,tmp[5]);
+        store(dst+10,vxor(load(dst+11),fs[5]));
+        tmp[6] = vxor(load(dst+12),fs[6]); sum = vxor(sum,tmp[6]);
+        store(dst+12,vxor(load(dst+13),fs[6]));
+        tmp[7] = vxor(load(dst+14),fs[7]); sum = vxor(sum,tmp[7]);
+        store(dst+14,vxor(load(dst+15),fs[7]));
+        store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0]));
+        store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1]));
+        store(dst+ 5, aes4(I,load(dst+ 4), J, I, L, tmp[2]));
+        store(dst+ 7, aes4(I,load(dst+ 6), J, I, L, tmp[3]));
+        store(dst+ 9, aes4(I,load(dst+ 8), J, I, L, tmp[4]));
+        store(dst+11, aes4(I,load(dst+10), J, I, L, tmp[5]));
+        store(dst+13, aes4(I,load(dst+12), J, I, L, tmp[6]));
+        store(dst+15, aes4(I,load(dst+14), J, I, L, tmp[7]));
+        store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0)));
+        store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2)));
+        store(dst+ 4, aes4(load(dst+ 5),o3, J, I, L, load(dst+ 4)));
+        store(dst+ 6, aes4(load(dst+ 7),o4, J, I, L, load(dst+ 6)));
+        store(dst+ 8, aes4(load(dst+ 9),o5, J, I, L, load(dst+ 8)));
+        store(dst+10, aes4(load(dst+11),o6, J, I, L, load(dst+10)));
+        store(dst+12, aes4(load(dst+13),o7, J, I, L, load(dst+12)));
+        store(dst+14, aes4(load(dst+15),o8, J, I, L, load(dst+14)));
+        bytes -= 16*16; dst += 16;
+    }
+    if (bytes >= 8*16) {
+        o1 = vxor(offset,L);
+        o2 = vxor(offset,L2);
+        o3 = vxor(o1,L2);
+        o4 = offset = vxor(offset,L4);
+        fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L);
+        fs[2] = aes4pre(s,o3,J,I,L); fs[3] = aes4pre(s,o4,J,I,L);
+        o1 = vxor(J3,o1); o2 = vxor(J3,o2);
+        o3 = vxor(J3,o3); o4 = vxor(J3,o4);
+        tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]);
+        store(dst+ 0,vxor(load(dst+ 1),fs[0]));
+        tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]);
+        store(dst+ 2,vxor(load(dst+ 3),fs[1]));
+        tmp[2] = vxor(load(dst+ 4),fs[2]); sum = vxor(sum,tmp[2]);
+        store(dst+ 4,vxor(load(dst+ 5),fs[2]));
+        tmp[3] = vxor(load(dst+ 6),fs[3]); sum = vxor(sum,tmp[3]);
+        store(dst+ 6,vxor(load(dst+ 7),fs[3]));
+        store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0]));
+        store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1]));
+        store(dst+ 5, aes4(I,load(dst+ 4), J, I, L, tmp[2]));
+        store(dst+ 7, aes4(I,load(dst+ 6), J, I, L, tmp[3]));
+        store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0)));
+        store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2)));
+        store(dst+ 4, aes4(load(dst+ 5),o3, J, I, L, load(dst+ 4)));
+        store(dst+ 6, aes4(load(dst+ 7),o4, J, I, L, load(dst+ 6)));
+        bytes -= 8*16; dst += 8;
+    }
+    if (bytes >= 4*16) {
+        o1 = vxor(offset,L);
+        o2 = offset = vxor(offset,L2);
+        fs[0] = aes4pre(s,o1,J,I,L); fs[1] = aes4pre(s,o2,J,I,L);
+        o1 = vxor(J3,o1); o2 = vxor(J3,o2);
+        tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]);
+        store(dst+ 0,vxor(load(dst+ 1),fs[0]));
+        tmp[1] = vxor(load(dst+ 2),fs[1]); sum = vxor(sum,tmp[1]);
+        store(dst+ 2,vxor(load(dst+ 3),fs[1]));
+        store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0]));
+        store(dst+ 3, aes4(I,load(dst+ 2), J, I, L, tmp[1]));
+        store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0)));
+        store(dst+ 2, aes4(load(dst+ 3),o2, J, I, L, load(dst+ 2)));
+        bytes -= 4*16; dst += 4;
+    }
+    if (bytes) {
+        o1 = vxor(offset,L);
+        fs[0] = aes4pre(s,o1,J,I,L);
+        o1 = vxor(J3,o1);
+        tmp[0] = vxor(load(dst+ 0),fs[0]); sum = vxor(sum,tmp[0]);
+        store(dst+ 0,vxor(load(dst+ 1),fs[0]));
+        store(dst+ 1, aes4(I,load(dst+ 0), J, I, L, tmp[0]));
+        store(dst+ 0, aes4(load(dst+ 1),o1, J, I, L, load(dst+ 0)));
+    }
+    return sum;
+}
+/* ------------------------------------------------------------------------- */
+static int cipher_aez_core(aez_ctx_t *ctx, block t, int d, char *src,
+                           unsigned bytes, unsigned abytes, char *dst) {
+    block s, x, y, frag0, frag1, final0, final1;
+    block I=ctx->I[0], L=ctx->L[0], J=ctx->J[0];
+    block L4=ctx->L[2], I2 = bswap16(ctx->I[1]);
+    unsigned i, frag_bytes, initial_bytes;
+    if (!d) bytes += abytes;
+    frag_bytes = bytes % 32;
+    initial_bytes = bytes - frag_bytes - 32;
+    /* Compute x and store intermediate results */
+    x = pass_one(ctx, (block*)src, initial_bytes, (block*)dst);
+    if (frag_bytes >= 16) {
+        frag0 = load(src + initial_bytes);
+        frag1 = one_zero_pad(load(src + initial_bytes + 16), 32-frag_bytes);
+        x  = aes4(frag0, vxor(L4, I2), J, I, L, x);
+        x  = vxor(x, aes4pre(frag1, vxor3(I2, L4, L), J, I, L));
+    } else if (frag_bytes) {
+        frag0 = one_zero_pad(load(src + initial_bytes), 16-frag_bytes);
+        x = aes4(frag0, vxor(L4, I2), J, I, L, x);
+    }
+    /* Calculate s and final block values (y xor'd to final1 later) */
+    final0 = vxor3(loadu(src + (bytes - 32)), x, t);
+    if (d || !abytes) final1 = loadu(src+(bytes-32)+16);
+    else              final1 = zero_pad(loadu(src+(bytes-32)+16), abytes);
+    final0 = aes4(final1, vxor(I2, ctx->L[d]), J, I, L, final0);
+    final1 = vxor(final1, aes((block*)ctx, final0, ctx->L[d]));
+    s = vxor(final0, final1);
+    final0 = vxor(final0, aes((block*)ctx, final1, ctx->L[d^1]));
+    /* Decryption: final0 should hold abytes zero bytes. If not, failure */
+    if (d && !is_zero(vandnot(loadu(pad+abytes),final0))) return -1;
+    final1 = aes4(final0, vxor(I2, ctx->L[d^1]), J, I, L, final1);
+    /* Compute y and store final results */
+    y = pass_two(ctx, s, initial_bytes, (block*)dst);
+    if (frag_bytes >= 16) {
+        frag0 = vxor(frag0, aes((block*)ctx, s, L4));
+        frag1 = vxor(frag1, aes((block*)ctx, s, vxor(L4, L)));
+        frag1 = one_zero_pad(frag1, 32-frag_bytes);
+        y  = aes4(frag0, vxor(I2, L4),     J, I, L, y);
+        y  = vxor(y, aes4pre(frag1, vxor3(I2, L4, L), J, I, L));
+        store(dst + initial_bytes, frag0);
+        store(dst + initial_bytes + 16, frag1);
+    } else if (frag_bytes) {
+        frag0 = vxor(frag0, aes((block*)ctx, s, L4));
+        frag0 = one_zero_pad(frag0, 16-frag_bytes);
+        y = aes4(frag0, vxor(I2, L4), J, I, L, y);
+        store(dst + initial_bytes, frag0);
+    }
+    storeu(dst + (bytes - 32), vxor3(final1, y, t));
+    if (!d || !abytes)
+        storeu(dst + (bytes - 32) + 16, final0);
+    else {
+        for (i=0; i<16-abytes; i++)
+            ((char*)dst + (bytes - 16))[i] = ((char*)&final0)[i];
+    }
+    return 0;
+}
+/* ------------------------------------------------------------------------- */
+static int cipher_aez_tiny(aez_ctx_t *ctx, block t, int d, char *src,
+                           unsigned bytes, unsigned abytes, char *dst) {
+    block l, r, tmp, one, rcon, buf[2], mask_10, mask_ff;
+    block I=ctx->I[0], L=ctx->L[0], J=ctx->J[0], t_orig = t;
+    block L2=ctx->L[1], L4=ctx->L[2], I2 = bswap16(ctx->I[1]);
+    unsigned rnds, i;
+    /* load src into buf, zero pad, update bytes for abytes */
+    if (bytes >= 16) {
+        buf[0] = load(src);
+        buf[1] = zero_pad(load_partial(src+16,bytes-16),32-bytes);
+    } else {
+        buf[0] = zero_pad(load_partial(src,bytes),16-bytes);
+        buf[1] = zero;
+    }
+    if (!d) bytes += abytes;
+    /* load l/r, create 10* padding masks, shift r 4 bits if odd length */
+    l = buf[0];
+    r = loadu((char*)buf+bytes/2);
+    mask_ff = loadu(pad+16-bytes/2);
+    mask_10 = loadu(pad+32-bytes/2);
+    if (bytes&1) {  /* Odd length. Deal with nibbles. */
+        mask_10 = sll4(mask_10);
+        ((char*)&mask_ff)[bytes/2] = (char)0xf0;
+        r = bswap16(r);
+        r = srl4(r);
+        r = bswap16(r);
+    }
+    r = vor(vand(r, mask_ff), mask_10);
+    /* Add tweak offset into t, and determine the number of rounds */
+    if (bytes >= 16) {
+        t = vxor4(t, I2, L2, L4);             /* (0,6) offset */
+        rnds = 8;
+    } else {
+        t = vxor(vxor4(t, I2, L2, L4), L); /* (0,7) offset */
+        if (bytes>=3) rnds = 10; else if (bytes==2) rnds = 16; else rnds = 24;
+    }
+    if (!d) {
+        one = zero_set_byte(1,15);
+        rcon = zero;
+    } else {
+        one = zero_set_byte(-1,15);
+        rcon = zero_set_byte((char)(rnds-1),15);
+    }
+    if ((d) && (bytes < 16)) {
+        block offset = vxor3(I2, L, L2);
+        tmp = vor(l, loadu(pad+32));
+        tmp = aes4pre(t_orig, vxor(tmp,offset), J, I, L);
+        tmp = vand(tmp, loadu(pad+32));
+        l = vxor(l, tmp);
+    }
+    /* Feistel */
+    for (i=0; i<rnds; i+=2) {
+        l = vor(vand(aes4(t,vxor(r,rcon), J, I, L, l), mask_ff), mask_10);
+        rcon = vadd(rcon,one);
+        r = vor(vand(aes4(t,vxor(l,rcon), J, I, L, r), mask_ff), mask_10);
+        rcon = vadd(rcon,one);
+    }
+    buf[0] = r;
+    if (bytes&1) {
+        l = bswap16(l);
+        l = sll4(l);
+        l = bswap16(l);
+        r = vand(loadu((char*)buf+bytes/2), zero_set_byte((char)0xf0,0));
+        l = vor(l, r);
+    }
+    storeu((char*)buf+bytes/2, l);
+    if (d) {
+        bytes -= abytes;
+        if (abytes==16) tmp = loadu((char*)buf+bytes);
+        else {
+            tmp = zero;
+            for (i=0; i<abytes; i++) ((char*)&tmp)[i] = ((char*)buf+bytes)[i];
+        }
+        if (!is_zero(tmp)) return -1;
+    } else if (bytes < 16) {
+        block offset = vxor3(I2, L, L2);
+        tmp = vor(zero_pad(buf[0], 16-bytes), loadu(pad+32));
+        tmp = aes4pre(t_orig,vxor(tmp,offset), J, I, L);
+        buf[0] = vxor(buf[0], vand(tmp, loadu(pad+32)));
+    }
+    for (i=0; i<bytes; i++) dst[i] = ((char*)buf)[i];
+    return 0;
+}
+/* ------------------------------------------------------------------------- */
+void aez_encrypt(aez_ctx_t *ctx, char *n, unsigned nbytes,
+                 char *ad, unsigned adbytes, unsigned abytes,
+                 char *src, unsigned bytes, char *dst) {
+    block t = aez_hash(ctx, n, nbytes, ad, adbytes, abytes);
+    if (bytes==0) {
+        unsigned i;
+        t = aes((block*)ctx, t, vxor(ctx->L[0], ctx->L[1]));
+        for (i=0; i<abytes; i++) dst[i] = ((char*)&t)[i];
+    } else if (bytes+abytes < 32)
+        cipher_aez_tiny(ctx, t, 0, src, bytes, abytes, dst);
+    else
+        cipher_aez_core(ctx, t, 0, src, bytes, abytes, dst);
+}
+/* ------------------------------------------------------------------------- */
+int aez_decrypt(aez_ctx_t *ctx, char *n, unsigned nbytes,
+                char *ad, unsigned adbytes, unsigned abytes,
+                char *src, unsigned bytes, char *dst) {
+    block t;
+    if (bytes < abytes) return -1;
+    t = aez_hash(ctx, n, nbytes, ad, adbytes, abytes);
+    if (bytes==abytes) {
+        block claimed = zero_pad(load_partial(src,abytes), 16-abytes);
+        t = zero_pad(aes((block*)ctx, t, vxor(ctx->L[0], ctx->L[1])), 16-abytes);
+        return is_zero(vandnot(t, claimed)) - 1;  /* is_zero return 0 or 1 */
+    } else if (bytes < 32) {
+        return cipher_aez_tiny(ctx, t, 1, src, bytes, abytes, dst);
+    } else {
+        return cipher_aez_core(ctx, t, 1, src, bytes, abytes, dst);
+    }
+}
+/* ------------------------------------------------------------------------- */
+/* Reference Blake2b code, here for convenience, and not for speed.          */
+/* Dowloaded Sep 2015 from https://github.com/mjosaarinen/blake2_mjosref     */
+#include <stdint.h>
+typedef struct {
+    uint8_t b[128];
+    uint64_t h[8];
+    uint64_t t[2];
+    size_t c;
+    size_t outlen;
+} blake2b_ctx;
+#ifndef ROTR64
+#define ROTR64(x, y)  (((x) >> (y)) ^ ((x) << (64 - (y))))
+#endif
+#define B2B_GET64(p)                            \
+(((uint64_t) ((uint8_t *) (p))[0]) ^        \
+(((uint64_t) ((uint8_t *) (p))[1]) << 8) ^  \
+(((uint64_t) ((uint8_t *) (p))[2]) << 16) ^ \
+(((uint64_t) ((uint8_t *) (p))[3]) << 24) ^ \
+(((uint64_t) ((uint8_t *) (p))[4]) << 32) ^ \
+(((uint64_t) ((uint8_t *) (p))[5]) << 40) ^ \
+(((uint64_t) ((uint8_t *) (p))[6]) << 48) ^ \
+(((uint64_t) ((uint8_t *) (p))[7]) << 56))
+#define B2B_G(a, b, c, d, x, y) {   \
+v[a] = v[a] + v[b] + x;         \
+v[d] = ROTR64(v[d] ^ v[a], 32); \
+v[c] = v[c] + v[d];             \
+v[b] = ROTR64(v[b] ^ v[c], 24); \
+v[a] = v[a] + v[b] + y;         \
+v[d] = ROTR64(v[d] ^ v[a], 16); \
+v[c] = v[c] + v[d];             \
+v[b] = ROTR64(v[b] ^ v[c], 63); }
+static const uint64_t blake2b_iv[8] = {
+    0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+    0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+    0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+    0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+};
+static void blake2b_compress(blake2b_ctx *ctx, int last)
+{
+    const uint8_t sigma[12][16] = {
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+        { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+        { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+        { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+        { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+        { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+        { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+        { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+        { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+        { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+        { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
+    };
+    int i;
+    uint64_t v[16], m[16];
+    for (i = 0; i < 8; i++) {
+        v[i] = ctx->h[i];
+        v[i + 8] = blake2b_iv[i];
+    }
+    v[12] ^= ctx->t[0];
+    v[13] ^= ctx->t[1];
+    if (last)
+        v[14] = ~v[14];
+    for (i = 0; i < 16; i++)
+        m[i] = B2B_GET64(&ctx->b[8 * i]);
+    for (i = 0; i < 12; i++) {
+        B2B_G( 0, 4,  8, 12, m[sigma[i][ 0]], m[sigma[i][ 1]]);
+        B2B_G( 1, 5,  9, 13, m[sigma[i][ 2]], m[sigma[i][ 3]]);
+        B2B_G( 2, 6, 10, 14, m[sigma[i][ 4]], m[sigma[i][ 5]]);
+        B2B_G( 3, 7, 11, 15, m[sigma[i][ 6]], m[sigma[i][ 7]]);
+        B2B_G( 0, 5, 10, 15, m[sigma[i][ 8]], m[sigma[i][ 9]]);
+        B2B_G( 1, 6, 11, 12, m[sigma[i][10]], m[sigma[i][11]]);
+        B2B_G( 2, 7,  8, 13, m[sigma[i][12]], m[sigma[i][13]]);
+        B2B_G( 3, 4,  9, 14, m[sigma[i][14]], m[sigma[i][15]]);
+    }
+    for( i = 0; i < 8; ++i )
+        ctx->h[i] ^= v[i] ^ v[i + 8];
+}
+static void blake2b_update(blake2b_ctx *ctx,
+                           const void *in, size_t inlen)
+{
+    size_t i;
+    for (i = 0; i < inlen; i++) {
+        if (ctx->c == 128) {
+            ctx->t[0] += ctx->c;
+            if (ctx->t[0] < ctx->c)
+                ctx->t[1]++;
+            blake2b_compress(ctx, 0);
+            ctx->c = 0;
+        }
+        ctx->b[ctx->c++] = ((const uint8_t *) in)[i];
+    }
+}
+static void blake2b_final(blake2b_ctx *ctx, void *out)
+{
+    size_t i;
+    ctx->t[0] += ctx->c;
+    if (ctx->t[0] < ctx->c)
+        ctx->t[1]++;
+    while (ctx->c < 128)
+        ctx->b[ctx->c++] = 0;
+    blake2b_compress(ctx, 1);
+    for (i = 0; i < ctx->outlen; i++) {
+        ((uint8_t *) out)[i] =
+        (ctx->h[i >> 3] >> (8 * (i & 7))) & 0xFF;
+    }
+}
+static int blake2b_init(blake2b_ctx *ctx, size_t outlen,
+                        const void *key, size_t keylen)
+{
+    size_t i;
+    if (outlen == 0 || outlen > 64 || keylen > 64)
+        return -1;
+    for (i = 0; i < 8; i++)
+        ctx->h[i] = blake2b_iv[i];
+    ctx->h[0] ^= 0x01010000 ^ (keylen << 8) ^ outlen;
+    ctx->t[0] = 0;
+    ctx->t[1] = 0;
+    ctx->c = 0;
+    ctx->outlen = outlen;
+    for (i = keylen; i < 128; i++)
+        ctx->b[i] = 0;
+    if (keylen > 0) {
+        blake2b_update(ctx, key, keylen);
+        ctx->c = 128;
+    }
+    return 0;
+}
+static int blake2b(void *out, size_t outlen,
+                   const void *key, size_t keylen,
+                   const void *in, size_t inlen)
+{
+    blake2b_ctx ctx;
+    if (blake2b_init(&ctx, outlen, key, keylen))
+        return -1;
+    blake2b_update(&ctx, in, inlen);
+    blake2b_final(&ctx, out);
+    return 0;
+}
+/* ------------------------------------------------------------------------- */
+/* aez mapping for CAESAR competition                                        */
+int crypto_aead_encrypt(
+    unsigned char *c,unsigned long long *clen,
+    const unsigned char *m,unsigned long long mlen,
+    const unsigned char *ad,unsigned long long adlen,
+    const unsigned char *nsec,
+    const unsigned char *npub,
+    const unsigned char *k
+)
+{
+    aez_ctx_t ctx;
+    (void)nsec;
+    if (clen) *clen = mlen+16;
+    aez_setup((unsigned char *)k, 48, &ctx);
+    aez_encrypt(&ctx, (char *)npub, 12,
+                 (char *)ad, (unsigned)adlen, 16,
+                 (char *)m, (unsigned)mlen, (char *)c);
+    return 0;
+}
+int crypto_aead_decrypt(
+    unsigned char *m,unsigned long long *mlen,
+    unsigned char *nsec,
+    const unsigned char *c,unsigned long long clen,
+    const unsigned char *ad,unsigned long long adlen,
+    const unsigned char *npub,
+    const unsigned char *k
+)
+{
+    aez_ctx_t ctx;
+    (void)nsec;
+    if (mlen) *mlen = clen-16;
+    aez_setup((unsigned char *)k, 48, &ctx);
+    return aez_decrypt(&ctx, (char *)npub, 12,
+                 (char *)ad, (unsigned)adlen, 16,
+                 (char *)c, (unsigned)clen, (char *)m);
+}