ob64 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/lib/ob64/version.rb +1 -1
- data/ob64.gemspec +2 -0
- data/vendor/libbase64/.gitignore +12 -0
- data/vendor/libbase64/.travis.yml +71 -0
- data/vendor/libbase64/CMakeLists.txt +264 -0
- data/vendor/libbase64/LICENSE +28 -0
- data/vendor/libbase64/Makefile +93 -0
- data/vendor/libbase64/README.md +474 -0
- data/vendor/libbase64/base64-benchmarks.png +0 -0
- data/vendor/libbase64/bin/base64.c +132 -0
- data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
- data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
- data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
- data/vendor/libbase64/cmake/config.h.in +25 -0
- data/vendor/libbase64/cmake/test-arch.c +35 -0
- data/vendor/libbase64/include/libbase64.h +145 -0
- data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
- data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
- data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
- data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
- data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
- data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
- data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
- data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
- data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
- data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
- data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
- data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
- data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
- data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
- data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
- data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
- data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
- data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
- data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
- data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
- data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
- data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
- data/vendor/libbase64/lib/codec_choose.c +281 -0
- data/vendor/libbase64/lib/codecs.h +65 -0
- data/vendor/libbase64/lib/env.h +67 -0
- data/vendor/libbase64/lib/exports.txt +7 -0
- data/vendor/libbase64/lib/lib.c +164 -0
- data/vendor/libbase64/lib/lib_openmp.c +149 -0
- data/vendor/libbase64/lib/tables/.gitignore +1 -0
- data/vendor/libbase64/lib/tables/Makefile +17 -0
- data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
- data/vendor/libbase64/lib/tables/table_generator.c +184 -0
- data/vendor/libbase64/lib/tables/tables.c +40 -0
- data/vendor/libbase64/lib/tables/tables.h +23 -0
- metadata +64 -4
@@ -0,0 +1,25 @@
|
|
1
|
+
#ifndef BASE64_CONFIG_H
|
2
|
+
#define BASE64_CONFIG_H
|
3
|
+
|
4
|
+
#cmakedefine01 BASE64_WITH_SSSE3
|
5
|
+
#define HAVE_SSSE3 BASE64_WITH_SSSE3
|
6
|
+
|
7
|
+
#cmakedefine01 BASE64_WITH_SSE41
|
8
|
+
#define HAVE_SSE41 BASE64_WITH_SSE41
|
9
|
+
|
10
|
+
#cmakedefine01 BASE64_WITH_SSE42
|
11
|
+
#define HAVE_SSE42 BASE64_WITH_SSE42
|
12
|
+
|
13
|
+
#cmakedefine01 BASE64_WITH_AVX
|
14
|
+
#define HAVE_AVX BASE64_WITH_AVX
|
15
|
+
|
16
|
+
#cmakedefine01 BASE64_WITH_AVX2
|
17
|
+
#define HAVE_AVX2 BASE64_WITH_AVX2
|
18
|
+
|
19
|
+
#cmakedefine01 BASE64_WITH_NEON32
|
20
|
+
#define HAVE_NEON32 BASE64_WITH_NEON32
|
21
|
+
|
22
|
+
#cmakedefine01 BASE64_WITH_NEON64
|
23
|
+
#define HAVE_NEON64 BASE64_WITH_NEON64
|
24
|
+
|
25
|
+
#endif // BASE64_CONFIG_H
|
@@ -0,0 +1,35 @@
|
|
1
|
+
// Written in 2017 by Henrik Steffen Gaßmann henrik@gassmann.onl
|
2
|
+
//
|
3
|
+
// To the extent possible under law, the author(s) have dedicated all
|
4
|
+
// copyright and related and neighboring rights to this software to the
|
5
|
+
// public domain worldwide. This software is distributed without any warranty.
|
6
|
+
//
|
7
|
+
// You should have received a copy of the CC0 Public Domain Dedication
|
8
|
+
// along with this software. If not, see
|
9
|
+
//
|
10
|
+
// http://creativecommons.org/publicdomain/zero/1.0/
|
11
|
+
//
|
12
|
+
////////////////////////////////////////////////////////////////////////////////
|
13
|
+
|
14
|
+
// ARM 64-Bit
|
15
|
+
#if defined(__aarch64__)
|
16
|
+
#error ##arch=arm64##
|
17
|
+
|
18
|
+
// ARM 32-Bit
|
19
|
+
#elif defined(__arm__) \
|
20
|
+
|| defined(_M_ARM)
|
21
|
+
#error ##arch=arm##
|
22
|
+
|
23
|
+
// x86 64-Bit
|
24
|
+
#elif defined(__x86_64__) \
|
25
|
+
|| defined(_M_X64)
|
26
|
+
#error ##arch=x64##
|
27
|
+
|
28
|
+
// x86 32-Bit
|
29
|
+
#elif defined(__i386__) \
|
30
|
+
|| defined(_M_IX86)
|
31
|
+
#error ##arch=x86##
|
32
|
+
|
33
|
+
#else
|
34
|
+
#error ##arch=unknown##
|
35
|
+
#endif
|
@@ -0,0 +1,145 @@
|
|
1
|
+
#ifndef LIBBASE64_H
|
2
|
+
#define LIBBASE64_H
|
3
|
+
|
4
|
+
#include <stddef.h> /* size_t */
|
5
|
+
|
6
|
+
|
7
|
+
#if defined(_WIN32) || defined(__CYGWIN__)
|
8
|
+
#define BASE64_SYMBOL_IMPORT __declspec(dllimport)
|
9
|
+
#define BASE64_SYMBOL_EXPORT __declspec(dllexport)
|
10
|
+
#define BASE64_SYMBOL_PRIVATE
|
11
|
+
|
12
|
+
#elif __GNUC__ >= 4
|
13
|
+
#define BASE64_SYMBOL_IMPORT __attribute__ ((visibility ("default")))
|
14
|
+
#define BASE64_SYMBOL_EXPORT __attribute__ ((visibility ("default")))
|
15
|
+
#define BASE64_SYMBOL_PRIVATE __attribute__ ((visibility ("hidden")))
|
16
|
+
|
17
|
+
#else
|
18
|
+
#define BASE64_SYMBOL_IMPORT
|
19
|
+
#define BASE64_SYMBOL_EXPORT
|
20
|
+
#define BASE64_SYMBOL_PRIVATE
|
21
|
+
#endif
|
22
|
+
|
23
|
+
#if defined(BASE64_STATIC_DEFINE)
|
24
|
+
#define BASE64_EXPORT
|
25
|
+
#define BASE64_NO_EXPORT
|
26
|
+
|
27
|
+
#else
|
28
|
+
#if defined(BASE64_EXPORTS) // defined if we are building the shared library
|
29
|
+
#define BASE64_EXPORT BASE64_SYMBOL_EXPORT
|
30
|
+
|
31
|
+
#else
|
32
|
+
#define BASE64_EXPORT BASE64_SYMBOL_IMPORT
|
33
|
+
#endif
|
34
|
+
|
35
|
+
#define BASE64_NO_EXPORT BASE64_SYMBOL_PRIVATE
|
36
|
+
#endif
|
37
|
+
|
38
|
+
|
39
|
+
#ifdef __cplusplus
|
40
|
+
extern "C" {
|
41
|
+
#endif
|
42
|
+
|
43
|
+
/* These are the flags that can be passed in the `flags` argument. The values
|
44
|
+
* below force the use of a given codec, even if that codec is a no-op in the
|
45
|
+
* current build. Used in testing. Set to 0 for the default behavior, which is
|
46
|
+
* runtime feature detection on x86, a compile-time fixed codec on ARM, and
|
47
|
+
* the plain codec on other platforms: */
|
48
|
+
#define BASE64_FORCE_AVX2 (1 << 0)
|
49
|
+
#define BASE64_FORCE_NEON32 (1 << 1)
|
50
|
+
#define BASE64_FORCE_NEON64 (1 << 2)
|
51
|
+
#define BASE64_FORCE_PLAIN (1 << 3)
|
52
|
+
#define BASE64_FORCE_SSSE3 (1 << 4)
|
53
|
+
#define BASE64_FORCE_SSE41 (1 << 5)
|
54
|
+
#define BASE64_FORCE_SSE42 (1 << 6)
|
55
|
+
#define BASE64_FORCE_AVX (1 << 7)
|
56
|
+
|
57
|
+
struct base64_state {
|
58
|
+
int eof;
|
59
|
+
int bytes;
|
60
|
+
int flags;
|
61
|
+
unsigned char carry;
|
62
|
+
};
|
63
|
+
|
64
|
+
/* Wrapper function to encode a plain string of given length. Output is written
|
65
|
+
* to *out without trailing zero. Output length in bytes is written to *outlen.
|
66
|
+
* The buffer in `out` has been allocated by the caller and is at least 4/3 the
|
67
|
+
* size of the input. See above for `flags`; set to 0 for default operation: */
|
68
|
+
void BASE64_EXPORT base64_encode
|
69
|
+
( const char *src
|
70
|
+
, size_t srclen
|
71
|
+
, char *out
|
72
|
+
, size_t *outlen
|
73
|
+
, int flags
|
74
|
+
) ;
|
75
|
+
|
76
|
+
/* Call this before calling base64_stream_encode() to init the state. See above
|
77
|
+
* for `flags`; set to 0 for default operation: */
|
78
|
+
void BASE64_EXPORT base64_stream_encode_init
|
79
|
+
( struct base64_state *state
|
80
|
+
, int flags
|
81
|
+
) ;
|
82
|
+
|
83
|
+
/* Encodes the block of data of given length at `src`, into the buffer at
|
84
|
+
* `out`. Caller is responsible for allocating a large enough out-buffer; it
|
85
|
+
* must be at least 4/3 the size of the in-buffer, but take some margin. Places
|
86
|
+
* the number of new bytes written into `outlen` (which is set to zero when the
|
87
|
+
* function starts). Does not zero-terminate or finalize the output. */
|
88
|
+
void BASE64_EXPORT base64_stream_encode
|
89
|
+
( struct base64_state *state
|
90
|
+
, const char *src
|
91
|
+
, size_t srclen
|
92
|
+
, char *out
|
93
|
+
, size_t *outlen
|
94
|
+
) ;
|
95
|
+
|
96
|
+
/* Finalizes the output begun by previous calls to `base64_stream_encode()`.
|
97
|
+
* Adds the required end-of-stream markers if appropriate. `outlen` is modified
|
98
|
+
* and will contain the number of new bytes written at `out` (which will quite
|
99
|
+
* often be zero). */
|
100
|
+
void BASE64_EXPORT base64_stream_encode_final
|
101
|
+
( struct base64_state *state
|
102
|
+
, char *out
|
103
|
+
, size_t *outlen
|
104
|
+
) ;
|
105
|
+
|
106
|
+
/* Wrapper function to decode a plain string of given length. Output is written
|
107
|
+
* to *out without trailing zero. Output length in bytes is written to *outlen.
|
108
|
+
* The buffer in `out` has been allocated by the caller and is at least 3/4 the
|
109
|
+
* size of the input. See above for `flags`, set to 0 for default operation: */
|
110
|
+
int BASE64_EXPORT base64_decode
|
111
|
+
( const char *src
|
112
|
+
, size_t srclen
|
113
|
+
, char *out
|
114
|
+
, size_t *outlen
|
115
|
+
, int flags
|
116
|
+
) ;
|
117
|
+
|
118
|
+
/* Call this before calling base64_stream_decode() to init the state. See above
|
119
|
+
* for `flags`; set to 0 for default operation: */
|
120
|
+
void BASE64_EXPORT base64_stream_decode_init
|
121
|
+
( struct base64_state *state
|
122
|
+
, int flags
|
123
|
+
) ;
|
124
|
+
|
125
|
+
/* Decodes the block of data of given length at `src`, into the buffer at
|
126
|
+
* `out`. Caller is responsible for allocating a large enough out-buffer; it
|
127
|
+
* must be at least 3/4 the size of the in-buffer, but take some margin. Places
|
128
|
+
* the number of new bytes written into `outlen` (which is set to zero when the
|
129
|
+
* function starts). Does not zero-terminate the output. Returns 1 if all is
|
130
|
+
* well, and 0 if a decoding error was found, such as an invalid character.
|
131
|
+
* Returns -1 if the chosen codec is not included in the current build. Used by
|
132
|
+
* the test harness to check whether a codec is available for testing. */
|
133
|
+
int BASE64_EXPORT base64_stream_decode
|
134
|
+
( struct base64_state *state
|
135
|
+
, const char *src
|
136
|
+
, size_t srclen
|
137
|
+
, char *out
|
138
|
+
, size_t *outlen
|
139
|
+
) ;
|
140
|
+
|
141
|
+
#ifdef __cplusplus
|
142
|
+
}
|
143
|
+
#endif
|
144
|
+
|
145
|
+
#endif /* LIBBASE64_H */
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#if HAVE_AVX
|
12
|
+
#include <immintrin.h>
|
13
|
+
|
14
|
+
#include "../ssse3/dec_reshuffle.c"
|
15
|
+
#include "../ssse3/dec_loop.c"
|
16
|
+
#include "../ssse3/enc_translate.c"
|
17
|
+
#include "../ssse3/enc_reshuffle.c"
|
18
|
+
#include "../ssse3/enc_loop.c"
|
19
|
+
|
20
|
+
#endif // HAVE_AVX
|
21
|
+
|
22
|
+
BASE64_ENC_FUNCTION(avx)
|
23
|
+
{
|
24
|
+
#if HAVE_AVX
|
25
|
+
#include "../generic/enc_head.c"
|
26
|
+
enc_loop_ssse3(&s, &slen, &o, &olen);
|
27
|
+
#include "../generic/enc_tail.c"
|
28
|
+
#else
|
29
|
+
BASE64_ENC_STUB
|
30
|
+
#endif
|
31
|
+
}
|
32
|
+
|
33
|
+
BASE64_DEC_FUNCTION(avx)
|
34
|
+
{
|
35
|
+
#if HAVE_AVX
|
36
|
+
#include "../generic/dec_head.c"
|
37
|
+
dec_loop_ssse3(&s, &slen, &o, &olen);
|
38
|
+
#include "../generic/dec_tail.c"
|
39
|
+
#else
|
40
|
+
BASE64_DEC_STUB
|
41
|
+
#endif
|
42
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#if HAVE_AVX2
|
12
|
+
#include <immintrin.h>
|
13
|
+
|
14
|
+
#include "dec_reshuffle.c"
|
15
|
+
#include "dec_loop.c"
|
16
|
+
#include "enc_translate.c"
|
17
|
+
#include "enc_reshuffle.c"
|
18
|
+
#include "enc_loop.c"
|
19
|
+
|
20
|
+
#endif // HAVE_AVX2
|
21
|
+
|
22
|
+
BASE64_ENC_FUNCTION(avx2)
|
23
|
+
{
|
24
|
+
#if HAVE_AVX2
|
25
|
+
#include "../generic/enc_head.c"
|
26
|
+
enc_loop_avx2(&s, &slen, &o, &olen);
|
27
|
+
#include "../generic/enc_tail.c"
|
28
|
+
#else
|
29
|
+
BASE64_ENC_STUB
|
30
|
+
#endif
|
31
|
+
}
|
32
|
+
|
33
|
+
BASE64_DEC_FUNCTION(avx2)
|
34
|
+
{
|
35
|
+
#if HAVE_AVX2
|
36
|
+
#include "../generic/dec_head.c"
|
37
|
+
dec_loop_avx2(&s, &slen, &o, &olen);
|
38
|
+
#include "../generic/dec_tail.c"
|
39
|
+
#else
|
40
|
+
BASE64_DEC_STUB
|
41
|
+
#endif
|
42
|
+
}
|
@@ -0,0 +1,110 @@
|
|
1
|
+
static inline int
|
2
|
+
dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
|
3
|
+
{
|
4
|
+
const __m256i lut_lo = _mm256_setr_epi8(
|
5
|
+
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
|
6
|
+
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
|
7
|
+
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
|
8
|
+
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
|
9
|
+
|
10
|
+
const __m256i lut_hi = _mm256_setr_epi8(
|
11
|
+
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
|
12
|
+
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
|
13
|
+
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
|
14
|
+
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
|
15
|
+
|
16
|
+
const __m256i lut_roll = _mm256_setr_epi8(
|
17
|
+
0, 16, 19, 4, -65, -65, -71, -71,
|
18
|
+
0, 0, 0, 0, 0, 0, 0, 0,
|
19
|
+
0, 16, 19, 4, -65, -65, -71, -71,
|
20
|
+
0, 0, 0, 0, 0, 0, 0, 0);
|
21
|
+
|
22
|
+
const __m256i mask_2F = _mm256_set1_epi8(0x2F);
|
23
|
+
|
24
|
+
// Load input:
|
25
|
+
__m256i str = _mm256_loadu_si256((__m256i *) *s);
|
26
|
+
|
27
|
+
// See the SSSE3 decoder for an explanation of the algorithm.
|
28
|
+
const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
|
29
|
+
const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
|
30
|
+
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
|
31
|
+
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
|
32
|
+
|
33
|
+
if (!_mm256_testz_si256(lo, hi)) {
|
34
|
+
return 0;
|
35
|
+
}
|
36
|
+
|
37
|
+
const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
|
38
|
+
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
|
39
|
+
|
40
|
+
// Now simply add the delta values to the input:
|
41
|
+
str = _mm256_add_epi8(str, roll);
|
42
|
+
|
43
|
+
// Reshuffle the input to packed 12-byte output format:
|
44
|
+
str = dec_reshuffle(str);
|
45
|
+
|
46
|
+
// Store the output:
|
47
|
+
_mm256_storeu_si256((__m256i *) *o, str);
|
48
|
+
|
49
|
+
*s += 32;
|
50
|
+
*o += 24;
|
51
|
+
*rounds -= 1;
|
52
|
+
|
53
|
+
return 1;
|
54
|
+
}
|
55
|
+
|
56
|
+
static inline void
|
57
|
+
dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
58
|
+
{
|
59
|
+
if (*slen < 45) {
|
60
|
+
return;
|
61
|
+
}
|
62
|
+
|
63
|
+
// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
|
64
|
+
// written after the output, ensure that there will be at least 13
|
65
|
+
// bytes of input data left to cover the gap. (11 data bytes and up to
|
66
|
+
// two end-of-string markers.)
|
67
|
+
size_t rounds = (*slen - 13) / 32;
|
68
|
+
|
69
|
+
*slen -= rounds * 32; // 32 bytes consumed per round
|
70
|
+
*olen += rounds * 24; // 24 bytes produced per round
|
71
|
+
|
72
|
+
do {
|
73
|
+
if (rounds >= 8) {
|
74
|
+
if (dec_loop_avx2_inner(s, o, &rounds) &&
|
75
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
76
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
77
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
78
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
79
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
80
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
81
|
+
dec_loop_avx2_inner(s, o, &rounds)) {
|
82
|
+
continue;
|
83
|
+
}
|
84
|
+
break;
|
85
|
+
}
|
86
|
+
if (rounds >= 4) {
|
87
|
+
if (dec_loop_avx2_inner(s, o, &rounds) &&
|
88
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
89
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
90
|
+
dec_loop_avx2_inner(s, o, &rounds)) {
|
91
|
+
continue;
|
92
|
+
}
|
93
|
+
break;
|
94
|
+
}
|
95
|
+
if (rounds >= 2) {
|
96
|
+
if (dec_loop_avx2_inner(s, o, &rounds) &&
|
97
|
+
dec_loop_avx2_inner(s, o, &rounds)) {
|
98
|
+
continue;
|
99
|
+
}
|
100
|
+
break;
|
101
|
+
}
|
102
|
+
dec_loop_avx2_inner(s, o, &rounds);
|
103
|
+
break;
|
104
|
+
|
105
|
+
} while (rounds > 0);
|
106
|
+
|
107
|
+
// Adjust for any rounds that were skipped:
|
108
|
+
*slen += rounds * 32;
|
109
|
+
*olen -= rounds * 24;
|
110
|
+
}
|
@@ -0,0 +1,34 @@
|
|
1
|
+
static inline __m256i
|
2
|
+
dec_reshuffle (const __m256i in)
|
3
|
+
{
|
4
|
+
// in, lower lane, bits, upper case are most significant bits, lower
|
5
|
+
// case are least significant bits:
|
6
|
+
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
|
7
|
+
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
|
8
|
+
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
|
9
|
+
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
|
10
|
+
|
11
|
+
const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
|
12
|
+
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
|
13
|
+
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
|
14
|
+
// 0000eeee FFffffff 0000DDDD DDddEEEE
|
15
|
+
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
|
16
|
+
|
17
|
+
__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
|
18
|
+
// 00000000 JJJJJJjj KKKKkkkk LLllllll
|
19
|
+
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
|
20
|
+
// 00000000 DDDDDDdd EEEEeeee FFffffff
|
21
|
+
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
|
22
|
+
|
23
|
+
// Pack bytes together in each lane:
|
24
|
+
out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
|
25
|
+
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
|
26
|
+
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
|
27
|
+
// 00000000 00000000 00000000 00000000
|
28
|
+
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
|
29
|
+
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
|
30
|
+
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
|
31
|
+
|
32
|
+
// Pack lanes:
|
33
|
+
return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
|
34
|
+
}
|
@@ -0,0 +1,89 @@
|
|
1
|
+
static inline void
|
2
|
+
enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
|
3
|
+
{
|
4
|
+
// First load is done at s - 0 to not get a segfault:
|
5
|
+
__m256i src = _mm256_loadu_si256((__m256i *) *s);
|
6
|
+
|
7
|
+
// Shift by 4 bytes, as required by enc_reshuffle:
|
8
|
+
src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
|
9
|
+
|
10
|
+
// Reshuffle, translate, store:
|
11
|
+
src = enc_reshuffle(src);
|
12
|
+
src = enc_translate(src);
|
13
|
+
_mm256_storeu_si256((__m256i *) *o, src);
|
14
|
+
|
15
|
+
// Subsequent loads will be done at s - 4, set pointer for next round:
|
16
|
+
*s += 20;
|
17
|
+
*o += 32;
|
18
|
+
}
|
19
|
+
|
20
|
+
static inline void
|
21
|
+
enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
|
22
|
+
{
|
23
|
+
// Load input:
|
24
|
+
__m256i src = _mm256_loadu_si256((__m256i *) *s);
|
25
|
+
|
26
|
+
// Reshuffle, translate, store:
|
27
|
+
src = enc_reshuffle(src);
|
28
|
+
src = enc_translate(src);
|
29
|
+
_mm256_storeu_si256((__m256i *) *o, src);
|
30
|
+
|
31
|
+
*s += 24;
|
32
|
+
*o += 32;
|
33
|
+
}
|
34
|
+
|
35
|
+
static inline void
|
36
|
+
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
37
|
+
{
|
38
|
+
if (*slen < 32) {
|
39
|
+
return;
|
40
|
+
}
|
41
|
+
|
42
|
+
// Process blocks of 24 bytes at a time. Because blocks are loaded 32
|
43
|
+
// bytes at a time an offset of -4, ensure that there will be at least
|
44
|
+
// 4 remaining bytes after the last round, so that the final read will
|
45
|
+
// not pass beyond the bounds of the input buffer:
|
46
|
+
size_t rounds = (*slen - 4) / 24;
|
47
|
+
|
48
|
+
*slen -= rounds * 24; // 24 bytes consumed per round
|
49
|
+
*olen += rounds * 32; // 32 bytes produced per round
|
50
|
+
|
51
|
+
// The first loop iteration requires special handling to ensure that
|
52
|
+
// the read, which is done at an offset, does not underflow the buffer:
|
53
|
+
enc_loop_avx2_inner_first(s, o);
|
54
|
+
rounds--;
|
55
|
+
|
56
|
+
while (rounds > 0) {
|
57
|
+
if (rounds >= 8) {
|
58
|
+
enc_loop_avx2_inner(s, o);
|
59
|
+
enc_loop_avx2_inner(s, o);
|
60
|
+
enc_loop_avx2_inner(s, o);
|
61
|
+
enc_loop_avx2_inner(s, o);
|
62
|
+
enc_loop_avx2_inner(s, o);
|
63
|
+
enc_loop_avx2_inner(s, o);
|
64
|
+
enc_loop_avx2_inner(s, o);
|
65
|
+
enc_loop_avx2_inner(s, o);
|
66
|
+
rounds -= 8;
|
67
|
+
continue;
|
68
|
+
}
|
69
|
+
if (rounds >= 4) {
|
70
|
+
enc_loop_avx2_inner(s, o);
|
71
|
+
enc_loop_avx2_inner(s, o);
|
72
|
+
enc_loop_avx2_inner(s, o);
|
73
|
+
enc_loop_avx2_inner(s, o);
|
74
|
+
rounds -= 4;
|
75
|
+
continue;
|
76
|
+
}
|
77
|
+
if (rounds >= 2) {
|
78
|
+
enc_loop_avx2_inner(s, o);
|
79
|
+
enc_loop_avx2_inner(s, o);
|
80
|
+
rounds -= 2;
|
81
|
+
continue;
|
82
|
+
}
|
83
|
+
enc_loop_avx2_inner(s, o);
|
84
|
+
break;
|
85
|
+
}
|
86
|
+
|
87
|
+
// Add the offset back:
|
88
|
+
*s += 4;
|
89
|
+
}
|
@@ -0,0 +1,83 @@
|
|
1
|
+
static inline __m256i
|
2
|
+
enc_reshuffle (const __m256i input)
|
3
|
+
{
|
4
|
+
// Translation of the SSSE3 reshuffling algorithm to AVX2. This one
|
5
|
+
// works with shifted (4 bytes) input in order to be able to work
|
6
|
+
// efficiently in the two 128-bit lanes.
|
7
|
+
|
8
|
+
// Input, bytes MSB to LSB:
|
9
|
+
// 0 0 0 0 x w v u t s r q p o n m
|
10
|
+
// l k j i h g f e d c b a 0 0 0 0
|
11
|
+
|
12
|
+
const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
|
13
|
+
10, 11, 9, 10,
|
14
|
+
7, 8, 6, 7,
|
15
|
+
4, 5, 3, 4,
|
16
|
+
1, 2, 0, 1,
|
17
|
+
|
18
|
+
14, 15, 13, 14,
|
19
|
+
11, 12, 10, 11,
|
20
|
+
8, 9, 7, 8,
|
21
|
+
5, 6, 4, 5));
|
22
|
+
// in, bytes MSB to LSB:
|
23
|
+
// w x v w
|
24
|
+
// t u s t
|
25
|
+
// q r p q
|
26
|
+
// n o m n
|
27
|
+
// k l j k
|
28
|
+
// h i g h
|
29
|
+
// e f d e
|
30
|
+
// b c a b
|
31
|
+
|
32
|
+
const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00));
|
33
|
+
// bits, upper case are most significant bits, lower case are least
|
34
|
+
// significant bits.
|
35
|
+
// 0000wwww XX000000 VVVVVV00 00000000
|
36
|
+
// 0000tttt UU000000 SSSSSS00 00000000
|
37
|
+
// 0000qqqq RR000000 PPPPPP00 00000000
|
38
|
+
// 0000nnnn OO000000 MMMMMM00 00000000
|
39
|
+
// 0000kkkk LL000000 JJJJJJ00 00000000
|
40
|
+
// 0000hhhh II000000 GGGGGG00 00000000
|
41
|
+
// 0000eeee FF000000 DDDDDD00 00000000
|
42
|
+
// 0000bbbb CC000000 AAAAAA00 00000000
|
43
|
+
|
44
|
+
const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
|
45
|
+
// 00000000 00wwwwXX 00000000 00VVVVVV
|
46
|
+
// 00000000 00ttttUU 00000000 00SSSSSS
|
47
|
+
// 00000000 00qqqqRR 00000000 00PPPPPP
|
48
|
+
// 00000000 00nnnnOO 00000000 00MMMMMM
|
49
|
+
// 00000000 00kkkkLL 00000000 00JJJJJJ
|
50
|
+
// 00000000 00hhhhII 00000000 00GGGGGG
|
51
|
+
// 00000000 00eeeeFF 00000000 00DDDDDD
|
52
|
+
// 00000000 00bbbbCC 00000000 00AAAAAA
|
53
|
+
|
54
|
+
const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0));
|
55
|
+
// 00000000 00xxxxxx 000000vv WWWW0000
|
56
|
+
// 00000000 00uuuuuu 000000ss TTTT0000
|
57
|
+
// 00000000 00rrrrrr 000000pp QQQQ0000
|
58
|
+
// 00000000 00oooooo 000000mm NNNN0000
|
59
|
+
// 00000000 00llllll 000000jj KKKK0000
|
60
|
+
// 00000000 00iiiiii 000000gg HHHH0000
|
61
|
+
// 00000000 00ffffff 000000dd EEEE0000
|
62
|
+
// 00000000 00cccccc 000000aa BBBB0000
|
63
|
+
|
64
|
+
const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
|
65
|
+
// 00xxxxxx 00000000 00vvWWWW 00000000
|
66
|
+
// 00uuuuuu 00000000 00ssTTTT 00000000
|
67
|
+
// 00rrrrrr 00000000 00ppQQQQ 00000000
|
68
|
+
// 00oooooo 00000000 00mmNNNN 00000000
|
69
|
+
// 00llllll 00000000 00jjKKKK 00000000
|
70
|
+
// 00iiiiii 00000000 00ggHHHH 00000000
|
71
|
+
// 00ffffff 00000000 00ddEEEE 00000000
|
72
|
+
// 00cccccc 00000000 00aaBBBB 00000000
|
73
|
+
|
74
|
+
return _mm256_or_si256(t1, t3);
|
75
|
+
// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
|
76
|
+
// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
|
77
|
+
// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
|
78
|
+
// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
|
79
|
+
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
|
80
|
+
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
|
81
|
+
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
|
82
|
+
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
|
83
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
static inline __m256i
|
2
|
+
enc_translate (const __m256i in)
|
3
|
+
{
|
4
|
+
// A lookup table containing the absolute offsets for all ranges:
|
5
|
+
const __m256i lut = _mm256_setr_epi8(
|
6
|
+
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
|
7
|
+
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
|
8
|
+
|
9
|
+
// Translate values 0..63 to the Base64 alphabet. There are five sets:
|
10
|
+
// # From To Abs Index Characters
|
11
|
+
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
12
|
+
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
|
13
|
+
// 2 [52..61] [48..57] -4 [2..11] 0123456789
|
14
|
+
// 3 [62] [43] -19 12 +
|
15
|
+
// 4 [63] [47] -16 13 /
|
16
|
+
|
17
|
+
// Create LUT indices from the input. The index for range #0 is right,
|
18
|
+
// others are 1 less than expected:
|
19
|
+
__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
|
20
|
+
|
21
|
+
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
|
22
|
+
const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
|
23
|
+
|
24
|
+
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
|
25
|
+
// now correct:
|
26
|
+
indices = _mm256_sub_epi8(indices, mask);
|
27
|
+
|
28
|
+
// Add offsets to input values:
|
29
|
+
return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
|
30
|
+
}
|