ob64 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/lib/ob64/version.rb +1 -1
- data/ob64.gemspec +2 -0
- data/vendor/libbase64/.gitignore +12 -0
- data/vendor/libbase64/.travis.yml +71 -0
- data/vendor/libbase64/CMakeLists.txt +264 -0
- data/vendor/libbase64/LICENSE +28 -0
- data/vendor/libbase64/Makefile +93 -0
- data/vendor/libbase64/README.md +474 -0
- data/vendor/libbase64/base64-benchmarks.png +0 -0
- data/vendor/libbase64/bin/base64.c +132 -0
- data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
- data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
- data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
- data/vendor/libbase64/cmake/config.h.in +25 -0
- data/vendor/libbase64/cmake/test-arch.c +35 -0
- data/vendor/libbase64/include/libbase64.h +145 -0
- data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
- data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
- data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
- data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
- data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
- data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
- data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
- data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
- data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
- data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
- data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
- data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
- data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
- data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
- data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
- data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
- data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
- data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
- data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
- data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
- data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
- data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
- data/vendor/libbase64/lib/codec_choose.c +281 -0
- data/vendor/libbase64/lib/codecs.h +65 -0
- data/vendor/libbase64/lib/env.h +67 -0
- data/vendor/libbase64/lib/exports.txt +7 -0
- data/vendor/libbase64/lib/lib.c +164 -0
- data/vendor/libbase64/lib/lib_openmp.c +149 -0
- data/vendor/libbase64/lib/tables/.gitignore +1 -0
- data/vendor/libbase64/lib/tables/Makefile +17 -0
- data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
- data/vendor/libbase64/lib/tables/table_generator.c +184 -0
- data/vendor/libbase64/lib/tables/tables.c +40 -0
- data/vendor/libbase64/lib/tables/tables.h +23 -0
- metadata +64 -4
@@ -0,0 +1,25 @@
|
|
1
|
+
#ifndef BASE64_CONFIG_H
|
2
|
+
#define BASE64_CONFIG_H
|
3
|
+
|
4
|
+
#cmakedefine01 BASE64_WITH_SSSE3
|
5
|
+
#define HAVE_SSSE3 BASE64_WITH_SSSE3
|
6
|
+
|
7
|
+
#cmakedefine01 BASE64_WITH_SSE41
|
8
|
+
#define HAVE_SSE41 BASE64_WITH_SSE41
|
9
|
+
|
10
|
+
#cmakedefine01 BASE64_WITH_SSE42
|
11
|
+
#define HAVE_SSE42 BASE64_WITH_SSE42
|
12
|
+
|
13
|
+
#cmakedefine01 BASE64_WITH_AVX
|
14
|
+
#define HAVE_AVX BASE64_WITH_AVX
|
15
|
+
|
16
|
+
#cmakedefine01 BASE64_WITH_AVX2
|
17
|
+
#define HAVE_AVX2 BASE64_WITH_AVX2
|
18
|
+
|
19
|
+
#cmakedefine01 BASE64_WITH_NEON32
|
20
|
+
#define HAVE_NEON32 BASE64_WITH_NEON32
|
21
|
+
|
22
|
+
#cmakedefine01 BASE64_WITH_NEON64
|
23
|
+
#define HAVE_NEON64 BASE64_WITH_NEON64
|
24
|
+
|
25
|
+
#endif // BASE64_CONFIG_H
|
@@ -0,0 +1,35 @@
|
|
1
|
+
// Written in 2017 by Henrik Steffen Gaßmann henrik@gassmann.onl
|
2
|
+
//
|
3
|
+
// To the extent possible under law, the author(s) have dedicated all
|
4
|
+
// copyright and related and neighboring rights to this software to the
|
5
|
+
// public domain worldwide. This software is distributed without any warranty.
|
6
|
+
//
|
7
|
+
// You should have received a copy of the CC0 Public Domain Dedication
|
8
|
+
// along with this software. If not, see
|
9
|
+
//
|
10
|
+
// http://creativecommons.org/publicdomain/zero/1.0/
|
11
|
+
//
|
12
|
+
////////////////////////////////////////////////////////////////////////////////
|
13
|
+
|
14
|
+
// ARM 64-Bit
|
15
|
+
#if defined(__aarch64__)
|
16
|
+
#error ##arch=arm64##
|
17
|
+
|
18
|
+
// ARM 32-Bit
|
19
|
+
#elif defined(__arm__) \
|
20
|
+
|| defined(_M_ARM)
|
21
|
+
#error ##arch=arm##
|
22
|
+
|
23
|
+
// x86 64-Bit
|
24
|
+
#elif defined(__x86_64__) \
|
25
|
+
|| defined(_M_X64)
|
26
|
+
#error ##arch=x64##
|
27
|
+
|
28
|
+
// x86 32-Bit
|
29
|
+
#elif defined(__i386__) \
|
30
|
+
|| defined(_M_IX86)
|
31
|
+
#error ##arch=x86##
|
32
|
+
|
33
|
+
#else
|
34
|
+
#error ##arch=unknown##
|
35
|
+
#endif
|
@@ -0,0 +1,145 @@
|
|
1
|
+
#ifndef LIBBASE64_H
|
2
|
+
#define LIBBASE64_H
|
3
|
+
|
4
|
+
#include <stddef.h> /* size_t */
|
5
|
+
|
6
|
+
|
7
|
+
#if defined(_WIN32) || defined(__CYGWIN__)
|
8
|
+
#define BASE64_SYMBOL_IMPORT __declspec(dllimport)
|
9
|
+
#define BASE64_SYMBOL_EXPORT __declspec(dllexport)
|
10
|
+
#define BASE64_SYMBOL_PRIVATE
|
11
|
+
|
12
|
+
#elif __GNUC__ >= 4
|
13
|
+
#define BASE64_SYMBOL_IMPORT __attribute__ ((visibility ("default")))
|
14
|
+
#define BASE64_SYMBOL_EXPORT __attribute__ ((visibility ("default")))
|
15
|
+
#define BASE64_SYMBOL_PRIVATE __attribute__ ((visibility ("hidden")))
|
16
|
+
|
17
|
+
#else
|
18
|
+
#define BASE64_SYMBOL_IMPORT
|
19
|
+
#define BASE64_SYMBOL_EXPORT
|
20
|
+
#define BASE64_SYMBOL_PRIVATE
|
21
|
+
#endif
|
22
|
+
|
23
|
+
#if defined(BASE64_STATIC_DEFINE)
|
24
|
+
#define BASE64_EXPORT
|
25
|
+
#define BASE64_NO_EXPORT
|
26
|
+
|
27
|
+
#else
|
28
|
+
#if defined(BASE64_EXPORTS) // defined if we are building the shared library
|
29
|
+
#define BASE64_EXPORT BASE64_SYMBOL_EXPORT
|
30
|
+
|
31
|
+
#else
|
32
|
+
#define BASE64_EXPORT BASE64_SYMBOL_IMPORT
|
33
|
+
#endif
|
34
|
+
|
35
|
+
#define BASE64_NO_EXPORT BASE64_SYMBOL_PRIVATE
|
36
|
+
#endif
|
37
|
+
|
38
|
+
|
39
|
+
#ifdef __cplusplus
|
40
|
+
extern "C" {
|
41
|
+
#endif
|
42
|
+
|
43
|
+
/* These are the flags that can be passed in the `flags` argument. The values
|
44
|
+
* below force the use of a given codec, even if that codec is a no-op in the
|
45
|
+
* current build. Used in testing. Set to 0 for the default behavior, which is
|
46
|
+
* runtime feature detection on x86, a compile-time fixed codec on ARM, and
|
47
|
+
* the plain codec on other platforms: */
|
48
|
+
#define BASE64_FORCE_AVX2 (1 << 0)
|
49
|
+
#define BASE64_FORCE_NEON32 (1 << 1)
|
50
|
+
#define BASE64_FORCE_NEON64 (1 << 2)
|
51
|
+
#define BASE64_FORCE_PLAIN (1 << 3)
|
52
|
+
#define BASE64_FORCE_SSSE3 (1 << 4)
|
53
|
+
#define BASE64_FORCE_SSE41 (1 << 5)
|
54
|
+
#define BASE64_FORCE_SSE42 (1 << 6)
|
55
|
+
#define BASE64_FORCE_AVX (1 << 7)
|
56
|
+
|
57
|
+
struct base64_state {
|
58
|
+
int eof;
|
59
|
+
int bytes;
|
60
|
+
int flags;
|
61
|
+
unsigned char carry;
|
62
|
+
};
|
63
|
+
|
64
|
+
/* Wrapper function to encode a plain string of given length. Output is written
|
65
|
+
* to *out without trailing zero. Output length in bytes is written to *outlen.
|
66
|
+
* The buffer in `out` has been allocated by the caller and is at least 4/3 the
|
67
|
+
* size of the input. See above for `flags`; set to 0 for default operation: */
|
68
|
+
void BASE64_EXPORT base64_encode
|
69
|
+
( const char *src
|
70
|
+
, size_t srclen
|
71
|
+
, char *out
|
72
|
+
, size_t *outlen
|
73
|
+
, int flags
|
74
|
+
) ;
|
75
|
+
|
76
|
+
/* Call this before calling base64_stream_encode() to init the state. See above
|
77
|
+
* for `flags`; set to 0 for default operation: */
|
78
|
+
void BASE64_EXPORT base64_stream_encode_init
|
79
|
+
( struct base64_state *state
|
80
|
+
, int flags
|
81
|
+
) ;
|
82
|
+
|
83
|
+
/* Encodes the block of data of given length at `src`, into the buffer at
|
84
|
+
* `out`. Caller is responsible for allocating a large enough out-buffer; it
|
85
|
+
* must be at least 4/3 the size of the in-buffer, but take some margin. Places
|
86
|
+
* the number of new bytes written into `outlen` (which is set to zero when the
|
87
|
+
* function starts). Does not zero-terminate or finalize the output. */
|
88
|
+
void BASE64_EXPORT base64_stream_encode
|
89
|
+
( struct base64_state *state
|
90
|
+
, const char *src
|
91
|
+
, size_t srclen
|
92
|
+
, char *out
|
93
|
+
, size_t *outlen
|
94
|
+
) ;
|
95
|
+
|
96
|
+
/* Finalizes the output begun by previous calls to `base64_stream_encode()`.
|
97
|
+
* Adds the required end-of-stream markers if appropriate. `outlen` is modified
|
98
|
+
* and will contain the number of new bytes written at `out` (which will quite
|
99
|
+
* often be zero). */
|
100
|
+
void BASE64_EXPORT base64_stream_encode_final
|
101
|
+
( struct base64_state *state
|
102
|
+
, char *out
|
103
|
+
, size_t *outlen
|
104
|
+
) ;
|
105
|
+
|
106
|
+
/* Wrapper function to decode a plain string of given length. Output is written
|
107
|
+
* to *out without trailing zero. Output length in bytes is written to *outlen.
|
108
|
+
* The buffer in `out` has been allocated by the caller and is at least 3/4 the
|
109
|
+
* size of the input. See above for `flags`, set to 0 for default operation: */
|
110
|
+
int BASE64_EXPORT base64_decode
|
111
|
+
( const char *src
|
112
|
+
, size_t srclen
|
113
|
+
, char *out
|
114
|
+
, size_t *outlen
|
115
|
+
, int flags
|
116
|
+
) ;
|
117
|
+
|
118
|
+
/* Call this before calling base64_stream_decode() to init the state. See above
|
119
|
+
* for `flags`; set to 0 for default operation: */
|
120
|
+
void BASE64_EXPORT base64_stream_decode_init
|
121
|
+
( struct base64_state *state
|
122
|
+
, int flags
|
123
|
+
) ;
|
124
|
+
|
125
|
+
/* Decodes the block of data of given length at `src`, into the buffer at
|
126
|
+
* `out`. Caller is responsible for allocating a large enough out-buffer; it
|
127
|
+
* must be at least 3/4 the size of the in-buffer, but take some margin. Places
|
128
|
+
* the number of new bytes written into `outlen` (which is set to zero when the
|
129
|
+
* function starts). Does not zero-terminate the output. Returns 1 if all is
|
130
|
+
* well, and 0 if a decoding error was found, such as an invalid character.
|
131
|
+
* Returns -1 if the chosen codec is not included in the current build. Used by
|
132
|
+
* the test harness to check whether a codec is available for testing. */
|
133
|
+
int BASE64_EXPORT base64_stream_decode
|
134
|
+
( struct base64_state *state
|
135
|
+
, const char *src
|
136
|
+
, size_t srclen
|
137
|
+
, char *out
|
138
|
+
, size_t *outlen
|
139
|
+
) ;
|
140
|
+
|
141
|
+
#ifdef __cplusplus
|
142
|
+
}
|
143
|
+
#endif
|
144
|
+
|
145
|
+
#endif /* LIBBASE64_H */
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#if HAVE_AVX
|
12
|
+
#include <immintrin.h>
|
13
|
+
|
14
|
+
#include "../ssse3/dec_reshuffle.c"
|
15
|
+
#include "../ssse3/dec_loop.c"
|
16
|
+
#include "../ssse3/enc_translate.c"
|
17
|
+
#include "../ssse3/enc_reshuffle.c"
|
18
|
+
#include "../ssse3/enc_loop.c"
|
19
|
+
|
20
|
+
#endif // HAVE_AVX
|
21
|
+
|
22
|
+
BASE64_ENC_FUNCTION(avx)
|
23
|
+
{
|
24
|
+
#if HAVE_AVX
|
25
|
+
#include "../generic/enc_head.c"
|
26
|
+
enc_loop_ssse3(&s, &slen, &o, &olen);
|
27
|
+
#include "../generic/enc_tail.c"
|
28
|
+
#else
|
29
|
+
BASE64_ENC_STUB
|
30
|
+
#endif
|
31
|
+
}
|
32
|
+
|
33
|
+
BASE64_DEC_FUNCTION(avx)
|
34
|
+
{
|
35
|
+
#if HAVE_AVX
|
36
|
+
#include "../generic/dec_head.c"
|
37
|
+
dec_loop_ssse3(&s, &slen, &o, &olen);
|
38
|
+
#include "../generic/dec_tail.c"
|
39
|
+
#else
|
40
|
+
BASE64_DEC_STUB
|
41
|
+
#endif
|
42
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#if HAVE_AVX2
|
12
|
+
#include <immintrin.h>
|
13
|
+
|
14
|
+
#include "dec_reshuffle.c"
|
15
|
+
#include "dec_loop.c"
|
16
|
+
#include "enc_translate.c"
|
17
|
+
#include "enc_reshuffle.c"
|
18
|
+
#include "enc_loop.c"
|
19
|
+
|
20
|
+
#endif // HAVE_AVX2
|
21
|
+
|
22
|
+
BASE64_ENC_FUNCTION(avx2)
|
23
|
+
{
|
24
|
+
#if HAVE_AVX2
|
25
|
+
#include "../generic/enc_head.c"
|
26
|
+
enc_loop_avx2(&s, &slen, &o, &olen);
|
27
|
+
#include "../generic/enc_tail.c"
|
28
|
+
#else
|
29
|
+
BASE64_ENC_STUB
|
30
|
+
#endif
|
31
|
+
}
|
32
|
+
|
33
|
+
BASE64_DEC_FUNCTION(avx2)
|
34
|
+
{
|
35
|
+
#if HAVE_AVX2
|
36
|
+
#include "../generic/dec_head.c"
|
37
|
+
dec_loop_avx2(&s, &slen, &o, &olen);
|
38
|
+
#include "../generic/dec_tail.c"
|
39
|
+
#else
|
40
|
+
BASE64_DEC_STUB
|
41
|
+
#endif
|
42
|
+
}
|
@@ -0,0 +1,110 @@
|
|
1
|
+
static inline int
|
2
|
+
dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
|
3
|
+
{
|
4
|
+
const __m256i lut_lo = _mm256_setr_epi8(
|
5
|
+
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
|
6
|
+
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
|
7
|
+
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
|
8
|
+
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
|
9
|
+
|
10
|
+
const __m256i lut_hi = _mm256_setr_epi8(
|
11
|
+
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
|
12
|
+
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
|
13
|
+
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
|
14
|
+
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
|
15
|
+
|
16
|
+
const __m256i lut_roll = _mm256_setr_epi8(
|
17
|
+
0, 16, 19, 4, -65, -65, -71, -71,
|
18
|
+
0, 0, 0, 0, 0, 0, 0, 0,
|
19
|
+
0, 16, 19, 4, -65, -65, -71, -71,
|
20
|
+
0, 0, 0, 0, 0, 0, 0, 0);
|
21
|
+
|
22
|
+
const __m256i mask_2F = _mm256_set1_epi8(0x2F);
|
23
|
+
|
24
|
+
// Load input:
|
25
|
+
__m256i str = _mm256_loadu_si256((__m256i *) *s);
|
26
|
+
|
27
|
+
// See the SSSE3 decoder for an explanation of the algorithm.
|
28
|
+
const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
|
29
|
+
const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
|
30
|
+
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
|
31
|
+
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
|
32
|
+
|
33
|
+
if (!_mm256_testz_si256(lo, hi)) {
|
34
|
+
return 0;
|
35
|
+
}
|
36
|
+
|
37
|
+
const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
|
38
|
+
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
|
39
|
+
|
40
|
+
// Now simply add the delta values to the input:
|
41
|
+
str = _mm256_add_epi8(str, roll);
|
42
|
+
|
43
|
+
// Reshuffle the input to packed 12-byte output format:
|
44
|
+
str = dec_reshuffle(str);
|
45
|
+
|
46
|
+
// Store the output:
|
47
|
+
_mm256_storeu_si256((__m256i *) *o, str);
|
48
|
+
|
49
|
+
*s += 32;
|
50
|
+
*o += 24;
|
51
|
+
*rounds -= 1;
|
52
|
+
|
53
|
+
return 1;
|
54
|
+
}
|
55
|
+
|
56
|
+
static inline void
|
57
|
+
dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
58
|
+
{
|
59
|
+
if (*slen < 45) {
|
60
|
+
return;
|
61
|
+
}
|
62
|
+
|
63
|
+
// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
|
64
|
+
// written after the output, ensure that there will be at least 13
|
65
|
+
// bytes of input data left to cover the gap. (11 data bytes and up to
|
66
|
+
// two end-of-string markers.)
|
67
|
+
size_t rounds = (*slen - 13) / 32;
|
68
|
+
|
69
|
+
*slen -= rounds * 32; // 32 bytes consumed per round
|
70
|
+
*olen += rounds * 24; // 24 bytes produced per round
|
71
|
+
|
72
|
+
do {
|
73
|
+
if (rounds >= 8) {
|
74
|
+
if (dec_loop_avx2_inner(s, o, &rounds) &&
|
75
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
76
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
77
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
78
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
79
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
80
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
81
|
+
dec_loop_avx2_inner(s, o, &rounds)) {
|
82
|
+
continue;
|
83
|
+
}
|
84
|
+
break;
|
85
|
+
}
|
86
|
+
if (rounds >= 4) {
|
87
|
+
if (dec_loop_avx2_inner(s, o, &rounds) &&
|
88
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
89
|
+
dec_loop_avx2_inner(s, o, &rounds) &&
|
90
|
+
dec_loop_avx2_inner(s, o, &rounds)) {
|
91
|
+
continue;
|
92
|
+
}
|
93
|
+
break;
|
94
|
+
}
|
95
|
+
if (rounds >= 2) {
|
96
|
+
if (dec_loop_avx2_inner(s, o, &rounds) &&
|
97
|
+
dec_loop_avx2_inner(s, o, &rounds)) {
|
98
|
+
continue;
|
99
|
+
}
|
100
|
+
break;
|
101
|
+
}
|
102
|
+
dec_loop_avx2_inner(s, o, &rounds);
|
103
|
+
break;
|
104
|
+
|
105
|
+
} while (rounds > 0);
|
106
|
+
|
107
|
+
// Adjust for any rounds that were skipped:
|
108
|
+
*slen += rounds * 32;
|
109
|
+
*olen -= rounds * 24;
|
110
|
+
}
|
@@ -0,0 +1,34 @@
|
|
1
|
+
static inline __m256i
|
2
|
+
dec_reshuffle (const __m256i in)
|
3
|
+
{
|
4
|
+
// in, lower lane, bits, upper case are most significant bits, lower
|
5
|
+
// case are least significant bits:
|
6
|
+
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
|
7
|
+
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
|
8
|
+
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
|
9
|
+
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
|
10
|
+
|
11
|
+
const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
|
12
|
+
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
|
13
|
+
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
|
14
|
+
// 0000eeee FFffffff 0000DDDD DDddEEEE
|
15
|
+
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
|
16
|
+
|
17
|
+
__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
|
18
|
+
// 00000000 JJJJJJjj KKKKkkkk LLllllll
|
19
|
+
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
|
20
|
+
// 00000000 DDDDDDdd EEEEeeee FFffffff
|
21
|
+
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
|
22
|
+
|
23
|
+
// Pack bytes together in each lane:
|
24
|
+
out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
|
25
|
+
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
|
26
|
+
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
|
27
|
+
// 00000000 00000000 00000000 00000000
|
28
|
+
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
|
29
|
+
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
|
30
|
+
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
|
31
|
+
|
32
|
+
// Pack lanes:
|
33
|
+
return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
|
34
|
+
}
|
@@ -0,0 +1,89 @@
|
|
1
|
+
static inline void
|
2
|
+
enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
|
3
|
+
{
|
4
|
+
// First load is done at s - 0 to not get a segfault:
|
5
|
+
__m256i src = _mm256_loadu_si256((__m256i *) *s);
|
6
|
+
|
7
|
+
// Shift by 4 bytes, as required by enc_reshuffle:
|
8
|
+
src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
|
9
|
+
|
10
|
+
// Reshuffle, translate, store:
|
11
|
+
src = enc_reshuffle(src);
|
12
|
+
src = enc_translate(src);
|
13
|
+
_mm256_storeu_si256((__m256i *) *o, src);
|
14
|
+
|
15
|
+
// Subsequent loads will be done at s - 4, set pointer for next round:
|
16
|
+
*s += 20;
|
17
|
+
*o += 32;
|
18
|
+
}
|
19
|
+
|
20
|
+
static inline void
|
21
|
+
enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
|
22
|
+
{
|
23
|
+
// Load input:
|
24
|
+
__m256i src = _mm256_loadu_si256((__m256i *) *s);
|
25
|
+
|
26
|
+
// Reshuffle, translate, store:
|
27
|
+
src = enc_reshuffle(src);
|
28
|
+
src = enc_translate(src);
|
29
|
+
_mm256_storeu_si256((__m256i *) *o, src);
|
30
|
+
|
31
|
+
*s += 24;
|
32
|
+
*o += 32;
|
33
|
+
}
|
34
|
+
|
35
|
+
static inline void
|
36
|
+
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
37
|
+
{
|
38
|
+
if (*slen < 32) {
|
39
|
+
return;
|
40
|
+
}
|
41
|
+
|
42
|
+
// Process blocks of 24 bytes at a time. Because blocks are loaded 32
|
43
|
+
// bytes at a time an offset of -4, ensure that there will be at least
|
44
|
+
// 4 remaining bytes after the last round, so that the final read will
|
45
|
+
// not pass beyond the bounds of the input buffer:
|
46
|
+
size_t rounds = (*slen - 4) / 24;
|
47
|
+
|
48
|
+
*slen -= rounds * 24; // 24 bytes consumed per round
|
49
|
+
*olen += rounds * 32; // 32 bytes produced per round
|
50
|
+
|
51
|
+
// The first loop iteration requires special handling to ensure that
|
52
|
+
// the read, which is done at an offset, does not underflow the buffer:
|
53
|
+
enc_loop_avx2_inner_first(s, o);
|
54
|
+
rounds--;
|
55
|
+
|
56
|
+
while (rounds > 0) {
|
57
|
+
if (rounds >= 8) {
|
58
|
+
enc_loop_avx2_inner(s, o);
|
59
|
+
enc_loop_avx2_inner(s, o);
|
60
|
+
enc_loop_avx2_inner(s, o);
|
61
|
+
enc_loop_avx2_inner(s, o);
|
62
|
+
enc_loop_avx2_inner(s, o);
|
63
|
+
enc_loop_avx2_inner(s, o);
|
64
|
+
enc_loop_avx2_inner(s, o);
|
65
|
+
enc_loop_avx2_inner(s, o);
|
66
|
+
rounds -= 8;
|
67
|
+
continue;
|
68
|
+
}
|
69
|
+
if (rounds >= 4) {
|
70
|
+
enc_loop_avx2_inner(s, o);
|
71
|
+
enc_loop_avx2_inner(s, o);
|
72
|
+
enc_loop_avx2_inner(s, o);
|
73
|
+
enc_loop_avx2_inner(s, o);
|
74
|
+
rounds -= 4;
|
75
|
+
continue;
|
76
|
+
}
|
77
|
+
if (rounds >= 2) {
|
78
|
+
enc_loop_avx2_inner(s, o);
|
79
|
+
enc_loop_avx2_inner(s, o);
|
80
|
+
rounds -= 2;
|
81
|
+
continue;
|
82
|
+
}
|
83
|
+
enc_loop_avx2_inner(s, o);
|
84
|
+
break;
|
85
|
+
}
|
86
|
+
|
87
|
+
// Add the offset back:
|
88
|
+
*s += 4;
|
89
|
+
}
|
@@ -0,0 +1,83 @@
|
|
1
|
+
static inline __m256i
|
2
|
+
enc_reshuffle (const __m256i input)
|
3
|
+
{
|
4
|
+
// Translation of the SSSE3 reshuffling algorithm to AVX2. This one
|
5
|
+
// works with shifted (4 bytes) input in order to be able to work
|
6
|
+
// efficiently in the two 128-bit lanes.
|
7
|
+
|
8
|
+
// Input, bytes MSB to LSB:
|
9
|
+
// 0 0 0 0 x w v u t s r q p o n m
|
10
|
+
// l k j i h g f e d c b a 0 0 0 0
|
11
|
+
|
12
|
+
const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
|
13
|
+
10, 11, 9, 10,
|
14
|
+
7, 8, 6, 7,
|
15
|
+
4, 5, 3, 4,
|
16
|
+
1, 2, 0, 1,
|
17
|
+
|
18
|
+
14, 15, 13, 14,
|
19
|
+
11, 12, 10, 11,
|
20
|
+
8, 9, 7, 8,
|
21
|
+
5, 6, 4, 5));
|
22
|
+
// in, bytes MSB to LSB:
|
23
|
+
// w x v w
|
24
|
+
// t u s t
|
25
|
+
// q r p q
|
26
|
+
// n o m n
|
27
|
+
// k l j k
|
28
|
+
// h i g h
|
29
|
+
// e f d e
|
30
|
+
// b c a b
|
31
|
+
|
32
|
+
const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00));
|
33
|
+
// bits, upper case are most significant bits, lower case are least
|
34
|
+
// significant bits.
|
35
|
+
// 0000wwww XX000000 VVVVVV00 00000000
|
36
|
+
// 0000tttt UU000000 SSSSSS00 00000000
|
37
|
+
// 0000qqqq RR000000 PPPPPP00 00000000
|
38
|
+
// 0000nnnn OO000000 MMMMMM00 00000000
|
39
|
+
// 0000kkkk LL000000 JJJJJJ00 00000000
|
40
|
+
// 0000hhhh II000000 GGGGGG00 00000000
|
41
|
+
// 0000eeee FF000000 DDDDDD00 00000000
|
42
|
+
// 0000bbbb CC000000 AAAAAA00 00000000
|
43
|
+
|
44
|
+
const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
|
45
|
+
// 00000000 00wwwwXX 00000000 00VVVVVV
|
46
|
+
// 00000000 00ttttUU 00000000 00SSSSSS
|
47
|
+
// 00000000 00qqqqRR 00000000 00PPPPPP
|
48
|
+
// 00000000 00nnnnOO 00000000 00MMMMMM
|
49
|
+
// 00000000 00kkkkLL 00000000 00JJJJJJ
|
50
|
+
// 00000000 00hhhhII 00000000 00GGGGGG
|
51
|
+
// 00000000 00eeeeFF 00000000 00DDDDDD
|
52
|
+
// 00000000 00bbbbCC 00000000 00AAAAAA
|
53
|
+
|
54
|
+
const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0));
|
55
|
+
// 00000000 00xxxxxx 000000vv WWWW0000
|
56
|
+
// 00000000 00uuuuuu 000000ss TTTT0000
|
57
|
+
// 00000000 00rrrrrr 000000pp QQQQ0000
|
58
|
+
// 00000000 00oooooo 000000mm NNNN0000
|
59
|
+
// 00000000 00llllll 000000jj KKKK0000
|
60
|
+
// 00000000 00iiiiii 000000gg HHHH0000
|
61
|
+
// 00000000 00ffffff 000000dd EEEE0000
|
62
|
+
// 00000000 00cccccc 000000aa BBBB0000
|
63
|
+
|
64
|
+
const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
|
65
|
+
// 00xxxxxx 00000000 00vvWWWW 00000000
|
66
|
+
// 00uuuuuu 00000000 00ssTTTT 00000000
|
67
|
+
// 00rrrrrr 00000000 00ppQQQQ 00000000
|
68
|
+
// 00oooooo 00000000 00mmNNNN 00000000
|
69
|
+
// 00llllll 00000000 00jjKKKK 00000000
|
70
|
+
// 00iiiiii 00000000 00ggHHHH 00000000
|
71
|
+
// 00ffffff 00000000 00ddEEEE 00000000
|
72
|
+
// 00cccccc 00000000 00aaBBBB 00000000
|
73
|
+
|
74
|
+
return _mm256_or_si256(t1, t3);
|
75
|
+
// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
|
76
|
+
// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
|
77
|
+
// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
|
78
|
+
// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
|
79
|
+
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
|
80
|
+
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
|
81
|
+
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
|
82
|
+
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
|
83
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
static inline __m256i
|
2
|
+
enc_translate (const __m256i in)
|
3
|
+
{
|
4
|
+
// A lookup table containing the absolute offsets for all ranges:
|
5
|
+
const __m256i lut = _mm256_setr_epi8(
|
6
|
+
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
|
7
|
+
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
|
8
|
+
|
9
|
+
// Translate values 0..63 to the Base64 alphabet. There are five sets:
|
10
|
+
// # From To Abs Index Characters
|
11
|
+
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
12
|
+
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
|
13
|
+
// 2 [52..61] [48..57] -4 [2..11] 0123456789
|
14
|
+
// 3 [62] [43] -19 12 +
|
15
|
+
// 4 [63] [47] -16 13 /
|
16
|
+
|
17
|
+
// Create LUT indices from the input. The index for range #0 is right,
|
18
|
+
// others are 1 less than expected:
|
19
|
+
__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
|
20
|
+
|
21
|
+
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
|
22
|
+
const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
|
23
|
+
|
24
|
+
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
|
25
|
+
// now correct:
|
26
|
+
indices = _mm256_sub_epi8(indices, mask);
|
27
|
+
|
28
|
+
// Add offsets to input values:
|
29
|
+
return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
|
30
|
+
}
|