ob64 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/lib/ob64/version.rb +1 -1
- data/ob64.gemspec +2 -0
- data/vendor/libbase64/.gitignore +12 -0
- data/vendor/libbase64/.travis.yml +71 -0
- data/vendor/libbase64/CMakeLists.txt +264 -0
- data/vendor/libbase64/LICENSE +28 -0
- data/vendor/libbase64/Makefile +93 -0
- data/vendor/libbase64/README.md +474 -0
- data/vendor/libbase64/base64-benchmarks.png +0 -0
- data/vendor/libbase64/bin/base64.c +132 -0
- data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
- data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
- data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
- data/vendor/libbase64/cmake/config.h.in +25 -0
- data/vendor/libbase64/cmake/test-arch.c +35 -0
- data/vendor/libbase64/include/libbase64.h +145 -0
- data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
- data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
- data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
- data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
- data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
- data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
- data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
- data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
- data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
- data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
- data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
- data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
- data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
- data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
- data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
- data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
- data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
- data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
- data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
- data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
- data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
- data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
- data/vendor/libbase64/lib/codec_choose.c +281 -0
- data/vendor/libbase64/lib/codecs.h +65 -0
- data/vendor/libbase64/lib/env.h +67 -0
- data/vendor/libbase64/lib/exports.txt +7 -0
- data/vendor/libbase64/lib/lib.c +164 -0
- data/vendor/libbase64/lib/lib_openmp.c +149 -0
- data/vendor/libbase64/lib/tables/.gitignore +1 -0
- data/vendor/libbase64/lib/tables/Makefile +17 -0
- data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
- data/vendor/libbase64/lib/tables/table_generator.c +184 -0
- data/vendor/libbase64/lib/tables/tables.c +40 -0
- data/vendor/libbase64/lib/tables/tables.h +23 -0
- metadata +64 -4
@@ -0,0 +1,54 @@
|
|
1
|
+
static inline uint8x16x4_t
|
2
|
+
enc_reshuffle (const uint8x16x3_t in)
|
3
|
+
{
|
4
|
+
uint8x16x4_t out;
|
5
|
+
|
6
|
+
#if defined(__GNUC__) || defined(__clang__)
|
7
|
+
|
8
|
+
// GCC and Clang support the following inline assembly syntax. This
|
9
|
+
// inline assembly implements the exact same algorithm as the
|
10
|
+
// intrinsics further down, but benchmarks show that the inline
|
11
|
+
// assembly easily beats the intrinsics. Perhaps this is because the
|
12
|
+
// inline assembly is well pipelined to avoid data dependencies.
|
13
|
+
|
14
|
+
__asm__ (
|
15
|
+
"vshr.u8 %q[o0], %q[i0], #2 \n\t"
|
16
|
+
"vshr.u8 %q[o1], %q[i1], #2 \n\t"
|
17
|
+
"vshr.u8 %q[o2], %q[i2], #4 \n\t"
|
18
|
+
"vsli.8 %q[o1], %q[i0], #6 \n\t"
|
19
|
+
"vsli.8 %q[o2], %q[i1], #4 \n\t"
|
20
|
+
"vshl.u8 %q[o3], %q[i2], #2 \n\t"
|
21
|
+
|
22
|
+
"vshr.u8 %q[o1], %q[o1], #2 \n\t"
|
23
|
+
"vshr.u8 %q[o2], %q[o2], #2 \n\t"
|
24
|
+
"vshr.u8 %q[o3], %q[o3], #2 \n\t"
|
25
|
+
|
26
|
+
// Outputs:
|
27
|
+
: [o0] "=&w" (out.val[0]),
|
28
|
+
[o1] "=&w" (out.val[1]),
|
29
|
+
[o2] "=&w" (out.val[2]),
|
30
|
+
[o3] "=&w" (out.val[3])
|
31
|
+
|
32
|
+
// Inputs:
|
33
|
+
: [i0] "w" (in.val[0]),
|
34
|
+
[i1] "w" (in.val[1]),
|
35
|
+
[i2] "w" (in.val[2])
|
36
|
+
);
|
37
|
+
#else
|
38
|
+
// Divide bits of three input bytes over four output bytes. All output
|
39
|
+
// bytes except the first one are shifted over two bits to the left:
|
40
|
+
out.val[0] = vshrq_n_u8(in.val[0], 2);
|
41
|
+
out.val[1] = vshrq_n_u8(in.val[1], 2);
|
42
|
+
out.val[2] = vshrq_n_u8(in.val[2], 4);
|
43
|
+
out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 6);
|
44
|
+
out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 4);
|
45
|
+
out.val[3] = vshlq_n_u8(in.val[2], 2);
|
46
|
+
|
47
|
+
// Clear the top two bits by shifting the output back to the right:
|
48
|
+
out.val[1] = vshrq_n_u8(out.val[1], 2);
|
49
|
+
out.val[2] = vshrq_n_u8(out.val[2], 2);
|
50
|
+
out.val[3] = vshrq_n_u8(out.val[3], 2);
|
51
|
+
#endif
|
52
|
+
|
53
|
+
return out;
|
54
|
+
}
|
@@ -0,0 +1,57 @@
|
|
1
|
+
static inline uint8x16x4_t
|
2
|
+
enc_translate (const uint8x16x4_t in)
|
3
|
+
{
|
4
|
+
// A lookup table containing the absolute offsets for all ranges:
|
5
|
+
const uint8x16_t lut = {
|
6
|
+
65U, 71U, 252U, 252U,
|
7
|
+
252U, 252U, 252U, 252U,
|
8
|
+
252U, 252U, 252U, 252U,
|
9
|
+
237U, 240U, 0U, 0U
|
10
|
+
};
|
11
|
+
|
12
|
+
const uint8x16_t offset = vdupq_n_u8(51);
|
13
|
+
|
14
|
+
uint8x16x4_t indices, mask, delta, out;
|
15
|
+
|
16
|
+
// Translate values 0..63 to the Base64 alphabet. There are five sets:
|
17
|
+
// # From To Abs Index Characters
|
18
|
+
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
19
|
+
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
|
20
|
+
// 2 [52..61] [48..57] -4 [2..11] 0123456789
|
21
|
+
// 3 [62] [43] -19 12 +
|
22
|
+
// 4 [63] [47] -16 13 /
|
23
|
+
|
24
|
+
// Create LUT indices from input:
|
25
|
+
// the index for range #0 is right, others are 1 less than expected:
|
26
|
+
indices.val[0] = vqsubq_u8(in.val[0], offset);
|
27
|
+
indices.val[1] = vqsubq_u8(in.val[1], offset);
|
28
|
+
indices.val[2] = vqsubq_u8(in.val[2], offset);
|
29
|
+
indices.val[3] = vqsubq_u8(in.val[3], offset);
|
30
|
+
|
31
|
+
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
|
32
|
+
mask.val[0] = vcgtq_u8(in.val[0], vdupq_n_u8(25));
|
33
|
+
mask.val[1] = vcgtq_u8(in.val[1], vdupq_n_u8(25));
|
34
|
+
mask.val[2] = vcgtq_u8(in.val[2], vdupq_n_u8(25));
|
35
|
+
mask.val[3] = vcgtq_u8(in.val[3], vdupq_n_u8(25));
|
36
|
+
|
37
|
+
// Subtract -1, so add 1 to indices for range #[1..4], All indices are
|
38
|
+
// now correct:
|
39
|
+
indices.val[0] = vsubq_u8(indices.val[0], mask.val[0]);
|
40
|
+
indices.val[1] = vsubq_u8(indices.val[1], mask.val[1]);
|
41
|
+
indices.val[2] = vsubq_u8(indices.val[2], mask.val[2]);
|
42
|
+
indices.val[3] = vsubq_u8(indices.val[3], mask.val[3]);
|
43
|
+
|
44
|
+
// Lookup delta values:
|
45
|
+
delta.val[0] = vqtbl1q_u8(lut, indices.val[0]);
|
46
|
+
delta.val[1] = vqtbl1q_u8(lut, indices.val[1]);
|
47
|
+
delta.val[2] = vqtbl1q_u8(lut, indices.val[2]);
|
48
|
+
delta.val[3] = vqtbl1q_u8(lut, indices.val[3]);
|
49
|
+
|
50
|
+
// Add delta values:
|
51
|
+
out.val[0] = vaddq_u8(in.val[0], delta.val[0]);
|
52
|
+
out.val[1] = vaddq_u8(in.val[1], delta.val[1]);
|
53
|
+
out.val[2] = vaddq_u8(in.val[2], delta.val[2]);
|
54
|
+
out.val[3] = vaddq_u8(in.val[3], delta.val[3]);
|
55
|
+
|
56
|
+
return out;
|
57
|
+
}
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <string.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#ifdef __aarch64__
|
12
|
+
# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON64
|
13
|
+
# define BASE64_USE_NEON64
|
14
|
+
# endif
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#ifdef BASE64_USE_NEON64
|
18
|
+
#include <arm_neon.h>
|
19
|
+
|
20
|
+
static inline uint8x16x4_t
|
21
|
+
load_64byte_table (const uint8_t *p)
|
22
|
+
{
|
23
|
+
#if defined(__GNUC__) && !defined(__clang__)
|
24
|
+
// As of October 2016, GCC does not support the 'vld1q_u8_x4()' intrinsic.
|
25
|
+
uint8x16x4_t ret;
|
26
|
+
ret.val[0] = vld1q_u8(p + 0);
|
27
|
+
ret.val[1] = vld1q_u8(p + 16);
|
28
|
+
ret.val[2] = vld1q_u8(p + 32);
|
29
|
+
ret.val[3] = vld1q_u8(p + 48);
|
30
|
+
return ret;
|
31
|
+
#else
|
32
|
+
return vld1q_u8_x4(p);
|
33
|
+
#endif
|
34
|
+
}
|
35
|
+
|
36
|
+
#include "../generic/32/dec_loop.c"
|
37
|
+
#include "../generic/64/enc_loop.c"
|
38
|
+
#include "dec_loop.c"
|
39
|
+
#include "enc_reshuffle.c"
|
40
|
+
#include "enc_loop.c"
|
41
|
+
|
42
|
+
#endif // BASE64_USE_NEON64
|
43
|
+
|
44
|
+
// Stride size is so large on these NEON 64-bit functions
|
45
|
+
// (48 bytes encode, 64 bytes decode) that we inline the
|
46
|
+
// uint64 codec to stay performant on smaller inputs.
|
47
|
+
|
48
|
+
BASE64_ENC_FUNCTION(neon64)
|
49
|
+
{
|
50
|
+
#ifdef BASE64_USE_NEON64
|
51
|
+
#include "../generic/enc_head.c"
|
52
|
+
enc_loop_neon64(&s, &slen, &o, &olen);
|
53
|
+
enc_loop_generic_64(&s, &slen, &o, &olen);
|
54
|
+
#include "../generic/enc_tail.c"
|
55
|
+
#else
|
56
|
+
BASE64_ENC_STUB
|
57
|
+
#endif
|
58
|
+
}
|
59
|
+
|
60
|
+
BASE64_DEC_FUNCTION(neon64)
|
61
|
+
{
|
62
|
+
#ifdef BASE64_USE_NEON64
|
63
|
+
#include "../generic/dec_head.c"
|
64
|
+
dec_loop_neon64(&s, &slen, &o, &olen);
|
65
|
+
dec_loop_generic_32(&s, &slen, &o, &olen);
|
66
|
+
#include "../generic/dec_tail.c"
|
67
|
+
#else
|
68
|
+
BASE64_DEC_STUB
|
69
|
+
#endif
|
70
|
+
}
|
@@ -0,0 +1,129 @@
|
|
1
|
+
// The input consists of five valid character sets in the Base64 alphabet,
|
2
|
+
// which we need to map back to the 6-bit values they represent.
|
3
|
+
// There are three ranges, two singles, and then there's the rest.
|
4
|
+
//
|
5
|
+
// # From To LUT Characters
|
6
|
+
// 1 [0..42] [255] #1 invalid input
|
7
|
+
// 2 [43] [62] #1 +
|
8
|
+
// 3 [44..46] [255] #1 invalid input
|
9
|
+
// 4 [47] [63] #1 /
|
10
|
+
// 5 [48..57] [52..61] #1 0..9
|
11
|
+
// 6 [58..63] [255] #1 invalid input
|
12
|
+
// 7 [64] [255] #2 invalid input
|
13
|
+
// 8 [65..90] [0..25] #2 A..Z
|
14
|
+
// 9 [91..96] [255] #2 invalid input
|
15
|
+
// 10 [97..122] [26..51] #2 a..z
|
16
|
+
// 11 [123..126] [255] #2 invalid input
|
17
|
+
// (12) Everything else => invalid input
|
18
|
+
|
19
|
+
// The first LUT will use the VTBL instruction (out of range indices are set to
|
20
|
+
// 0 in destination).
|
21
|
+
static const uint8_t dec_lut1[] = {
|
22
|
+
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
|
23
|
+
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
|
24
|
+
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 62U, 255U, 255U, 255U, 63U,
|
25
|
+
52U, 53U, 54U, 55U, 56U, 57U, 58U, 59U, 60U, 61U, 255U, 255U, 255U, 255U, 255U, 255U,
|
26
|
+
};
|
27
|
+
|
28
|
+
// The second LUT will use the VTBX instruction (out of range indices will be
|
29
|
+
// unchanged in destination). Input [64..126] will be mapped to index [1..63]
|
30
|
+
// in this LUT. Index 0 means that value comes from LUT #1.
|
31
|
+
static const uint8_t dec_lut2[] = {
|
32
|
+
0U, 255U, 0U, 1U, 2U, 3U, 4U, 5U, 6U, 7U, 8U, 9U, 10U, 11U, 12U, 13U,
|
33
|
+
14U, 15U, 16U, 17U, 18U, 19U, 20U, 21U, 22U, 23U, 24U, 25U, 255U, 255U, 255U, 255U,
|
34
|
+
255U, 255U, 26U, 27U, 28U, 29U, 30U, 31U, 32U, 33U, 34U, 35U, 36U, 37U, 38U, 39U,
|
35
|
+
40U, 41U, 42U, 43U, 44U, 45U, 46U, 47U, 48U, 49U, 50U, 51U, 255U, 255U, 255U, 255U,
|
36
|
+
};
|
37
|
+
|
38
|
+
// All input values in range for the first look-up will be 0U in the second
|
39
|
+
// look-up result. All input values out of range for the first look-up will be
|
40
|
+
// 0U in the first look-up result. Thus, the two results can be ORed without
|
41
|
+
// conflicts.
|
42
|
+
//
|
43
|
+
// Invalid characters that are in the valid range for either look-up will be
|
44
|
+
// set to 255U in the combined result. Other invalid characters will just be
|
45
|
+
// passed through with the second look-up result (using the VTBX instruction).
|
46
|
+
// Since the second LUT is 64 bytes, those passed-through values are guaranteed
|
47
|
+
// to have a value greater than 63U. Therefore, valid characters will be mapped
|
48
|
+
// to the valid [0..63] range and all invalid characters will be mapped to
|
49
|
+
// values greater than 63.
|
50
|
+
|
51
|
+
static inline void
|
52
|
+
dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
53
|
+
{
|
54
|
+
if (*slen < 64) {
|
55
|
+
return;
|
56
|
+
}
|
57
|
+
|
58
|
+
// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
|
59
|
+
// extra trailing zero bytes are written, so it is not necessary to
|
60
|
+
// reserve extra input bytes:
|
61
|
+
size_t rounds = *slen / 64;
|
62
|
+
|
63
|
+
*slen -= rounds * 64; // 64 bytes consumed per round
|
64
|
+
*olen += rounds * 48; // 48 bytes produced per round
|
65
|
+
|
66
|
+
const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1);
|
67
|
+
const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2);
|
68
|
+
|
69
|
+
do {
|
70
|
+
const uint8x16_t offset = vdupq_n_u8(63U);
|
71
|
+
uint8x16x4_t dec1, dec2;
|
72
|
+
uint8x16x3_t dec;
|
73
|
+
|
74
|
+
// Load 64 bytes and deinterleave:
|
75
|
+
uint8x16x4_t str = vld4q_u8((uint8_t *) *s);
|
76
|
+
|
77
|
+
// Get indices for second LUT:
|
78
|
+
dec2.val[0] = vqsubq_u8(str.val[0], offset);
|
79
|
+
dec2.val[1] = vqsubq_u8(str.val[1], offset);
|
80
|
+
dec2.val[2] = vqsubq_u8(str.val[2], offset);
|
81
|
+
dec2.val[3] = vqsubq_u8(str.val[3], offset);
|
82
|
+
|
83
|
+
// Get values from first LUT:
|
84
|
+
dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
|
85
|
+
dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
|
86
|
+
dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
|
87
|
+
dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
|
88
|
+
|
89
|
+
// Get values from second LUT:
|
90
|
+
dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
|
91
|
+
dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
|
92
|
+
dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
|
93
|
+
dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
|
94
|
+
|
95
|
+
// Get final values:
|
96
|
+
str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
|
97
|
+
str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
|
98
|
+
str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
|
99
|
+
str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
|
100
|
+
|
101
|
+
// Check for invalid input, any value larger than 63:
|
102
|
+
const uint8x16_t classified
|
103
|
+
= vcgtq_u8(str.val[0], vdupq_n_u8(63))
|
104
|
+
| vcgtq_u8(str.val[1], vdupq_n_u8(63))
|
105
|
+
| vcgtq_u8(str.val[2], vdupq_n_u8(63))
|
106
|
+
| vcgtq_u8(str.val[3], vdupq_n_u8(63));
|
107
|
+
|
108
|
+
// Check that all bits are zero:
|
109
|
+
if (vmaxvq_u8(classified) != 0U) {
|
110
|
+
break;
|
111
|
+
}
|
112
|
+
|
113
|
+
// Compress four bytes into three:
|
114
|
+
dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4);
|
115
|
+
dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2);
|
116
|
+
dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3];
|
117
|
+
|
118
|
+
// Interleave and store decoded result:
|
119
|
+
vst3q_u8((uint8_t *) *o, dec);
|
120
|
+
|
121
|
+
*s += 64;
|
122
|
+
*o += 48;
|
123
|
+
|
124
|
+
} while (--rounds > 0);
|
125
|
+
|
126
|
+
// Adjust for any rounds that were skipped:
|
127
|
+
*slen += rounds * 64;
|
128
|
+
*olen -= rounds * 48;
|
129
|
+
}
|
@@ -0,0 +1,66 @@
|
|
1
|
+
static inline void
|
2
|
+
enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t *tbl_enc)
|
3
|
+
{
|
4
|
+
// Load 48 bytes and deinterleave:
|
5
|
+
uint8x16x3_t src = vld3q_u8(*s);
|
6
|
+
|
7
|
+
// Divide bits of three input bytes over four output bytes:
|
8
|
+
uint8x16x4_t out = enc_reshuffle(src);
|
9
|
+
|
10
|
+
// The bits have now been shifted to the right locations;
|
11
|
+
// translate their values 0..63 to the Base64 alphabet.
|
12
|
+
// Use a 64-byte table lookup:
|
13
|
+
out.val[0] = vqtbl4q_u8(*tbl_enc, out.val[0]);
|
14
|
+
out.val[1] = vqtbl4q_u8(*tbl_enc, out.val[1]);
|
15
|
+
out.val[2] = vqtbl4q_u8(*tbl_enc, out.val[2]);
|
16
|
+
out.val[3] = vqtbl4q_u8(*tbl_enc, out.val[3]);
|
17
|
+
|
18
|
+
// Interleave and store output:
|
19
|
+
vst4q_u8(*o, out);
|
20
|
+
|
21
|
+
*s += 48;
|
22
|
+
*o += 64;
|
23
|
+
}
|
24
|
+
|
25
|
+
static inline void
|
26
|
+
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
27
|
+
{
|
28
|
+
size_t rounds = *slen / 48;
|
29
|
+
|
30
|
+
*slen -= rounds * 48; // 48 bytes consumed per round
|
31
|
+
*olen += rounds * 64; // 64 bytes produced per round
|
32
|
+
|
33
|
+
// Load the encoding table:
|
34
|
+
const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);
|
35
|
+
|
36
|
+
while (rounds > 0) {
|
37
|
+
if (rounds >= 8) {
|
38
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
39
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
40
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
41
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
42
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
43
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
44
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
45
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
46
|
+
rounds -= 8;
|
47
|
+
continue;
|
48
|
+
}
|
49
|
+
if (rounds >= 4) {
|
50
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
51
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
52
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
53
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
54
|
+
rounds -= 4;
|
55
|
+
continue;
|
56
|
+
}
|
57
|
+
if (rounds >= 2) {
|
58
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
59
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
60
|
+
rounds -= 2;
|
61
|
+
continue;
|
62
|
+
}
|
63
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
64
|
+
break;
|
65
|
+
}
|
66
|
+
}
|
@@ -0,0 +1,54 @@
|
|
1
|
+
static inline uint8x16x4_t
|
2
|
+
enc_reshuffle (const uint8x16x3_t in)
|
3
|
+
{
|
4
|
+
uint8x16x4_t out;
|
5
|
+
|
6
|
+
#if defined(__GNUC__) || defined(__clang__)
|
7
|
+
|
8
|
+
// GCC and Clang support the following inline assembly syntax. This
|
9
|
+
// inline assembly implements the exact same algorithm as the
|
10
|
+
// intrinsics further down, but benchmarks show that the inline
|
11
|
+
// assembly easily beats the intrinsics. Perhaps this is because the
|
12
|
+
// inline assembly is well pipelined to avoid data dependencies.
|
13
|
+
|
14
|
+
__asm__ (
|
15
|
+
"ushr %[o0].16b, %[i0].16b, #2 \n\t"
|
16
|
+
"ushr %[o1].16b, %[i1].16b, #2 \n\t"
|
17
|
+
"ushr %[o2].16b, %[i2].16b, #4 \n\t"
|
18
|
+
"sli %[o1].16b, %[i0].16b, #6 \n\t"
|
19
|
+
"sli %[o2].16b, %[i1].16b, #4 \n\t"
|
20
|
+
"shl %[o3].16b, %[i2].16b, #2 \n\t"
|
21
|
+
|
22
|
+
"ushr %[o1].16b, %[o1].16b, #2 \n\t"
|
23
|
+
"ushr %[o2].16b, %[o2].16b, #2 \n\t"
|
24
|
+
"ushr %[o3].16b, %[o3].16b, #2 \n\t"
|
25
|
+
|
26
|
+
// Outputs:
|
27
|
+
: [o0] "=&w" (out.val[0]),
|
28
|
+
[o1] "=&w" (out.val[1]),
|
29
|
+
[o2] "=&w" (out.val[2]),
|
30
|
+
[o3] "=&w" (out.val[3])
|
31
|
+
|
32
|
+
// Inputs:
|
33
|
+
: [i0] "w" (in.val[0]),
|
34
|
+
[i1] "w" (in.val[1]),
|
35
|
+
[i2] "w" (in.val[2])
|
36
|
+
);
|
37
|
+
#else
|
38
|
+
// Divide bits of three input bytes over four output bytes. All output
|
39
|
+
// bytes except the first one are shifted over two bits to the left:
|
40
|
+
out.val[0] = vshrq_n_u8(in.val[0], 2);
|
41
|
+
out.val[1] = vshrq_n_u8(in.val[1], 2);
|
42
|
+
out.val[2] = vshrq_n_u8(in.val[2], 4);
|
43
|
+
out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 6);
|
44
|
+
out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 4);
|
45
|
+
out.val[3] = vshlq_n_u8(in.val[2], 2);
|
46
|
+
|
47
|
+
// Clear the top two bits by shifting the output back to the right:
|
48
|
+
out.val[1] = vshrq_n_u8(out.val[1], 2);
|
49
|
+
out.val[2] = vshrq_n_u8(out.val[2], 2);
|
50
|
+
out.val[3] = vshrq_n_u8(out.val[3], 2);
|
51
|
+
#endif
|
52
|
+
|
53
|
+
return out;
|
54
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#if HAVE_SSE41
|
12
|
+
#include <smmintrin.h>
|
13
|
+
|
14
|
+
#include "../ssse3/dec_reshuffle.c"
|
15
|
+
#include "../ssse3/dec_loop.c"
|
16
|
+
#include "../ssse3/enc_translate.c"
|
17
|
+
#include "../ssse3/enc_reshuffle.c"
|
18
|
+
#include "../ssse3/enc_loop.c"
|
19
|
+
|
20
|
+
#endif // HAVE_SSE41
|
21
|
+
|
22
|
+
BASE64_ENC_FUNCTION(sse41)
|
23
|
+
{
|
24
|
+
#if HAVE_SSE41
|
25
|
+
#include "../generic/enc_head.c"
|
26
|
+
enc_loop_ssse3(&s, &slen, &o, &olen);
|
27
|
+
#include "../generic/enc_tail.c"
|
28
|
+
#else
|
29
|
+
BASE64_ENC_STUB
|
30
|
+
#endif
|
31
|
+
}
|
32
|
+
|
33
|
+
BASE64_DEC_FUNCTION(sse41)
|
34
|
+
{
|
35
|
+
#if HAVE_SSE41
|
36
|
+
#include "../generic/dec_head.c"
|
37
|
+
dec_loop_ssse3(&s, &slen, &o, &olen);
|
38
|
+
#include "../generic/dec_tail.c"
|
39
|
+
#else
|
40
|
+
BASE64_DEC_STUB
|
41
|
+
#endif
|
42
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#if HAVE_SSE42
|
12
|
+
#include <nmmintrin.h>
|
13
|
+
|
14
|
+
#include "../ssse3/dec_reshuffle.c"
|
15
|
+
#include "../ssse3/dec_loop.c"
|
16
|
+
#include "../ssse3/enc_translate.c"
|
17
|
+
#include "../ssse3/enc_reshuffle.c"
|
18
|
+
#include "../ssse3/enc_loop.c"
|
19
|
+
|
20
|
+
#endif // HAVE_SSE42
|
21
|
+
|
22
|
+
BASE64_ENC_FUNCTION(sse42)
|
23
|
+
{
|
24
|
+
#if HAVE_SSE42
|
25
|
+
#include "../generic/enc_head.c"
|
26
|
+
enc_loop_ssse3(&s, &slen, &o, &olen);
|
27
|
+
#include "../generic/enc_tail.c"
|
28
|
+
#else
|
29
|
+
BASE64_ENC_STUB
|
30
|
+
#endif
|
31
|
+
}
|
32
|
+
|
33
|
+
BASE64_DEC_FUNCTION(sse42)
|
34
|
+
{
|
35
|
+
#if HAVE_SSE42
|
36
|
+
#include "../generic/dec_head.c"
|
37
|
+
dec_loop_ssse3(&s, &slen, &o, &olen);
|
38
|
+
#include "../generic/dec_tail.c"
|
39
|
+
#else
|
40
|
+
BASE64_DEC_STUB
|
41
|
+
#endif
|
42
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#if HAVE_SSSE3
|
12
|
+
#include <tmmintrin.h>
|
13
|
+
|
14
|
+
#include "dec_reshuffle.c"
|
15
|
+
#include "dec_loop.c"
|
16
|
+
#include "enc_reshuffle.c"
|
17
|
+
#include "enc_translate.c"
|
18
|
+
#include "enc_loop.c"
|
19
|
+
|
20
|
+
#endif // HAVE_SSSE3
|
21
|
+
|
22
|
+
BASE64_ENC_FUNCTION(ssse3)
|
23
|
+
{
|
24
|
+
#if HAVE_SSSE3
|
25
|
+
#include "../generic/enc_head.c"
|
26
|
+
enc_loop_ssse3(&s, &slen, &o, &olen);
|
27
|
+
#include "../generic/enc_tail.c"
|
28
|
+
#else
|
29
|
+
BASE64_ENC_STUB
|
30
|
+
#endif
|
31
|
+
}
|
32
|
+
|
33
|
+
BASE64_DEC_FUNCTION(ssse3)
|
34
|
+
{
|
35
|
+
#if HAVE_SSSE3
|
36
|
+
#include "../generic/dec_head.c"
|
37
|
+
dec_loop_ssse3(&s, &slen, &o, &olen);
|
38
|
+
#include "../generic/dec_tail.c"
|
39
|
+
#else
|
40
|
+
BASE64_DEC_STUB
|
41
|
+
#endif
|
42
|
+
}
|