ob64 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/lib/ob64/version.rb +1 -1
- data/ob64.gemspec +2 -0
- data/vendor/libbase64/.gitignore +12 -0
- data/vendor/libbase64/.travis.yml +71 -0
- data/vendor/libbase64/CMakeLists.txt +264 -0
- data/vendor/libbase64/LICENSE +28 -0
- data/vendor/libbase64/Makefile +93 -0
- data/vendor/libbase64/README.md +474 -0
- data/vendor/libbase64/base64-benchmarks.png +0 -0
- data/vendor/libbase64/bin/base64.c +132 -0
- data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
- data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
- data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
- data/vendor/libbase64/cmake/config.h.in +25 -0
- data/vendor/libbase64/cmake/test-arch.c +35 -0
- data/vendor/libbase64/include/libbase64.h +145 -0
- data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
- data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
- data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
- data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
- data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
- data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
- data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
- data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
- data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
- data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
- data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
- data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
- data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
- data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
- data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
- data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
- data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
- data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
- data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
- data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
- data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
- data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
- data/vendor/libbase64/lib/codec_choose.c +281 -0
- data/vendor/libbase64/lib/codecs.h +65 -0
- data/vendor/libbase64/lib/env.h +67 -0
- data/vendor/libbase64/lib/exports.txt +7 -0
- data/vendor/libbase64/lib/lib.c +164 -0
- data/vendor/libbase64/lib/lib_openmp.c +149 -0
- data/vendor/libbase64/lib/tables/.gitignore +1 -0
- data/vendor/libbase64/lib/tables/Makefile +17 -0
- data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
- data/vendor/libbase64/lib/tables/table_generator.c +184 -0
- data/vendor/libbase64/lib/tables/tables.c +40 -0
- data/vendor/libbase64/lib/tables/tables.h +23 -0
- metadata +64 -4
@@ -0,0 +1,54 @@
|
|
1
|
+
static inline uint8x16x4_t
|
2
|
+
enc_reshuffle (const uint8x16x3_t in)
|
3
|
+
{
|
4
|
+
uint8x16x4_t out;
|
5
|
+
|
6
|
+
#if defined(__GNUC__) || defined(__clang__)
|
7
|
+
|
8
|
+
// GCC and Clang support the following inline assembly syntax. This
|
9
|
+
// inline assembly implements the exact same algorithm as the
|
10
|
+
// intrinsics further down, but benchmarks show that the inline
|
11
|
+
// assembly easily beats the intrinsics. Perhaps this is because the
|
12
|
+
// inline assembly is well pipelined to avoid data dependencies.
|
13
|
+
|
14
|
+
__asm__ (
|
15
|
+
"vshr.u8 %q[o0], %q[i0], #2 \n\t"
|
16
|
+
"vshr.u8 %q[o1], %q[i1], #2 \n\t"
|
17
|
+
"vshr.u8 %q[o2], %q[i2], #4 \n\t"
|
18
|
+
"vsli.8 %q[o1], %q[i0], #6 \n\t"
|
19
|
+
"vsli.8 %q[o2], %q[i1], #4 \n\t"
|
20
|
+
"vshl.u8 %q[o3], %q[i2], #2 \n\t"
|
21
|
+
|
22
|
+
"vshr.u8 %q[o1], %q[o1], #2 \n\t"
|
23
|
+
"vshr.u8 %q[o2], %q[o2], #2 \n\t"
|
24
|
+
"vshr.u8 %q[o3], %q[o3], #2 \n\t"
|
25
|
+
|
26
|
+
// Outputs:
|
27
|
+
: [o0] "=&w" (out.val[0]),
|
28
|
+
[o1] "=&w" (out.val[1]),
|
29
|
+
[o2] "=&w" (out.val[2]),
|
30
|
+
[o3] "=&w" (out.val[3])
|
31
|
+
|
32
|
+
// Inputs:
|
33
|
+
: [i0] "w" (in.val[0]),
|
34
|
+
[i1] "w" (in.val[1]),
|
35
|
+
[i2] "w" (in.val[2])
|
36
|
+
);
|
37
|
+
#else
|
38
|
+
// Divide bits of three input bytes over four output bytes. All output
|
39
|
+
// bytes except the first one are shifted over two bits to the left:
|
40
|
+
out.val[0] = vshrq_n_u8(in.val[0], 2);
|
41
|
+
out.val[1] = vshrq_n_u8(in.val[1], 2);
|
42
|
+
out.val[2] = vshrq_n_u8(in.val[2], 4);
|
43
|
+
out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 6);
|
44
|
+
out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 4);
|
45
|
+
out.val[3] = vshlq_n_u8(in.val[2], 2);
|
46
|
+
|
47
|
+
// Clear the top two bits by shifting the output back to the right:
|
48
|
+
out.val[1] = vshrq_n_u8(out.val[1], 2);
|
49
|
+
out.val[2] = vshrq_n_u8(out.val[2], 2);
|
50
|
+
out.val[3] = vshrq_n_u8(out.val[3], 2);
|
51
|
+
#endif
|
52
|
+
|
53
|
+
return out;
|
54
|
+
}
|
@@ -0,0 +1,57 @@
|
|
1
|
+
static inline uint8x16x4_t
|
2
|
+
enc_translate (const uint8x16x4_t in)
|
3
|
+
{
|
4
|
+
// A lookup table containing the absolute offsets for all ranges:
|
5
|
+
const uint8x16_t lut = {
|
6
|
+
65U, 71U, 252U, 252U,
|
7
|
+
252U, 252U, 252U, 252U,
|
8
|
+
252U, 252U, 252U, 252U,
|
9
|
+
237U, 240U, 0U, 0U
|
10
|
+
};
|
11
|
+
|
12
|
+
const uint8x16_t offset = vdupq_n_u8(51);
|
13
|
+
|
14
|
+
uint8x16x4_t indices, mask, delta, out;
|
15
|
+
|
16
|
+
// Translate values 0..63 to the Base64 alphabet. There are five sets:
|
17
|
+
// # From To Abs Index Characters
|
18
|
+
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
19
|
+
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
|
20
|
+
// 2 [52..61] [48..57] -4 [2..11] 0123456789
|
21
|
+
// 3 [62] [43] -19 12 +
|
22
|
+
// 4 [63] [47] -16 13 /
|
23
|
+
|
24
|
+
// Create LUT indices from input:
|
25
|
+
// the index for range #0 is right, others are 1 less than expected:
|
26
|
+
indices.val[0] = vqsubq_u8(in.val[0], offset);
|
27
|
+
indices.val[1] = vqsubq_u8(in.val[1], offset);
|
28
|
+
indices.val[2] = vqsubq_u8(in.val[2], offset);
|
29
|
+
indices.val[3] = vqsubq_u8(in.val[3], offset);
|
30
|
+
|
31
|
+
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
|
32
|
+
mask.val[0] = vcgtq_u8(in.val[0], vdupq_n_u8(25));
|
33
|
+
mask.val[1] = vcgtq_u8(in.val[1], vdupq_n_u8(25));
|
34
|
+
mask.val[2] = vcgtq_u8(in.val[2], vdupq_n_u8(25));
|
35
|
+
mask.val[3] = vcgtq_u8(in.val[3], vdupq_n_u8(25));
|
36
|
+
|
37
|
+
// Subtract -1, so add 1 to indices for range #[1..4], All indices are
|
38
|
+
// now correct:
|
39
|
+
indices.val[0] = vsubq_u8(indices.val[0], mask.val[0]);
|
40
|
+
indices.val[1] = vsubq_u8(indices.val[1], mask.val[1]);
|
41
|
+
indices.val[2] = vsubq_u8(indices.val[2], mask.val[2]);
|
42
|
+
indices.val[3] = vsubq_u8(indices.val[3], mask.val[3]);
|
43
|
+
|
44
|
+
// Lookup delta values:
|
45
|
+
delta.val[0] = vqtbl1q_u8(lut, indices.val[0]);
|
46
|
+
delta.val[1] = vqtbl1q_u8(lut, indices.val[1]);
|
47
|
+
delta.val[2] = vqtbl1q_u8(lut, indices.val[2]);
|
48
|
+
delta.val[3] = vqtbl1q_u8(lut, indices.val[3]);
|
49
|
+
|
50
|
+
// Add delta values:
|
51
|
+
out.val[0] = vaddq_u8(in.val[0], delta.val[0]);
|
52
|
+
out.val[1] = vaddq_u8(in.val[1], delta.val[1]);
|
53
|
+
out.val[2] = vaddq_u8(in.val[2], delta.val[2]);
|
54
|
+
out.val[3] = vaddq_u8(in.val[3], delta.val[3]);
|
55
|
+
|
56
|
+
return out;
|
57
|
+
}
|
@@ -0,0 +1,70 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <string.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#ifdef __aarch64__
|
12
|
+
# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON64
|
13
|
+
# define BASE64_USE_NEON64
|
14
|
+
# endif
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#ifdef BASE64_USE_NEON64
|
18
|
+
#include <arm_neon.h>
|
19
|
+
|
20
|
+
static inline uint8x16x4_t
|
21
|
+
load_64byte_table (const uint8_t *p)
|
22
|
+
{
|
23
|
+
#if defined(__GNUC__) && !defined(__clang__)
|
24
|
+
// As of October 2016, GCC does not support the 'vld1q_u8_x4()' intrinsic.
|
25
|
+
uint8x16x4_t ret;
|
26
|
+
ret.val[0] = vld1q_u8(p + 0);
|
27
|
+
ret.val[1] = vld1q_u8(p + 16);
|
28
|
+
ret.val[2] = vld1q_u8(p + 32);
|
29
|
+
ret.val[3] = vld1q_u8(p + 48);
|
30
|
+
return ret;
|
31
|
+
#else
|
32
|
+
return vld1q_u8_x4(p);
|
33
|
+
#endif
|
34
|
+
}
|
35
|
+
|
36
|
+
#include "../generic/32/dec_loop.c"
|
37
|
+
#include "../generic/64/enc_loop.c"
|
38
|
+
#include "dec_loop.c"
|
39
|
+
#include "enc_reshuffle.c"
|
40
|
+
#include "enc_loop.c"
|
41
|
+
|
42
|
+
#endif // BASE64_USE_NEON64
|
43
|
+
|
44
|
+
// Stride size is so large on these NEON 64-bit functions
|
45
|
+
// (48 bytes encode, 64 bytes decode) that we inline the
|
46
|
+
// uint64 codec to stay performant on smaller inputs.
|
47
|
+
|
48
|
+
BASE64_ENC_FUNCTION(neon64)
|
49
|
+
{
|
50
|
+
#ifdef BASE64_USE_NEON64
|
51
|
+
#include "../generic/enc_head.c"
|
52
|
+
enc_loop_neon64(&s, &slen, &o, &olen);
|
53
|
+
enc_loop_generic_64(&s, &slen, &o, &olen);
|
54
|
+
#include "../generic/enc_tail.c"
|
55
|
+
#else
|
56
|
+
BASE64_ENC_STUB
|
57
|
+
#endif
|
58
|
+
}
|
59
|
+
|
60
|
+
BASE64_DEC_FUNCTION(neon64)
|
61
|
+
{
|
62
|
+
#ifdef BASE64_USE_NEON64
|
63
|
+
#include "../generic/dec_head.c"
|
64
|
+
dec_loop_neon64(&s, &slen, &o, &olen);
|
65
|
+
dec_loop_generic_32(&s, &slen, &o, &olen);
|
66
|
+
#include "../generic/dec_tail.c"
|
67
|
+
#else
|
68
|
+
BASE64_DEC_STUB
|
69
|
+
#endif
|
70
|
+
}
|
@@ -0,0 +1,129 @@
|
|
1
|
+
// The input consists of five valid character sets in the Base64 alphabet,
|
2
|
+
// which we need to map back to the 6-bit values they represent.
|
3
|
+
// There are three ranges, two singles, and then there's the rest.
|
4
|
+
//
|
5
|
+
// # From To LUT Characters
|
6
|
+
// 1 [0..42] [255] #1 invalid input
|
7
|
+
// 2 [43] [62] #1 +
|
8
|
+
// 3 [44..46] [255] #1 invalid input
|
9
|
+
// 4 [47] [63] #1 /
|
10
|
+
// 5 [48..57] [52..61] #1 0..9
|
11
|
+
// 6 [58..63] [255] #1 invalid input
|
12
|
+
// 7 [64] [255] #2 invalid input
|
13
|
+
// 8 [65..90] [0..25] #2 A..Z
|
14
|
+
// 9 [91..96] [255] #2 invalid input
|
15
|
+
// 10 [97..122] [26..51] #2 a..z
|
16
|
+
// 11 [123..126] [255] #2 invalid input
|
17
|
+
// (12) Everything else => invalid input
|
18
|
+
|
19
|
+
// The first LUT will use the VTBL instruction (out of range indices are set to
|
20
|
+
// 0 in destination).
|
21
|
+
static const uint8_t dec_lut1[] = {
|
22
|
+
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
|
23
|
+
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
|
24
|
+
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 62U, 255U, 255U, 255U, 63U,
|
25
|
+
52U, 53U, 54U, 55U, 56U, 57U, 58U, 59U, 60U, 61U, 255U, 255U, 255U, 255U, 255U, 255U,
|
26
|
+
};
|
27
|
+
|
28
|
+
// The second LUT will use the VTBX instruction (out of range indices will be
|
29
|
+
// unchanged in destination). Input [64..126] will be mapped to index [1..63]
|
30
|
+
// in this LUT. Index 0 means that value comes from LUT #1.
|
31
|
+
static const uint8_t dec_lut2[] = {
|
32
|
+
0U, 255U, 0U, 1U, 2U, 3U, 4U, 5U, 6U, 7U, 8U, 9U, 10U, 11U, 12U, 13U,
|
33
|
+
14U, 15U, 16U, 17U, 18U, 19U, 20U, 21U, 22U, 23U, 24U, 25U, 255U, 255U, 255U, 255U,
|
34
|
+
255U, 255U, 26U, 27U, 28U, 29U, 30U, 31U, 32U, 33U, 34U, 35U, 36U, 37U, 38U, 39U,
|
35
|
+
40U, 41U, 42U, 43U, 44U, 45U, 46U, 47U, 48U, 49U, 50U, 51U, 255U, 255U, 255U, 255U,
|
36
|
+
};
|
37
|
+
|
38
|
+
// All input values in range for the first look-up will be 0U in the second
|
39
|
+
// look-up result. All input values out of range for the first look-up will be
|
40
|
+
// 0U in the first look-up result. Thus, the two results can be ORed without
|
41
|
+
// conflicts.
|
42
|
+
//
|
43
|
+
// Invalid characters that are in the valid range for either look-up will be
|
44
|
+
// set to 255U in the combined result. Other invalid characters will just be
|
45
|
+
// passed through with the second look-up result (using the VTBX instruction).
|
46
|
+
// Since the second LUT is 64 bytes, those passed-through values are guaranteed
|
47
|
+
// to have a value greater than 63U. Therefore, valid characters will be mapped
|
48
|
+
// to the valid [0..63] range and all invalid characters will be mapped to
|
49
|
+
// values greater than 63.
|
50
|
+
|
51
|
+
static inline void
|
52
|
+
dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
53
|
+
{
|
54
|
+
if (*slen < 64) {
|
55
|
+
return;
|
56
|
+
}
|
57
|
+
|
58
|
+
// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
|
59
|
+
// extra trailing zero bytes are written, so it is not necessary to
|
60
|
+
// reserve extra input bytes:
|
61
|
+
size_t rounds = *slen / 64;
|
62
|
+
|
63
|
+
*slen -= rounds * 64; // 64 bytes consumed per round
|
64
|
+
*olen += rounds * 48; // 48 bytes produced per round
|
65
|
+
|
66
|
+
const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1);
|
67
|
+
const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2);
|
68
|
+
|
69
|
+
do {
|
70
|
+
const uint8x16_t offset = vdupq_n_u8(63U);
|
71
|
+
uint8x16x4_t dec1, dec2;
|
72
|
+
uint8x16x3_t dec;
|
73
|
+
|
74
|
+
// Load 64 bytes and deinterleave:
|
75
|
+
uint8x16x4_t str = vld4q_u8((uint8_t *) *s);
|
76
|
+
|
77
|
+
// Get indices for second LUT:
|
78
|
+
dec2.val[0] = vqsubq_u8(str.val[0], offset);
|
79
|
+
dec2.val[1] = vqsubq_u8(str.val[1], offset);
|
80
|
+
dec2.val[2] = vqsubq_u8(str.val[2], offset);
|
81
|
+
dec2.val[3] = vqsubq_u8(str.val[3], offset);
|
82
|
+
|
83
|
+
// Get values from first LUT:
|
84
|
+
dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
|
85
|
+
dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
|
86
|
+
dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
|
87
|
+
dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
|
88
|
+
|
89
|
+
// Get values from second LUT:
|
90
|
+
dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
|
91
|
+
dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
|
92
|
+
dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
|
93
|
+
dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
|
94
|
+
|
95
|
+
// Get final values:
|
96
|
+
str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
|
97
|
+
str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
|
98
|
+
str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
|
99
|
+
str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
|
100
|
+
|
101
|
+
// Check for invalid input, any value larger than 63:
|
102
|
+
const uint8x16_t classified
|
103
|
+
= vcgtq_u8(str.val[0], vdupq_n_u8(63))
|
104
|
+
| vcgtq_u8(str.val[1], vdupq_n_u8(63))
|
105
|
+
| vcgtq_u8(str.val[2], vdupq_n_u8(63))
|
106
|
+
| vcgtq_u8(str.val[3], vdupq_n_u8(63));
|
107
|
+
|
108
|
+
// Check that all bits are zero:
|
109
|
+
if (vmaxvq_u8(classified) != 0U) {
|
110
|
+
break;
|
111
|
+
}
|
112
|
+
|
113
|
+
// Compress four bytes into three:
|
114
|
+
dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4);
|
115
|
+
dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2);
|
116
|
+
dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3];
|
117
|
+
|
118
|
+
// Interleave and store decoded result:
|
119
|
+
vst3q_u8((uint8_t *) *o, dec);
|
120
|
+
|
121
|
+
*s += 64;
|
122
|
+
*o += 48;
|
123
|
+
|
124
|
+
} while (--rounds > 0);
|
125
|
+
|
126
|
+
// Adjust for any rounds that were skipped:
|
127
|
+
*slen += rounds * 64;
|
128
|
+
*olen -= rounds * 48;
|
129
|
+
}
|
@@ -0,0 +1,66 @@
|
|
1
|
+
static inline void
|
2
|
+
enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t *tbl_enc)
|
3
|
+
{
|
4
|
+
// Load 48 bytes and deinterleave:
|
5
|
+
uint8x16x3_t src = vld3q_u8(*s);
|
6
|
+
|
7
|
+
// Divide bits of three input bytes over four output bytes:
|
8
|
+
uint8x16x4_t out = enc_reshuffle(src);
|
9
|
+
|
10
|
+
// The bits have now been shifted to the right locations;
|
11
|
+
// translate their values 0..63 to the Base64 alphabet.
|
12
|
+
// Use a 64-byte table lookup:
|
13
|
+
out.val[0] = vqtbl4q_u8(*tbl_enc, out.val[0]);
|
14
|
+
out.val[1] = vqtbl4q_u8(*tbl_enc, out.val[1]);
|
15
|
+
out.val[2] = vqtbl4q_u8(*tbl_enc, out.val[2]);
|
16
|
+
out.val[3] = vqtbl4q_u8(*tbl_enc, out.val[3]);
|
17
|
+
|
18
|
+
// Interleave and store output:
|
19
|
+
vst4q_u8(*o, out);
|
20
|
+
|
21
|
+
*s += 48;
|
22
|
+
*o += 64;
|
23
|
+
}
|
24
|
+
|
25
|
+
static inline void
|
26
|
+
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
27
|
+
{
|
28
|
+
size_t rounds = *slen / 48;
|
29
|
+
|
30
|
+
*slen -= rounds * 48; // 48 bytes consumed per round
|
31
|
+
*olen += rounds * 64; // 64 bytes produced per round
|
32
|
+
|
33
|
+
// Load the encoding table:
|
34
|
+
const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);
|
35
|
+
|
36
|
+
while (rounds > 0) {
|
37
|
+
if (rounds >= 8) {
|
38
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
39
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
40
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
41
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
42
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
43
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
44
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
45
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
46
|
+
rounds -= 8;
|
47
|
+
continue;
|
48
|
+
}
|
49
|
+
if (rounds >= 4) {
|
50
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
51
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
52
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
53
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
54
|
+
rounds -= 4;
|
55
|
+
continue;
|
56
|
+
}
|
57
|
+
if (rounds >= 2) {
|
58
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
59
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
60
|
+
rounds -= 2;
|
61
|
+
continue;
|
62
|
+
}
|
63
|
+
enc_loop_neon64_inner(s, o, &tbl_enc);
|
64
|
+
break;
|
65
|
+
}
|
66
|
+
}
|
@@ -0,0 +1,54 @@
|
|
1
|
+
static inline uint8x16x4_t
|
2
|
+
enc_reshuffle (const uint8x16x3_t in)
|
3
|
+
{
|
4
|
+
uint8x16x4_t out;
|
5
|
+
|
6
|
+
#if defined(__GNUC__) || defined(__clang__)
|
7
|
+
|
8
|
+
// GCC and Clang support the following inline assembly syntax. This
|
9
|
+
// inline assembly implements the exact same algorithm as the
|
10
|
+
// intrinsics further down, but benchmarks show that the inline
|
11
|
+
// assembly easily beats the intrinsics. Perhaps this is because the
|
12
|
+
// inline assembly is well pipelined to avoid data dependencies.
|
13
|
+
|
14
|
+
__asm__ (
|
15
|
+
"ushr %[o0].16b, %[i0].16b, #2 \n\t"
|
16
|
+
"ushr %[o1].16b, %[i1].16b, #2 \n\t"
|
17
|
+
"ushr %[o2].16b, %[i2].16b, #4 \n\t"
|
18
|
+
"sli %[o1].16b, %[i0].16b, #6 \n\t"
|
19
|
+
"sli %[o2].16b, %[i1].16b, #4 \n\t"
|
20
|
+
"shl %[o3].16b, %[i2].16b, #2 \n\t"
|
21
|
+
|
22
|
+
"ushr %[o1].16b, %[o1].16b, #2 \n\t"
|
23
|
+
"ushr %[o2].16b, %[o2].16b, #2 \n\t"
|
24
|
+
"ushr %[o3].16b, %[o3].16b, #2 \n\t"
|
25
|
+
|
26
|
+
// Outputs:
|
27
|
+
: [o0] "=&w" (out.val[0]),
|
28
|
+
[o1] "=&w" (out.val[1]),
|
29
|
+
[o2] "=&w" (out.val[2]),
|
30
|
+
[o3] "=&w" (out.val[3])
|
31
|
+
|
32
|
+
// Inputs:
|
33
|
+
: [i0] "w" (in.val[0]),
|
34
|
+
[i1] "w" (in.val[1]),
|
35
|
+
[i2] "w" (in.val[2])
|
36
|
+
);
|
37
|
+
#else
|
38
|
+
// Divide bits of three input bytes over four output bytes. All output
|
39
|
+
// bytes except the first one are shifted over two bits to the left:
|
40
|
+
out.val[0] = vshrq_n_u8(in.val[0], 2);
|
41
|
+
out.val[1] = vshrq_n_u8(in.val[1], 2);
|
42
|
+
out.val[2] = vshrq_n_u8(in.val[2], 4);
|
43
|
+
out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 6);
|
44
|
+
out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 4);
|
45
|
+
out.val[3] = vshlq_n_u8(in.val[2], 2);
|
46
|
+
|
47
|
+
// Clear the top two bits by shifting the output back to the right:
|
48
|
+
out.val[1] = vshrq_n_u8(out.val[1], 2);
|
49
|
+
out.val[2] = vshrq_n_u8(out.val[2], 2);
|
50
|
+
out.val[3] = vshrq_n_u8(out.val[3], 2);
|
51
|
+
#endif
|
52
|
+
|
53
|
+
return out;
|
54
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#if HAVE_SSE41
|
12
|
+
#include <smmintrin.h>
|
13
|
+
|
14
|
+
#include "../ssse3/dec_reshuffle.c"
|
15
|
+
#include "../ssse3/dec_loop.c"
|
16
|
+
#include "../ssse3/enc_translate.c"
|
17
|
+
#include "../ssse3/enc_reshuffle.c"
|
18
|
+
#include "../ssse3/enc_loop.c"
|
19
|
+
|
20
|
+
#endif // HAVE_SSE41
|
21
|
+
|
22
|
+
BASE64_ENC_FUNCTION(sse41)
|
23
|
+
{
|
24
|
+
#if HAVE_SSE41
|
25
|
+
#include "../generic/enc_head.c"
|
26
|
+
enc_loop_ssse3(&s, &slen, &o, &olen);
|
27
|
+
#include "../generic/enc_tail.c"
|
28
|
+
#else
|
29
|
+
BASE64_ENC_STUB
|
30
|
+
#endif
|
31
|
+
}
|
32
|
+
|
33
|
+
BASE64_DEC_FUNCTION(sse41)
|
34
|
+
{
|
35
|
+
#if HAVE_SSE41
|
36
|
+
#include "../generic/dec_head.c"
|
37
|
+
dec_loop_ssse3(&s, &slen, &o, &olen);
|
38
|
+
#include "../generic/dec_tail.c"
|
39
|
+
#else
|
40
|
+
BASE64_DEC_STUB
|
41
|
+
#endif
|
42
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#if HAVE_SSE42
|
12
|
+
#include <nmmintrin.h>
|
13
|
+
|
14
|
+
#include "../ssse3/dec_reshuffle.c"
|
15
|
+
#include "../ssse3/dec_loop.c"
|
16
|
+
#include "../ssse3/enc_translate.c"
|
17
|
+
#include "../ssse3/enc_reshuffle.c"
|
18
|
+
#include "../ssse3/enc_loop.c"
|
19
|
+
|
20
|
+
#endif // HAVE_SSE42
|
21
|
+
|
22
|
+
BASE64_ENC_FUNCTION(sse42)
|
23
|
+
{
|
24
|
+
#if HAVE_SSE42
|
25
|
+
#include "../generic/enc_head.c"
|
26
|
+
enc_loop_ssse3(&s, &slen, &o, &olen);
|
27
|
+
#include "../generic/enc_tail.c"
|
28
|
+
#else
|
29
|
+
BASE64_ENC_STUB
|
30
|
+
#endif
|
31
|
+
}
|
32
|
+
|
33
|
+
BASE64_DEC_FUNCTION(sse42)
|
34
|
+
{
|
35
|
+
#if HAVE_SSE42
|
36
|
+
#include "../generic/dec_head.c"
|
37
|
+
dec_loop_ssse3(&s, &slen, &o, &olen);
|
38
|
+
#include "../generic/dec_tail.c"
|
39
|
+
#else
|
40
|
+
BASE64_DEC_STUB
|
41
|
+
#endif
|
42
|
+
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <stdlib.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#if HAVE_SSSE3
|
12
|
+
#include <tmmintrin.h>
|
13
|
+
|
14
|
+
#include "dec_reshuffle.c"
|
15
|
+
#include "dec_loop.c"
|
16
|
+
#include "enc_reshuffle.c"
|
17
|
+
#include "enc_translate.c"
|
18
|
+
#include "enc_loop.c"
|
19
|
+
|
20
|
+
#endif // HAVE_SSSE3
|
21
|
+
|
22
|
+
BASE64_ENC_FUNCTION(ssse3)
|
23
|
+
{
|
24
|
+
#if HAVE_SSSE3
|
25
|
+
#include "../generic/enc_head.c"
|
26
|
+
enc_loop_ssse3(&s, &slen, &o, &olen);
|
27
|
+
#include "../generic/enc_tail.c"
|
28
|
+
#else
|
29
|
+
BASE64_ENC_STUB
|
30
|
+
#endif
|
31
|
+
}
|
32
|
+
|
33
|
+
BASE64_DEC_FUNCTION(ssse3)
|
34
|
+
{
|
35
|
+
#if HAVE_SSSE3
|
36
|
+
#include "../generic/dec_head.c"
|
37
|
+
dec_loop_ssse3(&s, &slen, &o, &olen);
|
38
|
+
#include "../generic/dec_tail.c"
|
39
|
+
#else
|
40
|
+
BASE64_DEC_STUB
|
41
|
+
#endif
|
42
|
+
}
|