ob64 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/lib/ob64/version.rb +1 -1
- data/ob64.gemspec +2 -0
- data/vendor/libbase64/.gitignore +12 -0
- data/vendor/libbase64/.travis.yml +71 -0
- data/vendor/libbase64/CMakeLists.txt +264 -0
- data/vendor/libbase64/LICENSE +28 -0
- data/vendor/libbase64/Makefile +93 -0
- data/vendor/libbase64/README.md +474 -0
- data/vendor/libbase64/base64-benchmarks.png +0 -0
- data/vendor/libbase64/bin/base64.c +132 -0
- data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
- data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
- data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
- data/vendor/libbase64/cmake/config.h.in +25 -0
- data/vendor/libbase64/cmake/test-arch.c +35 -0
- data/vendor/libbase64/include/libbase64.h +145 -0
- data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
- data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
- data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
- data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
- data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
- data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
- data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
- data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
- data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
- data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
- data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
- data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
- data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
- data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
- data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
- data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
- data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
- data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
- data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
- data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
- data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
- data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
- data/vendor/libbase64/lib/codec_choose.c +281 -0
- data/vendor/libbase64/lib/codecs.h +65 -0
- data/vendor/libbase64/lib/env.h +67 -0
- data/vendor/libbase64/lib/exports.txt +7 -0
- data/vendor/libbase64/lib/lib.c +164 -0
- data/vendor/libbase64/lib/lib_openmp.c +149 -0
- data/vendor/libbase64/lib/tables/.gitignore +1 -0
- data/vendor/libbase64/lib/tables/Makefile +17 -0
- data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
- data/vendor/libbase64/lib/tables/table_generator.c +184 -0
- data/vendor/libbase64/lib/tables/tables.c +40 -0
- data/vendor/libbase64/lib/tables/tables.h +23 -0
- metadata +64 -4
@@ -0,0 +1,86 @@
|
|
1
|
+
static inline int
|
2
|
+
dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
|
3
|
+
{
|
4
|
+
const uint32_t str
|
5
|
+
= base64_table_dec_32bit_d0[(*s)[0]]
|
6
|
+
| base64_table_dec_32bit_d1[(*s)[1]]
|
7
|
+
| base64_table_dec_32bit_d2[(*s)[2]]
|
8
|
+
| base64_table_dec_32bit_d3[(*s)[3]];
|
9
|
+
|
10
|
+
#if BASE64_LITTLE_ENDIAN
|
11
|
+
|
12
|
+
// LUTs for little-endian set MSB in case of invalid character:
|
13
|
+
if (str & UINT32_C(0x80000000)) {
|
14
|
+
return 0;
|
15
|
+
}
|
16
|
+
#else
|
17
|
+
// LUTs for big-endian set LSB in case of invalid character:
|
18
|
+
if (str & UINT32_C(1)) {
|
19
|
+
return 0;
|
20
|
+
}
|
21
|
+
#endif
|
22
|
+
// Store the output:
|
23
|
+
memcpy(*o, &str, sizeof (str));
|
24
|
+
|
25
|
+
*s += 4;
|
26
|
+
*o += 3;
|
27
|
+
*rounds -= 1;
|
28
|
+
|
29
|
+
return 1;
|
30
|
+
}
|
31
|
+
|
32
|
+
static inline void
|
33
|
+
dec_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
34
|
+
{
|
35
|
+
if (*slen < 8) {
|
36
|
+
return;
|
37
|
+
}
|
38
|
+
|
39
|
+
// Process blocks of 4 bytes per round. Because one extra zero byte is
|
40
|
+
// written after the output, ensure that there will be at least 4 bytes
|
41
|
+
// of input data left to cover the gap. (Two data bytes and up to two
|
42
|
+
// end-of-string markers.)
|
43
|
+
size_t rounds = (*slen - 4) / 4;
|
44
|
+
|
45
|
+
*slen -= rounds * 4; // 4 bytes consumed per round
|
46
|
+
*olen += rounds * 3; // 3 bytes produced per round
|
47
|
+
|
48
|
+
do {
|
49
|
+
if (rounds >= 8) {
|
50
|
+
if (dec_loop_generic_32_inner(s, o, &rounds) &&
|
51
|
+
dec_loop_generic_32_inner(s, o, &rounds) &&
|
52
|
+
dec_loop_generic_32_inner(s, o, &rounds) &&
|
53
|
+
dec_loop_generic_32_inner(s, o, &rounds) &&
|
54
|
+
dec_loop_generic_32_inner(s, o, &rounds) &&
|
55
|
+
dec_loop_generic_32_inner(s, o, &rounds) &&
|
56
|
+
dec_loop_generic_32_inner(s, o, &rounds) &&
|
57
|
+
dec_loop_generic_32_inner(s, o, &rounds)) {
|
58
|
+
continue;
|
59
|
+
}
|
60
|
+
break;
|
61
|
+
}
|
62
|
+
if (rounds >= 4) {
|
63
|
+
if (dec_loop_generic_32_inner(s, o, &rounds) &&
|
64
|
+
dec_loop_generic_32_inner(s, o, &rounds) &&
|
65
|
+
dec_loop_generic_32_inner(s, o, &rounds) &&
|
66
|
+
dec_loop_generic_32_inner(s, o, &rounds)) {
|
67
|
+
continue;
|
68
|
+
}
|
69
|
+
break;
|
70
|
+
}
|
71
|
+
if (rounds >= 2) {
|
72
|
+
if (dec_loop_generic_32_inner(s, o, &rounds) &&
|
73
|
+
dec_loop_generic_32_inner(s, o, &rounds)) {
|
74
|
+
continue;
|
75
|
+
}
|
76
|
+
break;
|
77
|
+
}
|
78
|
+
dec_loop_generic_32_inner(s, o, &rounds);
|
79
|
+
break;
|
80
|
+
|
81
|
+
} while (rounds > 0);
|
82
|
+
|
83
|
+
// Adjust for any rounds that were skipped:
|
84
|
+
*slen += rounds * 4;
|
85
|
+
*olen -= rounds * 3;
|
86
|
+
}
|
@@ -0,0 +1,73 @@
|
|
1
|
+
static inline void
|
2
|
+
enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
|
3
|
+
{
|
4
|
+
uint32_t src;
|
5
|
+
|
6
|
+
// Load input:
|
7
|
+
memcpy(&src, *s, sizeof (src));
|
8
|
+
|
9
|
+
// Reorder to 32-bit big-endian, if not already in that format. The
|
10
|
+
// workset must be in big-endian, otherwise the shifted bits do not
|
11
|
+
// carry over properly among adjacent bytes:
|
12
|
+
src = BASE64_HTOBE32(src);
|
13
|
+
|
14
|
+
// Two indices for the 12-bit lookup table:
|
15
|
+
const size_t index0 = (src >> 20) & 0xFFFU;
|
16
|
+
const size_t index1 = (src >> 8) & 0xFFFU;
|
17
|
+
|
18
|
+
// Table lookup and store:
|
19
|
+
memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
|
20
|
+
memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
|
21
|
+
|
22
|
+
*s += 3;
|
23
|
+
*o += 4;
|
24
|
+
}
|
25
|
+
|
26
|
+
static inline void
|
27
|
+
enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
28
|
+
{
|
29
|
+
if (*slen < 4) {
|
30
|
+
return;
|
31
|
+
}
|
32
|
+
|
33
|
+
// Process blocks of 3 bytes at a time. Because blocks are loaded 4
|
34
|
+
// bytes at a time, ensure that there will be at least one remaining
|
35
|
+
// byte after the last round, so that the final read will not pass
|
36
|
+
// beyond the bounds of the input buffer:
|
37
|
+
size_t rounds = (*slen - 1) / 3;
|
38
|
+
|
39
|
+
*slen -= rounds * 3; // 3 bytes consumed per round
|
40
|
+
*olen += rounds * 4; // 4 bytes produced per round
|
41
|
+
|
42
|
+
do {
|
43
|
+
if (rounds >= 8) {
|
44
|
+
enc_loop_generic_32_inner(s, o);
|
45
|
+
enc_loop_generic_32_inner(s, o);
|
46
|
+
enc_loop_generic_32_inner(s, o);
|
47
|
+
enc_loop_generic_32_inner(s, o);
|
48
|
+
enc_loop_generic_32_inner(s, o);
|
49
|
+
enc_loop_generic_32_inner(s, o);
|
50
|
+
enc_loop_generic_32_inner(s, o);
|
51
|
+
enc_loop_generic_32_inner(s, o);
|
52
|
+
rounds -= 8;
|
53
|
+
continue;
|
54
|
+
}
|
55
|
+
if (rounds >= 4) {
|
56
|
+
enc_loop_generic_32_inner(s, o);
|
57
|
+
enc_loop_generic_32_inner(s, o);
|
58
|
+
enc_loop_generic_32_inner(s, o);
|
59
|
+
enc_loop_generic_32_inner(s, o);
|
60
|
+
rounds -= 4;
|
61
|
+
continue;
|
62
|
+
}
|
63
|
+
if (rounds >= 2) {
|
64
|
+
enc_loop_generic_32_inner(s, o);
|
65
|
+
enc_loop_generic_32_inner(s, o);
|
66
|
+
rounds -= 2;
|
67
|
+
continue;
|
68
|
+
}
|
69
|
+
enc_loop_generic_32_inner(s, o);
|
70
|
+
break;
|
71
|
+
|
72
|
+
} while (rounds > 0);
|
73
|
+
}
|
@@ -0,0 +1,77 @@
|
|
1
|
+
static inline void
|
2
|
+
enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
|
3
|
+
{
|
4
|
+
uint64_t src;
|
5
|
+
|
6
|
+
// Load input:
|
7
|
+
memcpy(&src, *s, sizeof (src));
|
8
|
+
|
9
|
+
// Reorder to 64-bit big-endian, if not already in that format. The
|
10
|
+
// workset must be in big-endian, otherwise the shifted bits do not
|
11
|
+
// carry over properly among adjacent bytes:
|
12
|
+
src = BASE64_HTOBE64(src);
|
13
|
+
|
14
|
+
// Four indices for the 12-bit lookup table:
|
15
|
+
const size_t index0 = (src >> 52) & 0xFFFU;
|
16
|
+
const size_t index1 = (src >> 40) & 0xFFFU;
|
17
|
+
const size_t index2 = (src >> 28) & 0xFFFU;
|
18
|
+
const size_t index3 = (src >> 16) & 0xFFFU;
|
19
|
+
|
20
|
+
// Table lookup and store:
|
21
|
+
memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
|
22
|
+
memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
|
23
|
+
memcpy(*o + 4, base64_table_enc_12bit + index2, 2);
|
24
|
+
memcpy(*o + 6, base64_table_enc_12bit + index3, 2);
|
25
|
+
|
26
|
+
*s += 6;
|
27
|
+
*o += 8;
|
28
|
+
}
|
29
|
+
|
30
|
+
static inline void
|
31
|
+
enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
32
|
+
{
|
33
|
+
if (*slen < 8) {
|
34
|
+
return;
|
35
|
+
}
|
36
|
+
|
37
|
+
// Process blocks of 6 bytes at a time. Because blocks are loaded 8
|
38
|
+
// bytes at a time, ensure that there will be at least 2 remaining
|
39
|
+
// bytes after the last round, so that the final read will not pass
|
40
|
+
// beyond the bounds of the input buffer:
|
41
|
+
size_t rounds = (*slen - 2) / 6;
|
42
|
+
|
43
|
+
*slen -= rounds * 6; // 6 bytes consumed per round
|
44
|
+
*olen += rounds * 8; // 8 bytes produced per round
|
45
|
+
|
46
|
+
do {
|
47
|
+
if (rounds >= 8) {
|
48
|
+
enc_loop_generic_64_inner(s, o);
|
49
|
+
enc_loop_generic_64_inner(s, o);
|
50
|
+
enc_loop_generic_64_inner(s, o);
|
51
|
+
enc_loop_generic_64_inner(s, o);
|
52
|
+
enc_loop_generic_64_inner(s, o);
|
53
|
+
enc_loop_generic_64_inner(s, o);
|
54
|
+
enc_loop_generic_64_inner(s, o);
|
55
|
+
enc_loop_generic_64_inner(s, o);
|
56
|
+
rounds -= 8;
|
57
|
+
continue;
|
58
|
+
}
|
59
|
+
if (rounds >= 4) {
|
60
|
+
enc_loop_generic_64_inner(s, o);
|
61
|
+
enc_loop_generic_64_inner(s, o);
|
62
|
+
enc_loop_generic_64_inner(s, o);
|
63
|
+
enc_loop_generic_64_inner(s, o);
|
64
|
+
rounds -= 4;
|
65
|
+
continue;
|
66
|
+
}
|
67
|
+
if (rounds >= 2) {
|
68
|
+
enc_loop_generic_64_inner(s, o);
|
69
|
+
enc_loop_generic_64_inner(s, o);
|
70
|
+
rounds -= 2;
|
71
|
+
continue;
|
72
|
+
}
|
73
|
+
enc_loop_generic_64_inner(s, o);
|
74
|
+
break;
|
75
|
+
|
76
|
+
} while (rounds > 0);
|
77
|
+
}
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <string.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#if BASE64_WORDSIZE == 32
|
12
|
+
# include "32/enc_loop.c"
|
13
|
+
#elif BASE64_WORDSIZE == 64
|
14
|
+
# include "64/enc_loop.c"
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#if BASE64_WORDSIZE >= 32
|
18
|
+
# include "32/dec_loop.c"
|
19
|
+
#endif
|
20
|
+
|
21
|
+
BASE64_ENC_FUNCTION(plain)
|
22
|
+
{
|
23
|
+
#include "enc_head.c"
|
24
|
+
#if BASE64_WORDSIZE == 32
|
25
|
+
enc_loop_generic_32(&s, &slen, &o, &olen);
|
26
|
+
#elif BASE64_WORDSIZE == 64
|
27
|
+
enc_loop_generic_64(&s, &slen, &o, &olen);
|
28
|
+
#endif
|
29
|
+
#include "enc_tail.c"
|
30
|
+
}
|
31
|
+
|
32
|
+
BASE64_DEC_FUNCTION(plain)
|
33
|
+
{
|
34
|
+
#include "dec_head.c"
|
35
|
+
#if BASE64_WORDSIZE >= 32
|
36
|
+
dec_loop_generic_32(&s, &slen, &o, &olen);
|
37
|
+
#endif
|
38
|
+
#include "dec_tail.c"
|
39
|
+
}
|
@@ -0,0 +1,37 @@
|
|
1
|
+
int ret = 0;
|
2
|
+
const uint8_t *s = (const uint8_t *) src;
|
3
|
+
uint8_t *o = (uint8_t *) out;
|
4
|
+
uint8_t q;
|
5
|
+
|
6
|
+
// Use local temporaries to avoid cache thrashing:
|
7
|
+
size_t olen = 0;
|
8
|
+
size_t slen = srclen;
|
9
|
+
struct base64_state st;
|
10
|
+
st.eof = state->eof;
|
11
|
+
st.bytes = state->bytes;
|
12
|
+
st.carry = state->carry;
|
13
|
+
|
14
|
+
// If we previously saw an EOF or an invalid character, bail out:
|
15
|
+
if (st.eof) {
|
16
|
+
*outlen = 0;
|
17
|
+
ret = 0;
|
18
|
+
// If there was a trailing '=' to check, check it:
|
19
|
+
if (slen && (st.eof == BASE64_AEOF)) {
|
20
|
+
state->bytes = 0;
|
21
|
+
state->eof = BASE64_EOF;
|
22
|
+
ret = ((base64_table_dec_8bit[*s++] == 254) && (slen == 1)) ? 1 : 0;
|
23
|
+
}
|
24
|
+
return ret;
|
25
|
+
}
|
26
|
+
|
27
|
+
// Turn four 6-bit numbers into three bytes:
|
28
|
+
// out[0] = 11111122
|
29
|
+
// out[1] = 22223333
|
30
|
+
// out[2] = 33444444
|
31
|
+
|
32
|
+
// Duff's device again:
|
33
|
+
switch (st.bytes)
|
34
|
+
{
|
35
|
+
for (;;)
|
36
|
+
{
|
37
|
+
case 0:
|
@@ -0,0 +1,91 @@
|
|
1
|
+
if (slen-- == 0) {
|
2
|
+
ret = 1;
|
3
|
+
break;
|
4
|
+
}
|
5
|
+
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
|
6
|
+
st.eof = BASE64_EOF;
|
7
|
+
// Treat character '=' as invalid for byte 0:
|
8
|
+
break;
|
9
|
+
}
|
10
|
+
st.carry = q << 2;
|
11
|
+
st.bytes++;
|
12
|
+
|
13
|
+
// Deliberate fallthrough:
|
14
|
+
BASE64_FALLTHROUGH
|
15
|
+
|
16
|
+
case 1: if (slen-- == 0) {
|
17
|
+
ret = 1;
|
18
|
+
break;
|
19
|
+
}
|
20
|
+
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
|
21
|
+
st.eof = BASE64_EOF;
|
22
|
+
// Treat character '=' as invalid for byte 1:
|
23
|
+
break;
|
24
|
+
}
|
25
|
+
*o++ = st.carry | (q >> 4);
|
26
|
+
st.carry = q << 4;
|
27
|
+
st.bytes++;
|
28
|
+
olen++;
|
29
|
+
|
30
|
+
// Deliberate fallthrough:
|
31
|
+
BASE64_FALLTHROUGH
|
32
|
+
|
33
|
+
case 2: if (slen-- == 0) {
|
34
|
+
ret = 1;
|
35
|
+
break;
|
36
|
+
}
|
37
|
+
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
|
38
|
+
st.bytes++;
|
39
|
+
// When q == 254, the input char is '='.
|
40
|
+
// Check if next byte is also '=':
|
41
|
+
if (q == 254) {
|
42
|
+
if (slen-- != 0) {
|
43
|
+
st.bytes = 0;
|
44
|
+
// EOF:
|
45
|
+
st.eof = BASE64_EOF;
|
46
|
+
q = base64_table_dec_8bit[*s++];
|
47
|
+
ret = ((q == 254) && (slen == 0)) ? 1 : 0;
|
48
|
+
break;
|
49
|
+
}
|
50
|
+
else {
|
51
|
+
// Almost EOF
|
52
|
+
st.eof = BASE64_AEOF;
|
53
|
+
ret = 1;
|
54
|
+
break;
|
55
|
+
}
|
56
|
+
}
|
57
|
+
// If we get here, there was an error:
|
58
|
+
break;
|
59
|
+
}
|
60
|
+
*o++ = st.carry | (q >> 2);
|
61
|
+
st.carry = q << 6;
|
62
|
+
st.bytes++;
|
63
|
+
olen++;
|
64
|
+
|
65
|
+
// Deliberate fallthrough:
|
66
|
+
BASE64_FALLTHROUGH
|
67
|
+
|
68
|
+
case 3: if (slen-- == 0) {
|
69
|
+
ret = 1;
|
70
|
+
break;
|
71
|
+
}
|
72
|
+
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
|
73
|
+
st.bytes = 0;
|
74
|
+
st.eof = BASE64_EOF;
|
75
|
+
// When q == 254, the input char is '='. Return 1 and EOF.
|
76
|
+
// When q == 255, the input char is invalid. Return 0 and EOF.
|
77
|
+
ret = ((q == 254) && (slen == 0)) ? 1 : 0;
|
78
|
+
break;
|
79
|
+
}
|
80
|
+
*o++ = st.carry | q;
|
81
|
+
st.carry = 0;
|
82
|
+
st.bytes = 0;
|
83
|
+
olen++;
|
84
|
+
}
|
85
|
+
}
|
86
|
+
|
87
|
+
state->eof = st.eof;
|
88
|
+
state->bytes = st.bytes;
|
89
|
+
state->carry = st.carry;
|
90
|
+
*outlen = olen;
|
91
|
+
return ret;
|
@@ -0,0 +1,24 @@
|
|
1
|
+
// Assume that *out is large enough to contain the output.
|
2
|
+
// Theoretically it should be 4/3 the length of src.
|
3
|
+
const uint8_t *s = (const uint8_t *) src;
|
4
|
+
uint8_t *o = (uint8_t *) out;
|
5
|
+
|
6
|
+
// Use local temporaries to avoid cache thrashing:
|
7
|
+
size_t olen = 0;
|
8
|
+
size_t slen = srclen;
|
9
|
+
struct base64_state st;
|
10
|
+
st.bytes = state->bytes;
|
11
|
+
st.carry = state->carry;
|
12
|
+
|
13
|
+
// Turn three bytes into four 6-bit numbers:
|
14
|
+
// in[0] = 00111111
|
15
|
+
// in[1] = 00112222
|
16
|
+
// in[2] = 00222233
|
17
|
+
// in[3] = 00333333
|
18
|
+
|
19
|
+
// Duff's device, a for() loop inside a switch() statement. Legal!
|
20
|
+
switch (st.bytes)
|
21
|
+
{
|
22
|
+
for (;;)
|
23
|
+
{
|
24
|
+
case 0:
|
@@ -0,0 +1,34 @@
|
|
1
|
+
if (slen-- == 0) {
|
2
|
+
break;
|
3
|
+
}
|
4
|
+
*o++ = base64_table_enc_6bit[*s >> 2];
|
5
|
+
st.carry = (*s++ << 4) & 0x30;
|
6
|
+
st.bytes++;
|
7
|
+
olen += 1;
|
8
|
+
|
9
|
+
// Deliberate fallthrough:
|
10
|
+
BASE64_FALLTHROUGH
|
11
|
+
|
12
|
+
case 1: if (slen-- == 0) {
|
13
|
+
break;
|
14
|
+
}
|
15
|
+
*o++ = base64_table_enc_6bit[st.carry | (*s >> 4)];
|
16
|
+
st.carry = (*s++ << 2) & 0x3C;
|
17
|
+
st.bytes++;
|
18
|
+
olen += 1;
|
19
|
+
|
20
|
+
// Deliberate fallthrough:
|
21
|
+
BASE64_FALLTHROUGH
|
22
|
+
|
23
|
+
case 2: if (slen-- == 0) {
|
24
|
+
break;
|
25
|
+
}
|
26
|
+
*o++ = base64_table_enc_6bit[st.carry | (*s >> 6)];
|
27
|
+
*o++ = base64_table_enc_6bit[*s++ & 0x3F];
|
28
|
+
st.bytes = 0;
|
29
|
+
olen += 2;
|
30
|
+
}
|
31
|
+
}
|
32
|
+
state->bytes = st.bytes;
|
33
|
+
state->carry = st.carry;
|
34
|
+
*outlen = olen;
|
@@ -0,0 +1,72 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <string.h>
|
4
|
+
|
5
|
+
#include "../../../include/libbase64.h"
|
6
|
+
#include "../../tables/tables.h"
|
7
|
+
#include "../../codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "../../env.h"
|
10
|
+
|
11
|
+
#ifdef __arm__
|
12
|
+
# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32
|
13
|
+
# define BASE64_USE_NEON32
|
14
|
+
# endif
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#ifdef BASE64_USE_NEON32
|
18
|
+
#include <arm_neon.h>
|
19
|
+
|
20
|
+
static inline uint8x16_t
|
21
|
+
vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
|
22
|
+
{
|
23
|
+
// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
|
24
|
+
// the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups.
|
25
|
+
uint8x8x2_t lut2;
|
26
|
+
uint8x8x2_t result;
|
27
|
+
|
28
|
+
lut2.val[0] = vget_low_u8(lut);
|
29
|
+
lut2.val[1] = vget_high_u8(lut);
|
30
|
+
|
31
|
+
result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices));
|
32
|
+
result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices));
|
33
|
+
|
34
|
+
return vcombine_u8(result.val[0], result.val[1]);
|
35
|
+
}
|
36
|
+
|
37
|
+
#include "../generic/32/dec_loop.c"
|
38
|
+
#include "../generic/32/enc_loop.c"
|
39
|
+
#include "dec_loop.c"
|
40
|
+
#include "enc_reshuffle.c"
|
41
|
+
#include "enc_translate.c"
|
42
|
+
#include "enc_loop.c"
|
43
|
+
|
44
|
+
#endif // BASE64_USE_NEON32
|
45
|
+
|
46
|
+
// Stride size is so large on these NEON 32-bit functions
|
47
|
+
// (48 bytes encode, 32 bytes decode) that we inline the
|
48
|
+
// uint32 codec to stay performant on smaller inputs.
|
49
|
+
|
50
|
+
BASE64_ENC_FUNCTION(neon32)
|
51
|
+
{
|
52
|
+
#ifdef BASE64_USE_NEON32
|
53
|
+
#include "../generic/enc_head.c"
|
54
|
+
enc_loop_neon32(&s, &slen, &o, &olen);
|
55
|
+
enc_loop_generic_32(&s, &slen, &o, &olen);
|
56
|
+
#include "../generic/enc_tail.c"
|
57
|
+
#else
|
58
|
+
BASE64_ENC_STUB
|
59
|
+
#endif
|
60
|
+
}
|
61
|
+
|
62
|
+
BASE64_DEC_FUNCTION(neon32)
|
63
|
+
{
|
64
|
+
#ifdef BASE64_USE_NEON32
|
65
|
+
#include "../generic/dec_head.c"
|
66
|
+
dec_loop_neon32(&s, &slen, &o, &olen);
|
67
|
+
dec_loop_generic_32(&s, &slen, &o, &olen);
|
68
|
+
#include "../generic/dec_tail.c"
|
69
|
+
#else
|
70
|
+
BASE64_DEC_STUB
|
71
|
+
#endif
|
72
|
+
}
|
@@ -0,0 +1,106 @@
|
|
1
|
+
static inline int
|
2
|
+
is_nonzero (const uint8x16_t v)
|
3
|
+
{
|
4
|
+
uint64_t u64;
|
5
|
+
const uint64x2_t v64 = vreinterpretq_u64_u8(v);
|
6
|
+
const uint32x2_t v32 = vqmovn_u64(v64);
|
7
|
+
|
8
|
+
vst1_u64(&u64, vreinterpret_u64_u32(v32));
|
9
|
+
return u64 != 0;
|
10
|
+
}
|
11
|
+
|
12
|
+
static inline uint8x16_t
|
13
|
+
delta_lookup (const uint8x16_t v)
|
14
|
+
{
|
15
|
+
const uint8x8_t lut = {
|
16
|
+
0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
|
17
|
+
};
|
18
|
+
|
19
|
+
return vcombine_u8(
|
20
|
+
vtbl1_u8(lut, vget_low_u8(v)),
|
21
|
+
vtbl1_u8(lut, vget_high_u8(v)));
|
22
|
+
}
|
23
|
+
|
24
|
+
static inline uint8x16_t
|
25
|
+
dec_loop_neon32_lane (uint8x16_t *lane)
|
26
|
+
{
|
27
|
+
// See the SSSE3 decoder for an explanation of the algorithm.
|
28
|
+
const uint8x16_t lut_lo = {
|
29
|
+
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
|
30
|
+
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
|
31
|
+
};
|
32
|
+
|
33
|
+
const uint8x16_t lut_hi = {
|
34
|
+
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
|
35
|
+
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
|
36
|
+
};
|
37
|
+
|
38
|
+
const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
|
39
|
+
const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
|
40
|
+
|
41
|
+
const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
|
42
|
+
const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
|
43
|
+
const uint8x16_t eq_2F = vceqq_u8(*lane, mask_2F);
|
44
|
+
|
45
|
+
const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
|
46
|
+
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
|
47
|
+
|
48
|
+
// Now simply add the delta values to the input:
|
49
|
+
*lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
|
50
|
+
|
51
|
+
// Return the validity mask:
|
52
|
+
return vandq_u8(lo, hi);
|
53
|
+
}
|
54
|
+
|
55
|
+
static inline void
|
56
|
+
dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
57
|
+
{
|
58
|
+
if (*slen < 64) {
|
59
|
+
return;
|
60
|
+
}
|
61
|
+
|
62
|
+
// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
|
63
|
+
// extra trailing zero bytes are written, so it is not necessary to
|
64
|
+
// reserve extra input bytes:
|
65
|
+
size_t rounds = *slen / 64;
|
66
|
+
|
67
|
+
*slen -= rounds * 64; // 64 bytes consumed per round
|
68
|
+
*olen += rounds * 48; // 48 bytes produced per round
|
69
|
+
|
70
|
+
do {
|
71
|
+
uint8x16x3_t dec;
|
72
|
+
|
73
|
+
// Load 64 bytes and deinterleave:
|
74
|
+
uint8x16x4_t str = vld4q_u8(*s);
|
75
|
+
|
76
|
+
// Decode each lane, collect a mask of invalid inputs:
|
77
|
+
const uint8x16_t classified
|
78
|
+
= dec_loop_neon32_lane(&str.val[0])
|
79
|
+
| dec_loop_neon32_lane(&str.val[1])
|
80
|
+
| dec_loop_neon32_lane(&str.val[2])
|
81
|
+
| dec_loop_neon32_lane(&str.val[3]);
|
82
|
+
|
83
|
+
// Check for invalid input: if any of the delta values are
|
84
|
+
// zero, fall back on bytewise code to do error checking and
|
85
|
+
// reporting:
|
86
|
+
if (is_nonzero(classified)) {
|
87
|
+
break;
|
88
|
+
}
|
89
|
+
|
90
|
+
// Compress four bytes into three:
|
91
|
+
dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
|
92
|
+
dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
|
93
|
+
dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
|
94
|
+
|
95
|
+
// Interleave and store decoded result:
|
96
|
+
vst3q_u8(*o, dec);
|
97
|
+
|
98
|
+
*s += 64;
|
99
|
+
*o += 48;
|
100
|
+
|
101
|
+
} while (--rounds > 0);
|
102
|
+
|
103
|
+
// Adjust for any rounds that were skipped:
|
104
|
+
*slen += rounds * 64;
|
105
|
+
*olen -= rounds * 48;
|
106
|
+
}
|
@@ -0,0 +1,58 @@
|
|
1
|
+
static inline void
|
2
|
+
enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
|
3
|
+
{
|
4
|
+
// Load 48 bytes and deinterleave:
|
5
|
+
uint8x16x3_t src = vld3q_u8(*s);
|
6
|
+
|
7
|
+
// Reshuffle:
|
8
|
+
uint8x16x4_t out = enc_reshuffle(src);
|
9
|
+
|
10
|
+
// Translate reshuffled bytes to the Base64 alphabet:
|
11
|
+
out = enc_translate(out);
|
12
|
+
|
13
|
+
// Interleave and store output:
|
14
|
+
vst4q_u8(*o, out);
|
15
|
+
|
16
|
+
*s += 48;
|
17
|
+
*o += 64;
|
18
|
+
}
|
19
|
+
|
20
|
+
static inline void
|
21
|
+
enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
22
|
+
{
|
23
|
+
size_t rounds = *slen / 48;
|
24
|
+
|
25
|
+
*slen -= rounds * 48; // 48 bytes consumed per round
|
26
|
+
*olen += rounds * 64; // 64 bytes produced per round
|
27
|
+
|
28
|
+
while (rounds > 0) {
|
29
|
+
if (rounds >= 8) {
|
30
|
+
enc_loop_neon32_inner(s, o);
|
31
|
+
enc_loop_neon32_inner(s, o);
|
32
|
+
enc_loop_neon32_inner(s, o);
|
33
|
+
enc_loop_neon32_inner(s, o);
|
34
|
+
enc_loop_neon32_inner(s, o);
|
35
|
+
enc_loop_neon32_inner(s, o);
|
36
|
+
enc_loop_neon32_inner(s, o);
|
37
|
+
enc_loop_neon32_inner(s, o);
|
38
|
+
rounds -= 8;
|
39
|
+
continue;
|
40
|
+
}
|
41
|
+
if (rounds >= 4) {
|
42
|
+
enc_loop_neon32_inner(s, o);
|
43
|
+
enc_loop_neon32_inner(s, o);
|
44
|
+
enc_loop_neon32_inner(s, o);
|
45
|
+
enc_loop_neon32_inner(s, o);
|
46
|
+
rounds -= 4;
|
47
|
+
continue;
|
48
|
+
}
|
49
|
+
if (rounds >= 2) {
|
50
|
+
enc_loop_neon32_inner(s, o);
|
51
|
+
enc_loop_neon32_inner(s, o);
|
52
|
+
rounds -= 2;
|
53
|
+
continue;
|
54
|
+
}
|
55
|
+
enc_loop_neon32_inner(s, o);
|
56
|
+
break;
|
57
|
+
}
|
58
|
+
}
|