ob64 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/lib/ob64/version.rb +1 -1
  5. data/ob64.gemspec +2 -0
  6. data/vendor/libbase64/.gitignore +12 -0
  7. data/vendor/libbase64/.travis.yml +71 -0
  8. data/vendor/libbase64/CMakeLists.txt +264 -0
  9. data/vendor/libbase64/LICENSE +28 -0
  10. data/vendor/libbase64/Makefile +93 -0
  11. data/vendor/libbase64/README.md +474 -0
  12. data/vendor/libbase64/base64-benchmarks.png +0 -0
  13. data/vendor/libbase64/bin/base64.c +132 -0
  14. data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
  15. data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
  16. data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
  17. data/vendor/libbase64/cmake/config.h.in +25 -0
  18. data/vendor/libbase64/cmake/test-arch.c +35 -0
  19. data/vendor/libbase64/include/libbase64.h +145 -0
  20. data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
  21. data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
  22. data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
  23. data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
  24. data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
  25. data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
  26. data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
  27. data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
  28. data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
  29. data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
  30. data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
  31. data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
  32. data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
  33. data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
  34. data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
  35. data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
  36. data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
  37. data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
  38. data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
  39. data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
  40. data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
  41. data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
  42. data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
  43. data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
  44. data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
  45. data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
  46. data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
  47. data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
  48. data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
  49. data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
  50. data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
  51. data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
  52. data/vendor/libbase64/lib/codec_choose.c +281 -0
  53. data/vendor/libbase64/lib/codecs.h +65 -0
  54. data/vendor/libbase64/lib/env.h +67 -0
  55. data/vendor/libbase64/lib/exports.txt +7 -0
  56. data/vendor/libbase64/lib/lib.c +164 -0
  57. data/vendor/libbase64/lib/lib_openmp.c +149 -0
  58. data/vendor/libbase64/lib/tables/.gitignore +1 -0
  59. data/vendor/libbase64/lib/tables/Makefile +17 -0
  60. data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
  61. data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
  62. data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
  63. data/vendor/libbase64/lib/tables/table_generator.c +184 -0
  64. data/vendor/libbase64/lib/tables/tables.c +40 -0
  65. data/vendor/libbase64/lib/tables/tables.h +23 -0
  66. metadata +64 -4
@@ -0,0 +1,86 @@
1
+ static inline int
2
+ dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
3
+ {
4
+ const uint32_t str
5
+ = base64_table_dec_32bit_d0[(*s)[0]]
6
+ | base64_table_dec_32bit_d1[(*s)[1]]
7
+ | base64_table_dec_32bit_d2[(*s)[2]]
8
+ | base64_table_dec_32bit_d3[(*s)[3]];
9
+
10
+ #if BASE64_LITTLE_ENDIAN
11
+
12
+ // LUTs for little-endian set MSB in case of invalid character:
13
+ if (str & UINT32_C(0x80000000)) {
14
+ return 0;
15
+ }
16
+ #else
17
+ // LUTs for big-endian set LSB in case of invalid character:
18
+ if (str & UINT32_C(1)) {
19
+ return 0;
20
+ }
21
+ #endif
22
+ // Store the output:
23
+ memcpy(*o, &str, sizeof (str));
24
+
25
+ *s += 4;
26
+ *o += 3;
27
+ *rounds -= 1;
28
+
29
+ return 1;
30
+ }
31
+
32
+ static inline void
33
+ dec_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
34
+ {
35
+ if (*slen < 8) {
36
+ return;
37
+ }
38
+
39
+ // Process blocks of 4 bytes per round. Because one extra zero byte is
40
+ // written after the output, ensure that there will be at least 4 bytes
41
+ // of input data left to cover the gap. (Two data bytes and up to two
42
+ // end-of-string markers.)
43
+ size_t rounds = (*slen - 4) / 4;
44
+
45
+ *slen -= rounds * 4; // 4 bytes consumed per round
46
+ *olen += rounds * 3; // 3 bytes produced per round
47
+
48
+ do {
49
+ if (rounds >= 8) {
50
+ if (dec_loop_generic_32_inner(s, o, &rounds) &&
51
+ dec_loop_generic_32_inner(s, o, &rounds) &&
52
+ dec_loop_generic_32_inner(s, o, &rounds) &&
53
+ dec_loop_generic_32_inner(s, o, &rounds) &&
54
+ dec_loop_generic_32_inner(s, o, &rounds) &&
55
+ dec_loop_generic_32_inner(s, o, &rounds) &&
56
+ dec_loop_generic_32_inner(s, o, &rounds) &&
57
+ dec_loop_generic_32_inner(s, o, &rounds)) {
58
+ continue;
59
+ }
60
+ break;
61
+ }
62
+ if (rounds >= 4) {
63
+ if (dec_loop_generic_32_inner(s, o, &rounds) &&
64
+ dec_loop_generic_32_inner(s, o, &rounds) &&
65
+ dec_loop_generic_32_inner(s, o, &rounds) &&
66
+ dec_loop_generic_32_inner(s, o, &rounds)) {
67
+ continue;
68
+ }
69
+ break;
70
+ }
71
+ if (rounds >= 2) {
72
+ if (dec_loop_generic_32_inner(s, o, &rounds) &&
73
+ dec_loop_generic_32_inner(s, o, &rounds)) {
74
+ continue;
75
+ }
76
+ break;
77
+ }
78
+ dec_loop_generic_32_inner(s, o, &rounds);
79
+ break;
80
+
81
+ } while (rounds > 0);
82
+
83
+ // Adjust for any rounds that were skipped:
84
+ *slen += rounds * 4;
85
+ *olen -= rounds * 3;
86
+ }
@@ -0,0 +1,73 @@
1
+ static inline void
2
+ enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
3
+ {
4
+ uint32_t src;
5
+
6
+ // Load input:
7
+ memcpy(&src, *s, sizeof (src));
8
+
9
+ // Reorder to 32-bit big-endian, if not already in that format. The
10
+ // workset must be in big-endian, otherwise the shifted bits do not
11
+ // carry over properly among adjacent bytes:
12
+ src = BASE64_HTOBE32(src);
13
+
14
+ // Two indices for the 12-bit lookup table:
15
+ const size_t index0 = (src >> 20) & 0xFFFU;
16
+ const size_t index1 = (src >> 8) & 0xFFFU;
17
+
18
+ // Table lookup and store:
19
+ memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
20
+ memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
21
+
22
+ *s += 3;
23
+ *o += 4;
24
+ }
25
+
26
+ static inline void
27
+ enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
28
+ {
29
+ if (*slen < 4) {
30
+ return;
31
+ }
32
+
33
+ // Process blocks of 3 bytes at a time. Because blocks are loaded 4
34
+ // bytes at a time, ensure that there will be at least one remaining
35
+ // byte after the last round, so that the final read will not pass
36
+ // beyond the bounds of the input buffer:
37
+ size_t rounds = (*slen - 1) / 3;
38
+
39
+ *slen -= rounds * 3; // 3 bytes consumed per round
40
+ *olen += rounds * 4; // 4 bytes produced per round
41
+
42
+ do {
43
+ if (rounds >= 8) {
44
+ enc_loop_generic_32_inner(s, o);
45
+ enc_loop_generic_32_inner(s, o);
46
+ enc_loop_generic_32_inner(s, o);
47
+ enc_loop_generic_32_inner(s, o);
48
+ enc_loop_generic_32_inner(s, o);
49
+ enc_loop_generic_32_inner(s, o);
50
+ enc_loop_generic_32_inner(s, o);
51
+ enc_loop_generic_32_inner(s, o);
52
+ rounds -= 8;
53
+ continue;
54
+ }
55
+ if (rounds >= 4) {
56
+ enc_loop_generic_32_inner(s, o);
57
+ enc_loop_generic_32_inner(s, o);
58
+ enc_loop_generic_32_inner(s, o);
59
+ enc_loop_generic_32_inner(s, o);
60
+ rounds -= 4;
61
+ continue;
62
+ }
63
+ if (rounds >= 2) {
64
+ enc_loop_generic_32_inner(s, o);
65
+ enc_loop_generic_32_inner(s, o);
66
+ rounds -= 2;
67
+ continue;
68
+ }
69
+ enc_loop_generic_32_inner(s, o);
70
+ break;
71
+
72
+ } while (rounds > 0);
73
+ }
@@ -0,0 +1,77 @@
1
+ static inline void
2
+ enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
3
+ {
4
+ uint64_t src;
5
+
6
+ // Load input:
7
+ memcpy(&src, *s, sizeof (src));
8
+
9
+ // Reorder to 64-bit big-endian, if not already in that format. The
10
+ // workset must be in big-endian, otherwise the shifted bits do not
11
+ // carry over properly among adjacent bytes:
12
+ src = BASE64_HTOBE64(src);
13
+
14
+ // Four indices for the 12-bit lookup table:
15
+ const size_t index0 = (src >> 52) & 0xFFFU;
16
+ const size_t index1 = (src >> 40) & 0xFFFU;
17
+ const size_t index2 = (src >> 28) & 0xFFFU;
18
+ const size_t index3 = (src >> 16) & 0xFFFU;
19
+
20
+ // Table lookup and store:
21
+ memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
22
+ memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
23
+ memcpy(*o + 4, base64_table_enc_12bit + index2, 2);
24
+ memcpy(*o + 6, base64_table_enc_12bit + index3, 2);
25
+
26
+ *s += 6;
27
+ *o += 8;
28
+ }
29
+
30
+ static inline void
31
+ enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
32
+ {
33
+ if (*slen < 8) {
34
+ return;
35
+ }
36
+
37
+ // Process blocks of 6 bytes at a time. Because blocks are loaded 8
38
+ // bytes at a time, ensure that there will be at least 2 remaining
39
+ // bytes after the last round, so that the final read will not pass
40
+ // beyond the bounds of the input buffer:
41
+ size_t rounds = (*slen - 2) / 6;
42
+
43
+ *slen -= rounds * 6; // 6 bytes consumed per round
44
+ *olen += rounds * 8; // 8 bytes produced per round
45
+
46
+ do {
47
+ if (rounds >= 8) {
48
+ enc_loop_generic_64_inner(s, o);
49
+ enc_loop_generic_64_inner(s, o);
50
+ enc_loop_generic_64_inner(s, o);
51
+ enc_loop_generic_64_inner(s, o);
52
+ enc_loop_generic_64_inner(s, o);
53
+ enc_loop_generic_64_inner(s, o);
54
+ enc_loop_generic_64_inner(s, o);
55
+ enc_loop_generic_64_inner(s, o);
56
+ rounds -= 8;
57
+ continue;
58
+ }
59
+ if (rounds >= 4) {
60
+ enc_loop_generic_64_inner(s, o);
61
+ enc_loop_generic_64_inner(s, o);
62
+ enc_loop_generic_64_inner(s, o);
63
+ enc_loop_generic_64_inner(s, o);
64
+ rounds -= 4;
65
+ continue;
66
+ }
67
+ if (rounds >= 2) {
68
+ enc_loop_generic_64_inner(s, o);
69
+ enc_loop_generic_64_inner(s, o);
70
+ rounds -= 2;
71
+ continue;
72
+ }
73
+ enc_loop_generic_64_inner(s, o);
74
+ break;
75
+
76
+ } while (rounds > 0);
77
+ }
@@ -0,0 +1,39 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <string.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #if BASE64_WORDSIZE == 32
12
+ # include "32/enc_loop.c"
13
+ #elif BASE64_WORDSIZE == 64
14
+ # include "64/enc_loop.c"
15
+ #endif
16
+
17
+ #if BASE64_WORDSIZE >= 32
18
+ # include "32/dec_loop.c"
19
+ #endif
20
+
21
+ BASE64_ENC_FUNCTION(plain)
22
+ {
23
+ #include "enc_head.c"
24
+ #if BASE64_WORDSIZE == 32
25
+ enc_loop_generic_32(&s, &slen, &o, &olen);
26
+ #elif BASE64_WORDSIZE == 64
27
+ enc_loop_generic_64(&s, &slen, &o, &olen);
28
+ #endif
29
+ #include "enc_tail.c"
30
+ }
31
+
32
+ BASE64_DEC_FUNCTION(plain)
33
+ {
34
+ #include "dec_head.c"
35
+ #if BASE64_WORDSIZE >= 32
36
+ dec_loop_generic_32(&s, &slen, &o, &olen);
37
+ #endif
38
+ #include "dec_tail.c"
39
+ }
@@ -0,0 +1,37 @@
1
+ int ret = 0;
2
+ const uint8_t *s = (const uint8_t *) src;
3
+ uint8_t *o = (uint8_t *) out;
4
+ uint8_t q;
5
+
6
+ // Use local temporaries to avoid cache thrashing:
7
+ size_t olen = 0;
8
+ size_t slen = srclen;
9
+ struct base64_state st;
10
+ st.eof = state->eof;
11
+ st.bytes = state->bytes;
12
+ st.carry = state->carry;
13
+
14
+ // If we previously saw an EOF or an invalid character, bail out:
15
+ if (st.eof) {
16
+ *outlen = 0;
17
+ ret = 0;
18
+ // If there was a trailing '=' to check, check it:
19
+ if (slen && (st.eof == BASE64_AEOF)) {
20
+ state->bytes = 0;
21
+ state->eof = BASE64_EOF;
22
+ ret = ((base64_table_dec_8bit[*s++] == 254) && (slen == 1)) ? 1 : 0;
23
+ }
24
+ return ret;
25
+ }
26
+
27
+ // Turn four 6-bit numbers into three bytes:
28
+ // out[0] = 11111122
29
+ // out[1] = 22223333
30
+ // out[2] = 33444444
31
+
32
+ // Duff's device again:
33
+ switch (st.bytes)
34
+ {
35
+ for (;;)
36
+ {
37
+ case 0:
@@ -0,0 +1,91 @@
1
+ if (slen-- == 0) {
2
+ ret = 1;
3
+ break;
4
+ }
5
+ if ((q = base64_table_dec_8bit[*s++]) >= 254) {
6
+ st.eof = BASE64_EOF;
7
+ // Treat character '=' as invalid for byte 0:
8
+ break;
9
+ }
10
+ st.carry = q << 2;
11
+ st.bytes++;
12
+
13
+ // Deliberate fallthrough:
14
+ BASE64_FALLTHROUGH
15
+
16
+ case 1: if (slen-- == 0) {
17
+ ret = 1;
18
+ break;
19
+ }
20
+ if ((q = base64_table_dec_8bit[*s++]) >= 254) {
21
+ st.eof = BASE64_EOF;
22
+ // Treat character '=' as invalid for byte 1:
23
+ break;
24
+ }
25
+ *o++ = st.carry | (q >> 4);
26
+ st.carry = q << 4;
27
+ st.bytes++;
28
+ olen++;
29
+
30
+ // Deliberate fallthrough:
31
+ BASE64_FALLTHROUGH
32
+
33
+ case 2: if (slen-- == 0) {
34
+ ret = 1;
35
+ break;
36
+ }
37
+ if ((q = base64_table_dec_8bit[*s++]) >= 254) {
38
+ st.bytes++;
39
+ // When q == 254, the input char is '='.
40
+ // Check if next byte is also '=':
41
+ if (q == 254) {
42
+ if (slen-- != 0) {
43
+ st.bytes = 0;
44
+ // EOF:
45
+ st.eof = BASE64_EOF;
46
+ q = base64_table_dec_8bit[*s++];
47
+ ret = ((q == 254) && (slen == 0)) ? 1 : 0;
48
+ break;
49
+ }
50
+ else {
51
+ // Almost EOF
52
+ st.eof = BASE64_AEOF;
53
+ ret = 1;
54
+ break;
55
+ }
56
+ }
57
+ // If we get here, there was an error:
58
+ break;
59
+ }
60
+ *o++ = st.carry | (q >> 2);
61
+ st.carry = q << 6;
62
+ st.bytes++;
63
+ olen++;
64
+
65
+ // Deliberate fallthrough:
66
+ BASE64_FALLTHROUGH
67
+
68
+ case 3: if (slen-- == 0) {
69
+ ret = 1;
70
+ break;
71
+ }
72
+ if ((q = base64_table_dec_8bit[*s++]) >= 254) {
73
+ st.bytes = 0;
74
+ st.eof = BASE64_EOF;
75
+ // When q == 254, the input char is '='. Return 1 and EOF.
76
+ // When q == 255, the input char is invalid. Return 0 and EOF.
77
+ ret = ((q == 254) && (slen == 0)) ? 1 : 0;
78
+ break;
79
+ }
80
+ *o++ = st.carry | q;
81
+ st.carry = 0;
82
+ st.bytes = 0;
83
+ olen++;
84
+ }
85
+ }
86
+
87
+ state->eof = st.eof;
88
+ state->bytes = st.bytes;
89
+ state->carry = st.carry;
90
+ *outlen = olen;
91
+ return ret;
@@ -0,0 +1,24 @@
1
+ // Assume that *out is large enough to contain the output.
2
+ // Theoretically it should be 4/3 the length of src.
3
+ const uint8_t *s = (const uint8_t *) src;
4
+ uint8_t *o = (uint8_t *) out;
5
+
6
+ // Use local temporaries to avoid cache thrashing:
7
+ size_t olen = 0;
8
+ size_t slen = srclen;
9
+ struct base64_state st;
10
+ st.bytes = state->bytes;
11
+ st.carry = state->carry;
12
+
13
+ // Turn three bytes into four 6-bit numbers:
14
+ // in[0] = 00111111
15
+ // in[1] = 00112222
16
+ // in[2] = 00222233
17
+ // in[3] = 00333333
18
+
19
+ // Duff's device, a for() loop inside a switch() statement. Legal!
20
+ switch (st.bytes)
21
+ {
22
+ for (;;)
23
+ {
24
+ case 0:
@@ -0,0 +1,34 @@
1
+ if (slen-- == 0) {
2
+ break;
3
+ }
4
+ *o++ = base64_table_enc_6bit[*s >> 2];
5
+ st.carry = (*s++ << 4) & 0x30;
6
+ st.bytes++;
7
+ olen += 1;
8
+
9
+ // Deliberate fallthrough:
10
+ BASE64_FALLTHROUGH
11
+
12
+ case 1: if (slen-- == 0) {
13
+ break;
14
+ }
15
+ *o++ = base64_table_enc_6bit[st.carry | (*s >> 4)];
16
+ st.carry = (*s++ << 2) & 0x3C;
17
+ st.bytes++;
18
+ olen += 1;
19
+
20
+ // Deliberate fallthrough:
21
+ BASE64_FALLTHROUGH
22
+
23
+ case 2: if (slen-- == 0) {
24
+ break;
25
+ }
26
+ *o++ = base64_table_enc_6bit[st.carry | (*s >> 6)];
27
+ *o++ = base64_table_enc_6bit[*s++ & 0x3F];
28
+ st.bytes = 0;
29
+ olen += 2;
30
+ }
31
+ }
32
+ state->bytes = st.bytes;
33
+ state->carry = st.carry;
34
+ *outlen = olen;
@@ -0,0 +1,72 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <string.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #ifdef __arm__
12
+ # if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32
13
+ # define BASE64_USE_NEON32
14
+ # endif
15
+ #endif
16
+
17
+ #ifdef BASE64_USE_NEON32
18
+ #include <arm_neon.h>
19
+
20
+ static inline uint8x16_t
21
+ vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
22
+ {
23
+ // NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
24
+ // the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups.
25
+ uint8x8x2_t lut2;
26
+ uint8x8x2_t result;
27
+
28
+ lut2.val[0] = vget_low_u8(lut);
29
+ lut2.val[1] = vget_high_u8(lut);
30
+
31
+ result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices));
32
+ result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices));
33
+
34
+ return vcombine_u8(result.val[0], result.val[1]);
35
+ }
36
+
37
+ #include "../generic/32/dec_loop.c"
38
+ #include "../generic/32/enc_loop.c"
39
+ #include "dec_loop.c"
40
+ #include "enc_reshuffle.c"
41
+ #include "enc_translate.c"
42
+ #include "enc_loop.c"
43
+
44
+ #endif // BASE64_USE_NEON32
45
+
46
+ // Stride size is so large on these NEON 32-bit functions
47
+ // (48 bytes encode, 32 bytes decode) that we inline the
48
+ // uint32 codec to stay performant on smaller inputs.
49
+
50
+ BASE64_ENC_FUNCTION(neon32)
51
+ {
52
+ #ifdef BASE64_USE_NEON32
53
+ #include "../generic/enc_head.c"
54
+ enc_loop_neon32(&s, &slen, &o, &olen);
55
+ enc_loop_generic_32(&s, &slen, &o, &olen);
56
+ #include "../generic/enc_tail.c"
57
+ #else
58
+ BASE64_ENC_STUB
59
+ #endif
60
+ }
61
+
62
+ BASE64_DEC_FUNCTION(neon32)
63
+ {
64
+ #ifdef BASE64_USE_NEON32
65
+ #include "../generic/dec_head.c"
66
+ dec_loop_neon32(&s, &slen, &o, &olen);
67
+ dec_loop_generic_32(&s, &slen, &o, &olen);
68
+ #include "../generic/dec_tail.c"
69
+ #else
70
+ BASE64_DEC_STUB
71
+ #endif
72
+ }
@@ -0,0 +1,106 @@
1
+ static inline int
2
+ is_nonzero (const uint8x16_t v)
3
+ {
4
+ uint64_t u64;
5
+ const uint64x2_t v64 = vreinterpretq_u64_u8(v);
6
+ const uint32x2_t v32 = vqmovn_u64(v64);
7
+
8
+ vst1_u64(&u64, vreinterpret_u64_u32(v32));
9
+ return u64 != 0;
10
+ }
11
+
12
+ static inline uint8x16_t
13
+ delta_lookup (const uint8x16_t v)
14
+ {
15
+ const uint8x8_t lut = {
16
+ 0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
17
+ };
18
+
19
+ return vcombine_u8(
20
+ vtbl1_u8(lut, vget_low_u8(v)),
21
+ vtbl1_u8(lut, vget_high_u8(v)));
22
+ }
23
+
24
+ static inline uint8x16_t
25
+ dec_loop_neon32_lane (uint8x16_t *lane)
26
+ {
27
+ // See the SSSE3 decoder for an explanation of the algorithm.
28
+ const uint8x16_t lut_lo = {
29
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
30
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
31
+ };
32
+
33
+ const uint8x16_t lut_hi = {
34
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
35
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
36
+ };
37
+
38
+ const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
39
+ const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
40
+
41
+ const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
42
+ const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
43
+ const uint8x16_t eq_2F = vceqq_u8(*lane, mask_2F);
44
+
45
+ const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
46
+ const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
47
+
48
+ // Now simply add the delta values to the input:
49
+ *lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
50
+
51
+ // Return the validity mask:
52
+ return vandq_u8(lo, hi);
53
+ }
54
+
55
+ static inline void
56
+ dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
57
+ {
58
+ if (*slen < 64) {
59
+ return;
60
+ }
61
+
62
+ // Process blocks of 64 bytes per round. Unlike the SSE codecs, no
63
+ // extra trailing zero bytes are written, so it is not necessary to
64
+ // reserve extra input bytes:
65
+ size_t rounds = *slen / 64;
66
+
67
+ *slen -= rounds * 64; // 64 bytes consumed per round
68
+ *olen += rounds * 48; // 48 bytes produced per round
69
+
70
+ do {
71
+ uint8x16x3_t dec;
72
+
73
+ // Load 64 bytes and deinterleave:
74
+ uint8x16x4_t str = vld4q_u8(*s);
75
+
76
+ // Decode each lane, collect a mask of invalid inputs:
77
+ const uint8x16_t classified
78
+ = dec_loop_neon32_lane(&str.val[0])
79
+ | dec_loop_neon32_lane(&str.val[1])
80
+ | dec_loop_neon32_lane(&str.val[2])
81
+ | dec_loop_neon32_lane(&str.val[3]);
82
+
83
+ // Check for invalid input: if any of the delta values are
84
+ // zero, fall back on bytewise code to do error checking and
85
+ // reporting:
86
+ if (is_nonzero(classified)) {
87
+ break;
88
+ }
89
+
90
+ // Compress four bytes into three:
91
+ dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
92
+ dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
93
+ dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
94
+
95
+ // Interleave and store decoded result:
96
+ vst3q_u8(*o, dec);
97
+
98
+ *s += 64;
99
+ *o += 48;
100
+
101
+ } while (--rounds > 0);
102
+
103
+ // Adjust for any rounds that were skipped:
104
+ *slen += rounds * 64;
105
+ *olen -= rounds * 48;
106
+ }
@@ -0,0 +1,58 @@
1
+ static inline void
2
+ enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
3
+ {
4
+ // Load 48 bytes and deinterleave:
5
+ uint8x16x3_t src = vld3q_u8(*s);
6
+
7
+ // Reshuffle:
8
+ uint8x16x4_t out = enc_reshuffle(src);
9
+
10
+ // Translate reshuffled bytes to the Base64 alphabet:
11
+ out = enc_translate(out);
12
+
13
+ // Interleave and store output:
14
+ vst4q_u8(*o, out);
15
+
16
+ *s += 48;
17
+ *o += 64;
18
+ }
19
+
20
+ static inline void
21
+ enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
22
+ {
23
+ size_t rounds = *slen / 48;
24
+
25
+ *slen -= rounds * 48; // 48 bytes consumed per round
26
+ *olen += rounds * 64; // 64 bytes produced per round
27
+
28
+ while (rounds > 0) {
29
+ if (rounds >= 8) {
30
+ enc_loop_neon32_inner(s, o);
31
+ enc_loop_neon32_inner(s, o);
32
+ enc_loop_neon32_inner(s, o);
33
+ enc_loop_neon32_inner(s, o);
34
+ enc_loop_neon32_inner(s, o);
35
+ enc_loop_neon32_inner(s, o);
36
+ enc_loop_neon32_inner(s, o);
37
+ enc_loop_neon32_inner(s, o);
38
+ rounds -= 8;
39
+ continue;
40
+ }
41
+ if (rounds >= 4) {
42
+ enc_loop_neon32_inner(s, o);
43
+ enc_loop_neon32_inner(s, o);
44
+ enc_loop_neon32_inner(s, o);
45
+ enc_loop_neon32_inner(s, o);
46
+ rounds -= 4;
47
+ continue;
48
+ }
49
+ if (rounds >= 2) {
50
+ enc_loop_neon32_inner(s, o);
51
+ enc_loop_neon32_inner(s, o);
52
+ rounds -= 2;
53
+ continue;
54
+ }
55
+ enc_loop_neon32_inner(s, o);
56
+ break;
57
+ }
58
+ }