ob64 0.1.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/main.yml +20 -4
  3. data/.gitignore +2 -0
  4. data/CHANGELOG.md +18 -1
  5. data/{LICENSE.txt → LICENSE} +1 -1
  6. data/README.md +34 -2
  7. data/benchmark.rb +42 -3
  8. data/ext/ob64/ob64_ext.c +5 -3
  9. data/lib/ob64/core_ext.rb +2 -0
  10. data/lib/ob64/version.rb +1 -1
  11. data/lib/ob64.rb +52 -0
  12. data/ob64.gemspec +12 -6
  13. data/vendor/libbase64/.gitignore +12 -0
  14. data/vendor/libbase64/.travis.yml +71 -0
  15. data/vendor/libbase64/CMakeLists.txt +264 -0
  16. data/vendor/libbase64/LICENSE +28 -0
  17. data/vendor/libbase64/Makefile +93 -0
  18. data/vendor/libbase64/README.md +474 -0
  19. data/vendor/libbase64/base64-benchmarks.png +0 -0
  20. data/vendor/libbase64/bin/base64.c +132 -0
  21. data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
  22. data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
  23. data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
  24. data/vendor/libbase64/cmake/config.h.in +25 -0
  25. data/vendor/libbase64/cmake/test-arch.c +35 -0
  26. data/vendor/libbase64/include/libbase64.h +145 -0
  27. data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
  28. data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
  29. data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
  30. data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
  31. data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
  32. data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
  33. data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
  34. data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
  35. data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
  36. data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
  37. data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
  38. data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
  39. data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
  40. data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
  41. data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
  42. data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
  43. data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
  44. data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
  45. data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
  46. data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
  47. data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
  48. data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
  49. data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
  50. data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
  51. data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
  52. data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
  53. data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
  54. data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
  55. data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
  56. data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
  57. data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
  58. data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
  59. data/vendor/libbase64/lib/codec_choose.c +281 -0
  60. data/vendor/libbase64/lib/codecs.h +65 -0
  61. data/vendor/libbase64/lib/env.h +67 -0
  62. data/vendor/libbase64/lib/exports.txt +7 -0
  63. data/vendor/libbase64/lib/lib.c +164 -0
  64. data/vendor/libbase64/lib/lib_openmp.c +149 -0
  65. data/vendor/libbase64/lib/tables/.gitignore +1 -0
  66. data/vendor/libbase64/lib/tables/Makefile +17 -0
  67. data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
  68. data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
  69. data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
  70. data/vendor/libbase64/lib/tables/table_generator.c +184 -0
  71. data/vendor/libbase64/lib/tables/tables.c +40 -0
  72. data/vendor/libbase64/lib/tables/tables.h +23 -0
  73. metadata +67 -6
  74. data/.byebug_history +0 -72
  75. data/.envrc +0 -1
@@ -0,0 +1,86 @@
1
+ static inline int
2
+ dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
3
+ {
4
+ const uint32_t str
5
+ = base64_table_dec_32bit_d0[(*s)[0]]
6
+ | base64_table_dec_32bit_d1[(*s)[1]]
7
+ | base64_table_dec_32bit_d2[(*s)[2]]
8
+ | base64_table_dec_32bit_d3[(*s)[3]];
9
+
10
+ #if BASE64_LITTLE_ENDIAN
11
+
12
+ // LUTs for little-endian set MSB in case of invalid character:
13
+ if (str & UINT32_C(0x80000000)) {
14
+ return 0;
15
+ }
16
+ #else
17
+ // LUTs for big-endian set LSB in case of invalid character:
18
+ if (str & UINT32_C(1)) {
19
+ return 0;
20
+ }
21
+ #endif
22
+ // Store the output:
23
+ memcpy(*o, &str, sizeof (str));
24
+
25
+ *s += 4;
26
+ *o += 3;
27
+ *rounds -= 1;
28
+
29
+ return 1;
30
+ }
31
+
32
+ static inline void
33
+ dec_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
34
+ {
35
+ if (*slen < 8) {
36
+ return;
37
+ }
38
+
39
+ // Process blocks of 4 bytes per round. Because one extra zero byte is
40
+ // written after the output, ensure that there will be at least 4 bytes
41
+ // of input data left to cover the gap. (Two data bytes and up to two
42
+ // end-of-string markers.)
43
+ size_t rounds = (*slen - 4) / 4;
44
+
45
+ *slen -= rounds * 4; // 4 bytes consumed per round
46
+ *olen += rounds * 3; // 3 bytes produced per round
47
+
48
+ do {
49
+ if (rounds >= 8) {
50
+ if (dec_loop_generic_32_inner(s, o, &rounds) &&
51
+ dec_loop_generic_32_inner(s, o, &rounds) &&
52
+ dec_loop_generic_32_inner(s, o, &rounds) &&
53
+ dec_loop_generic_32_inner(s, o, &rounds) &&
54
+ dec_loop_generic_32_inner(s, o, &rounds) &&
55
+ dec_loop_generic_32_inner(s, o, &rounds) &&
56
+ dec_loop_generic_32_inner(s, o, &rounds) &&
57
+ dec_loop_generic_32_inner(s, o, &rounds)) {
58
+ continue;
59
+ }
60
+ break;
61
+ }
62
+ if (rounds >= 4) {
63
+ if (dec_loop_generic_32_inner(s, o, &rounds) &&
64
+ dec_loop_generic_32_inner(s, o, &rounds) &&
65
+ dec_loop_generic_32_inner(s, o, &rounds) &&
66
+ dec_loop_generic_32_inner(s, o, &rounds)) {
67
+ continue;
68
+ }
69
+ break;
70
+ }
71
+ if (rounds >= 2) {
72
+ if (dec_loop_generic_32_inner(s, o, &rounds) &&
73
+ dec_loop_generic_32_inner(s, o, &rounds)) {
74
+ continue;
75
+ }
76
+ break;
77
+ }
78
+ dec_loop_generic_32_inner(s, o, &rounds);
79
+ break;
80
+
81
+ } while (rounds > 0);
82
+
83
+ // Adjust for any rounds that were skipped:
84
+ *slen += rounds * 4;
85
+ *olen -= rounds * 3;
86
+ }
@@ -0,0 +1,73 @@
1
+ static inline void
2
+ enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
3
+ {
4
+ uint32_t src;
5
+
6
+ // Load input:
7
+ memcpy(&src, *s, sizeof (src));
8
+
9
+ // Reorder to 32-bit big-endian, if not already in that format. The
10
+ // workset must be in big-endian, otherwise the shifted bits do not
11
+ // carry over properly among adjacent bytes:
12
+ src = BASE64_HTOBE32(src);
13
+
14
+ // Two indices for the 12-bit lookup table:
15
+ const size_t index0 = (src >> 20) & 0xFFFU;
16
+ const size_t index1 = (src >> 8) & 0xFFFU;
17
+
18
+ // Table lookup and store:
19
+ memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
20
+ memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
21
+
22
+ *s += 3;
23
+ *o += 4;
24
+ }
25
+
26
+ static inline void
27
+ enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
28
+ {
29
+ if (*slen < 4) {
30
+ return;
31
+ }
32
+
33
+ // Process blocks of 3 bytes at a time. Because blocks are loaded 4
34
+ // bytes at a time, ensure that there will be at least one remaining
35
+ // byte after the last round, so that the final read will not pass
36
+ // beyond the bounds of the input buffer:
37
+ size_t rounds = (*slen - 1) / 3;
38
+
39
+ *slen -= rounds * 3; // 3 bytes consumed per round
40
+ *olen += rounds * 4; // 4 bytes produced per round
41
+
42
+ do {
43
+ if (rounds >= 8) {
44
+ enc_loop_generic_32_inner(s, o);
45
+ enc_loop_generic_32_inner(s, o);
46
+ enc_loop_generic_32_inner(s, o);
47
+ enc_loop_generic_32_inner(s, o);
48
+ enc_loop_generic_32_inner(s, o);
49
+ enc_loop_generic_32_inner(s, o);
50
+ enc_loop_generic_32_inner(s, o);
51
+ enc_loop_generic_32_inner(s, o);
52
+ rounds -= 8;
53
+ continue;
54
+ }
55
+ if (rounds >= 4) {
56
+ enc_loop_generic_32_inner(s, o);
57
+ enc_loop_generic_32_inner(s, o);
58
+ enc_loop_generic_32_inner(s, o);
59
+ enc_loop_generic_32_inner(s, o);
60
+ rounds -= 4;
61
+ continue;
62
+ }
63
+ if (rounds >= 2) {
64
+ enc_loop_generic_32_inner(s, o);
65
+ enc_loop_generic_32_inner(s, o);
66
+ rounds -= 2;
67
+ continue;
68
+ }
69
+ enc_loop_generic_32_inner(s, o);
70
+ break;
71
+
72
+ } while (rounds > 0);
73
+ }
@@ -0,0 +1,77 @@
1
+ static inline void
2
+ enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
3
+ {
4
+ uint64_t src;
5
+
6
+ // Load input:
7
+ memcpy(&src, *s, sizeof (src));
8
+
9
+ // Reorder to 64-bit big-endian, if not already in that format. The
10
+ // workset must be in big-endian, otherwise the shifted bits do not
11
+ // carry over properly among adjacent bytes:
12
+ src = BASE64_HTOBE64(src);
13
+
14
+ // Four indices for the 12-bit lookup table:
15
+ const size_t index0 = (src >> 52) & 0xFFFU;
16
+ const size_t index1 = (src >> 40) & 0xFFFU;
17
+ const size_t index2 = (src >> 28) & 0xFFFU;
18
+ const size_t index3 = (src >> 16) & 0xFFFU;
19
+
20
+ // Table lookup and store:
21
+ memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
22
+ memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
23
+ memcpy(*o + 4, base64_table_enc_12bit + index2, 2);
24
+ memcpy(*o + 6, base64_table_enc_12bit + index3, 2);
25
+
26
+ *s += 6;
27
+ *o += 8;
28
+ }
29
+
30
+ static inline void
31
+ enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
32
+ {
33
+ if (*slen < 8) {
34
+ return;
35
+ }
36
+
37
+ // Process blocks of 6 bytes at a time. Because blocks are loaded 8
38
+ // bytes at a time, ensure that there will be at least 2 remaining
39
+ // bytes after the last round, so that the final read will not pass
40
+ // beyond the bounds of the input buffer:
41
+ size_t rounds = (*slen - 2) / 6;
42
+
43
+ *slen -= rounds * 6; // 6 bytes consumed per round
44
+ *olen += rounds * 8; // 8 bytes produced per round
45
+
46
+ do {
47
+ if (rounds >= 8) {
48
+ enc_loop_generic_64_inner(s, o);
49
+ enc_loop_generic_64_inner(s, o);
50
+ enc_loop_generic_64_inner(s, o);
51
+ enc_loop_generic_64_inner(s, o);
52
+ enc_loop_generic_64_inner(s, o);
53
+ enc_loop_generic_64_inner(s, o);
54
+ enc_loop_generic_64_inner(s, o);
55
+ enc_loop_generic_64_inner(s, o);
56
+ rounds -= 8;
57
+ continue;
58
+ }
59
+ if (rounds >= 4) {
60
+ enc_loop_generic_64_inner(s, o);
61
+ enc_loop_generic_64_inner(s, o);
62
+ enc_loop_generic_64_inner(s, o);
63
+ enc_loop_generic_64_inner(s, o);
64
+ rounds -= 4;
65
+ continue;
66
+ }
67
+ if (rounds >= 2) {
68
+ enc_loop_generic_64_inner(s, o);
69
+ enc_loop_generic_64_inner(s, o);
70
+ rounds -= 2;
71
+ continue;
72
+ }
73
+ enc_loop_generic_64_inner(s, o);
74
+ break;
75
+
76
+ } while (rounds > 0);
77
+ }
@@ -0,0 +1,39 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <string.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #if BASE64_WORDSIZE == 32
12
+ # include "32/enc_loop.c"
13
+ #elif BASE64_WORDSIZE == 64
14
+ # include "64/enc_loop.c"
15
+ #endif
16
+
17
+ #if BASE64_WORDSIZE >= 32
18
+ # include "32/dec_loop.c"
19
+ #endif
20
+
21
+ BASE64_ENC_FUNCTION(plain)
22
+ {
23
+ #include "enc_head.c"
24
+ #if BASE64_WORDSIZE == 32
25
+ enc_loop_generic_32(&s, &slen, &o, &olen);
26
+ #elif BASE64_WORDSIZE == 64
27
+ enc_loop_generic_64(&s, &slen, &o, &olen);
28
+ #endif
29
+ #include "enc_tail.c"
30
+ }
31
+
32
+ BASE64_DEC_FUNCTION(plain)
33
+ {
34
+ #include "dec_head.c"
35
+ #if BASE64_WORDSIZE >= 32
36
+ dec_loop_generic_32(&s, &slen, &o, &olen);
37
+ #endif
38
+ #include "dec_tail.c"
39
+ }
@@ -0,0 +1,37 @@
1
+ int ret = 0;
2
+ const uint8_t *s = (const uint8_t *) src;
3
+ uint8_t *o = (uint8_t *) out;
4
+ uint8_t q;
5
+
6
+ // Use local temporaries to avoid cache thrashing:
7
+ size_t olen = 0;
8
+ size_t slen = srclen;
9
+ struct base64_state st;
10
+ st.eof = state->eof;
11
+ st.bytes = state->bytes;
12
+ st.carry = state->carry;
13
+
14
+ // If we previously saw an EOF or an invalid character, bail out:
15
+ if (st.eof) {
16
+ *outlen = 0;
17
+ ret = 0;
18
+ // If there was a trailing '=' to check, check it:
19
+ if (slen && (st.eof == BASE64_AEOF)) {
20
+ state->bytes = 0;
21
+ state->eof = BASE64_EOF;
22
+ ret = ((base64_table_dec_8bit[*s++] == 254) && (slen == 1)) ? 1 : 0;
23
+ }
24
+ return ret;
25
+ }
26
+
27
+ // Turn four 6-bit numbers into three bytes:
28
+ // out[0] = 11111122
29
+ // out[1] = 22223333
30
+ // out[2] = 33444444
31
+
32
+ // Duff's device again:
33
+ switch (st.bytes)
34
+ {
35
+ for (;;)
36
+ {
37
+ case 0:
@@ -0,0 +1,91 @@
1
+ if (slen-- == 0) {
2
+ ret = 1;
3
+ break;
4
+ }
5
+ if ((q = base64_table_dec_8bit[*s++]) >= 254) {
6
+ st.eof = BASE64_EOF;
7
+ // Treat character '=' as invalid for byte 0:
8
+ break;
9
+ }
10
+ st.carry = q << 2;
11
+ st.bytes++;
12
+
13
+ // Deliberate fallthrough:
14
+ BASE64_FALLTHROUGH
15
+
16
+ case 1: if (slen-- == 0) {
17
+ ret = 1;
18
+ break;
19
+ }
20
+ if ((q = base64_table_dec_8bit[*s++]) >= 254) {
21
+ st.eof = BASE64_EOF;
22
+ // Treat character '=' as invalid for byte 1:
23
+ break;
24
+ }
25
+ *o++ = st.carry | (q >> 4);
26
+ st.carry = q << 4;
27
+ st.bytes++;
28
+ olen++;
29
+
30
+ // Deliberate fallthrough:
31
+ BASE64_FALLTHROUGH
32
+
33
+ case 2: if (slen-- == 0) {
34
+ ret = 1;
35
+ break;
36
+ }
37
+ if ((q = base64_table_dec_8bit[*s++]) >= 254) {
38
+ st.bytes++;
39
+ // When q == 254, the input char is '='.
40
+ // Check if next byte is also '=':
41
+ if (q == 254) {
42
+ if (slen-- != 0) {
43
+ st.bytes = 0;
44
+ // EOF:
45
+ st.eof = BASE64_EOF;
46
+ q = base64_table_dec_8bit[*s++];
47
+ ret = ((q == 254) && (slen == 0)) ? 1 : 0;
48
+ break;
49
+ }
50
+ else {
51
+ // Almost EOF
52
+ st.eof = BASE64_AEOF;
53
+ ret = 1;
54
+ break;
55
+ }
56
+ }
57
+ // If we get here, there was an error:
58
+ break;
59
+ }
60
+ *o++ = st.carry | (q >> 2);
61
+ st.carry = q << 6;
62
+ st.bytes++;
63
+ olen++;
64
+
65
+ // Deliberate fallthrough:
66
+ BASE64_FALLTHROUGH
67
+
68
+ case 3: if (slen-- == 0) {
69
+ ret = 1;
70
+ break;
71
+ }
72
+ if ((q = base64_table_dec_8bit[*s++]) >= 254) {
73
+ st.bytes = 0;
74
+ st.eof = BASE64_EOF;
75
+ // When q == 254, the input char is '='. Return 1 and EOF.
76
+ // When q == 255, the input char is invalid. Return 0 and EOF.
77
+ ret = ((q == 254) && (slen == 0)) ? 1 : 0;
78
+ break;
79
+ }
80
+ *o++ = st.carry | q;
81
+ st.carry = 0;
82
+ st.bytes = 0;
83
+ olen++;
84
+ }
85
+ }
86
+
87
+ state->eof = st.eof;
88
+ state->bytes = st.bytes;
89
+ state->carry = st.carry;
90
+ *outlen = olen;
91
+ return ret;
@@ -0,0 +1,24 @@
1
+ // Assume that *out is large enough to contain the output.
2
+ // Theoretically it should be 4/3 the length of src.
3
+ const uint8_t *s = (const uint8_t *) src;
4
+ uint8_t *o = (uint8_t *) out;
5
+
6
+ // Use local temporaries to avoid cache thrashing:
7
+ size_t olen = 0;
8
+ size_t slen = srclen;
9
+ struct base64_state st;
10
+ st.bytes = state->bytes;
11
+ st.carry = state->carry;
12
+
13
+ // Turn three bytes into four 6-bit numbers:
14
+ // in[0] = 00111111
15
+ // in[1] = 00112222
16
+ // in[2] = 00222233
17
+ // in[3] = 00333333
18
+
19
+ // Duff's device, a for() loop inside a switch() statement. Legal!
20
+ switch (st.bytes)
21
+ {
22
+ for (;;)
23
+ {
24
+ case 0:
@@ -0,0 +1,34 @@
1
+ if (slen-- == 0) {
2
+ break;
3
+ }
4
+ *o++ = base64_table_enc_6bit[*s >> 2];
5
+ st.carry = (*s++ << 4) & 0x30;
6
+ st.bytes++;
7
+ olen += 1;
8
+
9
+ // Deliberate fallthrough:
10
+ BASE64_FALLTHROUGH
11
+
12
+ case 1: if (slen-- == 0) {
13
+ break;
14
+ }
15
+ *o++ = base64_table_enc_6bit[st.carry | (*s >> 4)];
16
+ st.carry = (*s++ << 2) & 0x3C;
17
+ st.bytes++;
18
+ olen += 1;
19
+
20
+ // Deliberate fallthrough:
21
+ BASE64_FALLTHROUGH
22
+
23
+ case 2: if (slen-- == 0) {
24
+ break;
25
+ }
26
+ *o++ = base64_table_enc_6bit[st.carry | (*s >> 6)];
27
+ *o++ = base64_table_enc_6bit[*s++ & 0x3F];
28
+ st.bytes = 0;
29
+ olen += 2;
30
+ }
31
+ }
32
+ state->bytes = st.bytes;
33
+ state->carry = st.carry;
34
+ *outlen = olen;
@@ -0,0 +1,72 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <string.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #ifdef __arm__
12
+ # if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32
13
+ # define BASE64_USE_NEON32
14
+ # endif
15
+ #endif
16
+
17
+ #ifdef BASE64_USE_NEON32
18
+ #include <arm_neon.h>
19
+
20
+ static inline uint8x16_t
21
+ vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
22
+ {
23
+ // NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
24
+ // the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups.
25
+ uint8x8x2_t lut2;
26
+ uint8x8x2_t result;
27
+
28
+ lut2.val[0] = vget_low_u8(lut);
29
+ lut2.val[1] = vget_high_u8(lut);
30
+
31
+ result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices));
32
+ result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices));
33
+
34
+ return vcombine_u8(result.val[0], result.val[1]);
35
+ }
36
+
37
+ #include "../generic/32/dec_loop.c"
38
+ #include "../generic/32/enc_loop.c"
39
+ #include "dec_loop.c"
40
+ #include "enc_reshuffle.c"
41
+ #include "enc_translate.c"
42
+ #include "enc_loop.c"
43
+
44
+ #endif // BASE64_USE_NEON32
45
+
46
+ // Stride size is so large on these NEON 32-bit functions
47
+ // (48 bytes encode, 32 bytes decode) that we inline the
48
+ // uint32 codec to stay performant on smaller inputs.
49
+
50
+ BASE64_ENC_FUNCTION(neon32)
51
+ {
52
+ #ifdef BASE64_USE_NEON32
53
+ #include "../generic/enc_head.c"
54
+ enc_loop_neon32(&s, &slen, &o, &olen);
55
+ enc_loop_generic_32(&s, &slen, &o, &olen);
56
+ #include "../generic/enc_tail.c"
57
+ #else
58
+ BASE64_ENC_STUB
59
+ #endif
60
+ }
61
+
62
+ BASE64_DEC_FUNCTION(neon32)
63
+ {
64
+ #ifdef BASE64_USE_NEON32
65
+ #include "../generic/dec_head.c"
66
+ dec_loop_neon32(&s, &slen, &o, &olen);
67
+ dec_loop_generic_32(&s, &slen, &o, &olen);
68
+ #include "../generic/dec_tail.c"
69
+ #else
70
+ BASE64_DEC_STUB
71
+ #endif
72
+ }
@@ -0,0 +1,106 @@
1
+ static inline int
2
+ is_nonzero (const uint8x16_t v)
3
+ {
4
+ uint64_t u64;
5
+ const uint64x2_t v64 = vreinterpretq_u64_u8(v);
6
+ const uint32x2_t v32 = vqmovn_u64(v64);
7
+
8
+ vst1_u64(&u64, vreinterpret_u64_u32(v32));
9
+ return u64 != 0;
10
+ }
11
+
12
+ static inline uint8x16_t
13
+ delta_lookup (const uint8x16_t v)
14
+ {
15
+ const uint8x8_t lut = {
16
+ 0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
17
+ };
18
+
19
+ return vcombine_u8(
20
+ vtbl1_u8(lut, vget_low_u8(v)),
21
+ vtbl1_u8(lut, vget_high_u8(v)));
22
+ }
23
+
24
+ static inline uint8x16_t
25
+ dec_loop_neon32_lane (uint8x16_t *lane)
26
+ {
27
+ // See the SSSE3 decoder for an explanation of the algorithm.
28
+ const uint8x16_t lut_lo = {
29
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
30
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
31
+ };
32
+
33
+ const uint8x16_t lut_hi = {
34
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
35
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
36
+ };
37
+
38
+ const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
39
+ const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
40
+
41
+ const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
42
+ const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
43
+ const uint8x16_t eq_2F = vceqq_u8(*lane, mask_2F);
44
+
45
+ const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
46
+ const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
47
+
48
+ // Now simply add the delta values to the input:
49
+ *lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
50
+
51
+ // Return the validity mask:
52
+ return vandq_u8(lo, hi);
53
+ }
54
+
55
+ static inline void
56
+ dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
57
+ {
58
+ if (*slen < 64) {
59
+ return;
60
+ }
61
+
62
+ // Process blocks of 64 bytes per round. Unlike the SSE codecs, no
63
+ // extra trailing zero bytes are written, so it is not necessary to
64
+ // reserve extra input bytes:
65
+ size_t rounds = *slen / 64;
66
+
67
+ *slen -= rounds * 64; // 64 bytes consumed per round
68
+ *olen += rounds * 48; // 48 bytes produced per round
69
+
70
+ do {
71
+ uint8x16x3_t dec;
72
+
73
+ // Load 64 bytes and deinterleave:
74
+ uint8x16x4_t str = vld4q_u8(*s);
75
+
76
+ // Decode each lane, collect a mask of invalid inputs:
77
+ const uint8x16_t classified
78
+ = dec_loop_neon32_lane(&str.val[0])
79
+ | dec_loop_neon32_lane(&str.val[1])
80
+ | dec_loop_neon32_lane(&str.val[2])
81
+ | dec_loop_neon32_lane(&str.val[3]);
82
+
83
+ // Check for invalid input: if any of the delta values are
84
+ // zero, fall back on bytewise code to do error checking and
85
+ // reporting:
86
+ if (is_nonzero(classified)) {
87
+ break;
88
+ }
89
+
90
+ // Compress four bytes into three:
91
+ dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
92
+ dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
93
+ dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
94
+
95
+ // Interleave and store decoded result:
96
+ vst3q_u8(*o, dec);
97
+
98
+ *s += 64;
99
+ *o += 48;
100
+
101
+ } while (--rounds > 0);
102
+
103
+ // Adjust for any rounds that were skipped:
104
+ *slen += rounds * 64;
105
+ *olen -= rounds * 48;
106
+ }
@@ -0,0 +1,58 @@
1
+ static inline void
2
+ enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
3
+ {
4
+ // Load 48 bytes and deinterleave:
5
+ uint8x16x3_t src = vld3q_u8(*s);
6
+
7
+ // Reshuffle:
8
+ uint8x16x4_t out = enc_reshuffle(src);
9
+
10
+ // Translate reshuffled bytes to the Base64 alphabet:
11
+ out = enc_translate(out);
12
+
13
+ // Interleave and store output:
14
+ vst4q_u8(*o, out);
15
+
16
+ *s += 48;
17
+ *o += 64;
18
+ }
19
+
20
+ static inline void
21
+ enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
22
+ {
23
+ size_t rounds = *slen / 48;
24
+
25
+ *slen -= rounds * 48; // 48 bytes consumed per round
26
+ *olen += rounds * 64; // 64 bytes produced per round
27
+
28
+ while (rounds > 0) {
29
+ if (rounds >= 8) {
30
+ enc_loop_neon32_inner(s, o);
31
+ enc_loop_neon32_inner(s, o);
32
+ enc_loop_neon32_inner(s, o);
33
+ enc_loop_neon32_inner(s, o);
34
+ enc_loop_neon32_inner(s, o);
35
+ enc_loop_neon32_inner(s, o);
36
+ enc_loop_neon32_inner(s, o);
37
+ enc_loop_neon32_inner(s, o);
38
+ rounds -= 8;
39
+ continue;
40
+ }
41
+ if (rounds >= 4) {
42
+ enc_loop_neon32_inner(s, o);
43
+ enc_loop_neon32_inner(s, o);
44
+ enc_loop_neon32_inner(s, o);
45
+ enc_loop_neon32_inner(s, o);
46
+ rounds -= 4;
47
+ continue;
48
+ }
49
+ if (rounds >= 2) {
50
+ enc_loop_neon32_inner(s, o);
51
+ enc_loop_neon32_inner(s, o);
52
+ rounds -= 2;
53
+ continue;
54
+ }
55
+ enc_loop_neon32_inner(s, o);
56
+ break;
57
+ }
58
+ }