ob64 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/lib/ob64/version.rb +1 -1
  5. data/ob64.gemspec +2 -0
  6. data/vendor/libbase64/.gitignore +12 -0
  7. data/vendor/libbase64/.travis.yml +71 -0
  8. data/vendor/libbase64/CMakeLists.txt +264 -0
  9. data/vendor/libbase64/LICENSE +28 -0
  10. data/vendor/libbase64/Makefile +93 -0
  11. data/vendor/libbase64/README.md +474 -0
  12. data/vendor/libbase64/base64-benchmarks.png +0 -0
  13. data/vendor/libbase64/bin/base64.c +132 -0
  14. data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
  15. data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
  16. data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
  17. data/vendor/libbase64/cmake/config.h.in +25 -0
  18. data/vendor/libbase64/cmake/test-arch.c +35 -0
  19. data/vendor/libbase64/include/libbase64.h +145 -0
  20. data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
  21. data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
  22. data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
  23. data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
  24. data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
  25. data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
  26. data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
  27. data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
  28. data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
  29. data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
  30. data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
  31. data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
  32. data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
  33. data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
  34. data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
  35. data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
  36. data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
  37. data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
  38. data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
  39. data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
  40. data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
  41. data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
  42. data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
  43. data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
  44. data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
  45. data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
  46. data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
  47. data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
  48. data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
  49. data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
  50. data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
  51. data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
  52. data/vendor/libbase64/lib/codec_choose.c +281 -0
  53. data/vendor/libbase64/lib/codecs.h +65 -0
  54. data/vendor/libbase64/lib/env.h +67 -0
  55. data/vendor/libbase64/lib/exports.txt +7 -0
  56. data/vendor/libbase64/lib/lib.c +164 -0
  57. data/vendor/libbase64/lib/lib_openmp.c +149 -0
  58. data/vendor/libbase64/lib/tables/.gitignore +1 -0
  59. data/vendor/libbase64/lib/tables/Makefile +17 -0
  60. data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
  61. data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
  62. data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
  63. data/vendor/libbase64/lib/tables/table_generator.c +184 -0
  64. data/vendor/libbase64/lib/tables/tables.c +40 -0
  65. data/vendor/libbase64/lib/tables/tables.h +23 -0
  66. metadata +64 -4
@@ -0,0 +1,54 @@
1
+ static inline uint8x16x4_t
2
+ enc_reshuffle (const uint8x16x3_t in)
3
+ {
4
+ uint8x16x4_t out;
5
+
6
+ #if defined(__GNUC__) || defined(__clang__)
7
+
8
+ // GCC and Clang support the following inline assembly syntax. This
9
+ // inline assembly implements the exact same algorithm as the
10
+ // intrinsics further down, but benchmarks show that the inline
11
+ // assembly easily beats the intrinsics. Perhaps this is because the
12
+ // inline assembly is well pipelined to avoid data dependencies.
13
+
14
+ __asm__ (
15
+ "vshr.u8 %q[o0], %q[i0], #2 \n\t"
16
+ "vshr.u8 %q[o1], %q[i1], #2 \n\t"
17
+ "vshr.u8 %q[o2], %q[i2], #4 \n\t"
18
+ "vsli.8 %q[o1], %q[i0], #6 \n\t"
19
+ "vsli.8 %q[o2], %q[i1], #4 \n\t"
20
+ "vshl.u8 %q[o3], %q[i2], #2 \n\t"
21
+
22
+ "vshr.u8 %q[o1], %q[o1], #2 \n\t"
23
+ "vshr.u8 %q[o2], %q[o2], #2 \n\t"
24
+ "vshr.u8 %q[o3], %q[o3], #2 \n\t"
25
+
26
+ // Outputs:
27
+ : [o0] "=&w" (out.val[0]),
28
+ [o1] "=&w" (out.val[1]),
29
+ [o2] "=&w" (out.val[2]),
30
+ [o3] "=&w" (out.val[3])
31
+
32
+ // Inputs:
33
+ : [i0] "w" (in.val[0]),
34
+ [i1] "w" (in.val[1]),
35
+ [i2] "w" (in.val[2])
36
+ );
37
+ #else
38
+ // Divide bits of three input bytes over four output bytes. All output
39
+ // bytes except the first one are shifted over two bits to the left:
40
+ out.val[0] = vshrq_n_u8(in.val[0], 2);
41
+ out.val[1] = vshrq_n_u8(in.val[1], 2);
42
+ out.val[2] = vshrq_n_u8(in.val[2], 4);
43
+ out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 6);
44
+ out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 4);
45
+ out.val[3] = vshlq_n_u8(in.val[2], 2);
46
+
47
+ // Clear the top two bits by shifting the output back to the right:
48
+ out.val[1] = vshrq_n_u8(out.val[1], 2);
49
+ out.val[2] = vshrq_n_u8(out.val[2], 2);
50
+ out.val[3] = vshrq_n_u8(out.val[3], 2);
51
+ #endif
52
+
53
+ return out;
54
+ }
@@ -0,0 +1,57 @@
1
+ static inline uint8x16x4_t
2
+ enc_translate (const uint8x16x4_t in)
3
+ {
4
+ // A lookup table containing the absolute offsets for all ranges:
5
+ const uint8x16_t lut = {
6
+ 65U, 71U, 252U, 252U,
7
+ 252U, 252U, 252U, 252U,
8
+ 252U, 252U, 252U, 252U,
9
+ 237U, 240U, 0U, 0U
10
+ };
11
+
12
+ const uint8x16_t offset = vdupq_n_u8(51);
13
+
14
+ uint8x16x4_t indices, mask, delta, out;
15
+
16
+ // Translate values 0..63 to the Base64 alphabet. There are five sets:
17
+ // # From To Abs Index Characters
18
+ // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
19
+ // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
20
+ // 2 [52..61] [48..57] -4 [2..11] 0123456789
21
+ // 3 [62] [43] -19 12 +
22
+ // 4 [63] [47] -16 13 /
23
+
24
+ // Create LUT indices from input:
25
+ // the index for range #0 is right, others are 1 less than expected:
26
+ indices.val[0] = vqsubq_u8(in.val[0], offset);
27
+ indices.val[1] = vqsubq_u8(in.val[1], offset);
28
+ indices.val[2] = vqsubq_u8(in.val[2], offset);
29
+ indices.val[3] = vqsubq_u8(in.val[3], offset);
30
+
31
+ // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
32
+ mask.val[0] = vcgtq_u8(in.val[0], vdupq_n_u8(25));
33
+ mask.val[1] = vcgtq_u8(in.val[1], vdupq_n_u8(25));
34
+ mask.val[2] = vcgtq_u8(in.val[2], vdupq_n_u8(25));
35
+ mask.val[3] = vcgtq_u8(in.val[3], vdupq_n_u8(25));
36
+
37
+ // Subtract -1, so add 1 to indices for range #[1..4], All indices are
38
+ // now correct:
39
+ indices.val[0] = vsubq_u8(indices.val[0], mask.val[0]);
40
+ indices.val[1] = vsubq_u8(indices.val[1], mask.val[1]);
41
+ indices.val[2] = vsubq_u8(indices.val[2], mask.val[2]);
42
+ indices.val[3] = vsubq_u8(indices.val[3], mask.val[3]);
43
+
44
+ // Lookup delta values:
45
+ delta.val[0] = vqtbl1q_u8(lut, indices.val[0]);
46
+ delta.val[1] = vqtbl1q_u8(lut, indices.val[1]);
47
+ delta.val[2] = vqtbl1q_u8(lut, indices.val[2]);
48
+ delta.val[3] = vqtbl1q_u8(lut, indices.val[3]);
49
+
50
+ // Add delta values:
51
+ out.val[0] = vaddq_u8(in.val[0], delta.val[0]);
52
+ out.val[1] = vaddq_u8(in.val[1], delta.val[1]);
53
+ out.val[2] = vaddq_u8(in.val[2], delta.val[2]);
54
+ out.val[3] = vaddq_u8(in.val[3], delta.val[3]);
55
+
56
+ return out;
57
+ }
@@ -0,0 +1,70 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <string.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #ifdef __aarch64__
12
+ # if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON64
13
+ # define BASE64_USE_NEON64
14
+ # endif
15
+ #endif
16
+
17
+ #ifdef BASE64_USE_NEON64
18
+ #include <arm_neon.h>
19
+
20
+ static inline uint8x16x4_t
21
+ load_64byte_table (const uint8_t *p)
22
+ {
23
+ #if defined(__GNUC__) && !defined(__clang__)
24
+ // As of October 2016, GCC does not support the 'vld1q_u8_x4()' intrinsic.
25
+ uint8x16x4_t ret;
26
+ ret.val[0] = vld1q_u8(p + 0);
27
+ ret.val[1] = vld1q_u8(p + 16);
28
+ ret.val[2] = vld1q_u8(p + 32);
29
+ ret.val[3] = vld1q_u8(p + 48);
30
+ return ret;
31
+ #else
32
+ return vld1q_u8_x4(p);
33
+ #endif
34
+ }
35
+
36
+ #include "../generic/32/dec_loop.c"
37
+ #include "../generic/64/enc_loop.c"
38
+ #include "dec_loop.c"
39
+ #include "enc_reshuffle.c"
40
+ #include "enc_loop.c"
41
+
42
+ #endif // BASE64_USE_NEON64
43
+
44
+ // Stride size is so large on these NEON 64-bit functions
45
+ // (48 bytes encode, 64 bytes decode) that we inline the
46
+ // uint64 codec to stay performant on smaller inputs.
47
+
48
+ BASE64_ENC_FUNCTION(neon64)
49
+ {
50
+ #ifdef BASE64_USE_NEON64
51
+ #include "../generic/enc_head.c"
52
+ enc_loop_neon64(&s, &slen, &o, &olen);
53
+ enc_loop_generic_64(&s, &slen, &o, &olen);
54
+ #include "../generic/enc_tail.c"
55
+ #else
56
+ BASE64_ENC_STUB
57
+ #endif
58
+ }
59
+
60
+ BASE64_DEC_FUNCTION(neon64)
61
+ {
62
+ #ifdef BASE64_USE_NEON64
63
+ #include "../generic/dec_head.c"
64
+ dec_loop_neon64(&s, &slen, &o, &olen);
65
+ dec_loop_generic_32(&s, &slen, &o, &olen);
66
+ #include "../generic/dec_tail.c"
67
+ #else
68
+ BASE64_DEC_STUB
69
+ #endif
70
+ }
@@ -0,0 +1,129 @@
1
+ // The input consists of five valid character sets in the Base64 alphabet,
2
+ // which we need to map back to the 6-bit values they represent.
3
+ // There are three ranges, two singles, and then there's the rest.
4
+ //
5
+ // # From To LUT Characters
6
+ // 1 [0..42] [255] #1 invalid input
7
+ // 2 [43] [62] #1 +
8
+ // 3 [44..46] [255] #1 invalid input
9
+ // 4 [47] [63] #1 /
10
+ // 5 [48..57] [52..61] #1 0..9
11
+ // 6 [58..63] [255] #1 invalid input
12
+ // 7 [64] [255] #2 invalid input
13
+ // 8 [65..90] [0..25] #2 A..Z
14
+ // 9 [91..96] [255] #2 invalid input
15
+ // 10 [97..122] [26..51] #2 a..z
16
+ // 11 [123..126] [255] #2 invalid input
17
+ // (12) Everything else => invalid input
18
+
19
+ // The first LUT will use the VTBL instruction (out of range indices are set to
20
+ // 0 in destination).
21
+ static const uint8_t dec_lut1[] = {
22
+ 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
23
+ 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
24
+ 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 62U, 255U, 255U, 255U, 63U,
25
+ 52U, 53U, 54U, 55U, 56U, 57U, 58U, 59U, 60U, 61U, 255U, 255U, 255U, 255U, 255U, 255U,
26
+ };
27
+
28
+ // The second LUT will use the VTBX instruction (out of range indices will be
29
+ // unchanged in destination). Input [64..126] will be mapped to index [1..63]
30
+ // in this LUT. Index 0 means that value comes from LUT #1.
31
+ static const uint8_t dec_lut2[] = {
32
+ 0U, 255U, 0U, 1U, 2U, 3U, 4U, 5U, 6U, 7U, 8U, 9U, 10U, 11U, 12U, 13U,
33
+ 14U, 15U, 16U, 17U, 18U, 19U, 20U, 21U, 22U, 23U, 24U, 25U, 255U, 255U, 255U, 255U,
34
+ 255U, 255U, 26U, 27U, 28U, 29U, 30U, 31U, 32U, 33U, 34U, 35U, 36U, 37U, 38U, 39U,
35
+ 40U, 41U, 42U, 43U, 44U, 45U, 46U, 47U, 48U, 49U, 50U, 51U, 255U, 255U, 255U, 255U,
36
+ };
37
+
38
+ // All input values in range for the first look-up will be 0U in the second
39
+ // look-up result. All input values out of range for the first look-up will be
40
+ // 0U in the first look-up result. Thus, the two results can be ORed without
41
+ // conflicts.
42
+ //
43
+ // Invalid characters that are in the valid range for either look-up will be
44
+ // set to 255U in the combined result. Other invalid characters will just be
45
+ // passed through with the second look-up result (using the VTBX instruction).
46
+ // Since the second LUT is 64 bytes, those passed-through values are guaranteed
47
+ // to have a value greater than 63U. Therefore, valid characters will be mapped
48
+ // to the valid [0..63] range and all invalid characters will be mapped to
49
+ // values greater than 63.
50
+
51
+ static inline void
52
+ dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
53
+ {
54
+ if (*slen < 64) {
55
+ return;
56
+ }
57
+
58
+ // Process blocks of 64 bytes per round. Unlike the SSE codecs, no
59
+ // extra trailing zero bytes are written, so it is not necessary to
60
+ // reserve extra input bytes:
61
+ size_t rounds = *slen / 64;
62
+
63
+ *slen -= rounds * 64; // 64 bytes consumed per round
64
+ *olen += rounds * 48; // 48 bytes produced per round
65
+
66
+ const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1);
67
+ const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2);
68
+
69
+ do {
70
+ const uint8x16_t offset = vdupq_n_u8(63U);
71
+ uint8x16x4_t dec1, dec2;
72
+ uint8x16x3_t dec;
73
+
74
+ // Load 64 bytes and deinterleave:
75
+ uint8x16x4_t str = vld4q_u8((uint8_t *) *s);
76
+
77
+ // Get indices for second LUT:
78
+ dec2.val[0] = vqsubq_u8(str.val[0], offset);
79
+ dec2.val[1] = vqsubq_u8(str.val[1], offset);
80
+ dec2.val[2] = vqsubq_u8(str.val[2], offset);
81
+ dec2.val[3] = vqsubq_u8(str.val[3], offset);
82
+
83
+ // Get values from first LUT:
84
+ dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
85
+ dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
86
+ dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
87
+ dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
88
+
89
+ // Get values from second LUT:
90
+ dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
91
+ dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
92
+ dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
93
+ dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
94
+
95
+ // Get final values:
96
+ str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
97
+ str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
98
+ str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
99
+ str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
100
+
101
+ // Check for invalid input, any value larger than 63:
102
+ const uint8x16_t classified
103
+ = vcgtq_u8(str.val[0], vdupq_n_u8(63))
104
+ | vcgtq_u8(str.val[1], vdupq_n_u8(63))
105
+ | vcgtq_u8(str.val[2], vdupq_n_u8(63))
106
+ | vcgtq_u8(str.val[3], vdupq_n_u8(63));
107
+
108
+ // Check that all bits are zero:
109
+ if (vmaxvq_u8(classified) != 0U) {
110
+ break;
111
+ }
112
+
113
+ // Compress four bytes into three:
114
+ dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4);
115
+ dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2);
116
+ dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3];
117
+
118
+ // Interleave and store decoded result:
119
+ vst3q_u8((uint8_t *) *o, dec);
120
+
121
+ *s += 64;
122
+ *o += 48;
123
+
124
+ } while (--rounds > 0);
125
+
126
+ // Adjust for any rounds that were skipped:
127
+ *slen += rounds * 64;
128
+ *olen -= rounds * 48;
129
+ }
@@ -0,0 +1,66 @@
1
+ static inline void
2
+ enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t *tbl_enc)
3
+ {
4
+ // Load 48 bytes and deinterleave:
5
+ uint8x16x3_t src = vld3q_u8(*s);
6
+
7
+ // Divide bits of three input bytes over four output bytes:
8
+ uint8x16x4_t out = enc_reshuffle(src);
9
+
10
+ // The bits have now been shifted to the right locations;
11
+ // translate their values 0..63 to the Base64 alphabet.
12
+ // Use a 64-byte table lookup:
13
+ out.val[0] = vqtbl4q_u8(*tbl_enc, out.val[0]);
14
+ out.val[1] = vqtbl4q_u8(*tbl_enc, out.val[1]);
15
+ out.val[2] = vqtbl4q_u8(*tbl_enc, out.val[2]);
16
+ out.val[3] = vqtbl4q_u8(*tbl_enc, out.val[3]);
17
+
18
+ // Interleave and store output:
19
+ vst4q_u8(*o, out);
20
+
21
+ *s += 48;
22
+ *o += 64;
23
+ }
24
+
25
+ static inline void
26
+ enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
27
+ {
28
+ size_t rounds = *slen / 48;
29
+
30
+ *slen -= rounds * 48; // 48 bytes consumed per round
31
+ *olen += rounds * 64; // 64 bytes produced per round
32
+
33
+ // Load the encoding table:
34
+ const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);
35
+
36
+ while (rounds > 0) {
37
+ if (rounds >= 8) {
38
+ enc_loop_neon64_inner(s, o, &tbl_enc);
39
+ enc_loop_neon64_inner(s, o, &tbl_enc);
40
+ enc_loop_neon64_inner(s, o, &tbl_enc);
41
+ enc_loop_neon64_inner(s, o, &tbl_enc);
42
+ enc_loop_neon64_inner(s, o, &tbl_enc);
43
+ enc_loop_neon64_inner(s, o, &tbl_enc);
44
+ enc_loop_neon64_inner(s, o, &tbl_enc);
45
+ enc_loop_neon64_inner(s, o, &tbl_enc);
46
+ rounds -= 8;
47
+ continue;
48
+ }
49
+ if (rounds >= 4) {
50
+ enc_loop_neon64_inner(s, o, &tbl_enc);
51
+ enc_loop_neon64_inner(s, o, &tbl_enc);
52
+ enc_loop_neon64_inner(s, o, &tbl_enc);
53
+ enc_loop_neon64_inner(s, o, &tbl_enc);
54
+ rounds -= 4;
55
+ continue;
56
+ }
57
+ if (rounds >= 2) {
58
+ enc_loop_neon64_inner(s, o, &tbl_enc);
59
+ enc_loop_neon64_inner(s, o, &tbl_enc);
60
+ rounds -= 2;
61
+ continue;
62
+ }
63
+ enc_loop_neon64_inner(s, o, &tbl_enc);
64
+ break;
65
+ }
66
+ }
@@ -0,0 +1,54 @@
1
+ static inline uint8x16x4_t
2
+ enc_reshuffle (const uint8x16x3_t in)
3
+ {
4
+ uint8x16x4_t out;
5
+
6
+ #if defined(__GNUC__) || defined(__clang__)
7
+
8
+ // GCC and Clang support the following inline assembly syntax. This
9
+ // inline assembly implements the exact same algorithm as the
10
+ // intrinsics further down, but benchmarks show that the inline
11
+ // assembly easily beats the intrinsics. Perhaps this is because the
12
+ // inline assembly is well pipelined to avoid data dependencies.
13
+
14
+ __asm__ (
15
+ "ushr %[o0].16b, %[i0].16b, #2 \n\t"
16
+ "ushr %[o1].16b, %[i1].16b, #2 \n\t"
17
+ "ushr %[o2].16b, %[i2].16b, #4 \n\t"
18
+ "sli %[o1].16b, %[i0].16b, #6 \n\t"
19
+ "sli %[o2].16b, %[i1].16b, #4 \n\t"
20
+ "shl %[o3].16b, %[i2].16b, #2 \n\t"
21
+
22
+ "ushr %[o1].16b, %[o1].16b, #2 \n\t"
23
+ "ushr %[o2].16b, %[o2].16b, #2 \n\t"
24
+ "ushr %[o3].16b, %[o3].16b, #2 \n\t"
25
+
26
+ // Outputs:
27
+ : [o0] "=&w" (out.val[0]),
28
+ [o1] "=&w" (out.val[1]),
29
+ [o2] "=&w" (out.val[2]),
30
+ [o3] "=&w" (out.val[3])
31
+
32
+ // Inputs:
33
+ : [i0] "w" (in.val[0]),
34
+ [i1] "w" (in.val[1]),
35
+ [i2] "w" (in.val[2])
36
+ );
37
+ #else
38
+ // Divide bits of three input bytes over four output bytes. All output
39
+ // bytes except the first one are shifted over two bits to the left:
40
+ out.val[0] = vshrq_n_u8(in.val[0], 2);
41
+ out.val[1] = vshrq_n_u8(in.val[1], 2);
42
+ out.val[2] = vshrq_n_u8(in.val[2], 4);
43
+ out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 6);
44
+ out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 4);
45
+ out.val[3] = vshlq_n_u8(in.val[2], 2);
46
+
47
+ // Clear the top two bits by shifting the output back to the right:
48
+ out.val[1] = vshrq_n_u8(out.val[1], 2);
49
+ out.val[2] = vshrq_n_u8(out.val[2], 2);
50
+ out.val[3] = vshrq_n_u8(out.val[3], 2);
51
+ #endif
52
+
53
+ return out;
54
+ }
@@ -0,0 +1,42 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #if HAVE_SSE41
12
+ #include <smmintrin.h>
13
+
14
+ #include "../ssse3/dec_reshuffle.c"
15
+ #include "../ssse3/dec_loop.c"
16
+ #include "../ssse3/enc_translate.c"
17
+ #include "../ssse3/enc_reshuffle.c"
18
+ #include "../ssse3/enc_loop.c"
19
+
20
+ #endif // HAVE_SSE41
21
+
22
+ BASE64_ENC_FUNCTION(sse41)
23
+ {
24
+ #if HAVE_SSE41
25
+ #include "../generic/enc_head.c"
26
+ enc_loop_ssse3(&s, &slen, &o, &olen);
27
+ #include "../generic/enc_tail.c"
28
+ #else
29
+ BASE64_ENC_STUB
30
+ #endif
31
+ }
32
+
33
+ BASE64_DEC_FUNCTION(sse41)
34
+ {
35
+ #if HAVE_SSE41
36
+ #include "../generic/dec_head.c"
37
+ dec_loop_ssse3(&s, &slen, &o, &olen);
38
+ #include "../generic/dec_tail.c"
39
+ #else
40
+ BASE64_DEC_STUB
41
+ #endif
42
+ }
@@ -0,0 +1,42 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #if HAVE_SSE42
12
+ #include <nmmintrin.h>
13
+
14
+ #include "../ssse3/dec_reshuffle.c"
15
+ #include "../ssse3/dec_loop.c"
16
+ #include "../ssse3/enc_translate.c"
17
+ #include "../ssse3/enc_reshuffle.c"
18
+ #include "../ssse3/enc_loop.c"
19
+
20
+ #endif // HAVE_SSE42
21
+
22
+ BASE64_ENC_FUNCTION(sse42)
23
+ {
24
+ #if HAVE_SSE42
25
+ #include "../generic/enc_head.c"
26
+ enc_loop_ssse3(&s, &slen, &o, &olen);
27
+ #include "../generic/enc_tail.c"
28
+ #else
29
+ BASE64_ENC_STUB
30
+ #endif
31
+ }
32
+
33
+ BASE64_DEC_FUNCTION(sse42)
34
+ {
35
+ #if HAVE_SSE42
36
+ #include "../generic/dec_head.c"
37
+ dec_loop_ssse3(&s, &slen, &o, &olen);
38
+ #include "../generic/dec_tail.c"
39
+ #else
40
+ BASE64_DEC_STUB
41
+ #endif
42
+ }
@@ -0,0 +1,42 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #if HAVE_SSSE3
12
+ #include <tmmintrin.h>
13
+
14
+ #include "dec_reshuffle.c"
15
+ #include "dec_loop.c"
16
+ #include "enc_reshuffle.c"
17
+ #include "enc_translate.c"
18
+ #include "enc_loop.c"
19
+
20
+ #endif // HAVE_SSSE3
21
+
22
+ BASE64_ENC_FUNCTION(ssse3)
23
+ {
24
+ #if HAVE_SSSE3
25
+ #include "../generic/enc_head.c"
26
+ enc_loop_ssse3(&s, &slen, &o, &olen);
27
+ #include "../generic/enc_tail.c"
28
+ #else
29
+ BASE64_ENC_STUB
30
+ #endif
31
+ }
32
+
33
+ BASE64_DEC_FUNCTION(ssse3)
34
+ {
35
+ #if HAVE_SSSE3
36
+ #include "../generic/dec_head.c"
37
+ dec_loop_ssse3(&s, &slen, &o, &olen);
38
+ #include "../generic/dec_tail.c"
39
+ #else
40
+ BASE64_DEC_STUB
41
+ #endif
42
+ }