ob64 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/lib/ob64/version.rb +1 -1
  5. data/ob64.gemspec +2 -0
  6. data/vendor/libbase64/.gitignore +12 -0
  7. data/vendor/libbase64/.travis.yml +71 -0
  8. data/vendor/libbase64/CMakeLists.txt +264 -0
  9. data/vendor/libbase64/LICENSE +28 -0
  10. data/vendor/libbase64/Makefile +93 -0
  11. data/vendor/libbase64/README.md +474 -0
  12. data/vendor/libbase64/base64-benchmarks.png +0 -0
  13. data/vendor/libbase64/bin/base64.c +132 -0
  14. data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
  15. data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
  16. data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
  17. data/vendor/libbase64/cmake/config.h.in +25 -0
  18. data/vendor/libbase64/cmake/test-arch.c +35 -0
  19. data/vendor/libbase64/include/libbase64.h +145 -0
  20. data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
  21. data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
  22. data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
  23. data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
  24. data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
  25. data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
  26. data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
  27. data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
  28. data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
  29. data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
  30. data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
  31. data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
  32. data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
  33. data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
  34. data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
  35. data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
  36. data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
  37. data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
  38. data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
  39. data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
  40. data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
  41. data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
  42. data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
  43. data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
  44. data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
  45. data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
  46. data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
  47. data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
  48. data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
  49. data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
  50. data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
  51. data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
  52. data/vendor/libbase64/lib/codec_choose.c +281 -0
  53. data/vendor/libbase64/lib/codecs.h +65 -0
  54. data/vendor/libbase64/lib/env.h +67 -0
  55. data/vendor/libbase64/lib/exports.txt +7 -0
  56. data/vendor/libbase64/lib/lib.c +164 -0
  57. data/vendor/libbase64/lib/lib_openmp.c +149 -0
  58. data/vendor/libbase64/lib/tables/.gitignore +1 -0
  59. data/vendor/libbase64/lib/tables/Makefile +17 -0
  60. data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
  61. data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
  62. data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
  63. data/vendor/libbase64/lib/tables/table_generator.c +184 -0
  64. data/vendor/libbase64/lib/tables/tables.c +40 -0
  65. data/vendor/libbase64/lib/tables/tables.h +23 -0
  66. metadata +64 -4
@@ -0,0 +1,54 @@
1
+ static inline uint8x16x4_t
2
+ enc_reshuffle (const uint8x16x3_t in)
3
+ {
4
+ uint8x16x4_t out;
5
+
6
+ #if defined(__GNUC__) || defined(__clang__)
7
+
8
+ // GCC and Clang support the following inline assembly syntax. This
9
+ // inline assembly implements the exact same algorithm as the
10
+ // intrinsics further down, but benchmarks show that the inline
11
+ // assembly easily beats the intrinsics. Perhaps this is because the
12
+ // inline assembly is well pipelined to avoid data dependencies.
13
+
14
+ __asm__ (
15
+ "vshr.u8 %q[o0], %q[i0], #2 \n\t"
16
+ "vshr.u8 %q[o1], %q[i1], #2 \n\t"
17
+ "vshr.u8 %q[o2], %q[i2], #4 \n\t"
18
+ "vsli.8 %q[o1], %q[i0], #6 \n\t"
19
+ "vsli.8 %q[o2], %q[i1], #4 \n\t"
20
+ "vshl.u8 %q[o3], %q[i2], #2 \n\t"
21
+
22
+ "vshr.u8 %q[o1], %q[o1], #2 \n\t"
23
+ "vshr.u8 %q[o2], %q[o2], #2 \n\t"
24
+ "vshr.u8 %q[o3], %q[o3], #2 \n\t"
25
+
26
+ // Outputs:
27
+ : [o0] "=&w" (out.val[0]),
28
+ [o1] "=&w" (out.val[1]),
29
+ [o2] "=&w" (out.val[2]),
30
+ [o3] "=&w" (out.val[3])
31
+
32
+ // Inputs:
33
+ : [i0] "w" (in.val[0]),
34
+ [i1] "w" (in.val[1]),
35
+ [i2] "w" (in.val[2])
36
+ );
37
+ #else
38
+ // Divide bits of three input bytes over four output bytes. All output
39
+ // bytes except the first one are shifted over two bits to the left:
40
+ out.val[0] = vshrq_n_u8(in.val[0], 2);
41
+ out.val[1] = vshrq_n_u8(in.val[1], 2);
42
+ out.val[2] = vshrq_n_u8(in.val[2], 4);
43
+ out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 6);
44
+ out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 4);
45
+ out.val[3] = vshlq_n_u8(in.val[2], 2);
46
+
47
+ // Clear the top two bits by shifting the output back to the right:
48
+ out.val[1] = vshrq_n_u8(out.val[1], 2);
49
+ out.val[2] = vshrq_n_u8(out.val[2], 2);
50
+ out.val[3] = vshrq_n_u8(out.val[3], 2);
51
+ #endif
52
+
53
+ return out;
54
+ }
@@ -0,0 +1,57 @@
1
+ static inline uint8x16x4_t
2
+ enc_translate (const uint8x16x4_t in)
3
+ {
4
+ // A lookup table containing the absolute offsets for all ranges:
5
+ const uint8x16_t lut = {
6
+ 65U, 71U, 252U, 252U,
7
+ 252U, 252U, 252U, 252U,
8
+ 252U, 252U, 252U, 252U,
9
+ 237U, 240U, 0U, 0U
10
+ };
11
+
12
+ const uint8x16_t offset = vdupq_n_u8(51);
13
+
14
+ uint8x16x4_t indices, mask, delta, out;
15
+
16
+ // Translate values 0..63 to the Base64 alphabet. There are five sets:
17
+ // # From To Abs Index Characters
18
+ // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
19
+ // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
20
+ // 2 [52..61] [48..57] -4 [2..11] 0123456789
21
+ // 3 [62] [43] -19 12 +
22
+ // 4 [63] [47] -16 13 /
23
+
24
+ // Create LUT indices from input:
25
+ // the index for range #0 is right, others are 1 less than expected:
26
+ indices.val[0] = vqsubq_u8(in.val[0], offset);
27
+ indices.val[1] = vqsubq_u8(in.val[1], offset);
28
+ indices.val[2] = vqsubq_u8(in.val[2], offset);
29
+ indices.val[3] = vqsubq_u8(in.val[3], offset);
30
+
31
+ // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
32
+ mask.val[0] = vcgtq_u8(in.val[0], vdupq_n_u8(25));
33
+ mask.val[1] = vcgtq_u8(in.val[1], vdupq_n_u8(25));
34
+ mask.val[2] = vcgtq_u8(in.val[2], vdupq_n_u8(25));
35
+ mask.val[3] = vcgtq_u8(in.val[3], vdupq_n_u8(25));
36
+
37
+ // Subtract -1, so add 1 to indices for range #[1..4], All indices are
38
+ // now correct:
39
+ indices.val[0] = vsubq_u8(indices.val[0], mask.val[0]);
40
+ indices.val[1] = vsubq_u8(indices.val[1], mask.val[1]);
41
+ indices.val[2] = vsubq_u8(indices.val[2], mask.val[2]);
42
+ indices.val[3] = vsubq_u8(indices.val[3], mask.val[3]);
43
+
44
+ // Lookup delta values:
45
+ delta.val[0] = vqtbl1q_u8(lut, indices.val[0]);
46
+ delta.val[1] = vqtbl1q_u8(lut, indices.val[1]);
47
+ delta.val[2] = vqtbl1q_u8(lut, indices.val[2]);
48
+ delta.val[3] = vqtbl1q_u8(lut, indices.val[3]);
49
+
50
+ // Add delta values:
51
+ out.val[0] = vaddq_u8(in.val[0], delta.val[0]);
52
+ out.val[1] = vaddq_u8(in.val[1], delta.val[1]);
53
+ out.val[2] = vaddq_u8(in.val[2], delta.val[2]);
54
+ out.val[3] = vaddq_u8(in.val[3], delta.val[3]);
55
+
56
+ return out;
57
+ }
@@ -0,0 +1,70 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <string.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #ifdef __aarch64__
12
+ # if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON64
13
+ # define BASE64_USE_NEON64
14
+ # endif
15
+ #endif
16
+
17
+ #ifdef BASE64_USE_NEON64
18
+ #include <arm_neon.h>
19
+
20
+ static inline uint8x16x4_t
21
+ load_64byte_table (const uint8_t *p)
22
+ {
23
+ #if defined(__GNUC__) && !defined(__clang__)
24
+ // As of October 2016, GCC does not support the 'vld1q_u8_x4()' intrinsic.
25
+ uint8x16x4_t ret;
26
+ ret.val[0] = vld1q_u8(p + 0);
27
+ ret.val[1] = vld1q_u8(p + 16);
28
+ ret.val[2] = vld1q_u8(p + 32);
29
+ ret.val[3] = vld1q_u8(p + 48);
30
+ return ret;
31
+ #else
32
+ return vld1q_u8_x4(p);
33
+ #endif
34
+ }
35
+
36
+ #include "../generic/32/dec_loop.c"
37
+ #include "../generic/64/enc_loop.c"
38
+ #include "dec_loop.c"
39
+ #include "enc_reshuffle.c"
40
+ #include "enc_loop.c"
41
+
42
+ #endif // BASE64_USE_NEON64
43
+
44
+ // Stride size is so large on these NEON 64-bit functions
45
+ // (48 bytes encode, 64 bytes decode) that we inline the
46
+ // uint64 codec to stay performant on smaller inputs.
47
+
48
+ BASE64_ENC_FUNCTION(neon64)
49
+ {
50
+ #ifdef BASE64_USE_NEON64
51
+ #include "../generic/enc_head.c"
52
+ enc_loop_neon64(&s, &slen, &o, &olen);
53
+ enc_loop_generic_64(&s, &slen, &o, &olen);
54
+ #include "../generic/enc_tail.c"
55
+ #else
56
+ BASE64_ENC_STUB
57
+ #endif
58
+ }
59
+
60
+ BASE64_DEC_FUNCTION(neon64)
61
+ {
62
+ #ifdef BASE64_USE_NEON64
63
+ #include "../generic/dec_head.c"
64
+ dec_loop_neon64(&s, &slen, &o, &olen);
65
+ dec_loop_generic_32(&s, &slen, &o, &olen);
66
+ #include "../generic/dec_tail.c"
67
+ #else
68
+ BASE64_DEC_STUB
69
+ #endif
70
+ }
@@ -0,0 +1,129 @@
1
+ // The input consists of five valid character sets in the Base64 alphabet,
2
+ // which we need to map back to the 6-bit values they represent.
3
+ // There are three ranges, two singles, and then there's the rest.
4
+ //
5
+ // # From To LUT Characters
6
+ // 1 [0..42] [255] #1 invalid input
7
+ // 2 [43] [62] #1 +
8
+ // 3 [44..46] [255] #1 invalid input
9
+ // 4 [47] [63] #1 /
10
+ // 5 [48..57] [52..61] #1 0..9
11
+ // 6 [58..63] [255] #1 invalid input
12
+ // 7 [64] [255] #2 invalid input
13
+ // 8 [65..90] [0..25] #2 A..Z
14
+ // 9 [91..96] [255] #2 invalid input
15
+ // 10 [97..122] [26..51] #2 a..z
16
+ // 11 [123..126] [255] #2 invalid input
17
+ // (12) Everything else => invalid input
18
+
19
+ // The first LUT will use the VTBL instruction (out of range indices are set to
20
+ // 0 in destination).
21
+ static const uint8_t dec_lut1[] = {
22
+ 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
23
+ 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
24
+ 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 62U, 255U, 255U, 255U, 63U,
25
+ 52U, 53U, 54U, 55U, 56U, 57U, 58U, 59U, 60U, 61U, 255U, 255U, 255U, 255U, 255U, 255U,
26
+ };
27
+
28
+ // The second LUT will use the VTBX instruction (out of range indices will be
29
+ // unchanged in destination). Input [64..126] will be mapped to index [1..63]
30
+ // in this LUT. Index 0 means that value comes from LUT #1.
31
+ static const uint8_t dec_lut2[] = {
32
+ 0U, 255U, 0U, 1U, 2U, 3U, 4U, 5U, 6U, 7U, 8U, 9U, 10U, 11U, 12U, 13U,
33
+ 14U, 15U, 16U, 17U, 18U, 19U, 20U, 21U, 22U, 23U, 24U, 25U, 255U, 255U, 255U, 255U,
34
+ 255U, 255U, 26U, 27U, 28U, 29U, 30U, 31U, 32U, 33U, 34U, 35U, 36U, 37U, 38U, 39U,
35
+ 40U, 41U, 42U, 43U, 44U, 45U, 46U, 47U, 48U, 49U, 50U, 51U, 255U, 255U, 255U, 255U,
36
+ };
37
+
38
+ // All input values in range for the first look-up will be 0U in the second
39
+ // look-up result. All input values out of range for the first look-up will be
40
+ // 0U in the first look-up result. Thus, the two results can be ORed without
41
+ // conflicts.
42
+ //
43
+ // Invalid characters that are in the valid range for either look-up will be
44
+ // set to 255U in the combined result. Other invalid characters will just be
45
+ // passed through with the second look-up result (using the VTBX instruction).
46
+ // Since the second LUT is 64 bytes, those passed-through values are guaranteed
47
+ // to have a value greater than 63U. Therefore, valid characters will be mapped
48
+ // to the valid [0..63] range and all invalid characters will be mapped to
49
+ // values greater than 63.
50
+
51
+ static inline void
52
+ dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
53
+ {
54
+ if (*slen < 64) {
55
+ return;
56
+ }
57
+
58
+ // Process blocks of 64 bytes per round. Unlike the SSE codecs, no
59
+ // extra trailing zero bytes are written, so it is not necessary to
60
+ // reserve extra input bytes:
61
+ size_t rounds = *slen / 64;
62
+
63
+ *slen -= rounds * 64; // 64 bytes consumed per round
64
+ *olen += rounds * 48; // 48 bytes produced per round
65
+
66
+ const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1);
67
+ const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2);
68
+
69
+ do {
70
+ const uint8x16_t offset = vdupq_n_u8(63U);
71
+ uint8x16x4_t dec1, dec2;
72
+ uint8x16x3_t dec;
73
+
74
+ // Load 64 bytes and deinterleave:
75
+ uint8x16x4_t str = vld4q_u8((uint8_t *) *s);
76
+
77
+ // Get indices for second LUT:
78
+ dec2.val[0] = vqsubq_u8(str.val[0], offset);
79
+ dec2.val[1] = vqsubq_u8(str.val[1], offset);
80
+ dec2.val[2] = vqsubq_u8(str.val[2], offset);
81
+ dec2.val[3] = vqsubq_u8(str.val[3], offset);
82
+
83
+ // Get values from first LUT:
84
+ dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
85
+ dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
86
+ dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
87
+ dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
88
+
89
+ // Get values from second LUT:
90
+ dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
91
+ dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
92
+ dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
93
+ dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
94
+
95
+ // Get final values:
96
+ str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
97
+ str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
98
+ str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
99
+ str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
100
+
101
+ // Check for invalid input, any value larger than 63:
102
+ const uint8x16_t classified
103
+ = vcgtq_u8(str.val[0], vdupq_n_u8(63))
104
+ | vcgtq_u8(str.val[1], vdupq_n_u8(63))
105
+ | vcgtq_u8(str.val[2], vdupq_n_u8(63))
106
+ | vcgtq_u8(str.val[3], vdupq_n_u8(63));
107
+
108
+ // Check that all bits are zero:
109
+ if (vmaxvq_u8(classified) != 0U) {
110
+ break;
111
+ }
112
+
113
+ // Compress four bytes into three:
114
+ dec.val[0] = vshlq_n_u8(str.val[0], 2) | vshrq_n_u8(str.val[1], 4);
115
+ dec.val[1] = vshlq_n_u8(str.val[1], 4) | vshrq_n_u8(str.val[2], 2);
116
+ dec.val[2] = vshlq_n_u8(str.val[2], 6) | str.val[3];
117
+
118
+ // Interleave and store decoded result:
119
+ vst3q_u8((uint8_t *) *o, dec);
120
+
121
+ *s += 64;
122
+ *o += 48;
123
+
124
+ } while (--rounds > 0);
125
+
126
+ // Adjust for any rounds that were skipped:
127
+ *slen += rounds * 64;
128
+ *olen -= rounds * 48;
129
+ }
@@ -0,0 +1,66 @@
1
+ static inline void
2
+ enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t *tbl_enc)
3
+ {
4
+ // Load 48 bytes and deinterleave:
5
+ uint8x16x3_t src = vld3q_u8(*s);
6
+
7
+ // Divide bits of three input bytes over four output bytes:
8
+ uint8x16x4_t out = enc_reshuffle(src);
9
+
10
+ // The bits have now been shifted to the right locations;
11
+ // translate their values 0..63 to the Base64 alphabet.
12
+ // Use a 64-byte table lookup:
13
+ out.val[0] = vqtbl4q_u8(*tbl_enc, out.val[0]);
14
+ out.val[1] = vqtbl4q_u8(*tbl_enc, out.val[1]);
15
+ out.val[2] = vqtbl4q_u8(*tbl_enc, out.val[2]);
16
+ out.val[3] = vqtbl4q_u8(*tbl_enc, out.val[3]);
17
+
18
+ // Interleave and store output:
19
+ vst4q_u8(*o, out);
20
+
21
+ *s += 48;
22
+ *o += 64;
23
+ }
24
+
25
+ static inline void
26
+ enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
27
+ {
28
+ size_t rounds = *slen / 48;
29
+
30
+ *slen -= rounds * 48; // 48 bytes consumed per round
31
+ *olen += rounds * 64; // 64 bytes produced per round
32
+
33
+ // Load the encoding table:
34
+ const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);
35
+
36
+ while (rounds > 0) {
37
+ if (rounds >= 8) {
38
+ enc_loop_neon64_inner(s, o, &tbl_enc);
39
+ enc_loop_neon64_inner(s, o, &tbl_enc);
40
+ enc_loop_neon64_inner(s, o, &tbl_enc);
41
+ enc_loop_neon64_inner(s, o, &tbl_enc);
42
+ enc_loop_neon64_inner(s, o, &tbl_enc);
43
+ enc_loop_neon64_inner(s, o, &tbl_enc);
44
+ enc_loop_neon64_inner(s, o, &tbl_enc);
45
+ enc_loop_neon64_inner(s, o, &tbl_enc);
46
+ rounds -= 8;
47
+ continue;
48
+ }
49
+ if (rounds >= 4) {
50
+ enc_loop_neon64_inner(s, o, &tbl_enc);
51
+ enc_loop_neon64_inner(s, o, &tbl_enc);
52
+ enc_loop_neon64_inner(s, o, &tbl_enc);
53
+ enc_loop_neon64_inner(s, o, &tbl_enc);
54
+ rounds -= 4;
55
+ continue;
56
+ }
57
+ if (rounds >= 2) {
58
+ enc_loop_neon64_inner(s, o, &tbl_enc);
59
+ enc_loop_neon64_inner(s, o, &tbl_enc);
60
+ rounds -= 2;
61
+ continue;
62
+ }
63
+ enc_loop_neon64_inner(s, o, &tbl_enc);
64
+ break;
65
+ }
66
+ }
@@ -0,0 +1,54 @@
1
+ static inline uint8x16x4_t
2
+ enc_reshuffle (const uint8x16x3_t in)
3
+ {
4
+ uint8x16x4_t out;
5
+
6
+ #if defined(__GNUC__) || defined(__clang__)
7
+
8
+ // GCC and Clang support the following inline assembly syntax. This
9
+ // inline assembly implements the exact same algorithm as the
10
+ // intrinsics further down, but benchmarks show that the inline
11
+ // assembly easily beats the intrinsics. Perhaps this is because the
12
+ // inline assembly is well pipelined to avoid data dependencies.
13
+
14
+ __asm__ (
15
+ "ushr %[o0].16b, %[i0].16b, #2 \n\t"
16
+ "ushr %[o1].16b, %[i1].16b, #2 \n\t"
17
+ "ushr %[o2].16b, %[i2].16b, #4 \n\t"
18
+ "sli %[o1].16b, %[i0].16b, #6 \n\t"
19
+ "sli %[o2].16b, %[i1].16b, #4 \n\t"
20
+ "shl %[o3].16b, %[i2].16b, #2 \n\t"
21
+
22
+ "ushr %[o1].16b, %[o1].16b, #2 \n\t"
23
+ "ushr %[o2].16b, %[o2].16b, #2 \n\t"
24
+ "ushr %[o3].16b, %[o3].16b, #2 \n\t"
25
+
26
+ // Outputs:
27
+ : [o0] "=&w" (out.val[0]),
28
+ [o1] "=&w" (out.val[1]),
29
+ [o2] "=&w" (out.val[2]),
30
+ [o3] "=&w" (out.val[3])
31
+
32
+ // Inputs:
33
+ : [i0] "w" (in.val[0]),
34
+ [i1] "w" (in.val[1]),
35
+ [i2] "w" (in.val[2])
36
+ );
37
+ #else
38
+ // Divide bits of three input bytes over four output bytes. All output
39
+ // bytes except the first one are shifted over two bits to the left:
40
+ out.val[0] = vshrq_n_u8(in.val[0], 2);
41
+ out.val[1] = vshrq_n_u8(in.val[1], 2);
42
+ out.val[2] = vshrq_n_u8(in.val[2], 4);
43
+ out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 6);
44
+ out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 4);
45
+ out.val[3] = vshlq_n_u8(in.val[2], 2);
46
+
47
+ // Clear the top two bits by shifting the output back to the right:
48
+ out.val[1] = vshrq_n_u8(out.val[1], 2);
49
+ out.val[2] = vshrq_n_u8(out.val[2], 2);
50
+ out.val[3] = vshrq_n_u8(out.val[3], 2);
51
+ #endif
52
+
53
+ return out;
54
+ }
@@ -0,0 +1,42 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #if HAVE_SSE41
12
+ #include <smmintrin.h>
13
+
14
+ #include "../ssse3/dec_reshuffle.c"
15
+ #include "../ssse3/dec_loop.c"
16
+ #include "../ssse3/enc_translate.c"
17
+ #include "../ssse3/enc_reshuffle.c"
18
+ #include "../ssse3/enc_loop.c"
19
+
20
+ #endif // HAVE_SSE41
21
+
22
+ BASE64_ENC_FUNCTION(sse41)
23
+ {
24
+ #if HAVE_SSE41
25
+ #include "../generic/enc_head.c"
26
+ enc_loop_ssse3(&s, &slen, &o, &olen);
27
+ #include "../generic/enc_tail.c"
28
+ #else
29
+ BASE64_ENC_STUB
30
+ #endif
31
+ }
32
+
33
+ BASE64_DEC_FUNCTION(sse41)
34
+ {
35
+ #if HAVE_SSE41
36
+ #include "../generic/dec_head.c"
37
+ dec_loop_ssse3(&s, &slen, &o, &olen);
38
+ #include "../generic/dec_tail.c"
39
+ #else
40
+ BASE64_DEC_STUB
41
+ #endif
42
+ }
@@ -0,0 +1,42 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #if HAVE_SSE42
12
+ #include <nmmintrin.h>
13
+
14
+ #include "../ssse3/dec_reshuffle.c"
15
+ #include "../ssse3/dec_loop.c"
16
+ #include "../ssse3/enc_translate.c"
17
+ #include "../ssse3/enc_reshuffle.c"
18
+ #include "../ssse3/enc_loop.c"
19
+
20
+ #endif // HAVE_SSE42
21
+
22
+ BASE64_ENC_FUNCTION(sse42)
23
+ {
24
+ #if HAVE_SSE42
25
+ #include "../generic/enc_head.c"
26
+ enc_loop_ssse3(&s, &slen, &o, &olen);
27
+ #include "../generic/enc_tail.c"
28
+ #else
29
+ BASE64_ENC_STUB
30
+ #endif
31
+ }
32
+
33
+ BASE64_DEC_FUNCTION(sse42)
34
+ {
35
+ #if HAVE_SSE42
36
+ #include "../generic/dec_head.c"
37
+ dec_loop_ssse3(&s, &slen, &o, &olen);
38
+ #include "../generic/dec_tail.c"
39
+ #else
40
+ BASE64_DEC_STUB
41
+ #endif
42
+ }
@@ -0,0 +1,42 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #if HAVE_SSSE3
12
+ #include <tmmintrin.h>
13
+
14
+ #include "dec_reshuffle.c"
15
+ #include "dec_loop.c"
16
+ #include "enc_reshuffle.c"
17
+ #include "enc_translate.c"
18
+ #include "enc_loop.c"
19
+
20
+ #endif // HAVE_SSSE3
21
+
22
+ BASE64_ENC_FUNCTION(ssse3)
23
+ {
24
+ #if HAVE_SSSE3
25
+ #include "../generic/enc_head.c"
26
+ enc_loop_ssse3(&s, &slen, &o, &olen);
27
+ #include "../generic/enc_tail.c"
28
+ #else
29
+ BASE64_ENC_STUB
30
+ #endif
31
+ }
32
+
33
+ BASE64_DEC_FUNCTION(ssse3)
34
+ {
35
+ #if HAVE_SSSE3
36
+ #include "../generic/dec_head.c"
37
+ dec_loop_ssse3(&s, &slen, &o, &olen);
38
+ #include "../generic/dec_tail.c"
39
+ #else
40
+ BASE64_DEC_STUB
41
+ #endif
42
+ }