ob64 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/lib/ob64/version.rb +1 -1
  5. data/ob64.gemspec +2 -0
  6. data/vendor/libbase64/.gitignore +12 -0
  7. data/vendor/libbase64/.travis.yml +71 -0
  8. data/vendor/libbase64/CMakeLists.txt +264 -0
  9. data/vendor/libbase64/LICENSE +28 -0
  10. data/vendor/libbase64/Makefile +93 -0
  11. data/vendor/libbase64/README.md +474 -0
  12. data/vendor/libbase64/base64-benchmarks.png +0 -0
  13. data/vendor/libbase64/bin/base64.c +132 -0
  14. data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
  15. data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
  16. data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
  17. data/vendor/libbase64/cmake/config.h.in +25 -0
  18. data/vendor/libbase64/cmake/test-arch.c +35 -0
  19. data/vendor/libbase64/include/libbase64.h +145 -0
  20. data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
  21. data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
  22. data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
  23. data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
  24. data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
  25. data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
  26. data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
  27. data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
  28. data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
  29. data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
  30. data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
  31. data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
  32. data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
  33. data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
  34. data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
  35. data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
  36. data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
  37. data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
  38. data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
  39. data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
  40. data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
  41. data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
  42. data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
  43. data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
  44. data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
  45. data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
  46. data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
  47. data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
  48. data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
  49. data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
  50. data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
  51. data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
  52. data/vendor/libbase64/lib/codec_choose.c +281 -0
  53. data/vendor/libbase64/lib/codecs.h +65 -0
  54. data/vendor/libbase64/lib/env.h +67 -0
  55. data/vendor/libbase64/lib/exports.txt +7 -0
  56. data/vendor/libbase64/lib/lib.c +164 -0
  57. data/vendor/libbase64/lib/lib_openmp.c +149 -0
  58. data/vendor/libbase64/lib/tables/.gitignore +1 -0
  59. data/vendor/libbase64/lib/tables/Makefile +17 -0
  60. data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
  61. data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
  62. data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
  63. data/vendor/libbase64/lib/tables/table_generator.c +184 -0
  64. data/vendor/libbase64/lib/tables/tables.c +40 -0
  65. data/vendor/libbase64/lib/tables/tables.h +23 -0
  66. metadata +64 -4
@@ -0,0 +1,25 @@
1
+ #ifndef BASE64_CONFIG_H
2
+ #define BASE64_CONFIG_H
3
+
4
+ #cmakedefine01 BASE64_WITH_SSSE3
5
+ #define HAVE_SSSE3 BASE64_WITH_SSSE3
6
+
7
+ #cmakedefine01 BASE64_WITH_SSE41
8
+ #define HAVE_SSE41 BASE64_WITH_SSE41
9
+
10
+ #cmakedefine01 BASE64_WITH_SSE42
11
+ #define HAVE_SSE42 BASE64_WITH_SSE42
12
+
13
+ #cmakedefine01 BASE64_WITH_AVX
14
+ #define HAVE_AVX BASE64_WITH_AVX
15
+
16
+ #cmakedefine01 BASE64_WITH_AVX2
17
+ #define HAVE_AVX2 BASE64_WITH_AVX2
18
+
19
+ #cmakedefine01 BASE64_WITH_NEON32
20
+ #define HAVE_NEON32 BASE64_WITH_NEON32
21
+
22
+ #cmakedefine01 BASE64_WITH_NEON64
23
+ #define HAVE_NEON64 BASE64_WITH_NEON64
24
+
25
+ #endif // BASE64_CONFIG_H
@@ -0,0 +1,35 @@
1
+ // Written in 2017 by Henrik Steffen Gaßmann henrik@gassmann.onl
2
+ //
3
+ // To the extent possible under law, the author(s) have dedicated all
4
+ // copyright and related and neighboring rights to this software to the
5
+ // public domain worldwide. This software is distributed without any warranty.
6
+ //
7
+ // You should have received a copy of the CC0 Public Domain Dedication
8
+ // along with this software. If not, see
9
+ //
10
+ // http://creativecommons.org/publicdomain/zero/1.0/
11
+ //
12
+ ////////////////////////////////////////////////////////////////////////////////
13
+
14
+ // ARM 64-Bit
15
+ #if defined(__aarch64__)
16
+ #error ##arch=arm64##
17
+
18
+ // ARM 32-Bit
19
+ #elif defined(__arm__) \
20
+ || defined(_M_ARM)
21
+ #error ##arch=arm##
22
+
23
+ // x86 64-Bit
24
+ #elif defined(__x86_64__) \
25
+ || defined(_M_X64)
26
+ #error ##arch=x64##
27
+
28
+ // x86 32-Bit
29
+ #elif defined(__i386__) \
30
+ || defined(_M_IX86)
31
+ #error ##arch=x86##
32
+
33
+ #else
34
+ #error ##arch=unknown##
35
+ #endif
@@ -0,0 +1,145 @@
1
+ #ifndef LIBBASE64_H
2
+ #define LIBBASE64_H
3
+
4
+ #include <stddef.h> /* size_t */
5
+
6
+
7
+ #if defined(_WIN32) || defined(__CYGWIN__)
8
+ #define BASE64_SYMBOL_IMPORT __declspec(dllimport)
9
+ #define BASE64_SYMBOL_EXPORT __declspec(dllexport)
10
+ #define BASE64_SYMBOL_PRIVATE
11
+
12
+ #elif __GNUC__ >= 4
13
+ #define BASE64_SYMBOL_IMPORT __attribute__ ((visibility ("default")))
14
+ #define BASE64_SYMBOL_EXPORT __attribute__ ((visibility ("default")))
15
+ #define BASE64_SYMBOL_PRIVATE __attribute__ ((visibility ("hidden")))
16
+
17
+ #else
18
+ #define BASE64_SYMBOL_IMPORT
19
+ #define BASE64_SYMBOL_EXPORT
20
+ #define BASE64_SYMBOL_PRIVATE
21
+ #endif
22
+
23
+ #if defined(BASE64_STATIC_DEFINE)
24
+ #define BASE64_EXPORT
25
+ #define BASE64_NO_EXPORT
26
+
27
+ #else
28
+ #if defined(BASE64_EXPORTS) // defined if we are building the shared library
29
+ #define BASE64_EXPORT BASE64_SYMBOL_EXPORT
30
+
31
+ #else
32
+ #define BASE64_EXPORT BASE64_SYMBOL_IMPORT
33
+ #endif
34
+
35
+ #define BASE64_NO_EXPORT BASE64_SYMBOL_PRIVATE
36
+ #endif
37
+
38
+
39
+ #ifdef __cplusplus
40
+ extern "C" {
41
+ #endif
42
+
43
+ /* These are the flags that can be passed in the `flags` argument. The values
44
+ * below force the use of a given codec, even if that codec is a no-op in the
45
+ * current build. Used in testing. Set to 0 for the default behavior, which is
46
+ * runtime feature detection on x86, a compile-time fixed codec on ARM, and
47
+ * the plain codec on other platforms: */
48
+ #define BASE64_FORCE_AVX2 (1 << 0)
49
+ #define BASE64_FORCE_NEON32 (1 << 1)
50
+ #define BASE64_FORCE_NEON64 (1 << 2)
51
+ #define BASE64_FORCE_PLAIN (1 << 3)
52
+ #define BASE64_FORCE_SSSE3 (1 << 4)
53
+ #define BASE64_FORCE_SSE41 (1 << 5)
54
+ #define BASE64_FORCE_SSE42 (1 << 6)
55
+ #define BASE64_FORCE_AVX (1 << 7)
56
+
57
+ struct base64_state {
58
+ int eof;
59
+ int bytes;
60
+ int flags;
61
+ unsigned char carry;
62
+ };
63
+
64
+ /* Wrapper function to encode a plain string of given length. Output is written
65
+ * to *out without trailing zero. Output length in bytes is written to *outlen.
66
+ * The buffer in `out` has been allocated by the caller and is at least 4/3 the
67
+ * size of the input. See above for `flags`; set to 0 for default operation: */
68
+ void BASE64_EXPORT base64_encode
69
+ ( const char *src
70
+ , size_t srclen
71
+ , char *out
72
+ , size_t *outlen
73
+ , int flags
74
+ ) ;
75
+
76
+ /* Call this before calling base64_stream_encode() to init the state. See above
77
+ * for `flags`; set to 0 for default operation: */
78
+ void BASE64_EXPORT base64_stream_encode_init
79
+ ( struct base64_state *state
80
+ , int flags
81
+ ) ;
82
+
83
+ /* Encodes the block of data of given length at `src`, into the buffer at
84
+ * `out`. Caller is responsible for allocating a large enough out-buffer; it
85
+ * must be at least 4/3 the size of the in-buffer, but take some margin. Places
86
+ * the number of new bytes written into `outlen` (which is set to zero when the
87
+ * function starts). Does not zero-terminate or finalize the output. */
88
+ void BASE64_EXPORT base64_stream_encode
89
+ ( struct base64_state *state
90
+ , const char *src
91
+ , size_t srclen
92
+ , char *out
93
+ , size_t *outlen
94
+ ) ;
95
+
96
+ /* Finalizes the output begun by previous calls to `base64_stream_encode()`.
97
+ * Adds the required end-of-stream markers if appropriate. `outlen` is modified
98
+ * and will contain the number of new bytes written at `out` (which will quite
99
+ * often be zero). */
100
+ void BASE64_EXPORT base64_stream_encode_final
101
+ ( struct base64_state *state
102
+ , char *out
103
+ , size_t *outlen
104
+ ) ;
105
+
106
+ /* Wrapper function to decode a plain string of given length. Output is written
107
+ * to *out without trailing zero. Output length in bytes is written to *outlen.
108
+ * The buffer in `out` has been allocated by the caller and is at least 3/4 the
109
+ * size of the input. See above for `flags`, set to 0 for default operation: */
110
+ int BASE64_EXPORT base64_decode
111
+ ( const char *src
112
+ , size_t srclen
113
+ , char *out
114
+ , size_t *outlen
115
+ , int flags
116
+ ) ;
117
+
118
+ /* Call this before calling base64_stream_decode() to init the state. See above
119
+ * for `flags`; set to 0 for default operation: */
120
+ void BASE64_EXPORT base64_stream_decode_init
121
+ ( struct base64_state *state
122
+ , int flags
123
+ ) ;
124
+
125
+ /* Decodes the block of data of given length at `src`, into the buffer at
126
+ * `out`. Caller is responsible for allocating a large enough out-buffer; it
127
+ * must be at least 3/4 the size of the in-buffer, but take some margin. Places
128
+ * the number of new bytes written into `outlen` (which is set to zero when the
129
+ * function starts). Does not zero-terminate the output. Returns 1 if all is
130
+ * well, and 0 if a decoding error was found, such as an invalid character.
131
+ * Returns -1 if the chosen codec is not included in the current build. Used by
132
+ * the test harness to check whether a codec is available for testing. */
133
+ int BASE64_EXPORT base64_stream_decode
134
+ ( struct base64_state *state
135
+ , const char *src
136
+ , size_t srclen
137
+ , char *out
138
+ , size_t *outlen
139
+ ) ;
140
+
141
+ #ifdef __cplusplus
142
+ }
143
+ #endif
144
+
145
+ #endif /* LIBBASE64_H */
@@ -0,0 +1,42 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #if HAVE_AVX
12
+ #include <immintrin.h>
13
+
14
+ #include "../ssse3/dec_reshuffle.c"
15
+ #include "../ssse3/dec_loop.c"
16
+ #include "../ssse3/enc_translate.c"
17
+ #include "../ssse3/enc_reshuffle.c"
18
+ #include "../ssse3/enc_loop.c"
19
+
20
+ #endif // HAVE_AVX
21
+
22
+ BASE64_ENC_FUNCTION(avx)
23
+ {
24
+ #if HAVE_AVX
25
+ #include "../generic/enc_head.c"
26
+ enc_loop_ssse3(&s, &slen, &o, &olen);
27
+ #include "../generic/enc_tail.c"
28
+ #else
29
+ BASE64_ENC_STUB
30
+ #endif
31
+ }
32
+
33
+ BASE64_DEC_FUNCTION(avx)
34
+ {
35
+ #if HAVE_AVX
36
+ #include "../generic/dec_head.c"
37
+ dec_loop_ssse3(&s, &slen, &o, &olen);
38
+ #include "../generic/dec_tail.c"
39
+ #else
40
+ BASE64_DEC_STUB
41
+ #endif
42
+ }
@@ -0,0 +1,42 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #if HAVE_AVX2
12
+ #include <immintrin.h>
13
+
14
+ #include "dec_reshuffle.c"
15
+ #include "dec_loop.c"
16
+ #include "enc_translate.c"
17
+ #include "enc_reshuffle.c"
18
+ #include "enc_loop.c"
19
+
20
+ #endif // HAVE_AVX2
21
+
22
+ BASE64_ENC_FUNCTION(avx2)
23
+ {
24
+ #if HAVE_AVX2
25
+ #include "../generic/enc_head.c"
26
+ enc_loop_avx2(&s, &slen, &o, &olen);
27
+ #include "../generic/enc_tail.c"
28
+ #else
29
+ BASE64_ENC_STUB
30
+ #endif
31
+ }
32
+
33
+ BASE64_DEC_FUNCTION(avx2)
34
+ {
35
+ #if HAVE_AVX2
36
+ #include "../generic/dec_head.c"
37
+ dec_loop_avx2(&s, &slen, &o, &olen);
38
+ #include "../generic/dec_tail.c"
39
+ #else
40
+ BASE64_DEC_STUB
41
+ #endif
42
+ }
@@ -0,0 +1,110 @@
1
+ static inline int
2
+ dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
3
+ {
4
+ const __m256i lut_lo = _mm256_setr_epi8(
5
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
6
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
7
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
8
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
9
+
10
+ const __m256i lut_hi = _mm256_setr_epi8(
11
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
12
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
13
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
14
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
15
+
16
+ const __m256i lut_roll = _mm256_setr_epi8(
17
+ 0, 16, 19, 4, -65, -65, -71, -71,
18
+ 0, 0, 0, 0, 0, 0, 0, 0,
19
+ 0, 16, 19, 4, -65, -65, -71, -71,
20
+ 0, 0, 0, 0, 0, 0, 0, 0);
21
+
22
+ const __m256i mask_2F = _mm256_set1_epi8(0x2F);
23
+
24
+ // Load input:
25
+ __m256i str = _mm256_loadu_si256((__m256i *) *s);
26
+
27
+ // See the SSSE3 decoder for an explanation of the algorithm.
28
+ const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
29
+ const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
30
+ const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
31
+ const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
32
+
33
+ if (!_mm256_testz_si256(lo, hi)) {
34
+ return 0;
35
+ }
36
+
37
+ const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
38
+ const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
39
+
40
+ // Now simply add the delta values to the input:
41
+ str = _mm256_add_epi8(str, roll);
42
+
43
+ // Reshuffle the input to packed 12-byte output format:
44
+ str = dec_reshuffle(str);
45
+
46
+ // Store the output:
47
+ _mm256_storeu_si256((__m256i *) *o, str);
48
+
49
+ *s += 32;
50
+ *o += 24;
51
+ *rounds -= 1;
52
+
53
+ return 1;
54
+ }
55
+
56
+ static inline void
57
+ dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
58
+ {
59
+ if (*slen < 45) {
60
+ return;
61
+ }
62
+
63
+ // Process blocks of 32 bytes per round. Because 8 extra zero bytes are
64
+ // written after the output, ensure that there will be at least 13
65
+ // bytes of input data left to cover the gap. (11 data bytes and up to
66
+ // two end-of-string markers.)
67
+ size_t rounds = (*slen - 13) / 32;
68
+
69
+ *slen -= rounds * 32; // 32 bytes consumed per round
70
+ *olen += rounds * 24; // 24 bytes produced per round
71
+
72
+ do {
73
+ if (rounds >= 8) {
74
+ if (dec_loop_avx2_inner(s, o, &rounds) &&
75
+ dec_loop_avx2_inner(s, o, &rounds) &&
76
+ dec_loop_avx2_inner(s, o, &rounds) &&
77
+ dec_loop_avx2_inner(s, o, &rounds) &&
78
+ dec_loop_avx2_inner(s, o, &rounds) &&
79
+ dec_loop_avx2_inner(s, o, &rounds) &&
80
+ dec_loop_avx2_inner(s, o, &rounds) &&
81
+ dec_loop_avx2_inner(s, o, &rounds)) {
82
+ continue;
83
+ }
84
+ break;
85
+ }
86
+ if (rounds >= 4) {
87
+ if (dec_loop_avx2_inner(s, o, &rounds) &&
88
+ dec_loop_avx2_inner(s, o, &rounds) &&
89
+ dec_loop_avx2_inner(s, o, &rounds) &&
90
+ dec_loop_avx2_inner(s, o, &rounds)) {
91
+ continue;
92
+ }
93
+ break;
94
+ }
95
+ if (rounds >= 2) {
96
+ if (dec_loop_avx2_inner(s, o, &rounds) &&
97
+ dec_loop_avx2_inner(s, o, &rounds)) {
98
+ continue;
99
+ }
100
+ break;
101
+ }
102
+ dec_loop_avx2_inner(s, o, &rounds);
103
+ break;
104
+
105
+ } while (rounds > 0);
106
+
107
+ // Adjust for any rounds that were skipped:
108
+ *slen += rounds * 32;
109
+ *olen -= rounds * 24;
110
+ }
@@ -0,0 +1,34 @@
1
+ static inline __m256i
2
+ dec_reshuffle (const __m256i in)
3
+ {
4
+ // in, lower lane, bits, upper case are most significant bits, lower
5
+ // case are least significant bits:
6
+ // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
7
+ // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
8
+ // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
9
+ // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
10
+
11
+ const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
12
+ // 0000kkkk LLllllll 0000JJJJ JJjjKKKK
13
+ // 0000hhhh IIiiiiii 0000GGGG GGggHHHH
14
+ // 0000eeee FFffffff 0000DDDD DDddEEEE
15
+ // 0000bbbb CCcccccc 0000AAAA AAaaBBBB
16
+
17
+ __m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
18
+ // 00000000 JJJJJJjj KKKKkkkk LLllllll
19
+ // 00000000 GGGGGGgg HHHHhhhh IIiiiiii
20
+ // 00000000 DDDDDDdd EEEEeeee FFffffff
21
+ // 00000000 AAAAAAaa BBBBbbbb CCcccccc
22
+
23
+ // Pack bytes together in each lane:
24
+ out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
25
+ 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
26
+ 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
27
+ // 00000000 00000000 00000000 00000000
28
+ // LLllllll KKKKkkkk JJJJJJjj IIiiiiii
29
+ // HHHHhhhh GGGGGGgg FFffffff EEEEeeee
30
+ // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
31
+
32
+ // Pack lanes:
33
+ return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
34
+ }
@@ -0,0 +1,89 @@
1
+ static inline void
2
+ enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
3
+ {
4
+ // First load is done at s - 0 to not get a segfault:
5
+ __m256i src = _mm256_loadu_si256((__m256i *) *s);
6
+
7
+ // Shift by 4 bytes, as required by enc_reshuffle:
8
+ src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
9
+
10
+ // Reshuffle, translate, store:
11
+ src = enc_reshuffle(src);
12
+ src = enc_translate(src);
13
+ _mm256_storeu_si256((__m256i *) *o, src);
14
+
15
+ // Subsequent loads will be done at s - 4, set pointer for next round:
16
+ *s += 20;
17
+ *o += 32;
18
+ }
19
+
20
+ static inline void
21
+ enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
22
+ {
23
+ // Load input:
24
+ __m256i src = _mm256_loadu_si256((__m256i *) *s);
25
+
26
+ // Reshuffle, translate, store:
27
+ src = enc_reshuffle(src);
28
+ src = enc_translate(src);
29
+ _mm256_storeu_si256((__m256i *) *o, src);
30
+
31
+ *s += 24;
32
+ *o += 32;
33
+ }
34
+
35
+ static inline void
36
+ enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
37
+ {
38
+ if (*slen < 32) {
39
+ return;
40
+ }
41
+
42
+ // Process blocks of 24 bytes at a time. Because blocks are loaded 32
43
+ // bytes at a time an offset of -4, ensure that there will be at least
44
+ // 4 remaining bytes after the last round, so that the final read will
45
+ // not pass beyond the bounds of the input buffer:
46
+ size_t rounds = (*slen - 4) / 24;
47
+
48
+ *slen -= rounds * 24; // 24 bytes consumed per round
49
+ *olen += rounds * 32; // 32 bytes produced per round
50
+
51
+ // The first loop iteration requires special handling to ensure that
52
+ // the read, which is done at an offset, does not underflow the buffer:
53
+ enc_loop_avx2_inner_first(s, o);
54
+ rounds--;
55
+
56
+ while (rounds > 0) {
57
+ if (rounds >= 8) {
58
+ enc_loop_avx2_inner(s, o);
59
+ enc_loop_avx2_inner(s, o);
60
+ enc_loop_avx2_inner(s, o);
61
+ enc_loop_avx2_inner(s, o);
62
+ enc_loop_avx2_inner(s, o);
63
+ enc_loop_avx2_inner(s, o);
64
+ enc_loop_avx2_inner(s, o);
65
+ enc_loop_avx2_inner(s, o);
66
+ rounds -= 8;
67
+ continue;
68
+ }
69
+ if (rounds >= 4) {
70
+ enc_loop_avx2_inner(s, o);
71
+ enc_loop_avx2_inner(s, o);
72
+ enc_loop_avx2_inner(s, o);
73
+ enc_loop_avx2_inner(s, o);
74
+ rounds -= 4;
75
+ continue;
76
+ }
77
+ if (rounds >= 2) {
78
+ enc_loop_avx2_inner(s, o);
79
+ enc_loop_avx2_inner(s, o);
80
+ rounds -= 2;
81
+ continue;
82
+ }
83
+ enc_loop_avx2_inner(s, o);
84
+ break;
85
+ }
86
+
87
+ // Add the offset back:
88
+ *s += 4;
89
+ }
@@ -0,0 +1,83 @@
1
+ static inline __m256i
2
+ enc_reshuffle (const __m256i input)
3
+ {
4
+ // Translation of the SSSE3 reshuffling algorithm to AVX2. This one
5
+ // works with shifted (4 bytes) input in order to be able to work
6
+ // efficiently in the two 128-bit lanes.
7
+
8
+ // Input, bytes MSB to LSB:
9
+ // 0 0 0 0 x w v u t s r q p o n m
10
+ // l k j i h g f e d c b a 0 0 0 0
11
+
12
+ const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
13
+ 10, 11, 9, 10,
14
+ 7, 8, 6, 7,
15
+ 4, 5, 3, 4,
16
+ 1, 2, 0, 1,
17
+
18
+ 14, 15, 13, 14,
19
+ 11, 12, 10, 11,
20
+ 8, 9, 7, 8,
21
+ 5, 6, 4, 5));
22
+ // in, bytes MSB to LSB:
23
+ // w x v w
24
+ // t u s t
25
+ // q r p q
26
+ // n o m n
27
+ // k l j k
28
+ // h i g h
29
+ // e f d e
30
+ // b c a b
31
+
32
+ const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00));
33
+ // bits, upper case are most significant bits, lower case are least
34
+ // significant bits.
35
+ // 0000wwww XX000000 VVVVVV00 00000000
36
+ // 0000tttt UU000000 SSSSSS00 00000000
37
+ // 0000qqqq RR000000 PPPPPP00 00000000
38
+ // 0000nnnn OO000000 MMMMMM00 00000000
39
+ // 0000kkkk LL000000 JJJJJJ00 00000000
40
+ // 0000hhhh II000000 GGGGGG00 00000000
41
+ // 0000eeee FF000000 DDDDDD00 00000000
42
+ // 0000bbbb CC000000 AAAAAA00 00000000
43
+
44
+ const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
45
+ // 00000000 00wwwwXX 00000000 00VVVVVV
46
+ // 00000000 00ttttUU 00000000 00SSSSSS
47
+ // 00000000 00qqqqRR 00000000 00PPPPPP
48
+ // 00000000 00nnnnOO 00000000 00MMMMMM
49
+ // 00000000 00kkkkLL 00000000 00JJJJJJ
50
+ // 00000000 00hhhhII 00000000 00GGGGGG
51
+ // 00000000 00eeeeFF 00000000 00DDDDDD
52
+ // 00000000 00bbbbCC 00000000 00AAAAAA
53
+
54
+ const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0));
55
+ // 00000000 00xxxxxx 000000vv WWWW0000
56
+ // 00000000 00uuuuuu 000000ss TTTT0000
57
+ // 00000000 00rrrrrr 000000pp QQQQ0000
58
+ // 00000000 00oooooo 000000mm NNNN0000
59
+ // 00000000 00llllll 000000jj KKKK0000
60
+ // 00000000 00iiiiii 000000gg HHHH0000
61
+ // 00000000 00ffffff 000000dd EEEE0000
62
+ // 00000000 00cccccc 000000aa BBBB0000
63
+
64
+ const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
65
+ // 00xxxxxx 00000000 00vvWWWW 00000000
66
+ // 00uuuuuu 00000000 00ssTTTT 00000000
67
+ // 00rrrrrr 00000000 00ppQQQQ 00000000
68
+ // 00oooooo 00000000 00mmNNNN 00000000
69
+ // 00llllll 00000000 00jjKKKK 00000000
70
+ // 00iiiiii 00000000 00ggHHHH 00000000
71
+ // 00ffffff 00000000 00ddEEEE 00000000
72
+ // 00cccccc 00000000 00aaBBBB 00000000
73
+
74
+ return _mm256_or_si256(t1, t3);
75
+ // 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
76
+ // 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
77
+ // 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
78
+ // 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
79
+ // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
80
+ // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
81
+ // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
82
+ // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
83
+ }
@@ -0,0 +1,30 @@
1
+ static inline __m256i
2
+ enc_translate (const __m256i in)
3
+ {
4
+ // A lookup table containing the absolute offsets for all ranges:
5
+ const __m256i lut = _mm256_setr_epi8(
6
+ 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
7
+ 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
8
+
9
+ // Translate values 0..63 to the Base64 alphabet. There are five sets:
10
+ // # From To Abs Index Characters
11
+ // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
12
+ // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
13
+ // 2 [52..61] [48..57] -4 [2..11] 0123456789
14
+ // 3 [62] [43] -19 12 +
15
+ // 4 [63] [47] -16 13 /
16
+
17
+ // Create LUT indices from the input. The index for range #0 is right,
18
+ // others are 1 less than expected:
19
+ __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
20
+
21
+ // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
22
+ const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
23
+
24
+ // Subtract -1, so add 1 to indices for range #[1..4]. All indices are
25
+ // now correct:
26
+ indices = _mm256_sub_epi8(indices, mask);
27
+
28
+ // Add offsets to input values:
29
+ return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
30
+ }