ob64 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/lib/ob64/version.rb +1 -1
  5. data/ob64.gemspec +2 -0
  6. data/vendor/libbase64/.gitignore +12 -0
  7. data/vendor/libbase64/.travis.yml +71 -0
  8. data/vendor/libbase64/CMakeLists.txt +264 -0
  9. data/vendor/libbase64/LICENSE +28 -0
  10. data/vendor/libbase64/Makefile +93 -0
  11. data/vendor/libbase64/README.md +474 -0
  12. data/vendor/libbase64/base64-benchmarks.png +0 -0
  13. data/vendor/libbase64/bin/base64.c +132 -0
  14. data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
  15. data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
  16. data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
  17. data/vendor/libbase64/cmake/config.h.in +25 -0
  18. data/vendor/libbase64/cmake/test-arch.c +35 -0
  19. data/vendor/libbase64/include/libbase64.h +145 -0
  20. data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
  21. data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
  22. data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
  23. data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
  24. data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
  25. data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
  26. data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
  27. data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
  28. data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
  29. data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
  30. data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
  31. data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
  32. data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
  33. data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
  34. data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
  35. data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
  36. data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
  37. data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
  38. data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
  39. data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
  40. data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
  41. data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
  42. data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
  43. data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
  44. data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
  45. data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
  46. data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
  47. data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
  48. data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
  49. data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
  50. data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
  51. data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
  52. data/vendor/libbase64/lib/codec_choose.c +281 -0
  53. data/vendor/libbase64/lib/codecs.h +65 -0
  54. data/vendor/libbase64/lib/env.h +67 -0
  55. data/vendor/libbase64/lib/exports.txt +7 -0
  56. data/vendor/libbase64/lib/lib.c +164 -0
  57. data/vendor/libbase64/lib/lib_openmp.c +149 -0
  58. data/vendor/libbase64/lib/tables/.gitignore +1 -0
  59. data/vendor/libbase64/lib/tables/Makefile +17 -0
  60. data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
  61. data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
  62. data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
  63. data/vendor/libbase64/lib/tables/table_generator.c +184 -0
  64. data/vendor/libbase64/lib/tables/tables.c +40 -0
  65. data/vendor/libbase64/lib/tables/tables.h +23 -0
  66. metadata +64 -4
@@ -0,0 +1,25 @@
1
+ #ifndef BASE64_CONFIG_H
2
+ #define BASE64_CONFIG_H
3
+
4
+ #cmakedefine01 BASE64_WITH_SSSE3
5
+ #define HAVE_SSSE3 BASE64_WITH_SSSE3
6
+
7
+ #cmakedefine01 BASE64_WITH_SSE41
8
+ #define HAVE_SSE41 BASE64_WITH_SSE41
9
+
10
+ #cmakedefine01 BASE64_WITH_SSE42
11
+ #define HAVE_SSE42 BASE64_WITH_SSE42
12
+
13
+ #cmakedefine01 BASE64_WITH_AVX
14
+ #define HAVE_AVX BASE64_WITH_AVX
15
+
16
+ #cmakedefine01 BASE64_WITH_AVX2
17
+ #define HAVE_AVX2 BASE64_WITH_AVX2
18
+
19
+ #cmakedefine01 BASE64_WITH_NEON32
20
+ #define HAVE_NEON32 BASE64_WITH_NEON32
21
+
22
+ #cmakedefine01 BASE64_WITH_NEON64
23
+ #define HAVE_NEON64 BASE64_WITH_NEON64
24
+
25
+ #endif // BASE64_CONFIG_H
@@ -0,0 +1,35 @@
1
+ // Written in 2017 by Henrik Steffen Gaßmann henrik@gassmann.onl
2
+ //
3
+ // To the extent possible under law, the author(s) have dedicated all
4
+ // copyright and related and neighboring rights to this software to the
5
+ // public domain worldwide. This software is distributed without any warranty.
6
+ //
7
+ // You should have received a copy of the CC0 Public Domain Dedication
8
+ // along with this software. If not, see
9
+ //
10
+ // http://creativecommons.org/publicdomain/zero/1.0/
11
+ //
12
+ ////////////////////////////////////////////////////////////////////////////////
13
+
14
+ // ARM 64-Bit
15
+ #if defined(__aarch64__)
16
+ #error ##arch=arm64##
17
+
18
+ // ARM 32-Bit
19
+ #elif defined(__arm__) \
20
+ || defined(_M_ARM)
21
+ #error ##arch=arm##
22
+
23
+ // x86 64-Bit
24
+ #elif defined(__x86_64__) \
25
+ || defined(_M_X64)
26
+ #error ##arch=x64##
27
+
28
+ // x86 32-Bit
29
+ #elif defined(__i386__) \
30
+ || defined(_M_IX86)
31
+ #error ##arch=x86##
32
+
33
+ #else
34
+ #error ##arch=unknown##
35
+ #endif
@@ -0,0 +1,145 @@
1
+ #ifndef LIBBASE64_H
2
+ #define LIBBASE64_H
3
+
4
+ #include <stddef.h> /* size_t */
5
+
6
+
7
+ #if defined(_WIN32) || defined(__CYGWIN__)
8
+ #define BASE64_SYMBOL_IMPORT __declspec(dllimport)
9
+ #define BASE64_SYMBOL_EXPORT __declspec(dllexport)
10
+ #define BASE64_SYMBOL_PRIVATE
11
+
12
+ #elif __GNUC__ >= 4
13
+ #define BASE64_SYMBOL_IMPORT __attribute__ ((visibility ("default")))
14
+ #define BASE64_SYMBOL_EXPORT __attribute__ ((visibility ("default")))
15
+ #define BASE64_SYMBOL_PRIVATE __attribute__ ((visibility ("hidden")))
16
+
17
+ #else
18
+ #define BASE64_SYMBOL_IMPORT
19
+ #define BASE64_SYMBOL_EXPORT
20
+ #define BASE64_SYMBOL_PRIVATE
21
+ #endif
22
+
23
+ #if defined(BASE64_STATIC_DEFINE)
24
+ #define BASE64_EXPORT
25
+ #define BASE64_NO_EXPORT
26
+
27
+ #else
28
+ #if defined(BASE64_EXPORTS) // defined if we are building the shared library
29
+ #define BASE64_EXPORT BASE64_SYMBOL_EXPORT
30
+
31
+ #else
32
+ #define BASE64_EXPORT BASE64_SYMBOL_IMPORT
33
+ #endif
34
+
35
+ #define BASE64_NO_EXPORT BASE64_SYMBOL_PRIVATE
36
+ #endif
37
+
38
+
39
+ #ifdef __cplusplus
40
+ extern "C" {
41
+ #endif
42
+
43
+ /* These are the flags that can be passed in the `flags` argument. The values
44
+ * below force the use of a given codec, even if that codec is a no-op in the
45
+ * current build. Used in testing. Set to 0 for the default behavior, which is
46
+ * runtime feature detection on x86, a compile-time fixed codec on ARM, and
47
+ * the plain codec on other platforms: */
48
+ #define BASE64_FORCE_AVX2 (1 << 0)
49
+ #define BASE64_FORCE_NEON32 (1 << 1)
50
+ #define BASE64_FORCE_NEON64 (1 << 2)
51
+ #define BASE64_FORCE_PLAIN (1 << 3)
52
+ #define BASE64_FORCE_SSSE3 (1 << 4)
53
+ #define BASE64_FORCE_SSE41 (1 << 5)
54
+ #define BASE64_FORCE_SSE42 (1 << 6)
55
+ #define BASE64_FORCE_AVX (1 << 7)
56
+
57
+ struct base64_state {
58
+ int eof;
59
+ int bytes;
60
+ int flags;
61
+ unsigned char carry;
62
+ };
63
+
64
+ /* Wrapper function to encode a plain string of given length. Output is written
65
+ * to *out without trailing zero. Output length in bytes is written to *outlen.
66
+ * The buffer in `out` has been allocated by the caller and is at least 4/3 the
67
+ * size of the input. See above for `flags`; set to 0 for default operation: */
68
+ void BASE64_EXPORT base64_encode
69
+ ( const char *src
70
+ , size_t srclen
71
+ , char *out
72
+ , size_t *outlen
73
+ , int flags
74
+ ) ;
75
+
76
+ /* Call this before calling base64_stream_encode() to init the state. See above
77
+ * for `flags`; set to 0 for default operation: */
78
+ void BASE64_EXPORT base64_stream_encode_init
79
+ ( struct base64_state *state
80
+ , int flags
81
+ ) ;
82
+
83
+ /* Encodes the block of data of given length at `src`, into the buffer at
84
+ * `out`. Caller is responsible for allocating a large enough out-buffer; it
85
+ * must be at least 4/3 the size of the in-buffer, but take some margin. Places
86
+ * the number of new bytes written into `outlen` (which is set to zero when the
87
+ * function starts). Does not zero-terminate or finalize the output. */
88
+ void BASE64_EXPORT base64_stream_encode
89
+ ( struct base64_state *state
90
+ , const char *src
91
+ , size_t srclen
92
+ , char *out
93
+ , size_t *outlen
94
+ ) ;
95
+
96
+ /* Finalizes the output begun by previous calls to `base64_stream_encode()`.
97
+ * Adds the required end-of-stream markers if appropriate. `outlen` is modified
98
+ * and will contain the number of new bytes written at `out` (which will quite
99
+ * often be zero). */
100
+ void BASE64_EXPORT base64_stream_encode_final
101
+ ( struct base64_state *state
102
+ , char *out
103
+ , size_t *outlen
104
+ ) ;
105
+
106
+ /* Wrapper function to decode a plain string of given length. Output is written
107
+ * to *out without trailing zero. Output length in bytes is written to *outlen.
108
+ * The buffer in `out` has been allocated by the caller and is at least 3/4 the
109
+ * size of the input. See above for `flags`, set to 0 for default operation: */
110
+ int BASE64_EXPORT base64_decode
111
+ ( const char *src
112
+ , size_t srclen
113
+ , char *out
114
+ , size_t *outlen
115
+ , int flags
116
+ ) ;
117
+
118
+ /* Call this before calling base64_stream_decode() to init the state. See above
119
+ * for `flags`; set to 0 for default operation: */
120
+ void BASE64_EXPORT base64_stream_decode_init
121
+ ( struct base64_state *state
122
+ , int flags
123
+ ) ;
124
+
125
+ /* Decodes the block of data of given length at `src`, into the buffer at
126
+ * `out`. Caller is responsible for allocating a large enough out-buffer; it
127
+ * must be at least 3/4 the size of the in-buffer, but take some margin. Places
128
+ * the number of new bytes written into `outlen` (which is set to zero when the
129
+ * function starts). Does not zero-terminate the output. Returns 1 if all is
130
+ * well, and 0 if a decoding error was found, such as an invalid character.
131
+ * Returns -1 if the chosen codec is not included in the current build. Used by
132
+ * the test harness to check whether a codec is available for testing. */
133
+ int BASE64_EXPORT base64_stream_decode
134
+ ( struct base64_state *state
135
+ , const char *src
136
+ , size_t srclen
137
+ , char *out
138
+ , size_t *outlen
139
+ ) ;
140
+
141
+ #ifdef __cplusplus
142
+ }
143
+ #endif
144
+
145
+ #endif /* LIBBASE64_H */
@@ -0,0 +1,42 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #if HAVE_AVX
12
+ #include <immintrin.h>
13
+
14
+ #include "../ssse3/dec_reshuffle.c"
15
+ #include "../ssse3/dec_loop.c"
16
+ #include "../ssse3/enc_translate.c"
17
+ #include "../ssse3/enc_reshuffle.c"
18
+ #include "../ssse3/enc_loop.c"
19
+
20
+ #endif // HAVE_AVX
21
+
22
+ BASE64_ENC_FUNCTION(avx)
23
+ {
24
+ #if HAVE_AVX
25
+ #include "../generic/enc_head.c"
26
+ enc_loop_ssse3(&s, &slen, &o, &olen);
27
+ #include "../generic/enc_tail.c"
28
+ #else
29
+ BASE64_ENC_STUB
30
+ #endif
31
+ }
32
+
33
+ BASE64_DEC_FUNCTION(avx)
34
+ {
35
+ #if HAVE_AVX
36
+ #include "../generic/dec_head.c"
37
+ dec_loop_ssse3(&s, &slen, &o, &olen);
38
+ #include "../generic/dec_tail.c"
39
+ #else
40
+ BASE64_DEC_STUB
41
+ #endif
42
+ }
@@ -0,0 +1,42 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+ #include <stdlib.h>
4
+
5
+ #include "../../../include/libbase64.h"
6
+ #include "../../tables/tables.h"
7
+ #include "../../codecs.h"
8
+ #include "config.h"
9
+ #include "../../env.h"
10
+
11
+ #if HAVE_AVX2
12
+ #include <immintrin.h>
13
+
14
+ #include "dec_reshuffle.c"
15
+ #include "dec_loop.c"
16
+ #include "enc_translate.c"
17
+ #include "enc_reshuffle.c"
18
+ #include "enc_loop.c"
19
+
20
+ #endif // HAVE_AVX2
21
+
22
+ BASE64_ENC_FUNCTION(avx2)
23
+ {
24
+ #if HAVE_AVX2
25
+ #include "../generic/enc_head.c"
26
+ enc_loop_avx2(&s, &slen, &o, &olen);
27
+ #include "../generic/enc_tail.c"
28
+ #else
29
+ BASE64_ENC_STUB
30
+ #endif
31
+ }
32
+
33
+ BASE64_DEC_FUNCTION(avx2)
34
+ {
35
+ #if HAVE_AVX2
36
+ #include "../generic/dec_head.c"
37
+ dec_loop_avx2(&s, &slen, &o, &olen);
38
+ #include "../generic/dec_tail.c"
39
+ #else
40
+ BASE64_DEC_STUB
41
+ #endif
42
+ }
@@ -0,0 +1,110 @@
1
+ static inline int
2
+ dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
3
+ {
4
+ const __m256i lut_lo = _mm256_setr_epi8(
5
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
6
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
7
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
8
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
9
+
10
+ const __m256i lut_hi = _mm256_setr_epi8(
11
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
12
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
13
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
14
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
15
+
16
+ const __m256i lut_roll = _mm256_setr_epi8(
17
+ 0, 16, 19, 4, -65, -65, -71, -71,
18
+ 0, 0, 0, 0, 0, 0, 0, 0,
19
+ 0, 16, 19, 4, -65, -65, -71, -71,
20
+ 0, 0, 0, 0, 0, 0, 0, 0);
21
+
22
+ const __m256i mask_2F = _mm256_set1_epi8(0x2F);
23
+
24
+ // Load input:
25
+ __m256i str = _mm256_loadu_si256((__m256i *) *s);
26
+
27
+ // See the SSSE3 decoder for an explanation of the algorithm.
28
+ const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
29
+ const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
30
+ const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
31
+ const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
32
+
33
+ if (!_mm256_testz_si256(lo, hi)) {
34
+ return 0;
35
+ }
36
+
37
+ const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
38
+ const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
39
+
40
+ // Now simply add the delta values to the input:
41
+ str = _mm256_add_epi8(str, roll);
42
+
43
+ // Reshuffle the input to packed 12-byte output format:
44
+ str = dec_reshuffle(str);
45
+
46
+ // Store the output:
47
+ _mm256_storeu_si256((__m256i *) *o, str);
48
+
49
+ *s += 32;
50
+ *o += 24;
51
+ *rounds -= 1;
52
+
53
+ return 1;
54
+ }
55
+
56
+ static inline void
57
+ dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
58
+ {
59
+ if (*slen < 45) {
60
+ return;
61
+ }
62
+
63
+ // Process blocks of 32 bytes per round. Because 8 extra zero bytes are
64
+ // written after the output, ensure that there will be at least 13
65
+ // bytes of input data left to cover the gap. (11 data bytes and up to
66
+ // two end-of-string markers.)
67
+ size_t rounds = (*slen - 13) / 32;
68
+
69
+ *slen -= rounds * 32; // 32 bytes consumed per round
70
+ *olen += rounds * 24; // 24 bytes produced per round
71
+
72
+ do {
73
+ if (rounds >= 8) {
74
+ if (dec_loop_avx2_inner(s, o, &rounds) &&
75
+ dec_loop_avx2_inner(s, o, &rounds) &&
76
+ dec_loop_avx2_inner(s, o, &rounds) &&
77
+ dec_loop_avx2_inner(s, o, &rounds) &&
78
+ dec_loop_avx2_inner(s, o, &rounds) &&
79
+ dec_loop_avx2_inner(s, o, &rounds) &&
80
+ dec_loop_avx2_inner(s, o, &rounds) &&
81
+ dec_loop_avx2_inner(s, o, &rounds)) {
82
+ continue;
83
+ }
84
+ break;
85
+ }
86
+ if (rounds >= 4) {
87
+ if (dec_loop_avx2_inner(s, o, &rounds) &&
88
+ dec_loop_avx2_inner(s, o, &rounds) &&
89
+ dec_loop_avx2_inner(s, o, &rounds) &&
90
+ dec_loop_avx2_inner(s, o, &rounds)) {
91
+ continue;
92
+ }
93
+ break;
94
+ }
95
+ if (rounds >= 2) {
96
+ if (dec_loop_avx2_inner(s, o, &rounds) &&
97
+ dec_loop_avx2_inner(s, o, &rounds)) {
98
+ continue;
99
+ }
100
+ break;
101
+ }
102
+ dec_loop_avx2_inner(s, o, &rounds);
103
+ break;
104
+
105
+ } while (rounds > 0);
106
+
107
+ // Adjust for any rounds that were skipped:
108
+ *slen += rounds * 32;
109
+ *olen -= rounds * 24;
110
+ }
@@ -0,0 +1,34 @@
1
+ static inline __m256i
2
+ dec_reshuffle (const __m256i in)
3
+ {
4
+ // in, lower lane, bits, upper case are most significant bits, lower
5
+ // case are least significant bits:
6
+ // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
7
+ // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
8
+ // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
9
+ // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
10
+
11
+ const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
12
+ // 0000kkkk LLllllll 0000JJJJ JJjjKKKK
13
+ // 0000hhhh IIiiiiii 0000GGGG GGggHHHH
14
+ // 0000eeee FFffffff 0000DDDD DDddEEEE
15
+ // 0000bbbb CCcccccc 0000AAAA AAaaBBBB
16
+
17
+ __m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
18
+ // 00000000 JJJJJJjj KKKKkkkk LLllllll
19
+ // 00000000 GGGGGGgg HHHHhhhh IIiiiiii
20
+ // 00000000 DDDDDDdd EEEEeeee FFffffff
21
+ // 00000000 AAAAAAaa BBBBbbbb CCcccccc
22
+
23
+ // Pack bytes together in each lane:
24
+ out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
25
+ 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
26
+ 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
27
+ // 00000000 00000000 00000000 00000000
28
+ // LLllllll KKKKkkkk JJJJJJjj IIiiiiii
29
+ // HHHHhhhh GGGGGGgg FFffffff EEEEeeee
30
+ // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
31
+
32
+ // Pack lanes:
33
+ return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
34
+ }
@@ -0,0 +1,89 @@
1
+ static inline void
2
+ enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
3
+ {
4
+ // First load is done at s - 0 to not get a segfault:
5
+ __m256i src = _mm256_loadu_si256((__m256i *) *s);
6
+
7
+ // Shift by 4 bytes, as required by enc_reshuffle:
8
+ src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
9
+
10
+ // Reshuffle, translate, store:
11
+ src = enc_reshuffle(src);
12
+ src = enc_translate(src);
13
+ _mm256_storeu_si256((__m256i *) *o, src);
14
+
15
+ // Subsequent loads will be done at s - 4, set pointer for next round:
16
+ *s += 20;
17
+ *o += 32;
18
+ }
19
+
20
+ static inline void
21
+ enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
22
+ {
23
+ // Load input:
24
+ __m256i src = _mm256_loadu_si256((__m256i *) *s);
25
+
26
+ // Reshuffle, translate, store:
27
+ src = enc_reshuffle(src);
28
+ src = enc_translate(src);
29
+ _mm256_storeu_si256((__m256i *) *o, src);
30
+
31
+ *s += 24;
32
+ *o += 32;
33
+ }
34
+
35
+ static inline void
36
+ enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
37
+ {
38
+ if (*slen < 32) {
39
+ return;
40
+ }
41
+
42
+ // Process blocks of 24 bytes at a time. Because blocks are loaded 32
43
+ // bytes at a time an offset of -4, ensure that there will be at least
44
+ // 4 remaining bytes after the last round, so that the final read will
45
+ // not pass beyond the bounds of the input buffer:
46
+ size_t rounds = (*slen - 4) / 24;
47
+
48
+ *slen -= rounds * 24; // 24 bytes consumed per round
49
+ *olen += rounds * 32; // 32 bytes produced per round
50
+
51
+ // The first loop iteration requires special handling to ensure that
52
+ // the read, which is done at an offset, does not underflow the buffer:
53
+ enc_loop_avx2_inner_first(s, o);
54
+ rounds--;
55
+
56
+ while (rounds > 0) {
57
+ if (rounds >= 8) {
58
+ enc_loop_avx2_inner(s, o);
59
+ enc_loop_avx2_inner(s, o);
60
+ enc_loop_avx2_inner(s, o);
61
+ enc_loop_avx2_inner(s, o);
62
+ enc_loop_avx2_inner(s, o);
63
+ enc_loop_avx2_inner(s, o);
64
+ enc_loop_avx2_inner(s, o);
65
+ enc_loop_avx2_inner(s, o);
66
+ rounds -= 8;
67
+ continue;
68
+ }
69
+ if (rounds >= 4) {
70
+ enc_loop_avx2_inner(s, o);
71
+ enc_loop_avx2_inner(s, o);
72
+ enc_loop_avx2_inner(s, o);
73
+ enc_loop_avx2_inner(s, o);
74
+ rounds -= 4;
75
+ continue;
76
+ }
77
+ if (rounds >= 2) {
78
+ enc_loop_avx2_inner(s, o);
79
+ enc_loop_avx2_inner(s, o);
80
+ rounds -= 2;
81
+ continue;
82
+ }
83
+ enc_loop_avx2_inner(s, o);
84
+ break;
85
+ }
86
+
87
+ // Add the offset back:
88
+ *s += 4;
89
+ }
@@ -0,0 +1,83 @@
1
+ static inline __m256i
2
+ enc_reshuffle (const __m256i input)
3
+ {
4
+ // Translation of the SSSE3 reshuffling algorithm to AVX2. This one
5
+ // works with shifted (4 bytes) input in order to be able to work
6
+ // efficiently in the two 128-bit lanes.
7
+
8
+ // Input, bytes MSB to LSB:
9
+ // 0 0 0 0 x w v u t s r q p o n m
10
+ // l k j i h g f e d c b a 0 0 0 0
11
+
12
+ const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
13
+ 10, 11, 9, 10,
14
+ 7, 8, 6, 7,
15
+ 4, 5, 3, 4,
16
+ 1, 2, 0, 1,
17
+
18
+ 14, 15, 13, 14,
19
+ 11, 12, 10, 11,
20
+ 8, 9, 7, 8,
21
+ 5, 6, 4, 5));
22
+ // in, bytes MSB to LSB:
23
+ // w x v w
24
+ // t u s t
25
+ // q r p q
26
+ // n o m n
27
+ // k l j k
28
+ // h i g h
29
+ // e f d e
30
+ // b c a b
31
+
32
+ const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00));
33
+ // bits, upper case are most significant bits, lower case are least
34
+ // significant bits.
35
+ // 0000wwww XX000000 VVVVVV00 00000000
36
+ // 0000tttt UU000000 SSSSSS00 00000000
37
+ // 0000qqqq RR000000 PPPPPP00 00000000
38
+ // 0000nnnn OO000000 MMMMMM00 00000000
39
+ // 0000kkkk LL000000 JJJJJJ00 00000000
40
+ // 0000hhhh II000000 GGGGGG00 00000000
41
+ // 0000eeee FF000000 DDDDDD00 00000000
42
+ // 0000bbbb CC000000 AAAAAA00 00000000
43
+
44
+ const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
45
+ // 00000000 00wwwwXX 00000000 00VVVVVV
46
+ // 00000000 00ttttUU 00000000 00SSSSSS
47
+ // 00000000 00qqqqRR 00000000 00PPPPPP
48
+ // 00000000 00nnnnOO 00000000 00MMMMMM
49
+ // 00000000 00kkkkLL 00000000 00JJJJJJ
50
+ // 00000000 00hhhhII 00000000 00GGGGGG
51
+ // 00000000 00eeeeFF 00000000 00DDDDDD
52
+ // 00000000 00bbbbCC 00000000 00AAAAAA
53
+
54
+ const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0));
55
+ // 00000000 00xxxxxx 000000vv WWWW0000
56
+ // 00000000 00uuuuuu 000000ss TTTT0000
57
+ // 00000000 00rrrrrr 000000pp QQQQ0000
58
+ // 00000000 00oooooo 000000mm NNNN0000
59
+ // 00000000 00llllll 000000jj KKKK0000
60
+ // 00000000 00iiiiii 000000gg HHHH0000
61
+ // 00000000 00ffffff 000000dd EEEE0000
62
+ // 00000000 00cccccc 000000aa BBBB0000
63
+
64
+ const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
65
+ // 00xxxxxx 00000000 00vvWWWW 00000000
66
+ // 00uuuuuu 00000000 00ssTTTT 00000000
67
+ // 00rrrrrr 00000000 00ppQQQQ 00000000
68
+ // 00oooooo 00000000 00mmNNNN 00000000
69
+ // 00llllll 00000000 00jjKKKK 00000000
70
+ // 00iiiiii 00000000 00ggHHHH 00000000
71
+ // 00ffffff 00000000 00ddEEEE 00000000
72
+ // 00cccccc 00000000 00aaBBBB 00000000
73
+
74
+ return _mm256_or_si256(t1, t3);
75
+ // 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
76
+ // 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
77
+ // 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
78
+ // 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
79
+ // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
80
+ // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
81
+ // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
82
+ // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
83
+ }
@@ -0,0 +1,30 @@
1
+ static inline __m256i
2
+ enc_translate (const __m256i in)
3
+ {
4
+ // A lookup table containing the absolute offsets for all ranges:
5
+ const __m256i lut = _mm256_setr_epi8(
6
+ 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
7
+ 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
8
+
9
+ // Translate values 0..63 to the Base64 alphabet. There are five sets:
10
+ // # From To Abs Index Characters
11
+ // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
12
+ // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
13
+ // 2 [52..61] [48..57] -4 [2..11] 0123456789
14
+ // 3 [62] [43] -19 12 +
15
+ // 4 [63] [47] -16 13 /
16
+
17
+ // Create LUT indices from the input. The index for range #0 is right,
18
+ // others are 1 less than expected:
19
+ __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
20
+
21
+ // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
22
+ const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
23
+
24
+ // Subtract -1, so add 1 to indices for range #[1..4]. All indices are
25
+ // now correct:
26
+ indices = _mm256_sub_epi8(indices, mask);
27
+
28
+ // Add offsets to input values:
29
+ return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
30
+ }