ob64 0.4.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/lib/ob64/version.rb +1 -1
  5. data/ob64.gemspec +2 -0
  6. data/vendor/libbase64/.gitignore +12 -0
  7. data/vendor/libbase64/.travis.yml +71 -0
  8. data/vendor/libbase64/CMakeLists.txt +264 -0
  9. data/vendor/libbase64/LICENSE +28 -0
  10. data/vendor/libbase64/Makefile +93 -0
  11. data/vendor/libbase64/README.md +474 -0
  12. data/vendor/libbase64/base64-benchmarks.png +0 -0
  13. data/vendor/libbase64/bin/base64.c +132 -0
  14. data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
  15. data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
  16. data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
  17. data/vendor/libbase64/cmake/config.h.in +25 -0
  18. data/vendor/libbase64/cmake/test-arch.c +35 -0
  19. data/vendor/libbase64/include/libbase64.h +145 -0
  20. data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
  21. data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
  22. data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
  23. data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
  24. data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
  25. data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
  26. data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
  27. data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
  28. data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
  29. data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
  30. data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
  31. data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
  32. data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
  33. data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
  34. data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
  35. data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
  36. data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
  37. data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
  38. data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
  39. data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
  40. data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
  41. data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
  42. data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
  43. data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
  44. data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
  45. data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
  46. data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
  47. data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
  48. data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
  49. data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
  50. data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
  51. data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
  52. data/vendor/libbase64/lib/codec_choose.c +281 -0
  53. data/vendor/libbase64/lib/codecs.h +65 -0
  54. data/vendor/libbase64/lib/env.h +67 -0
  55. data/vendor/libbase64/lib/exports.txt +7 -0
  56. data/vendor/libbase64/lib/lib.c +164 -0
  57. data/vendor/libbase64/lib/lib_openmp.c +149 -0
  58. data/vendor/libbase64/lib/tables/.gitignore +1 -0
  59. data/vendor/libbase64/lib/tables/Makefile +17 -0
  60. data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
  61. data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
  62. data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
  63. data/vendor/libbase64/lib/tables/table_generator.c +184 -0
  64. data/vendor/libbase64/lib/tables/tables.c +40 -0
  65. data/vendor/libbase64/lib/tables/tables.h +23 -0
  66. metadata +64 -4
@@ -0,0 +1,173 @@
1
+ // The input consists of six character sets in the Base64 alphabet, which we
2
+ // need to map back to the 6-bit values they represent. There are three ranges,
3
+ // two singles, and then there's the rest.
4
+ //
5
+ // # From To Add Characters
6
+ // 1 [43] [62] +19 +
7
+ // 2 [47] [63] +16 /
8
+ // 3 [48..57] [52..61] +4 0..9
9
+ // 4 [65..90] [0..25] -65 A..Z
10
+ // 5 [97..122] [26..51] -71 a..z
11
+ // (6) Everything else => invalid input
12
+ //
13
+ // We will use lookup tables for character validation and offset computation.
14
+ // Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
15
+ // allows to mask with 0x2F instead of 0x0F and thus save one constant
16
+ // declaration (register and/or memory access).
17
+ //
18
+ // For offsets:
19
+ // Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
20
+ // 0000 = garbage
21
+ // 0001 = /
22
+ // 0010 = +
23
+ // 0011 = 0-9
24
+ // 0100 = A-Z
25
+ // 0101 = A-Z
26
+ // 0110 = a-z
27
+ // 0111 = a-z
28
+ // 1000 >= garbage
29
+ //
30
+ // For validation, here's the table.
31
+ // A character is valid if and only if the AND of the 2 lookups equals 0:
32
+ //
33
+ // hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
34
+ // LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
35
+ //
36
+ // 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
37
+ // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
38
+ //
39
+ // 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
40
+ // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
41
+ //
42
+ // 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
43
+ // andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
44
+ //
45
+ // 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
46
+ // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
47
+ //
48
+ // 0100 0x04 char @ A B C D E F G H I J K L M N O
49
+ // andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
50
+ //
51
+ // 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
52
+ // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
53
+ //
54
+ // 0110 0x04 char ` a b c d e f g h i j k l m n o
55
+ // andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
56
+ // 0111 0x08 char p q r s t u v w x y z { | } ~
57
+ // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
58
+ //
59
+ // 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
60
+ // 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
61
+ // 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
62
+ // 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
63
+ // 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
64
+ // 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
65
+ // 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
66
+ // 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
67
+
68
+ static inline int
69
+ dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
70
+ {
71
+ const __m128i lut_lo = _mm_setr_epi8(
72
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
73
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
74
+
75
+ const __m128i lut_hi = _mm_setr_epi8(
76
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
77
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
78
+
79
+ const __m128i lut_roll = _mm_setr_epi8(
80
+ 0, 16, 19, 4, -65, -65, -71, -71,
81
+ 0, 0, 0, 0, 0, 0, 0, 0);
82
+
83
+ const __m128i mask_2F = _mm_set1_epi8(0x2F);
84
+
85
+ // Load input:
86
+ __m128i str = _mm_loadu_si128((__m128i *) *s);
87
+
88
+ // Table lookups:
89
+ const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
90
+ const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
91
+ const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles);
92
+ const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles);
93
+
94
+ // Check for invalid input: if any "and" values from lo and hi are not
95
+ // zero, fall back on bytewise code to do error checking and reporting:
96
+ if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
97
+ return 0;
98
+ }
99
+
100
+ const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
101
+ const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
102
+
103
+ // Now simply add the delta values to the input:
104
+ str = _mm_add_epi8(str, roll);
105
+
106
+ // Reshuffle the input to packed 12-byte output format:
107
+ str = dec_reshuffle(str);
108
+
109
+ // Store the output:
110
+ _mm_storeu_si128((__m128i *) *o, str);
111
+
112
+ *s += 16;
113
+ *o += 12;
114
+ *rounds -= 1;
115
+
116
+ return 1;
117
+ }
118
+
119
+ static inline void
120
+ dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
121
+ {
122
+ if (*slen < 24) {
123
+ return;
124
+ }
125
+
126
+ // Process blocks of 16 bytes per round. Because 4 extra zero bytes are
127
+ // written after the output, ensure that there will be at least 8 bytes
128
+ // of input data left to cover the gap. (6 data bytes and up to two
129
+ // end-of-string markers.)
130
+ size_t rounds = (*slen - 8) / 16;
131
+
132
+ *slen -= rounds * 16; // 16 bytes consumed per round
133
+ *olen += rounds * 12; // 12 bytes produced per round
134
+
135
+ do {
136
+ if (rounds >= 8) {
137
+ if (dec_loop_ssse3_inner(s, o, &rounds) &&
138
+ dec_loop_ssse3_inner(s, o, &rounds) &&
139
+ dec_loop_ssse3_inner(s, o, &rounds) &&
140
+ dec_loop_ssse3_inner(s, o, &rounds) &&
141
+ dec_loop_ssse3_inner(s, o, &rounds) &&
142
+ dec_loop_ssse3_inner(s, o, &rounds) &&
143
+ dec_loop_ssse3_inner(s, o, &rounds) &&
144
+ dec_loop_ssse3_inner(s, o, &rounds)) {
145
+ continue;
146
+ }
147
+ break;
148
+ }
149
+ if (rounds >= 4) {
150
+ if (dec_loop_ssse3_inner(s, o, &rounds) &&
151
+ dec_loop_ssse3_inner(s, o, &rounds) &&
152
+ dec_loop_ssse3_inner(s, o, &rounds) &&
153
+ dec_loop_ssse3_inner(s, o, &rounds)) {
154
+ continue;
155
+ }
156
+ break;
157
+ }
158
+ if (rounds >= 2) {
159
+ if (dec_loop_ssse3_inner(s, o, &rounds) &&
160
+ dec_loop_ssse3_inner(s, o, &rounds)) {
161
+ continue;
162
+ }
163
+ break;
164
+ }
165
+ dec_loop_ssse3_inner(s, o, &rounds);
166
+ break;
167
+
168
+ } while (rounds > 0);
169
+
170
+ // Adjust for any rounds that were skipped:
171
+ *slen += rounds * 16;
172
+ *olen -= rounds * 12;
173
+ }
@@ -0,0 +1,33 @@
1
+ static inline __m128i
2
+ dec_reshuffle (const __m128i in)
3
+ {
4
+ // in, bits, upper case are most significant bits, lower case are least significant bits
5
+ // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
6
+ // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
7
+ // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
8
+ // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
9
+
10
+ const __m128i merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
11
+ // 0000kkkk LLllllll 0000JJJJ JJjjKKKK
12
+ // 0000hhhh IIiiiiii 0000GGGG GGggHHHH
13
+ // 0000eeee FFffffff 0000DDDD DDddEEEE
14
+ // 0000bbbb CCcccccc 0000AAAA AAaaBBBB
15
+
16
+ const __m128i out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
17
+ // 00000000 JJJJJJjj KKKKkkkk LLllllll
18
+ // 00000000 GGGGGGgg HHHHhhhh IIiiiiii
19
+ // 00000000 DDDDDDdd EEEEeeee FFffffff
20
+ // 00000000 AAAAAAaa BBBBbbbb CCcccccc
21
+
22
+ // Pack bytes together:
23
+ return _mm_shuffle_epi8(out, _mm_setr_epi8(
24
+ 2, 1, 0,
25
+ 6, 5, 4,
26
+ 10, 9, 8,
27
+ 14, 13, 12,
28
+ -1, -1, -1, -1));
29
+ // 00000000 00000000 00000000 00000000
30
+ // LLllllll KKKKkkkk JJJJJJjj IIiiiiii
31
+ // HHHHhhhh GGGGGGgg FFffffff EEEEeeee
32
+ // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
33
+ }
@@ -0,0 +1,67 @@
1
+ static inline void
2
+ enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
3
+ {
4
+ // Load input:
5
+ __m128i str = _mm_loadu_si128((__m128i *) *s);
6
+
7
+ // Reshuffle:
8
+ str = enc_reshuffle(str);
9
+
10
+ // Translate reshuffled bytes to the Base64 alphabet:
11
+ str = enc_translate(str);
12
+
13
+ // Store:
14
+ _mm_storeu_si128((__m128i *) *o, str);
15
+
16
+ *s += 12;
17
+ *o += 16;
18
+ }
19
+
20
+ static inline void
21
+ enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
22
+ {
23
+ if (*slen < 16) {
24
+ return;
25
+ }
26
+
27
+ // Process blocks of 12 bytes at a time. Because blocks are loaded 16
28
+ // bytes at a time, ensure that there will be at least 4 remaining
29
+ // bytes after the last round, so that the final read will not pass
30
+ // beyond the bounds of the input buffer:
31
+ size_t rounds = (*slen - 4) / 12;
32
+
33
+ *slen -= rounds * 12; // 12 bytes consumed per round
34
+ *olen += rounds * 16; // 16 bytes produced per round
35
+
36
+ do {
37
+ if (rounds >= 8) {
38
+ enc_loop_ssse3_inner(s, o);
39
+ enc_loop_ssse3_inner(s, o);
40
+ enc_loop_ssse3_inner(s, o);
41
+ enc_loop_ssse3_inner(s, o);
42
+ enc_loop_ssse3_inner(s, o);
43
+ enc_loop_ssse3_inner(s, o);
44
+ enc_loop_ssse3_inner(s, o);
45
+ enc_loop_ssse3_inner(s, o);
46
+ rounds -= 8;
47
+ continue;
48
+ }
49
+ if (rounds >= 4) {
50
+ enc_loop_ssse3_inner(s, o);
51
+ enc_loop_ssse3_inner(s, o);
52
+ enc_loop_ssse3_inner(s, o);
53
+ enc_loop_ssse3_inner(s, o);
54
+ rounds -= 4;
55
+ continue;
56
+ }
57
+ if (rounds >= 2) {
58
+ enc_loop_ssse3_inner(s, o);
59
+ enc_loop_ssse3_inner(s, o);
60
+ rounds -= 2;
61
+ continue;
62
+ }
63
+ enc_loop_ssse3_inner(s, o);
64
+ break;
65
+
66
+ } while (rounds > 0);
67
+ }
@@ -0,0 +1,48 @@
1
+ static inline __m128i
2
+ enc_reshuffle (__m128i in)
3
+ {
4
+ // Input, bytes MSB to LSB:
5
+ // 0 0 0 0 l k j i h g f e d c b a
6
+
7
+ in = _mm_shuffle_epi8(in, _mm_set_epi8(
8
+ 10, 11, 9, 10,
9
+ 7, 8, 6, 7,
10
+ 4, 5, 3, 4,
11
+ 1, 2, 0, 1));
12
+ // in, bytes MSB to LSB:
13
+ // k l j k
14
+ // h i g h
15
+ // e f d e
16
+ // b c a b
17
+
18
+ const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0FC0FC00));
19
+ // bits, upper case are most significant bits, lower case are least significant bits
20
+ // 0000kkkk LL000000 JJJJJJ00 00000000
21
+ // 0000hhhh II000000 GGGGGG00 00000000
22
+ // 0000eeee FF000000 DDDDDD00 00000000
23
+ // 0000bbbb CC000000 AAAAAA00 00000000
24
+
25
+ const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
26
+ // 00000000 00kkkkLL 00000000 00JJJJJJ
27
+ // 00000000 00hhhhII 00000000 00GGGGGG
28
+ // 00000000 00eeeeFF 00000000 00DDDDDD
29
+ // 00000000 00bbbbCC 00000000 00AAAAAA
30
+
31
+ const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003F03F0));
32
+ // 00000000 00llllll 000000jj KKKK0000
33
+ // 00000000 00iiiiii 000000gg HHHH0000
34
+ // 00000000 00ffffff 000000dd EEEE0000
35
+ // 00000000 00cccccc 000000aa BBBB0000
36
+
37
+ const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
38
+ // 00llllll 00000000 00jjKKKK 00000000
39
+ // 00iiiiii 00000000 00ggHHHH 00000000
40
+ // 00ffffff 00000000 00ddEEEE 00000000
41
+ // 00cccccc 00000000 00aaBBBB 00000000
42
+
43
+ return _mm_or_si128(t1, t3);
44
+ // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
45
+ // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
46
+ // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
47
+ // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
48
+ }
@@ -0,0 +1,33 @@
1
+ static inline __m128i
2
+ enc_translate (const __m128i in)
3
+ {
4
+ // A lookup table containing the absolute offsets for all ranges:
5
+ const __m128i lut = _mm_setr_epi8(
6
+ 65, 71, -4, -4,
7
+ -4, -4, -4, -4,
8
+ -4, -4, -4, -4,
9
+ -19, -16, 0, 0
10
+ );
11
+
12
+ // Translate values 0..63 to the Base64 alphabet. There are five sets:
13
+ // # From To Abs Index Characters
14
+ // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
15
+ // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
16
+ // 2 [52..61] [48..57] -4 [2..11] 0123456789
17
+ // 3 [62] [43] -19 12 +
18
+ // 4 [63] [47] -16 13 /
19
+
20
+ // Create LUT indices from the input. The index for range #0 is right,
21
+ // others are 1 less than expected:
22
+ __m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
23
+
24
+ // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
25
+ __m128i mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
26
+
27
+ // Subtract -1, so add 1 to indices for range #[1..4]. All indices are
28
+ // now correct:
29
+ indices = _mm_sub_epi8(indices, mask);
30
+
31
+ // Add offsets to input values:
32
+ return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
33
+ }
@@ -0,0 +1,281 @@
1
+ #include <stdbool.h>
2
+ #include <stdint.h>
3
+ #include <stddef.h>
4
+ #include <stdint.h>
5
+
6
+ #include "../include/libbase64.h"
7
+ #include "codecs.h"
8
+ #include "config.h"
9
+ #include "env.h"
10
+
11
+ #if (__x86_64__ || __i386__ || _M_X86 || _M_X64)
12
+ #define BASE64_X86
13
+ #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2)
14
+ #define BASE64_X86_SIMD
15
+ #endif
16
+ #endif
17
+
18
+ #ifdef BASE64_X86
19
+ #ifdef _MSC_VER
20
+ #include <intrin.h>
21
+ #define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
22
+ { \
23
+ int info[4]; \
24
+ __cpuidex(info, __level, __count); \
25
+ __eax = info[0]; \
26
+ __ebx = info[1]; \
27
+ __ecx = info[2]; \
28
+ __edx = info[3]; \
29
+ }
30
+ #define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
31
+ __cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx)
32
+ #else
33
+ #include <cpuid.h>
34
+ #if HAVE_AVX2 || HAVE_AVX
35
+ #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3))
36
+ static inline uint64_t _xgetbv (uint32_t index)
37
+ {
38
+ uint32_t eax, edx;
39
+ __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
40
+ return ((uint64_t)edx << 32) | eax;
41
+ }
42
+ #else
43
+ #error "Platform not supported"
44
+ #endif
45
+ #endif
46
+ #endif
47
+
48
+ #ifndef bit_AVX2
49
+ #define bit_AVX2 (1 << 5)
50
+ #endif
51
+ #ifndef bit_SSSE3
52
+ #define bit_SSSE3 (1 << 9)
53
+ #endif
54
+ #ifndef bit_SSE41
55
+ #define bit_SSE41 (1 << 19)
56
+ #endif
57
+ #ifndef bit_SSE42
58
+ #define bit_SSE42 (1 << 20)
59
+ #endif
60
+ #ifndef bit_AVX
61
+ #define bit_AVX (1 << 28)
62
+ #endif
63
+
64
+ #define bit_XSAVE_XRSTORE (1 << 27)
65
+
66
+ #ifndef _XCR_XFEATURE_ENABLED_MASK
67
+ #define _XCR_XFEATURE_ENABLED_MASK 0
68
+ #endif
69
+
70
+ #define _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS 0x6
71
+ #endif
72
+
73
+ // Function declarations:
74
+ #define BASE64_CODEC_FUNCS(arch) \
75
+ BASE64_ENC_FUNCTION(arch); \
76
+ BASE64_DEC_FUNCTION(arch); \
77
+
78
+ BASE64_CODEC_FUNCS(avx2)
79
+ BASE64_CODEC_FUNCS(neon32)
80
+ BASE64_CODEC_FUNCS(neon64)
81
+ BASE64_CODEC_FUNCS(plain)
82
+ BASE64_CODEC_FUNCS(ssse3)
83
+ BASE64_CODEC_FUNCS(sse41)
84
+ BASE64_CODEC_FUNCS(sse42)
85
+ BASE64_CODEC_FUNCS(avx)
86
+
87
+ static bool
88
+ codec_choose_forced (struct codec *codec, int flags)
89
+ {
90
+ // If the user wants to use a certain codec,
91
+ // always allow it, even if the codec is a no-op.
92
+ // For testing purposes.
93
+
94
+ if (!(flags & 0xFF)) {
95
+ return false;
96
+ }
97
+ if (flags & BASE64_FORCE_AVX2) {
98
+ codec->enc = base64_stream_encode_avx2;
99
+ codec->dec = base64_stream_decode_avx2;
100
+ return true;
101
+ }
102
+ if (flags & BASE64_FORCE_NEON32) {
103
+ codec->enc = base64_stream_encode_neon32;
104
+ codec->dec = base64_stream_decode_neon32;
105
+ return true;
106
+ }
107
+ if (flags & BASE64_FORCE_NEON64) {
108
+ codec->enc = base64_stream_encode_neon64;
109
+ codec->dec = base64_stream_decode_neon64;
110
+ return true;
111
+ }
112
+ if (flags & BASE64_FORCE_PLAIN) {
113
+ codec->enc = base64_stream_encode_plain;
114
+ codec->dec = base64_stream_decode_plain;
115
+ return true;
116
+ }
117
+ if (flags & BASE64_FORCE_SSSE3) {
118
+ codec->enc = base64_stream_encode_ssse3;
119
+ codec->dec = base64_stream_decode_ssse3;
120
+ return true;
121
+ }
122
+ if (flags & BASE64_FORCE_SSE41) {
123
+ codec->enc = base64_stream_encode_sse41;
124
+ codec->dec = base64_stream_decode_sse41;
125
+ return true;
126
+ }
127
+ if (flags & BASE64_FORCE_SSE42) {
128
+ codec->enc = base64_stream_encode_sse42;
129
+ codec->dec = base64_stream_decode_sse42;
130
+ return true;
131
+ }
132
+ if (flags & BASE64_FORCE_AVX) {
133
+ codec->enc = base64_stream_encode_avx;
134
+ codec->dec = base64_stream_decode_avx;
135
+ return true;
136
+ }
137
+ return false;
138
+ }
139
+
140
+ static bool
141
+ codec_choose_arm (struct codec *codec)
142
+ {
143
+ #if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && ((defined(__aarch64__) && HAVE_NEON64) || HAVE_NEON32)
144
+
145
+ // Unfortunately there is no portable way to check for NEON
146
+ // support at runtime from userland in the same way that x86
147
+ // has cpuid, so just stick to the compile-time configuration:
148
+
149
+ #if defined(__aarch64__) && HAVE_NEON64
150
+ codec->enc = base64_stream_encode_neon64;
151
+ codec->dec = base64_stream_decode_neon64;
152
+ #else
153
+ codec->enc = base64_stream_encode_neon32;
154
+ codec->dec = base64_stream_decode_neon32;
155
+ #endif
156
+
157
+ return true;
158
+
159
+ #else
160
+ (void)codec;
161
+ return false;
162
+ #endif
163
+ }
164
+
165
+ static bool
166
+ codec_choose_x86 (struct codec *codec)
167
+ {
168
+ #ifdef BASE64_X86_SIMD
169
+
170
+ unsigned int eax, ebx = 0, ecx = 0, edx;
171
+ unsigned int max_level;
172
+
173
+ #ifdef _MSC_VER
174
+ int info[4];
175
+ __cpuidex(info, 0, 0);
176
+ max_level = info[0];
177
+ #else
178
+ max_level = __get_cpuid_max(0, NULL);
179
+ #endif
180
+
181
+ #if HAVE_AVX2 || HAVE_AVX
182
+ // Check for AVX/AVX2 support:
183
+ // Checking for AVX requires 3 things:
184
+ // 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions
185
+ // (allowing saving YMM registers on context switch)
186
+ // 2) CPUID indicates support for AVX
187
+ // 3) XGETBV indicates the AVX registers will be saved and restored on
188
+ // context switch
189
+ //
190
+ // Note that XGETBV is only available on 686 or later CPUs, so the
191
+ // instruction needs to be conditionally run.
192
+ if (max_level >= 1) {
193
+ __cpuid_count(1, 0, eax, ebx, ecx, edx);
194
+ if (ecx & bit_XSAVE_XRSTORE) {
195
+ uint64_t xcr_mask;
196
+ xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
197
+ if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) {
198
+ #if HAVE_AVX2
199
+ if (max_level >= 7) {
200
+ __cpuid_count(7, 0, eax, ebx, ecx, edx);
201
+ if (ebx & bit_AVX2) {
202
+ codec->enc = base64_stream_encode_avx2;
203
+ codec->dec = base64_stream_decode_avx2;
204
+ return true;
205
+ }
206
+ }
207
+ #endif
208
+ #if HAVE_AVX
209
+ __cpuid_count(1, 0, eax, ebx, ecx, edx);
210
+ if (ecx & bit_AVX) {
211
+ codec->enc = base64_stream_encode_avx;
212
+ codec->dec = base64_stream_decode_avx;
213
+ return true;
214
+ }
215
+ #endif
216
+ }
217
+ }
218
+ }
219
+ #endif
220
+
221
+ #if HAVE_SSE42
222
+ // Check for SSE42 support:
223
+ if (max_level >= 1) {
224
+ __cpuid(1, eax, ebx, ecx, edx);
225
+ if (ecx & bit_SSE42) {
226
+ codec->enc = base64_stream_encode_sse42;
227
+ codec->dec = base64_stream_decode_sse42;
228
+ return true;
229
+ }
230
+ }
231
+ #endif
232
+
233
+ #if HAVE_SSE41
234
+ // Check for SSE41 support:
235
+ if (max_level >= 1) {
236
+ __cpuid(1, eax, ebx, ecx, edx);
237
+ if (ecx & bit_SSE41) {
238
+ codec->enc = base64_stream_encode_sse41;
239
+ codec->dec = base64_stream_decode_sse41;
240
+ return true;
241
+ }
242
+ }
243
+ #endif
244
+
245
+ #if HAVE_SSSE3
246
+ // Check for SSSE3 support:
247
+ if (max_level >= 1) {
248
+ __cpuid(1, eax, ebx, ecx, edx);
249
+ if (ecx & bit_SSSE3) {
250
+ codec->enc = base64_stream_encode_ssse3;
251
+ codec->dec = base64_stream_decode_ssse3;
252
+ return true;
253
+ }
254
+ }
255
+ #endif
256
+
257
+ #else
258
+ (void)codec;
259
+ #endif
260
+
261
+ return false;
262
+ }
263
+
264
+ void
265
+ codec_choose (struct codec *codec, int flags)
266
+ {
267
+ // User forced a codec:
268
+ if (codec_choose_forced(codec, flags)) {
269
+ return;
270
+ }
271
+
272
+ // Runtime feature detection:
273
+ if (codec_choose_arm(codec)) {
274
+ return;
275
+ }
276
+ if (codec_choose_x86(codec)) {
277
+ return;
278
+ }
279
+ codec->enc = base64_stream_encode_plain;
280
+ codec->dec = base64_stream_decode_plain;
281
+ }
@@ -0,0 +1,65 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+
4
+ #include "../include/libbase64.h"
5
+ #include "config.h"
6
+
7
+ // Function parameters for encoding functions:
8
+ #define BASE64_ENC_PARAMS \
9
+ ( struct base64_state *state \
10
+ , const char *src \
11
+ , size_t srclen \
12
+ , char *out \
13
+ , size_t *outlen \
14
+ )
15
+
16
+ // Function parameters for decoding functions:
17
+ #define BASE64_DEC_PARAMS \
18
+ ( struct base64_state *state \
19
+ , const char *src \
20
+ , size_t srclen \
21
+ , char *out \
22
+ , size_t *outlen \
23
+ )
24
+
25
+ // Function signature for encoding functions:
26
+ #define BASE64_ENC_FUNCTION(arch) \
27
+ void \
28
+ base64_stream_encode_ ## arch \
29
+ BASE64_ENC_PARAMS
30
+
31
+ // Function signature for decoding functions:
32
+ #define BASE64_DEC_FUNCTION(arch) \
33
+ int \
34
+ base64_stream_decode_ ## arch \
35
+ BASE64_DEC_PARAMS
36
+
37
+ // Cast away unused variable, silence compiler:
38
+ #define UNUSED(x) ((void)(x))
39
+
40
+ // Stub function when encoder arch unsupported:
41
+ #define BASE64_ENC_STUB \
42
+ UNUSED(state); \
43
+ UNUSED(src); \
44
+ UNUSED(srclen); \
45
+ UNUSED(out); \
46
+ \
47
+ *outlen = 0;
48
+
49
+ // Stub function when decoder arch unsupported:
50
+ #define BASE64_DEC_STUB \
51
+ UNUSED(state); \
52
+ UNUSED(src); \
53
+ UNUSED(srclen); \
54
+ UNUSED(out); \
55
+ UNUSED(outlen); \
56
+ \
57
+ return -1;
58
+
59
+ struct codec
60
+ {
61
+ void (* enc) BASE64_ENC_PARAMS;
62
+ int (* dec) BASE64_DEC_PARAMS;
63
+ };
64
+
65
+ extern void codec_choose (struct codec *, int flags);