ob64 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/lib/ob64/version.rb +1 -1
- data/ob64.gemspec +2 -0
- data/vendor/libbase64/.gitignore +12 -0
- data/vendor/libbase64/.travis.yml +71 -0
- data/vendor/libbase64/CMakeLists.txt +264 -0
- data/vendor/libbase64/LICENSE +28 -0
- data/vendor/libbase64/Makefile +93 -0
- data/vendor/libbase64/README.md +474 -0
- data/vendor/libbase64/base64-benchmarks.png +0 -0
- data/vendor/libbase64/bin/base64.c +132 -0
- data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
- data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
- data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
- data/vendor/libbase64/cmake/config.h.in +25 -0
- data/vendor/libbase64/cmake/test-arch.c +35 -0
- data/vendor/libbase64/include/libbase64.h +145 -0
- data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
- data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
- data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
- data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
- data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
- data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
- data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
- data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
- data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
- data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
- data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
- data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
- data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
- data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
- data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
- data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
- data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
- data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
- data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
- data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
- data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
- data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
- data/vendor/libbase64/lib/codec_choose.c +281 -0
- data/vendor/libbase64/lib/codecs.h +65 -0
- data/vendor/libbase64/lib/env.h +67 -0
- data/vendor/libbase64/lib/exports.txt +7 -0
- data/vendor/libbase64/lib/lib.c +164 -0
- data/vendor/libbase64/lib/lib_openmp.c +149 -0
- data/vendor/libbase64/lib/tables/.gitignore +1 -0
- data/vendor/libbase64/lib/tables/Makefile +17 -0
- data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
- data/vendor/libbase64/lib/tables/table_generator.c +184 -0
- data/vendor/libbase64/lib/tables/tables.c +40 -0
- data/vendor/libbase64/lib/tables/tables.h +23 -0
- metadata +64 -4
@@ -0,0 +1,173 @@
|
|
1
|
+
// The input consists of six character sets in the Base64 alphabet, which we
|
2
|
+
// need to map back to the 6-bit values they represent. There are three ranges,
|
3
|
+
// two singles, and then there's the rest.
|
4
|
+
//
|
5
|
+
// # From To Add Characters
|
6
|
+
// 1 [43] [62] +19 +
|
7
|
+
// 2 [47] [63] +16 /
|
8
|
+
// 3 [48..57] [52..61] +4 0..9
|
9
|
+
// 4 [65..90] [0..25] -65 A..Z
|
10
|
+
// 5 [97..122] [26..51] -71 a..z
|
11
|
+
// (6) Everything else => invalid input
|
12
|
+
//
|
13
|
+
// We will use lookup tables for character validation and offset computation.
|
14
|
+
// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
|
15
|
+
// allows to mask with 0x2F instead of 0x0F and thus save one constant
|
16
|
+
// declaration (register and/or memory access).
|
17
|
+
//
|
18
|
+
// For offsets:
|
19
|
+
// Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
|
20
|
+
// 0000 = garbage
|
21
|
+
// 0001 = /
|
22
|
+
// 0010 = +
|
23
|
+
// 0011 = 0-9
|
24
|
+
// 0100 = A-Z
|
25
|
+
// 0101 = A-Z
|
26
|
+
// 0110 = a-z
|
27
|
+
// 0111 = a-z
|
28
|
+
// 1000 >= garbage
|
29
|
+
//
|
30
|
+
// For validation, here's the table.
|
31
|
+
// A character is valid if and only if the AND of the 2 lookups equals 0:
|
32
|
+
//
|
33
|
+
// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
|
34
|
+
// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
|
35
|
+
//
|
36
|
+
// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
|
37
|
+
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
38
|
+
//
|
39
|
+
// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
|
40
|
+
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
41
|
+
//
|
42
|
+
// 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
|
43
|
+
// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
|
44
|
+
//
|
45
|
+
// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
|
46
|
+
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
|
47
|
+
//
|
48
|
+
// 0100 0x04 char @ A B C D E F G H I J K L M N O
|
49
|
+
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
|
50
|
+
//
|
51
|
+
// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
|
52
|
+
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
|
53
|
+
//
|
54
|
+
// 0110 0x04 char ` a b c d e f g h i j k l m n o
|
55
|
+
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
|
56
|
+
// 0111 0x08 char p q r s t u v w x y z { | } ~
|
57
|
+
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
|
58
|
+
//
|
59
|
+
// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
60
|
+
// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
61
|
+
// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
62
|
+
// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
63
|
+
// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
64
|
+
// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
65
|
+
// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
66
|
+
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
67
|
+
|
68
|
+
static inline int
|
69
|
+
dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
|
70
|
+
{
|
71
|
+
const __m128i lut_lo = _mm_setr_epi8(
|
72
|
+
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
|
73
|
+
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
|
74
|
+
|
75
|
+
const __m128i lut_hi = _mm_setr_epi8(
|
76
|
+
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
|
77
|
+
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
|
78
|
+
|
79
|
+
const __m128i lut_roll = _mm_setr_epi8(
|
80
|
+
0, 16, 19, 4, -65, -65, -71, -71,
|
81
|
+
0, 0, 0, 0, 0, 0, 0, 0);
|
82
|
+
|
83
|
+
const __m128i mask_2F = _mm_set1_epi8(0x2F);
|
84
|
+
|
85
|
+
// Load input:
|
86
|
+
__m128i str = _mm_loadu_si128((__m128i *) *s);
|
87
|
+
|
88
|
+
// Table lookups:
|
89
|
+
const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
|
90
|
+
const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
|
91
|
+
const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles);
|
92
|
+
const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles);
|
93
|
+
|
94
|
+
// Check for invalid input: if any "and" values from lo and hi are not
|
95
|
+
// zero, fall back on bytewise code to do error checking and reporting:
|
96
|
+
if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
|
97
|
+
return 0;
|
98
|
+
}
|
99
|
+
|
100
|
+
const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
|
101
|
+
const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
|
102
|
+
|
103
|
+
// Now simply add the delta values to the input:
|
104
|
+
str = _mm_add_epi8(str, roll);
|
105
|
+
|
106
|
+
// Reshuffle the input to packed 12-byte output format:
|
107
|
+
str = dec_reshuffle(str);
|
108
|
+
|
109
|
+
// Store the output:
|
110
|
+
_mm_storeu_si128((__m128i *) *o, str);
|
111
|
+
|
112
|
+
*s += 16;
|
113
|
+
*o += 12;
|
114
|
+
*rounds -= 1;
|
115
|
+
|
116
|
+
return 1;
|
117
|
+
}
|
118
|
+
|
119
|
+
static inline void
|
120
|
+
dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
121
|
+
{
|
122
|
+
if (*slen < 24) {
|
123
|
+
return;
|
124
|
+
}
|
125
|
+
|
126
|
+
// Process blocks of 16 bytes per round. Because 4 extra zero bytes are
|
127
|
+
// written after the output, ensure that there will be at least 8 bytes
|
128
|
+
// of input data left to cover the gap. (6 data bytes and up to two
|
129
|
+
// end-of-string markers.)
|
130
|
+
size_t rounds = (*slen - 8) / 16;
|
131
|
+
|
132
|
+
*slen -= rounds * 16; // 16 bytes consumed per round
|
133
|
+
*olen += rounds * 12; // 12 bytes produced per round
|
134
|
+
|
135
|
+
do {
|
136
|
+
if (rounds >= 8) {
|
137
|
+
if (dec_loop_ssse3_inner(s, o, &rounds) &&
|
138
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
139
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
140
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
141
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
142
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
143
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
144
|
+
dec_loop_ssse3_inner(s, o, &rounds)) {
|
145
|
+
continue;
|
146
|
+
}
|
147
|
+
break;
|
148
|
+
}
|
149
|
+
if (rounds >= 4) {
|
150
|
+
if (dec_loop_ssse3_inner(s, o, &rounds) &&
|
151
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
152
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
153
|
+
dec_loop_ssse3_inner(s, o, &rounds)) {
|
154
|
+
continue;
|
155
|
+
}
|
156
|
+
break;
|
157
|
+
}
|
158
|
+
if (rounds >= 2) {
|
159
|
+
if (dec_loop_ssse3_inner(s, o, &rounds) &&
|
160
|
+
dec_loop_ssse3_inner(s, o, &rounds)) {
|
161
|
+
continue;
|
162
|
+
}
|
163
|
+
break;
|
164
|
+
}
|
165
|
+
dec_loop_ssse3_inner(s, o, &rounds);
|
166
|
+
break;
|
167
|
+
|
168
|
+
} while (rounds > 0);
|
169
|
+
|
170
|
+
// Adjust for any rounds that were skipped:
|
171
|
+
*slen += rounds * 16;
|
172
|
+
*olen -= rounds * 12;
|
173
|
+
}
|
@@ -0,0 +1,33 @@
|
|
1
|
+
static inline __m128i
|
2
|
+
dec_reshuffle (const __m128i in)
|
3
|
+
{
|
4
|
+
// in, bits, upper case are most significant bits, lower case are least significant bits
|
5
|
+
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
|
6
|
+
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
|
7
|
+
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
|
8
|
+
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
|
9
|
+
|
10
|
+
const __m128i merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
|
11
|
+
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
|
12
|
+
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
|
13
|
+
// 0000eeee FFffffff 0000DDDD DDddEEEE
|
14
|
+
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
|
15
|
+
|
16
|
+
const __m128i out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
|
17
|
+
// 00000000 JJJJJJjj KKKKkkkk LLllllll
|
18
|
+
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
|
19
|
+
// 00000000 DDDDDDdd EEEEeeee FFffffff
|
20
|
+
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
|
21
|
+
|
22
|
+
// Pack bytes together:
|
23
|
+
return _mm_shuffle_epi8(out, _mm_setr_epi8(
|
24
|
+
2, 1, 0,
|
25
|
+
6, 5, 4,
|
26
|
+
10, 9, 8,
|
27
|
+
14, 13, 12,
|
28
|
+
-1, -1, -1, -1));
|
29
|
+
// 00000000 00000000 00000000 00000000
|
30
|
+
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
|
31
|
+
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
|
32
|
+
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
|
33
|
+
}
|
@@ -0,0 +1,67 @@
|
|
1
|
+
static inline void
|
2
|
+
enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
|
3
|
+
{
|
4
|
+
// Load input:
|
5
|
+
__m128i str = _mm_loadu_si128((__m128i *) *s);
|
6
|
+
|
7
|
+
// Reshuffle:
|
8
|
+
str = enc_reshuffle(str);
|
9
|
+
|
10
|
+
// Translate reshuffled bytes to the Base64 alphabet:
|
11
|
+
str = enc_translate(str);
|
12
|
+
|
13
|
+
// Store:
|
14
|
+
_mm_storeu_si128((__m128i *) *o, str);
|
15
|
+
|
16
|
+
*s += 12;
|
17
|
+
*o += 16;
|
18
|
+
}
|
19
|
+
|
20
|
+
static inline void
|
21
|
+
enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
22
|
+
{
|
23
|
+
if (*slen < 16) {
|
24
|
+
return;
|
25
|
+
}
|
26
|
+
|
27
|
+
// Process blocks of 12 bytes at a time. Because blocks are loaded 16
|
28
|
+
// bytes at a time, ensure that there will be at least 4 remaining
|
29
|
+
// bytes after the last round, so that the final read will not pass
|
30
|
+
// beyond the bounds of the input buffer:
|
31
|
+
size_t rounds = (*slen - 4) / 12;
|
32
|
+
|
33
|
+
*slen -= rounds * 12; // 12 bytes consumed per round
|
34
|
+
*olen += rounds * 16; // 16 bytes produced per round
|
35
|
+
|
36
|
+
do {
|
37
|
+
if (rounds >= 8) {
|
38
|
+
enc_loop_ssse3_inner(s, o);
|
39
|
+
enc_loop_ssse3_inner(s, o);
|
40
|
+
enc_loop_ssse3_inner(s, o);
|
41
|
+
enc_loop_ssse3_inner(s, o);
|
42
|
+
enc_loop_ssse3_inner(s, o);
|
43
|
+
enc_loop_ssse3_inner(s, o);
|
44
|
+
enc_loop_ssse3_inner(s, o);
|
45
|
+
enc_loop_ssse3_inner(s, o);
|
46
|
+
rounds -= 8;
|
47
|
+
continue;
|
48
|
+
}
|
49
|
+
if (rounds >= 4) {
|
50
|
+
enc_loop_ssse3_inner(s, o);
|
51
|
+
enc_loop_ssse3_inner(s, o);
|
52
|
+
enc_loop_ssse3_inner(s, o);
|
53
|
+
enc_loop_ssse3_inner(s, o);
|
54
|
+
rounds -= 4;
|
55
|
+
continue;
|
56
|
+
}
|
57
|
+
if (rounds >= 2) {
|
58
|
+
enc_loop_ssse3_inner(s, o);
|
59
|
+
enc_loop_ssse3_inner(s, o);
|
60
|
+
rounds -= 2;
|
61
|
+
continue;
|
62
|
+
}
|
63
|
+
enc_loop_ssse3_inner(s, o);
|
64
|
+
break;
|
65
|
+
|
66
|
+
} while (rounds > 0);
|
67
|
+
}
|
@@ -0,0 +1,48 @@
|
|
1
|
+
static inline __m128i
|
2
|
+
enc_reshuffle (__m128i in)
|
3
|
+
{
|
4
|
+
// Input, bytes MSB to LSB:
|
5
|
+
// 0 0 0 0 l k j i h g f e d c b a
|
6
|
+
|
7
|
+
in = _mm_shuffle_epi8(in, _mm_set_epi8(
|
8
|
+
10, 11, 9, 10,
|
9
|
+
7, 8, 6, 7,
|
10
|
+
4, 5, 3, 4,
|
11
|
+
1, 2, 0, 1));
|
12
|
+
// in, bytes MSB to LSB:
|
13
|
+
// k l j k
|
14
|
+
// h i g h
|
15
|
+
// e f d e
|
16
|
+
// b c a b
|
17
|
+
|
18
|
+
const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0FC0FC00));
|
19
|
+
// bits, upper case are most significant bits, lower case are least significant bits
|
20
|
+
// 0000kkkk LL000000 JJJJJJ00 00000000
|
21
|
+
// 0000hhhh II000000 GGGGGG00 00000000
|
22
|
+
// 0000eeee FF000000 DDDDDD00 00000000
|
23
|
+
// 0000bbbb CC000000 AAAAAA00 00000000
|
24
|
+
|
25
|
+
const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
|
26
|
+
// 00000000 00kkkkLL 00000000 00JJJJJJ
|
27
|
+
// 00000000 00hhhhII 00000000 00GGGGGG
|
28
|
+
// 00000000 00eeeeFF 00000000 00DDDDDD
|
29
|
+
// 00000000 00bbbbCC 00000000 00AAAAAA
|
30
|
+
|
31
|
+
const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003F03F0));
|
32
|
+
// 00000000 00llllll 000000jj KKKK0000
|
33
|
+
// 00000000 00iiiiii 000000gg HHHH0000
|
34
|
+
// 00000000 00ffffff 000000dd EEEE0000
|
35
|
+
// 00000000 00cccccc 000000aa BBBB0000
|
36
|
+
|
37
|
+
const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
|
38
|
+
// 00llllll 00000000 00jjKKKK 00000000
|
39
|
+
// 00iiiiii 00000000 00ggHHHH 00000000
|
40
|
+
// 00ffffff 00000000 00ddEEEE 00000000
|
41
|
+
// 00cccccc 00000000 00aaBBBB 00000000
|
42
|
+
|
43
|
+
return _mm_or_si128(t1, t3);
|
44
|
+
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
|
45
|
+
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
|
46
|
+
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
|
47
|
+
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
|
48
|
+
}
|
@@ -0,0 +1,33 @@
|
|
1
|
+
static inline __m128i
|
2
|
+
enc_translate (const __m128i in)
|
3
|
+
{
|
4
|
+
// A lookup table containing the absolute offsets for all ranges:
|
5
|
+
const __m128i lut = _mm_setr_epi8(
|
6
|
+
65, 71, -4, -4,
|
7
|
+
-4, -4, -4, -4,
|
8
|
+
-4, -4, -4, -4,
|
9
|
+
-19, -16, 0, 0
|
10
|
+
);
|
11
|
+
|
12
|
+
// Translate values 0..63 to the Base64 alphabet. There are five sets:
|
13
|
+
// # From To Abs Index Characters
|
14
|
+
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
15
|
+
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
|
16
|
+
// 2 [52..61] [48..57] -4 [2..11] 0123456789
|
17
|
+
// 3 [62] [43] -19 12 +
|
18
|
+
// 4 [63] [47] -16 13 /
|
19
|
+
|
20
|
+
// Create LUT indices from the input. The index for range #0 is right,
|
21
|
+
// others are 1 less than expected:
|
22
|
+
__m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
|
23
|
+
|
24
|
+
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
|
25
|
+
__m128i mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
|
26
|
+
|
27
|
+
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
|
28
|
+
// now correct:
|
29
|
+
indices = _mm_sub_epi8(indices, mask);
|
30
|
+
|
31
|
+
// Add offsets to input values:
|
32
|
+
return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
|
33
|
+
}
|
@@ -0,0 +1,281 @@
|
|
1
|
+
#include <stdbool.h>
|
2
|
+
#include <stdint.h>
|
3
|
+
#include <stddef.h>
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
#include "../include/libbase64.h"
|
7
|
+
#include "codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "env.h"
|
10
|
+
|
11
|
+
#if (__x86_64__ || __i386__ || _M_X86 || _M_X64)
|
12
|
+
#define BASE64_X86
|
13
|
+
#if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2)
|
14
|
+
#define BASE64_X86_SIMD
|
15
|
+
#endif
|
16
|
+
#endif
|
17
|
+
|
18
|
+
#ifdef BASE64_X86
|
19
|
+
#ifdef _MSC_VER
|
20
|
+
#include <intrin.h>
|
21
|
+
#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
|
22
|
+
{ \
|
23
|
+
int info[4]; \
|
24
|
+
__cpuidex(info, __level, __count); \
|
25
|
+
__eax = info[0]; \
|
26
|
+
__ebx = info[1]; \
|
27
|
+
__ecx = info[2]; \
|
28
|
+
__edx = info[3]; \
|
29
|
+
}
|
30
|
+
#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
|
31
|
+
__cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx)
|
32
|
+
#else
|
33
|
+
#include <cpuid.h>
|
34
|
+
#if HAVE_AVX2 || HAVE_AVX
|
35
|
+
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3))
|
36
|
+
static inline uint64_t _xgetbv (uint32_t index)
|
37
|
+
{
|
38
|
+
uint32_t eax, edx;
|
39
|
+
__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
|
40
|
+
return ((uint64_t)edx << 32) | eax;
|
41
|
+
}
|
42
|
+
#else
|
43
|
+
#error "Platform not supported"
|
44
|
+
#endif
|
45
|
+
#endif
|
46
|
+
#endif
|
47
|
+
|
48
|
+
#ifndef bit_AVX2
|
49
|
+
#define bit_AVX2 (1 << 5)
|
50
|
+
#endif
|
51
|
+
#ifndef bit_SSSE3
|
52
|
+
#define bit_SSSE3 (1 << 9)
|
53
|
+
#endif
|
54
|
+
#ifndef bit_SSE41
|
55
|
+
#define bit_SSE41 (1 << 19)
|
56
|
+
#endif
|
57
|
+
#ifndef bit_SSE42
|
58
|
+
#define bit_SSE42 (1 << 20)
|
59
|
+
#endif
|
60
|
+
#ifndef bit_AVX
|
61
|
+
#define bit_AVX (1 << 28)
|
62
|
+
#endif
|
63
|
+
|
64
|
+
#define bit_XSAVE_XRSTORE (1 << 27)
|
65
|
+
|
66
|
+
#ifndef _XCR_XFEATURE_ENABLED_MASK
|
67
|
+
#define _XCR_XFEATURE_ENABLED_MASK 0
|
68
|
+
#endif
|
69
|
+
|
70
|
+
#define _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS 0x6
|
71
|
+
#endif
|
72
|
+
|
73
|
+
// Function declarations:
|
74
|
+
#define BASE64_CODEC_FUNCS(arch) \
|
75
|
+
BASE64_ENC_FUNCTION(arch); \
|
76
|
+
BASE64_DEC_FUNCTION(arch); \
|
77
|
+
|
78
|
+
BASE64_CODEC_FUNCS(avx2)
|
79
|
+
BASE64_CODEC_FUNCS(neon32)
|
80
|
+
BASE64_CODEC_FUNCS(neon64)
|
81
|
+
BASE64_CODEC_FUNCS(plain)
|
82
|
+
BASE64_CODEC_FUNCS(ssse3)
|
83
|
+
BASE64_CODEC_FUNCS(sse41)
|
84
|
+
BASE64_CODEC_FUNCS(sse42)
|
85
|
+
BASE64_CODEC_FUNCS(avx)
|
86
|
+
|
87
|
+
static bool
|
88
|
+
codec_choose_forced (struct codec *codec, int flags)
|
89
|
+
{
|
90
|
+
// If the user wants to use a certain codec,
|
91
|
+
// always allow it, even if the codec is a no-op.
|
92
|
+
// For testing purposes.
|
93
|
+
|
94
|
+
if (!(flags & 0xFF)) {
|
95
|
+
return false;
|
96
|
+
}
|
97
|
+
if (flags & BASE64_FORCE_AVX2) {
|
98
|
+
codec->enc = base64_stream_encode_avx2;
|
99
|
+
codec->dec = base64_stream_decode_avx2;
|
100
|
+
return true;
|
101
|
+
}
|
102
|
+
if (flags & BASE64_FORCE_NEON32) {
|
103
|
+
codec->enc = base64_stream_encode_neon32;
|
104
|
+
codec->dec = base64_stream_decode_neon32;
|
105
|
+
return true;
|
106
|
+
}
|
107
|
+
if (flags & BASE64_FORCE_NEON64) {
|
108
|
+
codec->enc = base64_stream_encode_neon64;
|
109
|
+
codec->dec = base64_stream_decode_neon64;
|
110
|
+
return true;
|
111
|
+
}
|
112
|
+
if (flags & BASE64_FORCE_PLAIN) {
|
113
|
+
codec->enc = base64_stream_encode_plain;
|
114
|
+
codec->dec = base64_stream_decode_plain;
|
115
|
+
return true;
|
116
|
+
}
|
117
|
+
if (flags & BASE64_FORCE_SSSE3) {
|
118
|
+
codec->enc = base64_stream_encode_ssse3;
|
119
|
+
codec->dec = base64_stream_decode_ssse3;
|
120
|
+
return true;
|
121
|
+
}
|
122
|
+
if (flags & BASE64_FORCE_SSE41) {
|
123
|
+
codec->enc = base64_stream_encode_sse41;
|
124
|
+
codec->dec = base64_stream_decode_sse41;
|
125
|
+
return true;
|
126
|
+
}
|
127
|
+
if (flags & BASE64_FORCE_SSE42) {
|
128
|
+
codec->enc = base64_stream_encode_sse42;
|
129
|
+
codec->dec = base64_stream_decode_sse42;
|
130
|
+
return true;
|
131
|
+
}
|
132
|
+
if (flags & BASE64_FORCE_AVX) {
|
133
|
+
codec->enc = base64_stream_encode_avx;
|
134
|
+
codec->dec = base64_stream_decode_avx;
|
135
|
+
return true;
|
136
|
+
}
|
137
|
+
return false;
|
138
|
+
}
|
139
|
+
|
140
|
+
static bool
|
141
|
+
codec_choose_arm (struct codec *codec)
|
142
|
+
{
|
143
|
+
#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && ((defined(__aarch64__) && HAVE_NEON64) || HAVE_NEON32)
|
144
|
+
|
145
|
+
// Unfortunately there is no portable way to check for NEON
|
146
|
+
// support at runtime from userland in the same way that x86
|
147
|
+
// has cpuid, so just stick to the compile-time configuration:
|
148
|
+
|
149
|
+
#if defined(__aarch64__) && HAVE_NEON64
|
150
|
+
codec->enc = base64_stream_encode_neon64;
|
151
|
+
codec->dec = base64_stream_decode_neon64;
|
152
|
+
#else
|
153
|
+
codec->enc = base64_stream_encode_neon32;
|
154
|
+
codec->dec = base64_stream_decode_neon32;
|
155
|
+
#endif
|
156
|
+
|
157
|
+
return true;
|
158
|
+
|
159
|
+
#else
|
160
|
+
(void)codec;
|
161
|
+
return false;
|
162
|
+
#endif
|
163
|
+
}
|
164
|
+
|
165
|
+
static bool
|
166
|
+
codec_choose_x86 (struct codec *codec)
|
167
|
+
{
|
168
|
+
#ifdef BASE64_X86_SIMD
|
169
|
+
|
170
|
+
unsigned int eax, ebx = 0, ecx = 0, edx;
|
171
|
+
unsigned int max_level;
|
172
|
+
|
173
|
+
#ifdef _MSC_VER
|
174
|
+
int info[4];
|
175
|
+
__cpuidex(info, 0, 0);
|
176
|
+
max_level = info[0];
|
177
|
+
#else
|
178
|
+
max_level = __get_cpuid_max(0, NULL);
|
179
|
+
#endif
|
180
|
+
|
181
|
+
#if HAVE_AVX2 || HAVE_AVX
|
182
|
+
// Check for AVX/AVX2 support:
|
183
|
+
// Checking for AVX requires 3 things:
|
184
|
+
// 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions
|
185
|
+
// (allowing saving YMM registers on context switch)
|
186
|
+
// 2) CPUID indicates support for AVX
|
187
|
+
// 3) XGETBV indicates the AVX registers will be saved and restored on
|
188
|
+
// context switch
|
189
|
+
//
|
190
|
+
// Note that XGETBV is only available on 686 or later CPUs, so the
|
191
|
+
// instruction needs to be conditionally run.
|
192
|
+
if (max_level >= 1) {
|
193
|
+
__cpuid_count(1, 0, eax, ebx, ecx, edx);
|
194
|
+
if (ecx & bit_XSAVE_XRSTORE) {
|
195
|
+
uint64_t xcr_mask;
|
196
|
+
xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
|
197
|
+
if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) {
|
198
|
+
#if HAVE_AVX2
|
199
|
+
if (max_level >= 7) {
|
200
|
+
__cpuid_count(7, 0, eax, ebx, ecx, edx);
|
201
|
+
if (ebx & bit_AVX2) {
|
202
|
+
codec->enc = base64_stream_encode_avx2;
|
203
|
+
codec->dec = base64_stream_decode_avx2;
|
204
|
+
return true;
|
205
|
+
}
|
206
|
+
}
|
207
|
+
#endif
|
208
|
+
#if HAVE_AVX
|
209
|
+
__cpuid_count(1, 0, eax, ebx, ecx, edx);
|
210
|
+
if (ecx & bit_AVX) {
|
211
|
+
codec->enc = base64_stream_encode_avx;
|
212
|
+
codec->dec = base64_stream_decode_avx;
|
213
|
+
return true;
|
214
|
+
}
|
215
|
+
#endif
|
216
|
+
}
|
217
|
+
}
|
218
|
+
}
|
219
|
+
#endif
|
220
|
+
|
221
|
+
#if HAVE_SSE42
|
222
|
+
// Check for SSE42 support:
|
223
|
+
if (max_level >= 1) {
|
224
|
+
__cpuid(1, eax, ebx, ecx, edx);
|
225
|
+
if (ecx & bit_SSE42) {
|
226
|
+
codec->enc = base64_stream_encode_sse42;
|
227
|
+
codec->dec = base64_stream_decode_sse42;
|
228
|
+
return true;
|
229
|
+
}
|
230
|
+
}
|
231
|
+
#endif
|
232
|
+
|
233
|
+
#if HAVE_SSE41
|
234
|
+
// Check for SSE41 support:
|
235
|
+
if (max_level >= 1) {
|
236
|
+
__cpuid(1, eax, ebx, ecx, edx);
|
237
|
+
if (ecx & bit_SSE41) {
|
238
|
+
codec->enc = base64_stream_encode_sse41;
|
239
|
+
codec->dec = base64_stream_decode_sse41;
|
240
|
+
return true;
|
241
|
+
}
|
242
|
+
}
|
243
|
+
#endif
|
244
|
+
|
245
|
+
#if HAVE_SSSE3
|
246
|
+
// Check for SSSE3 support:
|
247
|
+
if (max_level >= 1) {
|
248
|
+
__cpuid(1, eax, ebx, ecx, edx);
|
249
|
+
if (ecx & bit_SSSE3) {
|
250
|
+
codec->enc = base64_stream_encode_ssse3;
|
251
|
+
codec->dec = base64_stream_decode_ssse3;
|
252
|
+
return true;
|
253
|
+
}
|
254
|
+
}
|
255
|
+
#endif
|
256
|
+
|
257
|
+
#else
|
258
|
+
(void)codec;
|
259
|
+
#endif
|
260
|
+
|
261
|
+
return false;
|
262
|
+
}
|
263
|
+
|
264
|
+
void
|
265
|
+
codec_choose (struct codec *codec, int flags)
|
266
|
+
{
|
267
|
+
// User forced a codec:
|
268
|
+
if (codec_choose_forced(codec, flags)) {
|
269
|
+
return;
|
270
|
+
}
|
271
|
+
|
272
|
+
// Runtime feature detection:
|
273
|
+
if (codec_choose_arm(codec)) {
|
274
|
+
return;
|
275
|
+
}
|
276
|
+
if (codec_choose_x86(codec)) {
|
277
|
+
return;
|
278
|
+
}
|
279
|
+
codec->enc = base64_stream_encode_plain;
|
280
|
+
codec->dec = base64_stream_decode_plain;
|
281
|
+
}
|
@@ -0,0 +1,65 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
|
4
|
+
#include "../include/libbase64.h"
|
5
|
+
#include "config.h"
|
6
|
+
|
7
|
+
// Function parameters for encoding functions:
|
8
|
+
#define BASE64_ENC_PARAMS \
|
9
|
+
( struct base64_state *state \
|
10
|
+
, const char *src \
|
11
|
+
, size_t srclen \
|
12
|
+
, char *out \
|
13
|
+
, size_t *outlen \
|
14
|
+
)
|
15
|
+
|
16
|
+
// Function parameters for decoding functions:
|
17
|
+
#define BASE64_DEC_PARAMS \
|
18
|
+
( struct base64_state *state \
|
19
|
+
, const char *src \
|
20
|
+
, size_t srclen \
|
21
|
+
, char *out \
|
22
|
+
, size_t *outlen \
|
23
|
+
)
|
24
|
+
|
25
|
+
// Function signature for encoding functions:
|
26
|
+
#define BASE64_ENC_FUNCTION(arch) \
|
27
|
+
void \
|
28
|
+
base64_stream_encode_ ## arch \
|
29
|
+
BASE64_ENC_PARAMS
|
30
|
+
|
31
|
+
// Function signature for decoding functions:
|
32
|
+
#define BASE64_DEC_FUNCTION(arch) \
|
33
|
+
int \
|
34
|
+
base64_stream_decode_ ## arch \
|
35
|
+
BASE64_DEC_PARAMS
|
36
|
+
|
37
|
+
// Cast away unused variable, silence compiler:
|
38
|
+
#define UNUSED(x) ((void)(x))
|
39
|
+
|
40
|
+
// Stub function when encoder arch unsupported:
|
41
|
+
#define BASE64_ENC_STUB \
|
42
|
+
UNUSED(state); \
|
43
|
+
UNUSED(src); \
|
44
|
+
UNUSED(srclen); \
|
45
|
+
UNUSED(out); \
|
46
|
+
\
|
47
|
+
*outlen = 0;
|
48
|
+
|
49
|
+
// Stub function when decoder arch unsupported:
|
50
|
+
#define BASE64_DEC_STUB \
|
51
|
+
UNUSED(state); \
|
52
|
+
UNUSED(src); \
|
53
|
+
UNUSED(srclen); \
|
54
|
+
UNUSED(out); \
|
55
|
+
UNUSED(outlen); \
|
56
|
+
\
|
57
|
+
return -1;
|
58
|
+
|
59
|
+
struct codec
|
60
|
+
{
|
61
|
+
void (* enc) BASE64_ENC_PARAMS;
|
62
|
+
int (* dec) BASE64_DEC_PARAMS;
|
63
|
+
};
|
64
|
+
|
65
|
+
extern void codec_choose (struct codec *, int flags);
|