ob64 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/lib/ob64/version.rb +1 -1
- data/ob64.gemspec +2 -0
- data/vendor/libbase64/.gitignore +12 -0
- data/vendor/libbase64/.travis.yml +71 -0
- data/vendor/libbase64/CMakeLists.txt +264 -0
- data/vendor/libbase64/LICENSE +28 -0
- data/vendor/libbase64/Makefile +93 -0
- data/vendor/libbase64/README.md +474 -0
- data/vendor/libbase64/base64-benchmarks.png +0 -0
- data/vendor/libbase64/bin/base64.c +132 -0
- data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
- data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
- data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
- data/vendor/libbase64/cmake/config.h.in +25 -0
- data/vendor/libbase64/cmake/test-arch.c +35 -0
- data/vendor/libbase64/include/libbase64.h +145 -0
- data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
- data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
- data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
- data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
- data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
- data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
- data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
- data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
- data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
- data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
- data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
- data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
- data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
- data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
- data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
- data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
- data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
- data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
- data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
- data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
- data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
- data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
- data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
- data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
- data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
- data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
- data/vendor/libbase64/lib/codec_choose.c +281 -0
- data/vendor/libbase64/lib/codecs.h +65 -0
- data/vendor/libbase64/lib/env.h +67 -0
- data/vendor/libbase64/lib/exports.txt +7 -0
- data/vendor/libbase64/lib/lib.c +164 -0
- data/vendor/libbase64/lib/lib_openmp.c +149 -0
- data/vendor/libbase64/lib/tables/.gitignore +1 -0
- data/vendor/libbase64/lib/tables/Makefile +17 -0
- data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
- data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
- data/vendor/libbase64/lib/tables/table_generator.c +184 -0
- data/vendor/libbase64/lib/tables/tables.c +40 -0
- data/vendor/libbase64/lib/tables/tables.h +23 -0
- metadata +64 -4
@@ -0,0 +1,173 @@
|
|
1
|
+
// The input consists of six character sets in the Base64 alphabet, which we
|
2
|
+
// need to map back to the 6-bit values they represent. There are three ranges,
|
3
|
+
// two singles, and then there's the rest.
|
4
|
+
//
|
5
|
+
// # From To Add Characters
|
6
|
+
// 1 [43] [62] +19 +
|
7
|
+
// 2 [47] [63] +16 /
|
8
|
+
// 3 [48..57] [52..61] +4 0..9
|
9
|
+
// 4 [65..90] [0..25] -65 A..Z
|
10
|
+
// 5 [97..122] [26..51] -71 a..z
|
11
|
+
// (6) Everything else => invalid input
|
12
|
+
//
|
13
|
+
// We will use lookup tables for character validation and offset computation.
|
14
|
+
// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
|
15
|
+
// allows to mask with 0x2F instead of 0x0F and thus save one constant
|
16
|
+
// declaration (register and/or memory access).
|
17
|
+
//
|
18
|
+
// For offsets:
|
19
|
+
// Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
|
20
|
+
// 0000 = garbage
|
21
|
+
// 0001 = /
|
22
|
+
// 0010 = +
|
23
|
+
// 0011 = 0-9
|
24
|
+
// 0100 = A-Z
|
25
|
+
// 0101 = A-Z
|
26
|
+
// 0110 = a-z
|
27
|
+
// 0111 = a-z
|
28
|
+
// 1000 >= garbage
|
29
|
+
//
|
30
|
+
// For validation, here's the table.
|
31
|
+
// A character is valid if and only if the AND of the 2 lookups equals 0:
|
32
|
+
//
|
33
|
+
// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
|
34
|
+
// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
|
35
|
+
//
|
36
|
+
// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
|
37
|
+
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
38
|
+
//
|
39
|
+
// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
|
40
|
+
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
41
|
+
//
|
42
|
+
// 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
|
43
|
+
// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
|
44
|
+
//
|
45
|
+
// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
|
46
|
+
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
|
47
|
+
//
|
48
|
+
// 0100 0x04 char @ A B C D E F G H I J K L M N O
|
49
|
+
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
|
50
|
+
//
|
51
|
+
// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
|
52
|
+
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
|
53
|
+
//
|
54
|
+
// 0110 0x04 char ` a b c d e f g h i j k l m n o
|
55
|
+
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
|
56
|
+
// 0111 0x08 char p q r s t u v w x y z { | } ~
|
57
|
+
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
|
58
|
+
//
|
59
|
+
// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
60
|
+
// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
61
|
+
// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
62
|
+
// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
63
|
+
// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
64
|
+
// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
65
|
+
// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
66
|
+
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
|
67
|
+
|
68
|
+
static inline int
|
69
|
+
dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
|
70
|
+
{
|
71
|
+
const __m128i lut_lo = _mm_setr_epi8(
|
72
|
+
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
|
73
|
+
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
|
74
|
+
|
75
|
+
const __m128i lut_hi = _mm_setr_epi8(
|
76
|
+
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
|
77
|
+
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
|
78
|
+
|
79
|
+
const __m128i lut_roll = _mm_setr_epi8(
|
80
|
+
0, 16, 19, 4, -65, -65, -71, -71,
|
81
|
+
0, 0, 0, 0, 0, 0, 0, 0);
|
82
|
+
|
83
|
+
const __m128i mask_2F = _mm_set1_epi8(0x2F);
|
84
|
+
|
85
|
+
// Load input:
|
86
|
+
__m128i str = _mm_loadu_si128((__m128i *) *s);
|
87
|
+
|
88
|
+
// Table lookups:
|
89
|
+
const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
|
90
|
+
const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
|
91
|
+
const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles);
|
92
|
+
const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles);
|
93
|
+
|
94
|
+
// Check for invalid input: if any "and" values from lo and hi are not
|
95
|
+
// zero, fall back on bytewise code to do error checking and reporting:
|
96
|
+
if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
|
97
|
+
return 0;
|
98
|
+
}
|
99
|
+
|
100
|
+
const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
|
101
|
+
const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
|
102
|
+
|
103
|
+
// Now simply add the delta values to the input:
|
104
|
+
str = _mm_add_epi8(str, roll);
|
105
|
+
|
106
|
+
// Reshuffle the input to packed 12-byte output format:
|
107
|
+
str = dec_reshuffle(str);
|
108
|
+
|
109
|
+
// Store the output:
|
110
|
+
_mm_storeu_si128((__m128i *) *o, str);
|
111
|
+
|
112
|
+
*s += 16;
|
113
|
+
*o += 12;
|
114
|
+
*rounds -= 1;
|
115
|
+
|
116
|
+
return 1;
|
117
|
+
}
|
118
|
+
|
119
|
+
static inline void
|
120
|
+
dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
121
|
+
{
|
122
|
+
if (*slen < 24) {
|
123
|
+
return;
|
124
|
+
}
|
125
|
+
|
126
|
+
// Process blocks of 16 bytes per round. Because 4 extra zero bytes are
|
127
|
+
// written after the output, ensure that there will be at least 8 bytes
|
128
|
+
// of input data left to cover the gap. (6 data bytes and up to two
|
129
|
+
// end-of-string markers.)
|
130
|
+
size_t rounds = (*slen - 8) / 16;
|
131
|
+
|
132
|
+
*slen -= rounds * 16; // 16 bytes consumed per round
|
133
|
+
*olen += rounds * 12; // 12 bytes produced per round
|
134
|
+
|
135
|
+
do {
|
136
|
+
if (rounds >= 8) {
|
137
|
+
if (dec_loop_ssse3_inner(s, o, &rounds) &&
|
138
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
139
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
140
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
141
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
142
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
143
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
144
|
+
dec_loop_ssse3_inner(s, o, &rounds)) {
|
145
|
+
continue;
|
146
|
+
}
|
147
|
+
break;
|
148
|
+
}
|
149
|
+
if (rounds >= 4) {
|
150
|
+
if (dec_loop_ssse3_inner(s, o, &rounds) &&
|
151
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
152
|
+
dec_loop_ssse3_inner(s, o, &rounds) &&
|
153
|
+
dec_loop_ssse3_inner(s, o, &rounds)) {
|
154
|
+
continue;
|
155
|
+
}
|
156
|
+
break;
|
157
|
+
}
|
158
|
+
if (rounds >= 2) {
|
159
|
+
if (dec_loop_ssse3_inner(s, o, &rounds) &&
|
160
|
+
dec_loop_ssse3_inner(s, o, &rounds)) {
|
161
|
+
continue;
|
162
|
+
}
|
163
|
+
break;
|
164
|
+
}
|
165
|
+
dec_loop_ssse3_inner(s, o, &rounds);
|
166
|
+
break;
|
167
|
+
|
168
|
+
} while (rounds > 0);
|
169
|
+
|
170
|
+
// Adjust for any rounds that were skipped:
|
171
|
+
*slen += rounds * 16;
|
172
|
+
*olen -= rounds * 12;
|
173
|
+
}
|
@@ -0,0 +1,33 @@
|
|
1
|
+
static inline __m128i
|
2
|
+
dec_reshuffle (const __m128i in)
|
3
|
+
{
|
4
|
+
// in, bits, upper case are most significant bits, lower case are least significant bits
|
5
|
+
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
|
6
|
+
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
|
7
|
+
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
|
8
|
+
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
|
9
|
+
|
10
|
+
const __m128i merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
|
11
|
+
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
|
12
|
+
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
|
13
|
+
// 0000eeee FFffffff 0000DDDD DDddEEEE
|
14
|
+
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
|
15
|
+
|
16
|
+
const __m128i out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
|
17
|
+
// 00000000 JJJJJJjj KKKKkkkk LLllllll
|
18
|
+
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
|
19
|
+
// 00000000 DDDDDDdd EEEEeeee FFffffff
|
20
|
+
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
|
21
|
+
|
22
|
+
// Pack bytes together:
|
23
|
+
return _mm_shuffle_epi8(out, _mm_setr_epi8(
|
24
|
+
2, 1, 0,
|
25
|
+
6, 5, 4,
|
26
|
+
10, 9, 8,
|
27
|
+
14, 13, 12,
|
28
|
+
-1, -1, -1, -1));
|
29
|
+
// 00000000 00000000 00000000 00000000
|
30
|
+
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
|
31
|
+
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
|
32
|
+
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
|
33
|
+
}
|
@@ -0,0 +1,67 @@
|
|
1
|
+
static inline void
|
2
|
+
enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
|
3
|
+
{
|
4
|
+
// Load input:
|
5
|
+
__m128i str = _mm_loadu_si128((__m128i *) *s);
|
6
|
+
|
7
|
+
// Reshuffle:
|
8
|
+
str = enc_reshuffle(str);
|
9
|
+
|
10
|
+
// Translate reshuffled bytes to the Base64 alphabet:
|
11
|
+
str = enc_translate(str);
|
12
|
+
|
13
|
+
// Store:
|
14
|
+
_mm_storeu_si128((__m128i *) *o, str);
|
15
|
+
|
16
|
+
*s += 12;
|
17
|
+
*o += 16;
|
18
|
+
}
|
19
|
+
|
20
|
+
static inline void
|
21
|
+
enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
|
22
|
+
{
|
23
|
+
if (*slen < 16) {
|
24
|
+
return;
|
25
|
+
}
|
26
|
+
|
27
|
+
// Process blocks of 12 bytes at a time. Because blocks are loaded 16
|
28
|
+
// bytes at a time, ensure that there will be at least 4 remaining
|
29
|
+
// bytes after the last round, so that the final read will not pass
|
30
|
+
// beyond the bounds of the input buffer:
|
31
|
+
size_t rounds = (*slen - 4) / 12;
|
32
|
+
|
33
|
+
*slen -= rounds * 12; // 12 bytes consumed per round
|
34
|
+
*olen += rounds * 16; // 16 bytes produced per round
|
35
|
+
|
36
|
+
do {
|
37
|
+
if (rounds >= 8) {
|
38
|
+
enc_loop_ssse3_inner(s, o);
|
39
|
+
enc_loop_ssse3_inner(s, o);
|
40
|
+
enc_loop_ssse3_inner(s, o);
|
41
|
+
enc_loop_ssse3_inner(s, o);
|
42
|
+
enc_loop_ssse3_inner(s, o);
|
43
|
+
enc_loop_ssse3_inner(s, o);
|
44
|
+
enc_loop_ssse3_inner(s, o);
|
45
|
+
enc_loop_ssse3_inner(s, o);
|
46
|
+
rounds -= 8;
|
47
|
+
continue;
|
48
|
+
}
|
49
|
+
if (rounds >= 4) {
|
50
|
+
enc_loop_ssse3_inner(s, o);
|
51
|
+
enc_loop_ssse3_inner(s, o);
|
52
|
+
enc_loop_ssse3_inner(s, o);
|
53
|
+
enc_loop_ssse3_inner(s, o);
|
54
|
+
rounds -= 4;
|
55
|
+
continue;
|
56
|
+
}
|
57
|
+
if (rounds >= 2) {
|
58
|
+
enc_loop_ssse3_inner(s, o);
|
59
|
+
enc_loop_ssse3_inner(s, o);
|
60
|
+
rounds -= 2;
|
61
|
+
continue;
|
62
|
+
}
|
63
|
+
enc_loop_ssse3_inner(s, o);
|
64
|
+
break;
|
65
|
+
|
66
|
+
} while (rounds > 0);
|
67
|
+
}
|
@@ -0,0 +1,48 @@
|
|
1
|
+
static inline __m128i
|
2
|
+
enc_reshuffle (__m128i in)
|
3
|
+
{
|
4
|
+
// Input, bytes MSB to LSB:
|
5
|
+
// 0 0 0 0 l k j i h g f e d c b a
|
6
|
+
|
7
|
+
in = _mm_shuffle_epi8(in, _mm_set_epi8(
|
8
|
+
10, 11, 9, 10,
|
9
|
+
7, 8, 6, 7,
|
10
|
+
4, 5, 3, 4,
|
11
|
+
1, 2, 0, 1));
|
12
|
+
// in, bytes MSB to LSB:
|
13
|
+
// k l j k
|
14
|
+
// h i g h
|
15
|
+
// e f d e
|
16
|
+
// b c a b
|
17
|
+
|
18
|
+
const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0FC0FC00));
|
19
|
+
// bits, upper case are most significant bits, lower case are least significant bits
|
20
|
+
// 0000kkkk LL000000 JJJJJJ00 00000000
|
21
|
+
// 0000hhhh II000000 GGGGGG00 00000000
|
22
|
+
// 0000eeee FF000000 DDDDDD00 00000000
|
23
|
+
// 0000bbbb CC000000 AAAAAA00 00000000
|
24
|
+
|
25
|
+
const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
|
26
|
+
// 00000000 00kkkkLL 00000000 00JJJJJJ
|
27
|
+
// 00000000 00hhhhII 00000000 00GGGGGG
|
28
|
+
// 00000000 00eeeeFF 00000000 00DDDDDD
|
29
|
+
// 00000000 00bbbbCC 00000000 00AAAAAA
|
30
|
+
|
31
|
+
const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003F03F0));
|
32
|
+
// 00000000 00llllll 000000jj KKKK0000
|
33
|
+
// 00000000 00iiiiii 000000gg HHHH0000
|
34
|
+
// 00000000 00ffffff 000000dd EEEE0000
|
35
|
+
// 00000000 00cccccc 000000aa BBBB0000
|
36
|
+
|
37
|
+
const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
|
38
|
+
// 00llllll 00000000 00jjKKKK 00000000
|
39
|
+
// 00iiiiii 00000000 00ggHHHH 00000000
|
40
|
+
// 00ffffff 00000000 00ddEEEE 00000000
|
41
|
+
// 00cccccc 00000000 00aaBBBB 00000000
|
42
|
+
|
43
|
+
return _mm_or_si128(t1, t3);
|
44
|
+
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
|
45
|
+
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
|
46
|
+
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
|
47
|
+
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
|
48
|
+
}
|
@@ -0,0 +1,33 @@
|
|
1
|
+
static inline __m128i
|
2
|
+
enc_translate (const __m128i in)
|
3
|
+
{
|
4
|
+
// A lookup table containing the absolute offsets for all ranges:
|
5
|
+
const __m128i lut = _mm_setr_epi8(
|
6
|
+
65, 71, -4, -4,
|
7
|
+
-4, -4, -4, -4,
|
8
|
+
-4, -4, -4, -4,
|
9
|
+
-19, -16, 0, 0
|
10
|
+
);
|
11
|
+
|
12
|
+
// Translate values 0..63 to the Base64 alphabet. There are five sets:
|
13
|
+
// # From To Abs Index Characters
|
14
|
+
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
15
|
+
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
|
16
|
+
// 2 [52..61] [48..57] -4 [2..11] 0123456789
|
17
|
+
// 3 [62] [43] -19 12 +
|
18
|
+
// 4 [63] [47] -16 13 /
|
19
|
+
|
20
|
+
// Create LUT indices from the input. The index for range #0 is right,
|
21
|
+
// others are 1 less than expected:
|
22
|
+
__m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
|
23
|
+
|
24
|
+
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
|
25
|
+
__m128i mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
|
26
|
+
|
27
|
+
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
|
28
|
+
// now correct:
|
29
|
+
indices = _mm_sub_epi8(indices, mask);
|
30
|
+
|
31
|
+
// Add offsets to input values:
|
32
|
+
return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
|
33
|
+
}
|
@@ -0,0 +1,281 @@
|
|
1
|
+
#include <stdbool.h>
|
2
|
+
#include <stdint.h>
|
3
|
+
#include <stddef.h>
|
4
|
+
#include <stdint.h>
|
5
|
+
|
6
|
+
#include "../include/libbase64.h"
|
7
|
+
#include "codecs.h"
|
8
|
+
#include "config.h"
|
9
|
+
#include "env.h"
|
10
|
+
|
11
|
+
#if (__x86_64__ || __i386__ || _M_X86 || _M_X64)
|
12
|
+
#define BASE64_X86
|
13
|
+
#if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2)
|
14
|
+
#define BASE64_X86_SIMD
|
15
|
+
#endif
|
16
|
+
#endif
|
17
|
+
|
18
|
+
#ifdef BASE64_X86
|
19
|
+
#ifdef _MSC_VER
|
20
|
+
#include <intrin.h>
|
21
|
+
#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
|
22
|
+
{ \
|
23
|
+
int info[4]; \
|
24
|
+
__cpuidex(info, __level, __count); \
|
25
|
+
__eax = info[0]; \
|
26
|
+
__ebx = info[1]; \
|
27
|
+
__ecx = info[2]; \
|
28
|
+
__edx = info[3]; \
|
29
|
+
}
|
30
|
+
#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
|
31
|
+
__cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx)
|
32
|
+
#else
|
33
|
+
#include <cpuid.h>
|
34
|
+
#if HAVE_AVX2 || HAVE_AVX
|
35
|
+
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3))
|
36
|
+
static inline uint64_t _xgetbv (uint32_t index)
|
37
|
+
{
|
38
|
+
uint32_t eax, edx;
|
39
|
+
__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
|
40
|
+
return ((uint64_t)edx << 32) | eax;
|
41
|
+
}
|
42
|
+
#else
|
43
|
+
#error "Platform not supported"
|
44
|
+
#endif
|
45
|
+
#endif
|
46
|
+
#endif
|
47
|
+
|
48
|
+
#ifndef bit_AVX2
|
49
|
+
#define bit_AVX2 (1 << 5)
|
50
|
+
#endif
|
51
|
+
#ifndef bit_SSSE3
|
52
|
+
#define bit_SSSE3 (1 << 9)
|
53
|
+
#endif
|
54
|
+
#ifndef bit_SSE41
|
55
|
+
#define bit_SSE41 (1 << 19)
|
56
|
+
#endif
|
57
|
+
#ifndef bit_SSE42
|
58
|
+
#define bit_SSE42 (1 << 20)
|
59
|
+
#endif
|
60
|
+
#ifndef bit_AVX
|
61
|
+
#define bit_AVX (1 << 28)
|
62
|
+
#endif
|
63
|
+
|
64
|
+
#define bit_XSAVE_XRSTORE (1 << 27)
|
65
|
+
|
66
|
+
#ifndef _XCR_XFEATURE_ENABLED_MASK
|
67
|
+
#define _XCR_XFEATURE_ENABLED_MASK 0
|
68
|
+
#endif
|
69
|
+
|
70
|
+
#define _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS 0x6
|
71
|
+
#endif
|
72
|
+
|
73
|
+
// Function declarations:
|
74
|
+
#define BASE64_CODEC_FUNCS(arch) \
|
75
|
+
BASE64_ENC_FUNCTION(arch); \
|
76
|
+
BASE64_DEC_FUNCTION(arch); \
|
77
|
+
|
78
|
+
BASE64_CODEC_FUNCS(avx2)
|
79
|
+
BASE64_CODEC_FUNCS(neon32)
|
80
|
+
BASE64_CODEC_FUNCS(neon64)
|
81
|
+
BASE64_CODEC_FUNCS(plain)
|
82
|
+
BASE64_CODEC_FUNCS(ssse3)
|
83
|
+
BASE64_CODEC_FUNCS(sse41)
|
84
|
+
BASE64_CODEC_FUNCS(sse42)
|
85
|
+
BASE64_CODEC_FUNCS(avx)
|
86
|
+
|
87
|
+
static bool
|
88
|
+
codec_choose_forced (struct codec *codec, int flags)
|
89
|
+
{
|
90
|
+
// If the user wants to use a certain codec,
|
91
|
+
// always allow it, even if the codec is a no-op.
|
92
|
+
// For testing purposes.
|
93
|
+
|
94
|
+
if (!(flags & 0xFF)) {
|
95
|
+
return false;
|
96
|
+
}
|
97
|
+
if (flags & BASE64_FORCE_AVX2) {
|
98
|
+
codec->enc = base64_stream_encode_avx2;
|
99
|
+
codec->dec = base64_stream_decode_avx2;
|
100
|
+
return true;
|
101
|
+
}
|
102
|
+
if (flags & BASE64_FORCE_NEON32) {
|
103
|
+
codec->enc = base64_stream_encode_neon32;
|
104
|
+
codec->dec = base64_stream_decode_neon32;
|
105
|
+
return true;
|
106
|
+
}
|
107
|
+
if (flags & BASE64_FORCE_NEON64) {
|
108
|
+
codec->enc = base64_stream_encode_neon64;
|
109
|
+
codec->dec = base64_stream_decode_neon64;
|
110
|
+
return true;
|
111
|
+
}
|
112
|
+
if (flags & BASE64_FORCE_PLAIN) {
|
113
|
+
codec->enc = base64_stream_encode_plain;
|
114
|
+
codec->dec = base64_stream_decode_plain;
|
115
|
+
return true;
|
116
|
+
}
|
117
|
+
if (flags & BASE64_FORCE_SSSE3) {
|
118
|
+
codec->enc = base64_stream_encode_ssse3;
|
119
|
+
codec->dec = base64_stream_decode_ssse3;
|
120
|
+
return true;
|
121
|
+
}
|
122
|
+
if (flags & BASE64_FORCE_SSE41) {
|
123
|
+
codec->enc = base64_stream_encode_sse41;
|
124
|
+
codec->dec = base64_stream_decode_sse41;
|
125
|
+
return true;
|
126
|
+
}
|
127
|
+
if (flags & BASE64_FORCE_SSE42) {
|
128
|
+
codec->enc = base64_stream_encode_sse42;
|
129
|
+
codec->dec = base64_stream_decode_sse42;
|
130
|
+
return true;
|
131
|
+
}
|
132
|
+
if (flags & BASE64_FORCE_AVX) {
|
133
|
+
codec->enc = base64_stream_encode_avx;
|
134
|
+
codec->dec = base64_stream_decode_avx;
|
135
|
+
return true;
|
136
|
+
}
|
137
|
+
return false;
|
138
|
+
}
|
139
|
+
|
140
|
+
static bool
|
141
|
+
codec_choose_arm (struct codec *codec)
|
142
|
+
{
|
143
|
+
#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && ((defined(__aarch64__) && HAVE_NEON64) || HAVE_NEON32)
|
144
|
+
|
145
|
+
// Unfortunately there is no portable way to check for NEON
|
146
|
+
// support at runtime from userland in the same way that x86
|
147
|
+
// has cpuid, so just stick to the compile-time configuration:
|
148
|
+
|
149
|
+
#if defined(__aarch64__) && HAVE_NEON64
|
150
|
+
codec->enc = base64_stream_encode_neon64;
|
151
|
+
codec->dec = base64_stream_decode_neon64;
|
152
|
+
#else
|
153
|
+
codec->enc = base64_stream_encode_neon32;
|
154
|
+
codec->dec = base64_stream_decode_neon32;
|
155
|
+
#endif
|
156
|
+
|
157
|
+
return true;
|
158
|
+
|
159
|
+
#else
|
160
|
+
(void)codec;
|
161
|
+
return false;
|
162
|
+
#endif
|
163
|
+
}
|
164
|
+
|
165
|
+
static bool
|
166
|
+
codec_choose_x86 (struct codec *codec)
|
167
|
+
{
|
168
|
+
#ifdef BASE64_X86_SIMD
|
169
|
+
|
170
|
+
unsigned int eax, ebx = 0, ecx = 0, edx;
|
171
|
+
unsigned int max_level;
|
172
|
+
|
173
|
+
#ifdef _MSC_VER
|
174
|
+
int info[4];
|
175
|
+
__cpuidex(info, 0, 0);
|
176
|
+
max_level = info[0];
|
177
|
+
#else
|
178
|
+
max_level = __get_cpuid_max(0, NULL);
|
179
|
+
#endif
|
180
|
+
|
181
|
+
#if HAVE_AVX2 || HAVE_AVX
|
182
|
+
// Check for AVX/AVX2 support:
|
183
|
+
// Checking for AVX requires 3 things:
|
184
|
+
// 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions
|
185
|
+
// (allowing saving YMM registers on context switch)
|
186
|
+
// 2) CPUID indicates support for AVX
|
187
|
+
// 3) XGETBV indicates the AVX registers will be saved and restored on
|
188
|
+
// context switch
|
189
|
+
//
|
190
|
+
// Note that XGETBV is only available on 686 or later CPUs, so the
|
191
|
+
// instruction needs to be conditionally run.
|
192
|
+
if (max_level >= 1) {
|
193
|
+
__cpuid_count(1, 0, eax, ebx, ecx, edx);
|
194
|
+
if (ecx & bit_XSAVE_XRSTORE) {
|
195
|
+
uint64_t xcr_mask;
|
196
|
+
xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
|
197
|
+
if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) {
|
198
|
+
#if HAVE_AVX2
|
199
|
+
if (max_level >= 7) {
|
200
|
+
__cpuid_count(7, 0, eax, ebx, ecx, edx);
|
201
|
+
if (ebx & bit_AVX2) {
|
202
|
+
codec->enc = base64_stream_encode_avx2;
|
203
|
+
codec->dec = base64_stream_decode_avx2;
|
204
|
+
return true;
|
205
|
+
}
|
206
|
+
}
|
207
|
+
#endif
|
208
|
+
#if HAVE_AVX
|
209
|
+
__cpuid_count(1, 0, eax, ebx, ecx, edx);
|
210
|
+
if (ecx & bit_AVX) {
|
211
|
+
codec->enc = base64_stream_encode_avx;
|
212
|
+
codec->dec = base64_stream_decode_avx;
|
213
|
+
return true;
|
214
|
+
}
|
215
|
+
#endif
|
216
|
+
}
|
217
|
+
}
|
218
|
+
}
|
219
|
+
#endif
|
220
|
+
|
221
|
+
#if HAVE_SSE42
|
222
|
+
// Check for SSE42 support:
|
223
|
+
if (max_level >= 1) {
|
224
|
+
__cpuid(1, eax, ebx, ecx, edx);
|
225
|
+
if (ecx & bit_SSE42) {
|
226
|
+
codec->enc = base64_stream_encode_sse42;
|
227
|
+
codec->dec = base64_stream_decode_sse42;
|
228
|
+
return true;
|
229
|
+
}
|
230
|
+
}
|
231
|
+
#endif
|
232
|
+
|
233
|
+
#if HAVE_SSE41
|
234
|
+
// Check for SSE41 support:
|
235
|
+
if (max_level >= 1) {
|
236
|
+
__cpuid(1, eax, ebx, ecx, edx);
|
237
|
+
if (ecx & bit_SSE41) {
|
238
|
+
codec->enc = base64_stream_encode_sse41;
|
239
|
+
codec->dec = base64_stream_decode_sse41;
|
240
|
+
return true;
|
241
|
+
}
|
242
|
+
}
|
243
|
+
#endif
|
244
|
+
|
245
|
+
#if HAVE_SSSE3
|
246
|
+
// Check for SSSE3 support:
|
247
|
+
if (max_level >= 1) {
|
248
|
+
__cpuid(1, eax, ebx, ecx, edx);
|
249
|
+
if (ecx & bit_SSSE3) {
|
250
|
+
codec->enc = base64_stream_encode_ssse3;
|
251
|
+
codec->dec = base64_stream_decode_ssse3;
|
252
|
+
return true;
|
253
|
+
}
|
254
|
+
}
|
255
|
+
#endif
|
256
|
+
|
257
|
+
#else
|
258
|
+
(void)codec;
|
259
|
+
#endif
|
260
|
+
|
261
|
+
return false;
|
262
|
+
}
|
263
|
+
|
264
|
+
void
|
265
|
+
codec_choose (struct codec *codec, int flags)
|
266
|
+
{
|
267
|
+
// User forced a codec:
|
268
|
+
if (codec_choose_forced(codec, flags)) {
|
269
|
+
return;
|
270
|
+
}
|
271
|
+
|
272
|
+
// Runtime feature detection:
|
273
|
+
if (codec_choose_arm(codec)) {
|
274
|
+
return;
|
275
|
+
}
|
276
|
+
if (codec_choose_x86(codec)) {
|
277
|
+
return;
|
278
|
+
}
|
279
|
+
codec->enc = base64_stream_encode_plain;
|
280
|
+
codec->dec = base64_stream_decode_plain;
|
281
|
+
}
|
@@ -0,0 +1,65 @@
|
|
1
|
+
#include <stdint.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
|
4
|
+
#include "../include/libbase64.h"
|
5
|
+
#include "config.h"
|
6
|
+
|
7
|
+
// Function parameters for encoding functions:
|
8
|
+
#define BASE64_ENC_PARAMS \
|
9
|
+
( struct base64_state *state \
|
10
|
+
, const char *src \
|
11
|
+
, size_t srclen \
|
12
|
+
, char *out \
|
13
|
+
, size_t *outlen \
|
14
|
+
)
|
15
|
+
|
16
|
+
// Function parameters for decoding functions:
|
17
|
+
#define BASE64_DEC_PARAMS \
|
18
|
+
( struct base64_state *state \
|
19
|
+
, const char *src \
|
20
|
+
, size_t srclen \
|
21
|
+
, char *out \
|
22
|
+
, size_t *outlen \
|
23
|
+
)
|
24
|
+
|
25
|
+
// Function signature for encoding functions:
|
26
|
+
#define BASE64_ENC_FUNCTION(arch) \
|
27
|
+
void \
|
28
|
+
base64_stream_encode_ ## arch \
|
29
|
+
BASE64_ENC_PARAMS
|
30
|
+
|
31
|
+
// Function signature for decoding functions:
|
32
|
+
#define BASE64_DEC_FUNCTION(arch) \
|
33
|
+
int \
|
34
|
+
base64_stream_decode_ ## arch \
|
35
|
+
BASE64_DEC_PARAMS
|
36
|
+
|
37
|
+
// Cast away unused variable, silence compiler:
|
38
|
+
#define UNUSED(x) ((void)(x))
|
39
|
+
|
40
|
+
// Stub function when encoder arch unsupported:
|
41
|
+
#define BASE64_ENC_STUB \
|
42
|
+
UNUSED(state); \
|
43
|
+
UNUSED(src); \
|
44
|
+
UNUSED(srclen); \
|
45
|
+
UNUSED(out); \
|
46
|
+
\
|
47
|
+
*outlen = 0;
|
48
|
+
|
49
|
+
// Stub function when decoder arch unsupported:
|
50
|
+
#define BASE64_DEC_STUB \
|
51
|
+
UNUSED(state); \
|
52
|
+
UNUSED(src); \
|
53
|
+
UNUSED(srclen); \
|
54
|
+
UNUSED(out); \
|
55
|
+
UNUSED(outlen); \
|
56
|
+
\
|
57
|
+
return -1;
|
58
|
+
|
59
|
+
struct codec
|
60
|
+
{
|
61
|
+
void (* enc) BASE64_ENC_PARAMS;
|
62
|
+
int (* dec) BASE64_DEC_PARAMS;
|
63
|
+
};
|
64
|
+
|
65
|
+
extern void codec_choose (struct codec *, int flags);
|