ob64 0.1.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/main.yml +20 -4
  3. data/.gitignore +2 -0
  4. data/CHANGELOG.md +18 -1
  5. data/{LICENSE.txt → LICENSE} +1 -1
  6. data/README.md +34 -2
  7. data/benchmark.rb +42 -3
  8. data/ext/ob64/ob64_ext.c +5 -3
  9. data/lib/ob64/core_ext.rb +2 -0
  10. data/lib/ob64/version.rb +1 -1
  11. data/lib/ob64.rb +52 -0
  12. data/ob64.gemspec +12 -6
  13. data/vendor/libbase64/.gitignore +12 -0
  14. data/vendor/libbase64/.travis.yml +71 -0
  15. data/vendor/libbase64/CMakeLists.txt +264 -0
  16. data/vendor/libbase64/LICENSE +28 -0
  17. data/vendor/libbase64/Makefile +93 -0
  18. data/vendor/libbase64/README.md +474 -0
  19. data/vendor/libbase64/base64-benchmarks.png +0 -0
  20. data/vendor/libbase64/bin/base64.c +132 -0
  21. data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
  22. data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
  23. data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
  24. data/vendor/libbase64/cmake/config.h.in +25 -0
  25. data/vendor/libbase64/cmake/test-arch.c +35 -0
  26. data/vendor/libbase64/include/libbase64.h +145 -0
  27. data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
  28. data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
  29. data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
  30. data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
  31. data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
  32. data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
  33. data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
  34. data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
  35. data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
  36. data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
  37. data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
  38. data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
  39. data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
  40. data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
  41. data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
  42. data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
  43. data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
  44. data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
  45. data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
  46. data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
  47. data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
  48. data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
  49. data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
  50. data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
  51. data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
  52. data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
  53. data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
  54. data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
  55. data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
  56. data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
  57. data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
  58. data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
  59. data/vendor/libbase64/lib/codec_choose.c +281 -0
  60. data/vendor/libbase64/lib/codecs.h +65 -0
  61. data/vendor/libbase64/lib/env.h +67 -0
  62. data/vendor/libbase64/lib/exports.txt +7 -0
  63. data/vendor/libbase64/lib/lib.c +164 -0
  64. data/vendor/libbase64/lib/lib_openmp.c +149 -0
  65. data/vendor/libbase64/lib/tables/.gitignore +1 -0
  66. data/vendor/libbase64/lib/tables/Makefile +17 -0
  67. data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
  68. data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
  69. data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
  70. data/vendor/libbase64/lib/tables/table_generator.c +184 -0
  71. data/vendor/libbase64/lib/tables/tables.c +40 -0
  72. data/vendor/libbase64/lib/tables/tables.h +23 -0
  73. metadata +67 -6
  74. data/.byebug_history +0 -72
  75. data/.envrc +0 -1
@@ -0,0 +1,173 @@
1
+ // The input consists of six character sets in the Base64 alphabet, which we
2
+ // need to map back to the 6-bit values they represent. There are three ranges,
3
+ // two singles, and then there's the rest.
4
+ //
5
+ // # From To Add Characters
6
+ // 1 [43] [62] +19 +
7
+ // 2 [47] [63] +16 /
8
+ // 3 [48..57] [52..61] +4 0..9
9
+ // 4 [65..90] [0..25] -65 A..Z
10
+ // 5 [97..122] [26..51] -71 a..z
11
+ // (6) Everything else => invalid input
12
+ //
13
+ // We will use lookup tables for character validation and offset computation.
14
+ // Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
15
+ // allows to mask with 0x2F instead of 0x0F and thus save one constant
16
+ // declaration (register and/or memory access).
17
+ //
18
+ // For offsets:
19
+ // Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
20
+ // 0000 = garbage
21
+ // 0001 = /
22
+ // 0010 = +
23
+ // 0011 = 0-9
24
+ // 0100 = A-Z
25
+ // 0101 = A-Z
26
+ // 0110 = a-z
27
+ // 0111 = a-z
28
+ // 1000 >= garbage
29
+ //
30
+ // For validation, here's the table.
31
+ // A character is valid if and only if the AND of the 2 lookups equals 0:
32
+ //
33
+ // hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
34
+ // LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
35
+ //
36
+ // 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
37
+ // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
38
+ //
39
+ // 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
40
+ // andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
41
+ //
42
+ // 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
43
+ // andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
44
+ //
45
+ // 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
46
+ // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
47
+ //
48
+ // 0100 0x04 char @ A B C D E F G H I J K L M N O
49
+ // andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
50
+ //
51
+ // 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
52
+ // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
53
+ //
54
+ // 0110 0x04 char ` a b c d e f g h i j k l m n o
55
+ // andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
56
+ // 0111 0x08 char p q r s t u v w x y z { | } ~
57
+ // andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
58
+ //
59
+ // 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
60
+ // 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
61
+ // 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
62
+ // 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
63
+ // 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
64
+ // 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
65
+ // 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
66
+ // 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
67
+
68
+ static inline int
69
+ dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
70
+ {
71
+ const __m128i lut_lo = _mm_setr_epi8(
72
+ 0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
73
+ 0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
74
+
75
+ const __m128i lut_hi = _mm_setr_epi8(
76
+ 0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
77
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
78
+
79
+ const __m128i lut_roll = _mm_setr_epi8(
80
+ 0, 16, 19, 4, -65, -65, -71, -71,
81
+ 0, 0, 0, 0, 0, 0, 0, 0);
82
+
83
+ const __m128i mask_2F = _mm_set1_epi8(0x2F);
84
+
85
+ // Load input:
86
+ __m128i str = _mm_loadu_si128((__m128i *) *s);
87
+
88
+ // Table lookups:
89
+ const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
90
+ const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
91
+ const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles);
92
+ const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles);
93
+
94
+ // Check for invalid input: if any "and" values from lo and hi are not
95
+ // zero, fall back on bytewise code to do error checking and reporting:
96
+ if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
97
+ return 0;
98
+ }
99
+
100
+ const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
101
+ const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
102
+
103
+ // Now simply add the delta values to the input:
104
+ str = _mm_add_epi8(str, roll);
105
+
106
+ // Reshuffle the input to packed 12-byte output format:
107
+ str = dec_reshuffle(str);
108
+
109
+ // Store the output:
110
+ _mm_storeu_si128((__m128i *) *o, str);
111
+
112
+ *s += 16;
113
+ *o += 12;
114
+ *rounds -= 1;
115
+
116
+ return 1;
117
+ }
118
+
119
+ static inline void
120
+ dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
121
+ {
122
+ if (*slen < 24) {
123
+ return;
124
+ }
125
+
126
+ // Process blocks of 16 bytes per round. Because 4 extra zero bytes are
127
+ // written after the output, ensure that there will be at least 8 bytes
128
+ // of input data left to cover the gap. (6 data bytes and up to two
129
+ // end-of-string markers.)
130
+ size_t rounds = (*slen - 8) / 16;
131
+
132
+ *slen -= rounds * 16; // 16 bytes consumed per round
133
+ *olen += rounds * 12; // 12 bytes produced per round
134
+
135
+ do {
136
+ if (rounds >= 8) {
137
+ if (dec_loop_ssse3_inner(s, o, &rounds) &&
138
+ dec_loop_ssse3_inner(s, o, &rounds) &&
139
+ dec_loop_ssse3_inner(s, o, &rounds) &&
140
+ dec_loop_ssse3_inner(s, o, &rounds) &&
141
+ dec_loop_ssse3_inner(s, o, &rounds) &&
142
+ dec_loop_ssse3_inner(s, o, &rounds) &&
143
+ dec_loop_ssse3_inner(s, o, &rounds) &&
144
+ dec_loop_ssse3_inner(s, o, &rounds)) {
145
+ continue;
146
+ }
147
+ break;
148
+ }
149
+ if (rounds >= 4) {
150
+ if (dec_loop_ssse3_inner(s, o, &rounds) &&
151
+ dec_loop_ssse3_inner(s, o, &rounds) &&
152
+ dec_loop_ssse3_inner(s, o, &rounds) &&
153
+ dec_loop_ssse3_inner(s, o, &rounds)) {
154
+ continue;
155
+ }
156
+ break;
157
+ }
158
+ if (rounds >= 2) {
159
+ if (dec_loop_ssse3_inner(s, o, &rounds) &&
160
+ dec_loop_ssse3_inner(s, o, &rounds)) {
161
+ continue;
162
+ }
163
+ break;
164
+ }
165
+ dec_loop_ssse3_inner(s, o, &rounds);
166
+ break;
167
+
168
+ } while (rounds > 0);
169
+
170
+ // Adjust for any rounds that were skipped:
171
+ *slen += rounds * 16;
172
+ *olen -= rounds * 12;
173
+ }
@@ -0,0 +1,33 @@
1
+ static inline __m128i
2
+ dec_reshuffle (const __m128i in)
3
+ {
4
+ // in, bits, upper case are most significant bits, lower case are least significant bits
5
+ // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
6
+ // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
7
+ // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
8
+ // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
9
+
10
+ const __m128i merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
11
+ // 0000kkkk LLllllll 0000JJJJ JJjjKKKK
12
+ // 0000hhhh IIiiiiii 0000GGGG GGggHHHH
13
+ // 0000eeee FFffffff 0000DDDD DDddEEEE
14
+ // 0000bbbb CCcccccc 0000AAAA AAaaBBBB
15
+
16
+ const __m128i out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
17
+ // 00000000 JJJJJJjj KKKKkkkk LLllllll
18
+ // 00000000 GGGGGGgg HHHHhhhh IIiiiiii
19
+ // 00000000 DDDDDDdd EEEEeeee FFffffff
20
+ // 00000000 AAAAAAaa BBBBbbbb CCcccccc
21
+
22
+ // Pack bytes together:
23
+ return _mm_shuffle_epi8(out, _mm_setr_epi8(
24
+ 2, 1, 0,
25
+ 6, 5, 4,
26
+ 10, 9, 8,
27
+ 14, 13, 12,
28
+ -1, -1, -1, -1));
29
+ // 00000000 00000000 00000000 00000000
30
+ // LLllllll KKKKkkkk JJJJJJjj IIiiiiii
31
+ // HHHHhhhh GGGGGGgg FFffffff EEEEeeee
32
+ // DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
33
+ }
@@ -0,0 +1,67 @@
1
+ static inline void
2
+ enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
3
+ {
4
+ // Load input:
5
+ __m128i str = _mm_loadu_si128((__m128i *) *s);
6
+
7
+ // Reshuffle:
8
+ str = enc_reshuffle(str);
9
+
10
+ // Translate reshuffled bytes to the Base64 alphabet:
11
+ str = enc_translate(str);
12
+
13
+ // Store:
14
+ _mm_storeu_si128((__m128i *) *o, str);
15
+
16
+ *s += 12;
17
+ *o += 16;
18
+ }
19
+
20
+ static inline void
21
+ enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
22
+ {
23
+ if (*slen < 16) {
24
+ return;
25
+ }
26
+
27
+ // Process blocks of 12 bytes at a time. Because blocks are loaded 16
28
+ // bytes at a time, ensure that there will be at least 4 remaining
29
+ // bytes after the last round, so that the final read will not pass
30
+ // beyond the bounds of the input buffer:
31
+ size_t rounds = (*slen - 4) / 12;
32
+
33
+ *slen -= rounds * 12; // 12 bytes consumed per round
34
+ *olen += rounds * 16; // 16 bytes produced per round
35
+
36
+ do {
37
+ if (rounds >= 8) {
38
+ enc_loop_ssse3_inner(s, o);
39
+ enc_loop_ssse3_inner(s, o);
40
+ enc_loop_ssse3_inner(s, o);
41
+ enc_loop_ssse3_inner(s, o);
42
+ enc_loop_ssse3_inner(s, o);
43
+ enc_loop_ssse3_inner(s, o);
44
+ enc_loop_ssse3_inner(s, o);
45
+ enc_loop_ssse3_inner(s, o);
46
+ rounds -= 8;
47
+ continue;
48
+ }
49
+ if (rounds >= 4) {
50
+ enc_loop_ssse3_inner(s, o);
51
+ enc_loop_ssse3_inner(s, o);
52
+ enc_loop_ssse3_inner(s, o);
53
+ enc_loop_ssse3_inner(s, o);
54
+ rounds -= 4;
55
+ continue;
56
+ }
57
+ if (rounds >= 2) {
58
+ enc_loop_ssse3_inner(s, o);
59
+ enc_loop_ssse3_inner(s, o);
60
+ rounds -= 2;
61
+ continue;
62
+ }
63
+ enc_loop_ssse3_inner(s, o);
64
+ break;
65
+
66
+ } while (rounds > 0);
67
+ }
@@ -0,0 +1,48 @@
1
+ static inline __m128i
2
+ enc_reshuffle (__m128i in)
3
+ {
4
+ // Input, bytes MSB to LSB:
5
+ // 0 0 0 0 l k j i h g f e d c b a
6
+
7
+ in = _mm_shuffle_epi8(in, _mm_set_epi8(
8
+ 10, 11, 9, 10,
9
+ 7, 8, 6, 7,
10
+ 4, 5, 3, 4,
11
+ 1, 2, 0, 1));
12
+ // in, bytes MSB to LSB:
13
+ // k l j k
14
+ // h i g h
15
+ // e f d e
16
+ // b c a b
17
+
18
+ const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0FC0FC00));
19
+ // bits, upper case are most significant bits, lower case are least significant bits
20
+ // 0000kkkk LL000000 JJJJJJ00 00000000
21
+ // 0000hhhh II000000 GGGGGG00 00000000
22
+ // 0000eeee FF000000 DDDDDD00 00000000
23
+ // 0000bbbb CC000000 AAAAAA00 00000000
24
+
25
+ const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
26
+ // 00000000 00kkkkLL 00000000 00JJJJJJ
27
+ // 00000000 00hhhhII 00000000 00GGGGGG
28
+ // 00000000 00eeeeFF 00000000 00DDDDDD
29
+ // 00000000 00bbbbCC 00000000 00AAAAAA
30
+
31
+ const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003F03F0));
32
+ // 00000000 00llllll 000000jj KKKK0000
33
+ // 00000000 00iiiiii 000000gg HHHH0000
34
+ // 00000000 00ffffff 000000dd EEEE0000
35
+ // 00000000 00cccccc 000000aa BBBB0000
36
+
37
+ const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
38
+ // 00llllll 00000000 00jjKKKK 00000000
39
+ // 00iiiiii 00000000 00ggHHHH 00000000
40
+ // 00ffffff 00000000 00ddEEEE 00000000
41
+ // 00cccccc 00000000 00aaBBBB 00000000
42
+
43
+ return _mm_or_si128(t1, t3);
44
+ // 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
45
+ // 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
46
+ // 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
47
+ // 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
48
+ }
@@ -0,0 +1,33 @@
1
+ static inline __m128i
2
+ enc_translate (const __m128i in)
3
+ {
4
+ // A lookup table containing the absolute offsets for all ranges:
5
+ const __m128i lut = _mm_setr_epi8(
6
+ 65, 71, -4, -4,
7
+ -4, -4, -4, -4,
8
+ -4, -4, -4, -4,
9
+ -19, -16, 0, 0
10
+ );
11
+
12
+ // Translate values 0..63 to the Base64 alphabet. There are five sets:
13
+ // # From To Abs Index Characters
14
+ // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
15
+ // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
16
+ // 2 [52..61] [48..57] -4 [2..11] 0123456789
17
+ // 3 [62] [43] -19 12 +
18
+ // 4 [63] [47] -16 13 /
19
+
20
+ // Create LUT indices from the input. The index for range #0 is right,
21
+ // others are 1 less than expected:
22
+ __m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
23
+
24
+ // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
25
+ __m128i mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
26
+
27
+ // Subtract -1, so add 1 to indices for range #[1..4]. All indices are
28
+ // now correct:
29
+ indices = _mm_sub_epi8(indices, mask);
30
+
31
+ // Add offsets to input values:
32
+ return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
33
+ }
@@ -0,0 +1,281 @@
1
+ #include <stdbool.h>
2
+ #include <stdint.h>
3
+ #include <stddef.h>
4
+ #include <stdint.h>
5
+
6
+ #include "../include/libbase64.h"
7
+ #include "codecs.h"
8
+ #include "config.h"
9
+ #include "env.h"
10
+
11
+ #if (__x86_64__ || __i386__ || _M_X86 || _M_X64)
12
+ #define BASE64_X86
13
+ #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2)
14
+ #define BASE64_X86_SIMD
15
+ #endif
16
+ #endif
17
+
18
+ #ifdef BASE64_X86
19
+ #ifdef _MSC_VER
20
+ #include <intrin.h>
21
+ #define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
22
+ { \
23
+ int info[4]; \
24
+ __cpuidex(info, __level, __count); \
25
+ __eax = info[0]; \
26
+ __ebx = info[1]; \
27
+ __ecx = info[2]; \
28
+ __edx = info[3]; \
29
+ }
30
+ #define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
31
+ __cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx)
32
+ #else
33
+ #include <cpuid.h>
34
+ #if HAVE_AVX2 || HAVE_AVX
35
+ #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3))
36
+ static inline uint64_t _xgetbv (uint32_t index)
37
+ {
38
+ uint32_t eax, edx;
39
+ __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
40
+ return ((uint64_t)edx << 32) | eax;
41
+ }
42
+ #else
43
+ #error "Platform not supported"
44
+ #endif
45
+ #endif
46
+ #endif
47
+
48
+ #ifndef bit_AVX2
49
+ #define bit_AVX2 (1 << 5)
50
+ #endif
51
+ #ifndef bit_SSSE3
52
+ #define bit_SSSE3 (1 << 9)
53
+ #endif
54
+ #ifndef bit_SSE41
55
+ #define bit_SSE41 (1 << 19)
56
+ #endif
57
+ #ifndef bit_SSE42
58
+ #define bit_SSE42 (1 << 20)
59
+ #endif
60
+ #ifndef bit_AVX
61
+ #define bit_AVX (1 << 28)
62
+ #endif
63
+
64
+ #define bit_XSAVE_XRSTORE (1 << 27)
65
+
66
+ #ifndef _XCR_XFEATURE_ENABLED_MASK
67
+ #define _XCR_XFEATURE_ENABLED_MASK 0
68
+ #endif
69
+
70
+ #define _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS 0x6
71
+ #endif
72
+
73
+ // Function declarations:
74
+ #define BASE64_CODEC_FUNCS(arch) \
75
+ BASE64_ENC_FUNCTION(arch); \
76
+ BASE64_DEC_FUNCTION(arch); \
77
+
78
+ BASE64_CODEC_FUNCS(avx2)
79
+ BASE64_CODEC_FUNCS(neon32)
80
+ BASE64_CODEC_FUNCS(neon64)
81
+ BASE64_CODEC_FUNCS(plain)
82
+ BASE64_CODEC_FUNCS(ssse3)
83
+ BASE64_CODEC_FUNCS(sse41)
84
+ BASE64_CODEC_FUNCS(sse42)
85
+ BASE64_CODEC_FUNCS(avx)
86
+
87
+ static bool
88
+ codec_choose_forced (struct codec *codec, int flags)
89
+ {
90
+ // If the user wants to use a certain codec,
91
+ // always allow it, even if the codec is a no-op.
92
+ // For testing purposes.
93
+
94
+ if (!(flags & 0xFF)) {
95
+ return false;
96
+ }
97
+ if (flags & BASE64_FORCE_AVX2) {
98
+ codec->enc = base64_stream_encode_avx2;
99
+ codec->dec = base64_stream_decode_avx2;
100
+ return true;
101
+ }
102
+ if (flags & BASE64_FORCE_NEON32) {
103
+ codec->enc = base64_stream_encode_neon32;
104
+ codec->dec = base64_stream_decode_neon32;
105
+ return true;
106
+ }
107
+ if (flags & BASE64_FORCE_NEON64) {
108
+ codec->enc = base64_stream_encode_neon64;
109
+ codec->dec = base64_stream_decode_neon64;
110
+ return true;
111
+ }
112
+ if (flags & BASE64_FORCE_PLAIN) {
113
+ codec->enc = base64_stream_encode_plain;
114
+ codec->dec = base64_stream_decode_plain;
115
+ return true;
116
+ }
117
+ if (flags & BASE64_FORCE_SSSE3) {
118
+ codec->enc = base64_stream_encode_ssse3;
119
+ codec->dec = base64_stream_decode_ssse3;
120
+ return true;
121
+ }
122
+ if (flags & BASE64_FORCE_SSE41) {
123
+ codec->enc = base64_stream_encode_sse41;
124
+ codec->dec = base64_stream_decode_sse41;
125
+ return true;
126
+ }
127
+ if (flags & BASE64_FORCE_SSE42) {
128
+ codec->enc = base64_stream_encode_sse42;
129
+ codec->dec = base64_stream_decode_sse42;
130
+ return true;
131
+ }
132
+ if (flags & BASE64_FORCE_AVX) {
133
+ codec->enc = base64_stream_encode_avx;
134
+ codec->dec = base64_stream_decode_avx;
135
+ return true;
136
+ }
137
+ return false;
138
+ }
139
+
140
+ static bool
141
+ codec_choose_arm (struct codec *codec)
142
+ {
143
+ #if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && ((defined(__aarch64__) && HAVE_NEON64) || HAVE_NEON32)
144
+
145
+ // Unfortunately there is no portable way to check for NEON
146
+ // support at runtime from userland in the same way that x86
147
+ // has cpuid, so just stick to the compile-time configuration:
148
+
149
+ #if defined(__aarch64__) && HAVE_NEON64
150
+ codec->enc = base64_stream_encode_neon64;
151
+ codec->dec = base64_stream_decode_neon64;
152
+ #else
153
+ codec->enc = base64_stream_encode_neon32;
154
+ codec->dec = base64_stream_decode_neon32;
155
+ #endif
156
+
157
+ return true;
158
+
159
+ #else
160
+ (void)codec;
161
+ return false;
162
+ #endif
163
+ }
164
+
165
+ static bool
166
+ codec_choose_x86 (struct codec *codec)
167
+ {
168
+ #ifdef BASE64_X86_SIMD
169
+
170
+ unsigned int eax, ebx = 0, ecx = 0, edx;
171
+ unsigned int max_level;
172
+
173
+ #ifdef _MSC_VER
174
+ int info[4];
175
+ __cpuidex(info, 0, 0);
176
+ max_level = info[0];
177
+ #else
178
+ max_level = __get_cpuid_max(0, NULL);
179
+ #endif
180
+
181
+ #if HAVE_AVX2 || HAVE_AVX
182
+ // Check for AVX/AVX2 support:
183
+ // Checking for AVX requires 3 things:
184
+ // 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions
185
+ // (allowing saving YMM registers on context switch)
186
+ // 2) CPUID indicates support for AVX
187
+ // 3) XGETBV indicates the AVX registers will be saved and restored on
188
+ // context switch
189
+ //
190
+ // Note that XGETBV is only available on 686 or later CPUs, so the
191
+ // instruction needs to be conditionally run.
192
+ if (max_level >= 1) {
193
+ __cpuid_count(1, 0, eax, ebx, ecx, edx);
194
+ if (ecx & bit_XSAVE_XRSTORE) {
195
+ uint64_t xcr_mask;
196
+ xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
197
+ if (xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) {
198
+ #if HAVE_AVX2
199
+ if (max_level >= 7) {
200
+ __cpuid_count(7, 0, eax, ebx, ecx, edx);
201
+ if (ebx & bit_AVX2) {
202
+ codec->enc = base64_stream_encode_avx2;
203
+ codec->dec = base64_stream_decode_avx2;
204
+ return true;
205
+ }
206
+ }
207
+ #endif
208
+ #if HAVE_AVX
209
+ __cpuid_count(1, 0, eax, ebx, ecx, edx);
210
+ if (ecx & bit_AVX) {
211
+ codec->enc = base64_stream_encode_avx;
212
+ codec->dec = base64_stream_decode_avx;
213
+ return true;
214
+ }
215
+ #endif
216
+ }
217
+ }
218
+ }
219
+ #endif
220
+
221
+ #if HAVE_SSE42
222
+ // Check for SSE42 support:
223
+ if (max_level >= 1) {
224
+ __cpuid(1, eax, ebx, ecx, edx);
225
+ if (ecx & bit_SSE42) {
226
+ codec->enc = base64_stream_encode_sse42;
227
+ codec->dec = base64_stream_decode_sse42;
228
+ return true;
229
+ }
230
+ }
231
+ #endif
232
+
233
+ #if HAVE_SSE41
234
+ // Check for SSE41 support:
235
+ if (max_level >= 1) {
236
+ __cpuid(1, eax, ebx, ecx, edx);
237
+ if (ecx & bit_SSE41) {
238
+ codec->enc = base64_stream_encode_sse41;
239
+ codec->dec = base64_stream_decode_sse41;
240
+ return true;
241
+ }
242
+ }
243
+ #endif
244
+
245
+ #if HAVE_SSSE3
246
+ // Check for SSSE3 support:
247
+ if (max_level >= 1) {
248
+ __cpuid(1, eax, ebx, ecx, edx);
249
+ if (ecx & bit_SSSE3) {
250
+ codec->enc = base64_stream_encode_ssse3;
251
+ codec->dec = base64_stream_decode_ssse3;
252
+ return true;
253
+ }
254
+ }
255
+ #endif
256
+
257
+ #else
258
+ (void)codec;
259
+ #endif
260
+
261
+ return false;
262
+ }
263
+
264
+ void
265
+ codec_choose (struct codec *codec, int flags)
266
+ {
267
+ // User forced a codec:
268
+ if (codec_choose_forced(codec, flags)) {
269
+ return;
270
+ }
271
+
272
+ // Runtime feature detection:
273
+ if (codec_choose_arm(codec)) {
274
+ return;
275
+ }
276
+ if (codec_choose_x86(codec)) {
277
+ return;
278
+ }
279
+ codec->enc = base64_stream_encode_plain;
280
+ codec->dec = base64_stream_decode_plain;
281
+ }
@@ -0,0 +1,65 @@
1
+ #include <stdint.h>
2
+ #include <stddef.h>
3
+
4
+ #include "../include/libbase64.h"
5
+ #include "config.h"
6
+
7
+ // Function parameters for encoding functions:
8
+ #define BASE64_ENC_PARAMS \
9
+ ( struct base64_state *state \
10
+ , const char *src \
11
+ , size_t srclen \
12
+ , char *out \
13
+ , size_t *outlen \
14
+ )
15
+
16
+ // Function parameters for decoding functions:
17
+ #define BASE64_DEC_PARAMS \
18
+ ( struct base64_state *state \
19
+ , const char *src \
20
+ , size_t srclen \
21
+ , char *out \
22
+ , size_t *outlen \
23
+ )
24
+
25
+ // Function signature for encoding functions:
26
+ #define BASE64_ENC_FUNCTION(arch) \
27
+ void \
28
+ base64_stream_encode_ ## arch \
29
+ BASE64_ENC_PARAMS
30
+
31
+ // Function signature for decoding functions:
32
+ #define BASE64_DEC_FUNCTION(arch) \
33
+ int \
34
+ base64_stream_decode_ ## arch \
35
+ BASE64_DEC_PARAMS
36
+
37
+ // Cast away unused variable, silence compiler:
38
+ #define UNUSED(x) ((void)(x))
39
+
40
+ // Stub function when encoder arch unsupported:
41
+ #define BASE64_ENC_STUB \
42
+ UNUSED(state); \
43
+ UNUSED(src); \
44
+ UNUSED(srclen); \
45
+ UNUSED(out); \
46
+ \
47
+ *outlen = 0;
48
+
49
+ // Stub function when decoder arch unsupported:
50
+ #define BASE64_DEC_STUB \
51
+ UNUSED(state); \
52
+ UNUSED(src); \
53
+ UNUSED(srclen); \
54
+ UNUSED(out); \
55
+ UNUSED(outlen); \
56
+ \
57
+ return -1;
58
+
59
+ struct codec
60
+ {
61
+ void (* enc) BASE64_ENC_PARAMS;
62
+ int (* dec) BASE64_DEC_PARAMS;
63
+ };
64
+
65
+ extern void codec_choose (struct codec *, int flags);