ob64 0.1.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/main.yml +20 -4
  3. data/.gitignore +2 -0
  4. data/CHANGELOG.md +18 -1
  5. data/{LICENSE.txt → LICENSE} +1 -1
  6. data/README.md +34 -2
  7. data/benchmark.rb +42 -3
  8. data/ext/ob64/ob64_ext.c +5 -3
  9. data/lib/ob64/core_ext.rb +2 -0
  10. data/lib/ob64/version.rb +1 -1
  11. data/lib/ob64.rb +52 -0
  12. data/ob64.gemspec +12 -6
  13. data/vendor/libbase64/.gitignore +12 -0
  14. data/vendor/libbase64/.travis.yml +71 -0
  15. data/vendor/libbase64/CMakeLists.txt +264 -0
  16. data/vendor/libbase64/LICENSE +28 -0
  17. data/vendor/libbase64/Makefile +93 -0
  18. data/vendor/libbase64/README.md +474 -0
  19. data/vendor/libbase64/base64-benchmarks.png +0 -0
  20. data/vendor/libbase64/bin/base64.c +132 -0
  21. data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
  22. data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
  23. data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
  24. data/vendor/libbase64/cmake/config.h.in +25 -0
  25. data/vendor/libbase64/cmake/test-arch.c +35 -0
  26. data/vendor/libbase64/include/libbase64.h +145 -0
  27. data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
  28. data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
  29. data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
  30. data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
  31. data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
  32. data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
  33. data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
  34. data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
  35. data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
  36. data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
  37. data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
  38. data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
  39. data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
  40. data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
  41. data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
  42. data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
  43. data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
  44. data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
  45. data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
  46. data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
  47. data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
  48. data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
  49. data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
  50. data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
  51. data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
  52. data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
  53. data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
  54. data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
  55. data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
  56. data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
  57. data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
  58. data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
  59. data/vendor/libbase64/lib/codec_choose.c +281 -0
  60. data/vendor/libbase64/lib/codecs.h +65 -0
  61. data/vendor/libbase64/lib/env.h +67 -0
  62. data/vendor/libbase64/lib/exports.txt +7 -0
  63. data/vendor/libbase64/lib/lib.c +164 -0
  64. data/vendor/libbase64/lib/lib_openmp.c +149 -0
  65. data/vendor/libbase64/lib/tables/.gitignore +1 -0
  66. data/vendor/libbase64/lib/tables/Makefile +17 -0
  67. data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
  68. data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
  69. data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
  70. data/vendor/libbase64/lib/tables/table_generator.c +184 -0
  71. data/vendor/libbase64/lib/tables/tables.c +40 -0
  72. data/vendor/libbase64/lib/tables/tables.h +23 -0
  73. metadata +67 -6
  74. data/.byebug_history +0 -72
  75. data/.envrc +0 -1
@@ -0,0 +1,474 @@
1
+ # Fast Base64 stream encoder/decoder
2
+
3
+ [![Build Status](https://travis-ci.org/aklomp/base64.png?branch=master)](https://travis-ci.org/aklomp/base64)
4
+
5
+ This is an implementation of a base64 stream encoding/decoding library in C99
6
+ with SIMD (AVX2, NEON, AArch64/NEON, SSSE3, SSE4.1, SSE4.2, AVX) and
7
+ [OpenMP](http://www.openmp.org) acceleration. It also contains wrapper functions
8
+ to encode/decode simple length-delimited strings. This library aims to be:
9
+
10
+ - FAST;
11
+ - easy to use;
12
+ - elegant.
13
+
14
+ On x86, the library does runtime feature detection. The first time it's called,
15
+ the library will determine the appropriate encoding/decoding routines for the
16
+ machine. It then remembers them for the lifetime of the program. If your
17
+ processor supports AVX2, SSSE3, SSE4.1, SSE4.2 or AVX instructions, the library
18
+ will pick an optimized codec that lets it encode/decode 12 or 24 bytes at a
19
+ time, which gives a speedup of four or more times compared to the "plain"
20
+ bytewise codec.
21
+
22
+ NEON support is hardcoded to on or off at compile time, because portable
23
+ runtime feature detection is unavailable on ARM.
24
+
25
+ Even if your processor does not support SIMD instructions, this is a very fast
26
+ library. The fallback routine can process 32 or 64 bits of input in one round,
27
+ depending on your processor's word width, which still makes it significantly
28
+ faster than naive bytewise implementations. On some 64-bit machines, the 64-bit
29
+ routines even outperform the SSSE3 ones.
30
+
31
+ To the author's knowledge, at the time of original release, this was the only
32
+ Base64 library to offer SIMD acceleration. The author wrote
33
+ [an article](http://www.alfredklomp.com/programming/sse-base64) explaining one
34
+ possible SIMD approach to encoding/decoding Base64. The article can help figure
35
+ out what the code is doing, and why.
36
+
37
+ Notable features:
38
+
39
+ - Really fast on x86 and ARM systems by using SIMD vector processing;
40
+ - Can use [OpenMP](http://www.openmp.org) for even more parallel speedups;
41
+ - Really fast on other 32 or 64-bit platforms through optimized routines;
42
+ - Reads/writes blocks of streaming data;
43
+ - Does not dynamically allocate memory;
44
+ - Valid C99 that compiles with pedantic options on;
45
+ - Re-entrant and threadsafe;
46
+ - Unit tested;
47
+ - Uses Duff's Device.
48
+
49
+ ## Acknowledgements
50
+
51
+ The original AVX2, NEON and Aarch64/NEON codecs were generously contributed by
52
+ [Inkymail](https://github.com/inkymail/base64), who, in their fork, also
53
+ implemented some additional features. Their work is slowly being backported
54
+ into this project.
55
+
56
+ The SSSE3 and AVX2 codecs were substantially improved by using some very clever
57
+ optimizations described by Wojciech Muła in a
58
+ [series](http://0x80.pl/notesen/2016-01-12-sse-base64-encoding.html) of
59
+ [articles](http://0x80.pl/notesen/2016-01-17-sse-base64-decoding.html).
60
+ His own code is [here](https://github.com/WojciechMula/toys/tree/master/base64).
61
+
62
+ The OpenMP implementation was added by Ferry Toth (@htot) from [Exalon Delft](http://www.exalondelft.nl).
63
+
64
+ ## Building
65
+
66
+ The `lib` directory contains the code for the actual library.
67
+ Typing `make` in the toplevel directory will build `lib/libbase64.o` and `bin/base64`.
68
+ The first is a single, self-contained object file that you can link into your own project.
69
+ The second is a standalone test binary that works similarly to the `base64` system utility.
70
+
71
+ The matching header file needed to use this library is in `include/libbase64.h`.
72
+
73
+ To compile just the "plain" library without SIMD codecs, type:
74
+
75
+ ```sh
76
+ make lib/libbase64.o
77
+ ```
78
+
79
+ Optional SIMD codecs can be included by specifying the `AVX2_CFLAGS`, `NEON32_CFLAGS`, `NEON64_CFLAGS`,
80
+ `SSSE3_CFLAGS`, `SSE41_CFLAGS`, `SSE42_CFLAGS` and/or `AVX_CFLAGS` environment variables.
81
+ A typical build invocation on x86 looks like this:
82
+
83
+ ```sh
84
+ AVX2_CFLAGS=-mavx2 SSSE3_CFLAGS=-mssse3 SSE41_CFLAGS=-msse4.1 SSE42_CFLAGS=-msse4.2 AVX_CFLAGS=-mavx make lib/libbase64.o
85
+ ```
86
+
87
+ ### AVX2
88
+
89
+ To build and include the AVX2 codec, set the `AVX2_CFLAGS` environment variable to a value that will turn on AVX2 support in your compiler, typically `-mavx2`.
90
+ Example:
91
+
92
+ ```sh
93
+ AVX2_CFLAGS=-mavx2 make
94
+ ```
95
+
96
+ The codec will only be used if runtime feature detection shows that the target machine supports AVX2.
97
+
98
+ ### SSSE3
99
+
100
+ To build and include the SSSE3 codec, set the `SSSE3_CFLAGS` environment variable to a value that will turn on SSSE3 support in your compiler, typically `-mssse3`.
101
+ Example:
102
+
103
+ ```sh
104
+ SSSE3_CFLAGS=-mssse3 make
105
+ ```
106
+
107
+ The codec will only be used if runtime feature detection shows that the target machine supports SSSE3.
108
+
109
+ ### NEON
110
+
111
+ This library includes two NEON codecs: one for regular 32-bit ARM and one for the 64-bit AArch64 with NEON, which has double the amount of SIMD registers and can do full 64-byte table lookups.
112
+ These codecs encode in 48-byte chunks and decode in massive 64-byte chunks, so they had to be augmented with an uint32/64 codec to stay fast on smaller inputs!
113
+
114
+ Use LLVM/Clang for compiling the NEON codecs.
115
+ The code generation of at least GCC 4.6 (the version shipped with Raspbian and used for testing) contains a bug when compiling `vstq4_u8()`, and the generated assembly code is of low quality.
116
+ NEON intrinsics are a known weak area of GCC.
117
+ Clang does a better job.
118
+
119
+ NEON support can unfortunately not be portably detected at runtime from userland (the `mrc` instruction is privileged), so the default value for using the NEON codec is determined at compile-time.
120
+ But you can do your own runtime detection.
121
+ You can include the NEON codec and make it the default, then do a runtime check if the CPU has NEON support, and if not, force a downgrade to non-NEON with `BASE64_FORCE_PLAIN`.
122
+
123
+ These are your options:
124
+
125
+ 1. Don't include NEON support;
126
+ 2. build NEON support and make it the default, but build all other code without NEON flags so that you can override the default at runtime with `BASE64_FORCE_PLAIN`;
127
+ 3. build everything with NEON support and make it the default;
128
+ 4. build everything with NEON support, but don't make it the default (which makes no sense).
129
+
130
+ For option 1, simply don't specify any NEON-specific compiler flags at all, like so:
131
+
132
+ ```sh
133
+ CC=clang CFLAGS="-march=armv6" make
134
+ ```
135
+
136
+ For option 2, keep your `CFLAGS` plain, but set the `NEON32_CFLAGS` environment variable to a value that will build NEON support.
137
+ The line below, for instance, will build all the code at ARMv6 level, except for the NEON codec, which is built at ARMv7.
138
+ It will also make the NEON codec the default.
139
+ For ARMv6 platforms, override that default at runtime with the `BASE64_FORCE_PLAIN` flag.
140
+ No ARMv7/NEON code will then be touched.
141
+
142
+ ```sh
143
+ CC=clang CFLAGS="-march=armv6" NEON32_CFLAGS="-march=armv7 -mfpu=neon" make
144
+ ```
145
+
146
+ For option 3, put everything in your `CFLAGS` and use a stub, but non-empty, `NEON32_CFLAGS`.
147
+ This example works for the Raspberry Pi 2B V1.1, which has NEON support:
148
+
149
+ ```sh
150
+ CC=clang CFLAGS="-march=armv7 -mtune=cortex-a7" NEON32_CFLAGS="-mfpu=neon" make
151
+ ```
152
+
153
+ To build and include the NEON64 codec, use `CFLAGS` as usual to define the platform and set `NEON64_CFLAGS` to a nonempty stub.
154
+ (The AArch64 target has mandatory NEON64 support.)
155
+ Example:
156
+
157
+ ```sh
158
+ CC=clang CFLAGS="--target=aarch64-linux-gnu -march=armv8-a" NEON64_CFLAGS=" " make
159
+ ```
160
+
161
+ ### OpenMP
162
+
163
+ To enable OpenMP on GCC you need to build with `-fopenmp`. This can be by setting the the `OPENMP` environment variable to `1`.
164
+
165
+ Example:
166
+
167
+ ```sh
168
+ OPENMP=1 make
169
+ ```
170
+
171
+ This will let the compiler define `_OPENMP`, which in turn will include the OpenMP optimized `lib_openmp.c` into `lib.c`.
172
+
173
+ By default the number of parallel threads will be equal to the number of cores of the processor.
174
+ On a quad core with hyperthreading eight cores will be detected, but hyperthreading will not increase the performance.
175
+
176
+ To get verbose information about OpenMP start the program with `OMP_DISPLAY_ENV=VERBOSE`, for instance
177
+
178
+ ```sh
179
+ OMP_DISPLAY_ENV=VERBOSE test/benchmark
180
+ ```
181
+
182
+ To put a limit on the number of threads, start the program with `OMP_THREAD_LIMIT=n`, for instance
183
+
184
+ ```sh
185
+ OMP_THREAD_LIMIT=2 test/benchmark
186
+ ```
187
+
188
+ An example of running a benchmark with OpenMP, SSSE3 and AVX2 enabled:
189
+
190
+ ```sh
191
+ make clean && OPENMP=1 SSSE3_CFLAGS=-mssse3 AVX2_CFLAGS=-mavx2 make && OPENMP=1 make -C test
192
+ ```
193
+
194
+ ## API reference
195
+
196
+ Strings are represented as a pointer and a length; they are not
197
+ zero-terminated. This was a conscious design decision. In the decoding step,
198
+ relying on zero-termination would make no sense since the output could contain
199
+ legitimate zero bytes. In the encoding step, returning the length saves the
200
+ overhead of calling `strlen()` on the output. If you insist on the trailing
201
+ zero, you can easily add it yourself at the given offset.
202
+
203
+ ### Flags
204
+
205
+ Some API calls take a `flags` argument.
206
+ That argument can be used to force the use of a specific codec, even if that codec is a no-op in the current build.
207
+ Mainly there for testing purposes, this is also useful on ARM where the only way to do runtime NEON detection is to ask the OS if it's available.
208
+ The following constants can be used:
209
+
210
+ - `BASE64_FORCE_AVX2`
211
+ - `BASE64_FORCE_NEON32`
212
+ - `BASE64_FORCE_NEON64`
213
+ - `BASE64_FORCE_PLAIN`
214
+ - `BASE64_FORCE_SSSE3`
215
+ - `BASE64_FORCE_SSE41`
216
+ - `BASE64_FORCE_SSE42`
217
+ - `BASE64_FORCE_AVX`
218
+
219
+ Set `flags` to `0` for the default behavior, which is runtime feature detection on x86, a compile-time fixed codec on ARM, and the plain codec on other platforms.
220
+
221
+ ### Encoding
222
+
223
+ #### base64_encode
224
+
225
+ ```c
226
+ void base64_encode
227
+ ( const char *src
228
+ , size_t srclen
229
+ , char *out
230
+ , size_t *outlen
231
+ , int flags
232
+ ) ;
233
+ ```
234
+
235
+ Wrapper function to encode a plain string of given length.
236
+ Output is written to `out` without trailing zero.
237
+ Output length in bytes is written to `outlen`.
238
+ The buffer in `out` has been allocated by the caller and is at least 4/3 the size of the input.
239
+
240
+ #### base64_stream_encode_init
241
+
242
+ ```c
243
+ void base64_stream_encode_init
244
+ ( struct base64_state *state
245
+ , int flags
246
+ ) ;
247
+ ```
248
+
249
+ Call this before calling `base64_stream_encode()` to init the state.
250
+
251
+ #### base64_stream_encode
252
+
253
+ ```c
254
+ void base64_stream_encode
255
+ ( struct base64_state *state
256
+ , const char *src
257
+ , size_t srclen
258
+ , char *out
259
+ , size_t *outlen
260
+ ) ;
261
+ ```
262
+
263
+ Encodes the block of data of given length at `src`, into the buffer at `out`.
264
+ Caller is responsible for allocating a large enough out-buffer; it must be at least 4/3 the size of the in-buffer, but take some margin.
265
+ Places the number of new bytes written into `outlen` (which is set to zero when the function starts).
266
+ Does not zero-terminate or finalize the output.
267
+
268
+ #### base64_stream_encode_final
269
+
270
+ ```c
271
+ void base64_stream_encode_final
272
+ ( struct base64_state *state
273
+ , char *out
274
+ , size_t *outlen
275
+ ) ;
276
+ ```
277
+
278
+ Finalizes the output begun by previous calls to `base64_stream_encode()`.
279
+ Adds the required end-of-stream markers if appropriate.
280
+ `outlen` is modified and will contain the number of new bytes written at `out` (which will quite often be zero).
281
+
282
+ ### Decoding
283
+
284
+ #### base64_decode
285
+
286
+ ```c
287
+ int base64_decode
288
+ ( const char *src
289
+ , size_t srclen
290
+ , char *out
291
+ , size_t *outlen
292
+ , int flags
293
+ ) ;
294
+ ```
295
+
296
+ Wrapper function to decode a plain string of given length.
297
+ Output is written to `out` without trailing zero. Output length in bytes is written to `outlen`.
298
+ The buffer in `out` has been allocated by the caller and is at least 3/4 the size of the input.
299
+ Returns `1` for success, and `0` when a decode error has occured due to invalid input.
300
+ Returns `-1` if the chosen codec is not included in the current build.
301
+
302
+ #### base64_stream_decode_init
303
+
304
+ ```c
305
+ void base64_stream_decode_init
306
+ ( struct base64_state *state
307
+ , int flags
308
+ ) ;
309
+ ```
310
+
311
+ Call this before calling `base64_stream_decode()` to init the state.
312
+
313
+ #### base64_stream_decode
314
+
315
+ ```c
316
+ int base64_stream_decode
317
+ ( struct base64_state *state
318
+ , const char *src
319
+ , size_t srclen
320
+ , char *out
321
+ , size_t *outlen
322
+ ) ;
323
+ ```
324
+
325
+ Decodes the block of data of given length at `src`, into the buffer at `out`.
326
+ Caller is responsible for allocating a large enough out-buffer; it must be at least 3/4 the size of the in-buffer, but take some margin.
327
+ Places the number of new bytes written into `outlen` (which is set to zero when the function starts).
328
+ Does not zero-terminate the output.
329
+ Returns 1 if all is well, and 0 if a decoding error was found, such as an invalid character.
330
+ Returns -1 if the chosen codec is not included in the current build.
331
+ Used by the test harness to check whether a codec is available for testing.
332
+
333
+ ## Examples
334
+
335
+ A simple example of encoding a static string to base64 and printing the output
336
+ to stdout:
337
+
338
+ ```c
339
+ #include <stdio.h> /* fwrite */
340
+ #include "libbase64.h"
341
+
342
+ int main ()
343
+ {
344
+ char src[] = "hello world";
345
+ char out[20];
346
+ size_t srclen = sizeof(src) - 1;
347
+ size_t outlen;
348
+
349
+ base64_encode(src, srclen, out, &outlen, 0);
350
+
351
+ fwrite(out, outlen, 1, stdout);
352
+
353
+ return 0;
354
+ }
355
+ ```
356
+
357
+ A simple example (no error checking, etc) of stream encoding standard input to
358
+ standard output:
359
+
360
+ ```c
361
+ #include <stdio.h>
362
+ #include "libbase64.h"
363
+
364
+ int main ()
365
+ {
366
+ size_t nread, nout;
367
+ char buf[12000], out[16000];
368
+ struct base64_state state;
369
+
370
+ // Initialize stream encoder:
371
+ base64_stream_encode_init(&state, 0);
372
+
373
+ // Read contents of stdin into buffer:
374
+ while ((nread = fread(buf, 1, sizeof(buf), stdin)) > 0) {
375
+
376
+ // Encode buffer:
377
+ base64_stream_encode(&state, buf, nread, out, &nout);
378
+
379
+ // If there's output, print it to stdout:
380
+ if (nout) {
381
+ fwrite(out, nout, 1, stdout);
382
+ }
383
+
384
+ // If an error occurred, exit the loop:
385
+ if (feof(stdin)) {
386
+ break;
387
+ }
388
+ }
389
+
390
+ // Finalize encoding:
391
+ base64_stream_encode_final(&state, out, &nout);
392
+
393
+ // If the finalizing resulted in extra output bytes, print them:
394
+ if (nout) {
395
+ fwrite(out, nout, 1, stdout);
396
+ }
397
+
398
+ return 0;
399
+ }
400
+ ```
401
+
402
+ Also see `bin/base64.c` for a simple re-implementation of the `base64` utility.
403
+ A file or standard input is fed through the encoder/decoder, and the output is
404
+ written to standard output.
405
+
406
+ ## Tests
407
+
408
+ See `tests/` for a small test suite. Testing is automated with [Travis
409
+ CI](https://travis-ci.org/aklomp/base64), which builds and tests the code
410
+ across various architectures.
411
+
412
+ ## Benchmarks
413
+
414
+ Benchmarks can be run with the built-in benchmark program as follows:
415
+
416
+ ```sh
417
+ make -C test benchmark <buildflags> && test/benchmark
418
+ ```
419
+
420
+ It will run an encoding and decoding benchmark for all of the compiled-in codecs.
421
+
422
+ The tables below contain some results on random machines. All numbers measured with a 10MB buffer in MB/sec, rounded to the nearest integer.
423
+
424
+ \*: Update needed
425
+
426
+ x86 processors
427
+
428
+ | Processor | Plain enc | Plain dec | SSSE3 enc | SSSE3 dec | AVX enc | AVX dec | AVX2 enc | AVX2 dec |
429
+ |-------------------------------------------|----------:|----------:|----------:|----------:|--------:|--------:|---------:|---------:|
430
+ | i7-4771 @ 3.5 GHz | 833\* | 1111\* | 3333\* | 4444\* | TBD | TBD | 4999\* | 6666\* |
431
+ | i7-4770 @ 3.4 GHz DDR1600 | 1790\* | 3038\* | 4899\* | 4043\* | 4796\* | 5709\* | 4681\* | 6386\* |
432
+ | i7-4770 @ 3.4 GHz DDR1600 OPENMP 1 thread | 1784\* | 3041\* | 4945\* | 4035\* | 4776\* | 5719\* | 4661\* | 6294\* |
433
+ | i7-4770 @ 3.4 GHz DDR1600 OPENMP 2 thread | 3401\* | 5729\* | 5489\* | 7444\* | 5003\* | 8624\* | 5105\* | 8558\* |
434
+ | i7-4770 @ 3.4 GHz DDR1600 OPENMP 4 thread | 4884\* | 7099\* | 4917\* | 7057\* | 4799\* | 7143\* | 4902\* | 7219\* |
435
+ | i7-4770 @ 3.4 GHz DDR1600 OPENMP 8 thread | 5212\* | 8849\* | 5284\* | 9099\* | 5289\* | 9220\* | 4849\* | 9200\* |
436
+ | i7-4870HQ @ 2.5 GHz | 1471\* | 3066\* | 6721\* | 6962\* | 7015\* | 8267\* | 8328\* | 11576\* |
437
+ | i5-4590S @ 3.0 GHz | 3356 | 3197 | 4363 | 6104 | 4243 | 6233 | 4160 | 6344 |
438
+ | Xeon X5570 @ 2.93 GHz | 2161 | 1508 | 3160 | 3915 | - | - | - | - |
439
+ | Pentium4 @ 3.4 GHz | 896 | 740 | - | - | - | - | - | - |
440
+ | Atom N270 | 243 | 266 | 508 | 387 | - | - | - | - |
441
+ | AMD E-450 | 645 | 564 | 625 | 634 | - | - | - | - |
442
+ | Intel Edison @ 500 MHz | 79\* | 92\* | 152\* | 172\* | - | - | - | - |
443
+ | Intel Edison @ 500 MHz OPENMP 2 thread | 158\* | 184\* | 300\* | 343\* | - | - | - | - |
444
+ | Intel Edison @ 500 MHz (x86-64) | 97\* | 146\* | 197\* | 207\* | - | - | - | - |
445
+ | Intel Edison @ 500 MHz (x86-64) 2 thread | 193\* | 288\* | 389\* | 410\* | - | - | - | - |
446
+
447
+ ARM processors
448
+
449
+ | Processor | Plain enc | Plain dec | NEON32 enc | NEON32 dec | NEON64 enc | NEON64 dec |
450
+ |-------------------------------------------|----------:|----------:|-----------:|-----------:|-----------:|-----------:|
451
+ | Raspberry PI B+ V1.2 | 46\* | 40\* | - | - | - | - |
452
+ | Raspberry PI 2 B V1.1 | 85 | 141 | 282 | 225 | - | - |
453
+ | Apple iPhone SE armv7 | 1056\* | 895\* | 2943\* | 2618\* | - | - |
454
+ | Apple iPhone SE arm64 | 1061\* | 1239\* | - | - | 4098\* | 3983\* |
455
+
456
+ PowerPC processors
457
+
458
+ | Processor | Plain enc | Plain dec |
459
+ |-------------------------------------------|----------:|----------:|
460
+ | PowerPC E6500 @ 1.8GHz | 270\* | 265\* |
461
+
462
+
463
+ Benchmarks on i7-4770 @ 3.4 GHz DDR1600 with varrying buffer sizes:
464
+ ![Benchmarks](base64-benchmarks.png)
465
+
466
+ Note: optimal buffer size to take advantage of the cache is in the range of 100 kB to 1 MB, leading to 12x faster AVX encoding/decoding compared to Plain, or a throughput of 24/27GB/sec.
467
+ Also note the performance degradation when the buffer size is less than 10 kB due to thread creation overhead.
468
+ To prevent this from happening `lib_openmp.c` defines `OMP_THRESHOLD 20000`, requiring at least a 20000 byte buffer to enable multithreading.
469
+
470
+ ## License
471
+
472
+ This repository is licensed under the
473
+ [BSD 2-clause License](http://opensource.org/licenses/BSD-2-Clause). See the
474
+ LICENSE file.
@@ -0,0 +1,132 @@
1
+ #include <stddef.h> // size_t
2
+ #include <stdio.h> // fopen()
3
+ #include <string.h> // strlen()
4
+ #include <getopt.h>
5
+ #include "../include/libbase64.h"
6
+
7
+ #define BUFSIZE 1024 * 1024
8
+
9
+ static char buf[BUFSIZE];
10
+ static char out[(BUFSIZE * 5) / 3]; // Technically 4/3 of input, but take some margin
11
+ size_t nread;
12
+ size_t nout;
13
+
14
+ static int
15
+ enc (FILE *fp)
16
+ {
17
+ int ret = 1;
18
+ struct base64_state state;
19
+ size_t acc = 0;
20
+
21
+ base64_stream_encode_init(&state, 0);
22
+
23
+ while ((nread = fread(buf, 1, BUFSIZE, fp)) > 0) {
24
+ base64_stream_encode(&state, buf, nread, out + acc, &nout);
25
+ if (nout) {
26
+ fwrite(out + acc, nout, 1, stdout);
27
+ acc += nout;
28
+ }
29
+ if (feof(fp)) {
30
+ break;
31
+ }
32
+ }
33
+ if (ferror(fp)) {
34
+ fprintf(stderr, "read error\n");
35
+ ret = 0;
36
+ goto out;
37
+ }
38
+ base64_stream_encode_final(&state, out + acc, &nout);
39
+
40
+ if (nout) {
41
+ fwrite(out + acc, nout, 1, stdout);
42
+ }
43
+ out: fclose(fp);
44
+ fclose(stdout);
45
+ return ret;
46
+ }
47
+
48
+ static int
49
+ dec (FILE *fp)
50
+ {
51
+ int ret = 1;
52
+ struct base64_state state;
53
+ size_t acc = 0;
54
+
55
+ base64_stream_decode_init(&state, 0);
56
+
57
+ while ((nread = fread(buf, 1, BUFSIZE, fp)) > 0) {
58
+ if (!base64_stream_decode(&state, buf, nread, out + acc, &nout)) {
59
+ fprintf(stderr, "decoding error\n");
60
+ ret = 0;
61
+ goto out;
62
+ }
63
+ if (nout) {
64
+ fwrite(out + acc, nout, 1, stdout);
65
+ acc += nout;
66
+ }
67
+ if (feof(fp)) {
68
+ break;
69
+ }
70
+ }
71
+ if (ferror(fp)) {
72
+ fprintf(stderr, "read error\n");
73
+ ret = 0;
74
+ }
75
+ out: fclose(fp);
76
+ fclose(stdout);
77
+ return ret;
78
+ }
79
+
80
+ int
81
+ main (int argc, char **argv)
82
+ {
83
+ char *file;
84
+ FILE *fp;
85
+ int decode = 0;
86
+
87
+ // Parse options:
88
+ for (;;)
89
+ {
90
+ int c;
91
+ int opt_index = 0;
92
+ static struct option opt_long[] = {
93
+ { "decode", 0, 0, 'd' },
94
+ { 0, 0, 0, 0 }
95
+ };
96
+ if ((c = getopt_long(argc, argv, "d", opt_long, &opt_index)) == -1) {
97
+ break;
98
+ }
99
+ switch (c)
100
+ {
101
+ case 'd':
102
+ decode = 1;
103
+ break;
104
+ }
105
+ }
106
+
107
+ // No options left on command line? Read from stdin:
108
+ if (optind >= argc) {
109
+ fp = stdin;
110
+ }
111
+
112
+ // One option left on command line? Treat it as a file:
113
+ else if (optind + 1 == argc) {
114
+ file = argv[optind];
115
+ if (strcmp(file, "-") == 0) {
116
+ fp = stdin;
117
+ }
118
+ else if ((fp = fopen(file, "rb")) == NULL) {
119
+ printf("cannot open %s\n", file);
120
+ return 1;
121
+ }
122
+ }
123
+
124
+ // More than one option left on command line? Syntax error:
125
+ else {
126
+ printf("Usage: %s <file>\n", argv[0]);
127
+ return 1;
128
+ }
129
+
130
+ // Invert return codes to create shell return code:
131
+ return (decode) ? !dec(fp) : !enc(fp);
132
+ }
@@ -0,0 +1,29 @@
1
+ # Written in 2017 by Henrik Steffen Gaßmann henrik@gassmann.onl
2
+ #
3
+ # To the extent possible under law, the author(s) have dedicated all
4
+ # copyright and related and neighboring rights to this software to the
5
+ # public domain worldwide. This software is distributed without any warranty.
6
+ #
7
+ # You should have received a copy of the CC0 Public Domain Dedication
8
+ # along with this software. If not, see
9
+ #
10
+ # http://creativecommons.org/publicdomain/zero/1.0/
11
+ #
12
+ ########################################################################
13
+
14
+ set(TARGET_ARCHITECTURE_TEST_FILE "${CMAKE_CURRENT_LIST_DIR}/../test-arch.c")
15
+
16
+ function(detect_target_architecture OUTPUT_VARIABLE)
17
+ message(STATUS "${CMAKE_CURRENT_LIST_DIR}")
18
+ try_compile(_IGNORED "${CMAKE_CURRENT_BINARY_DIR}"
19
+ "${TARGET_ARCHITECTURE_TEST_FILE}"
20
+ OUTPUT_VARIABLE _LOG
21
+ )
22
+
23
+ string(REGEX MATCH "##arch=([^#]+)##" _IGNORED "${_LOG}")
24
+
25
+ set(${OUTPUT_VARIABLE} "${CMAKE_MATCH_1}" PARENT_SCOPE)
26
+ if (CMAKE_MATCH_1 STREQUAL "unknown")
27
+ message(WARNING "could not detect the target architecture.")
28
+ endif()
29
+ endfunction()
@@ -0,0 +1,34 @@
1
+ # Written in 2016-2017 by Henrik Steffen Gaßmann henrik@gassmann.onl
2
+ #
3
+ # To the extent possible under law, the author(s) have dedicated all
4
+ # copyright and related and neighboring rights to this software to the
5
+ # public domain worldwide. This software is distributed without any warranty.
6
+ #
7
+ # You should have received a copy of the CC0 Public Domain Dedication
8
+ # along with this software. If not, see
9
+ #
10
+ # http://creativecommons.org/publicdomain/zero/1.0/
11
+ #
12
+ ########################################################################
13
+
14
+ ########################################################################
15
+ # compiler flags definition
16
+ macro(define_SIMD_compile_flags)
17
+ if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
18
+ # x86
19
+ set(COMPILE_FLAGS_SSSE3 "-mssse3")
20
+ set(COMPILE_FLAGS_SSE41 "-msse4.1")
21
+ set(COMPILE_FLAGS_SSE42 "-msse4.2")
22
+ set(COMPILE_FLAGS_AVX "-mavx")
23
+ set(COMPILE_FLAGS_AVX2 "-mavx2")
24
+
25
+ #arm
26
+ set(COMPILE_FLAGS_NEON32 "-mfpu=neon")
27
+ elseif(MSVC)
28
+ set(COMPILE_FLAGS_SSSE3 " ")
29
+ set(COMPILE_FLAGS_SSE41 " ")
30
+ set(COMPILE_FLAGS_SSE42 " ")
31
+ set(COMPILE_FLAGS_AVX "/arch:AVX2")
32
+ set(COMPILE_FLAGS_AVX2 "/arch:AVX2")
33
+ endif()
34
+ endmacro(define_SIMD_compile_flags)
@@ -0,0 +1,5 @@
1
+ @PACKAGE_INIT@
2
+
3
+ include("${CMAKE_CURRENT_LIST_DIR}/base64-targets.cmake")
4
+
5
+ check_required_components(base64)