RubyGems - ob64 - Versions diffs - 0.1.0 → 0.5.0 - Mend

ob64 0.1.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

checksums.yaml +4 -4
data/.github/workflows/main.yml +20 -4
data/.gitignore +2 -0
data/CHANGELOG.md +18 -1
data/{LICENSE.txt → LICENSE} +1 -1
data/README.md +34 -2
data/benchmark.rb +42 -3
data/ext/ob64/ob64_ext.c +5 -3
data/lib/ob64/core_ext.rb +2 -0
data/lib/ob64/version.rb +1 -1
data/lib/ob64.rb +52 -0
data/ob64.gemspec +12 -6
data/vendor/libbase64/.gitignore +12 -0
data/vendor/libbase64/.travis.yml +71 -0
data/vendor/libbase64/CMakeLists.txt +264 -0
data/vendor/libbase64/LICENSE +28 -0
data/vendor/libbase64/Makefile +93 -0
data/vendor/libbase64/README.md +474 -0
data/vendor/libbase64/base64-benchmarks.png +0 -0
data/vendor/libbase64/bin/base64.c +132 -0
data/vendor/libbase64/cmake/Modules/TargetArch.cmake +29 -0
data/vendor/libbase64/cmake/Modules/TargetSIMDInstructionSet.cmake +34 -0
data/vendor/libbase64/cmake/base64-config.cmake.in +5 -0
data/vendor/libbase64/cmake/config.h.in +25 -0
data/vendor/libbase64/cmake/test-arch.c +35 -0
data/vendor/libbase64/include/libbase64.h +145 -0
data/vendor/libbase64/lib/arch/avx/codec.c +42 -0
data/vendor/libbase64/lib/arch/avx2/codec.c +42 -0
data/vendor/libbase64/lib/arch/avx2/dec_loop.c +110 -0
data/vendor/libbase64/lib/arch/avx2/dec_reshuffle.c +34 -0
data/vendor/libbase64/lib/arch/avx2/enc_loop.c +89 -0
data/vendor/libbase64/lib/arch/avx2/enc_reshuffle.c +83 -0
data/vendor/libbase64/lib/arch/avx2/enc_translate.c +30 -0
data/vendor/libbase64/lib/arch/generic/32/dec_loop.c +86 -0
data/vendor/libbase64/lib/arch/generic/32/enc_loop.c +73 -0
data/vendor/libbase64/lib/arch/generic/64/enc_loop.c +77 -0
data/vendor/libbase64/lib/arch/generic/codec.c +39 -0
data/vendor/libbase64/lib/arch/generic/dec_head.c +37 -0
data/vendor/libbase64/lib/arch/generic/dec_tail.c +91 -0
data/vendor/libbase64/lib/arch/generic/enc_head.c +24 -0
data/vendor/libbase64/lib/arch/generic/enc_tail.c +34 -0
data/vendor/libbase64/lib/arch/neon32/codec.c +72 -0
data/vendor/libbase64/lib/arch/neon32/dec_loop.c +106 -0
data/vendor/libbase64/lib/arch/neon32/enc_loop.c +58 -0
data/vendor/libbase64/lib/arch/neon32/enc_reshuffle.c +54 -0
data/vendor/libbase64/lib/arch/neon32/enc_translate.c +57 -0
data/vendor/libbase64/lib/arch/neon64/codec.c +70 -0
data/vendor/libbase64/lib/arch/neon64/dec_loop.c +129 -0
data/vendor/libbase64/lib/arch/neon64/enc_loop.c +66 -0
data/vendor/libbase64/lib/arch/neon64/enc_reshuffle.c +54 -0
data/vendor/libbase64/lib/arch/sse41/codec.c +42 -0
data/vendor/libbase64/lib/arch/sse42/codec.c +42 -0
data/vendor/libbase64/lib/arch/ssse3/codec.c +42 -0
data/vendor/libbase64/lib/arch/ssse3/dec_loop.c +173 -0
data/vendor/libbase64/lib/arch/ssse3/dec_reshuffle.c +33 -0
data/vendor/libbase64/lib/arch/ssse3/enc_loop.c +67 -0
data/vendor/libbase64/lib/arch/ssse3/enc_reshuffle.c +48 -0
data/vendor/libbase64/lib/arch/ssse3/enc_translate.c +33 -0
data/vendor/libbase64/lib/codec_choose.c +281 -0
data/vendor/libbase64/lib/codecs.h +65 -0
data/vendor/libbase64/lib/env.h +67 -0
data/vendor/libbase64/lib/exports.txt +7 -0
data/vendor/libbase64/lib/lib.c +164 -0
data/vendor/libbase64/lib/lib_openmp.c +149 -0
data/vendor/libbase64/lib/tables/.gitignore +1 -0
data/vendor/libbase64/lib/tables/Makefile +17 -0
data/vendor/libbase64/lib/tables/table_dec_32bit.h +393 -0
data/vendor/libbase64/lib/tables/table_enc_12bit.h +1031 -0
data/vendor/libbase64/lib/tables/table_enc_12bit.py +45 -0
data/vendor/libbase64/lib/tables/table_generator.c +184 -0
data/vendor/libbase64/lib/tables/tables.c +40 -0
data/vendor/libbase64/lib/tables/tables.h +23 -0
metadata +67 -6
data/.byebug_history +0 -72
data/.envrc +0 -1

data/vendor/libbase64/lib/arch/generic/32/dec_loop.c ADDED Viewed

@@ -0,0 +1,86 @@
+static inline int
+dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
+{
+	const uint32_t str
+		= base64_table_dec_32bit_d0[(*s)[0]]
+		| base64_table_dec_32bit_d1[(*s)[1]]
+		| base64_table_dec_32bit_d2[(*s)[2]]
+		| base64_table_dec_32bit_d3[(*s)[3]];
+#if BASE64_LITTLE_ENDIAN
+	// LUTs for little-endian set MSB in case of invalid character:
+	if (str & UINT32_C(0x80000000)) {
+		return 0;
+	}
+#else
+	// LUTs for big-endian set LSB in case of invalid character:
+	if (str & UINT32_C(1)) {
+		return 0;
+	}
+#endif
+	// Store the output:
+	memcpy(*o, &str, sizeof (str));
+	*s += 4;
+	*o += 3;
+	*rounds -= 1;
+	return 1;
+}
+static inline void
+dec_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 8) {
+		return;
+	}
+	// Process blocks of 4 bytes per round. Because one extra zero byte is
+	// written after the output, ensure that there will be at least 4 bytes
+	// of input data left to cover the gap. (Two data bytes and up to two
+	// end-of-string markers.)
+	size_t rounds = (*slen - 4) / 4;
+	*slen -= rounds * 4;	// 4 bytes consumed per round
+	*olen += rounds * 3;	// 3 bytes produced per round
+	do {
+		if (rounds >= 8) {
+			if (dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 4) {
+			if (dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 2) {
+			if (dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		dec_loop_generic_32_inner(s, o, &rounds);
+		break;
+	} while (rounds > 0);
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 4;
+	*olen -= rounds * 3;
+}

data/vendor/libbase64/lib/arch/generic/32/enc_loop.c ADDED Viewed

@@ -0,0 +1,73 @@
+static inline void
+enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
+{
+	uint32_t src;
+	// Load input:
+	memcpy(&src, *s, sizeof (src));
+	// Reorder to 32-bit big-endian, if not already in that format. The
+	// workset must be in big-endian, otherwise the shifted bits do not
+	// carry over properly among adjacent bytes:
+	src = BASE64_HTOBE32(src);
+	// Two indices for the 12-bit lookup table:
+	const size_t index0 = (src >> 20) & 0xFFFU;
+	const size_t index1 = (src >>  8) & 0xFFFU;
+	// Table lookup and store:
+	memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
+	memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
+	*s += 3;
+	*o += 4;
+}
+static inline void
+enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 4) {
+		return;
+	}
+	// Process blocks of 3 bytes at a time. Because blocks are loaded 4
+	// bytes at a time, ensure that there will be at least one remaining
+	// byte after the last round, so that the final read will not pass
+	// beyond the bounds of the input buffer:
+	size_t rounds = (*slen - 1) / 3;
+	*slen -= rounds * 3;	// 3 bytes consumed per round
+	*olen += rounds * 4;	// 4 bytes produced per round
+	do {
+		if (rounds >= 8) {
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_generic_32_inner(s, o);
+		break;
+	} while (rounds > 0);
+}

data/vendor/libbase64/lib/arch/generic/64/enc_loop.c ADDED Viewed

@@ -0,0 +1,77 @@
+static inline void
+enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
+{
+	uint64_t src;
+	// Load input:
+	memcpy(&src, *s, sizeof (src));
+	// Reorder to 64-bit big-endian, if not already in that format. The
+	// workset must be in big-endian, otherwise the shifted bits do not
+	// carry over properly among adjacent bytes:
+	src = BASE64_HTOBE64(src);
+	// Four indices for the 12-bit lookup table:
+	const size_t index0 = (src >> 52) & 0xFFFU;
+	const size_t index1 = (src >> 40) & 0xFFFU;
+	const size_t index2 = (src >> 28) & 0xFFFU;
+	const size_t index3 = (src >> 16) & 0xFFFU;
+	// Table lookup and store:
+	memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
+	memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
+	memcpy(*o + 4, base64_table_enc_12bit + index2, 2);
+	memcpy(*o + 6, base64_table_enc_12bit + index3, 2);
+	*s += 6;
+	*o += 8;
+}
+static inline void
+enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 8) {
+		return;
+	}
+	// Process blocks of 6 bytes at a time. Because blocks are loaded 8
+	// bytes at a time, ensure that there will be at least 2 remaining
+	// bytes after the last round, so that the final read will not pass
+	// beyond the bounds of the input buffer:
+	size_t rounds = (*slen - 2) / 6;
+	*slen -= rounds * 6;	// 6 bytes consumed per round
+	*olen += rounds * 8;	// 8 bytes produced per round
+	do {
+		if (rounds >= 8) {
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_generic_64_inner(s, o);
+		break;
+	} while (rounds > 0);
+}

data/vendor/libbase64/lib/arch/generic/codec.c ADDED Viewed

@@ -0,0 +1,39 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include "../../../include/libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+#if BASE64_WORDSIZE == 32
+#  include "32/enc_loop.c"
+#elif BASE64_WORDSIZE == 64
+#  include "64/enc_loop.c"
+#endif
+#if BASE64_WORDSIZE >= 32
+#  include "32/dec_loop.c"
+#endif
+BASE64_ENC_FUNCTION(plain)
+{
+	#include "enc_head.c"
+#if BASE64_WORDSIZE == 32
+	enc_loop_generic_32(&s, &slen, &o, &olen);
+#elif BASE64_WORDSIZE == 64
+	enc_loop_generic_64(&s, &slen, &o, &olen);
+#endif
+	#include "enc_tail.c"
+}
+BASE64_DEC_FUNCTION(plain)
+{
+	#include "dec_head.c"
+#if BASE64_WORDSIZE >= 32
+	dec_loop_generic_32(&s, &slen, &o, &olen);
+#endif
+	#include "dec_tail.c"
+}

data/vendor/libbase64/lib/arch/generic/dec_head.c ADDED Viewed

@@ -0,0 +1,37 @@
+int ret = 0;
+const uint8_t *s = (const uint8_t *) src;
+uint8_t *o = (uint8_t *) out;
+uint8_t q;
+// Use local temporaries to avoid cache thrashing:
+size_t olen = 0;
+size_t slen = srclen;
+struct base64_state st;
+st.eof = state->eof;
+st.bytes = state->bytes;
+st.carry = state->carry;
+// If we previously saw an EOF or an invalid character, bail out:
+if (st.eof) {
+	*outlen = 0;
+	ret = 0;
+	// If there was a trailing '=' to check, check it:
+	if (slen && (st.eof == BASE64_AEOF)) {
+		state->bytes = 0;
+		state->eof = BASE64_EOF;
+		ret = ((base64_table_dec_8bit[*s++] == 254) && (slen == 1)) ? 1 : 0;
+	}
+	return ret;
+}
+// Turn four 6-bit numbers into three bytes:
+// out[0] = 11111122
+// out[1] = 22223333
+// out[2] = 33444444
+// Duff's device again:
+switch (st.bytes)
+{
+	for (;;)
+	{
+	case 0:

data/vendor/libbase64/lib/arch/generic/dec_tail.c ADDED Viewed

@@ -0,0 +1,91 @@
+		if (slen-- == 0) {
+			ret = 1;
+			break;
+		}
+		if ((q = base64_table_dec_8bit[*s++]) >= 254) {
+			st.eof = BASE64_EOF;
+			// Treat character '=' as invalid for byte 0:
+			break;
+		}
+		st.carry = q << 2;
+		st.bytes++;
+		// Deliberate fallthrough:
+		BASE64_FALLTHROUGH
+	case 1:	if (slen-- == 0) {
+			ret = 1;
+			break;
+		}
+		if ((q = base64_table_dec_8bit[*s++]) >= 254) {
+			st.eof = BASE64_EOF;
+			// Treat character '=' as invalid for byte 1:
+			break;
+		}
+		*o++ = st.carry | (q >> 4);
+		st.carry = q << 4;
+		st.bytes++;
+		olen++;
+		// Deliberate fallthrough:
+		BASE64_FALLTHROUGH
+	case 2:	if (slen-- == 0) {
+			ret = 1;
+			break;
+		}
+		if ((q = base64_table_dec_8bit[*s++]) >= 254) {
+			st.bytes++;
+			// When q == 254, the input char is '='.
+			// Check if next byte is also '=':
+			if (q == 254) {
+				if (slen-- != 0) {
+					st.bytes = 0;
+					// EOF:
+					st.eof = BASE64_EOF;
+					q = base64_table_dec_8bit[*s++];
+					ret = ((q == 254) && (slen == 0)) ? 1 : 0;
+					break;
+				}
+				else {
+					// Almost EOF
+					st.eof = BASE64_AEOF;
+					ret = 1;
+					break;
+				}
+			}
+			// If we get here, there was an error:
+			break;
+		}
+		*o++ = st.carry | (q >> 2);
+		st.carry = q << 6;
+		st.bytes++;
+		olen++;
+		// Deliberate fallthrough:
+		BASE64_FALLTHROUGH
+	case 3:	if (slen-- == 0) {
+			ret = 1;
+			break;
+		}
+		if ((q = base64_table_dec_8bit[*s++]) >= 254) {
+			st.bytes = 0;
+			st.eof = BASE64_EOF;
+			// When q == 254, the input char is '='. Return 1 and EOF.
+			// When q == 255, the input char is invalid. Return 0 and EOF.
+			ret = ((q == 254) && (slen == 0)) ? 1 : 0;
+			break;
+		}
+		*o++ = st.carry | q;
+		st.carry = 0;
+		st.bytes = 0;
+		olen++;
+	}
+}
+state->eof = st.eof;
+state->bytes = st.bytes;
+state->carry = st.carry;
+*outlen = olen;
+return ret;

data/vendor/libbase64/lib/arch/generic/enc_head.c ADDED Viewed

@@ -0,0 +1,24 @@
+// Assume that *out is large enough to contain the output.
+// Theoretically it should be 4/3 the length of src.
+const uint8_t *s = (const uint8_t *) src;
+uint8_t *o = (uint8_t *) out;
+// Use local temporaries to avoid cache thrashing:
+size_t olen = 0;
+size_t slen = srclen;
+struct base64_state st;
+st.bytes = state->bytes;
+st.carry = state->carry;
+// Turn three bytes into four 6-bit numbers:
+// in[0] = 00111111
+// in[1] = 00112222
+// in[2] = 00222233
+// in[3] = 00333333
+// Duff's device, a for() loop inside a switch() statement. Legal!
+switch (st.bytes)
+{
+	for (;;)
+	{
+	case 0:

data/vendor/libbase64/lib/arch/generic/enc_tail.c ADDED Viewed

@@ -0,0 +1,34 @@
+		if (slen-- == 0) {
+			break;
+		}
+		*o++ = base64_table_enc_6bit[*s >> 2];
+		st.carry = (*s++ << 4) & 0x30;
+		st.bytes++;
+		olen += 1;
+		// Deliberate fallthrough:
+		BASE64_FALLTHROUGH
+	case 1:	if (slen-- == 0) {
+			break;
+		}
+		*o++ = base64_table_enc_6bit[st.carry | (*s >> 4)];
+		st.carry = (*s++ << 2) & 0x3C;
+		st.bytes++;
+		olen += 1;
+		// Deliberate fallthrough:
+		BASE64_FALLTHROUGH
+	case 2:	if (slen-- == 0) {
+			break;
+		}
+		*o++ = base64_table_enc_6bit[st.carry | (*s >> 6)];
+		*o++ = base64_table_enc_6bit[*s++ & 0x3F];
+		st.bytes = 0;
+		olen += 2;
+	}
+}
+state->bytes = st.bytes;
+state->carry = st.carry;
+*outlen = olen;

data/vendor/libbase64/lib/arch/neon32/codec.c ADDED Viewed

@@ -0,0 +1,72 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include "../../../include/libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+#ifdef __arm__
+#  if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32
+#    define BASE64_USE_NEON32
+#  endif
+#endif
+#ifdef BASE64_USE_NEON32
+#include <arm_neon.h>
+static inline uint8x16_t
+vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
+{
+	// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
+	// the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups.
+	uint8x8x2_t lut2;
+	uint8x8x2_t result;
+	lut2.val[0] = vget_low_u8(lut);
+	lut2.val[1] = vget_high_u8(lut);
+	result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices));
+	result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices));
+	return vcombine_u8(result.val[0], result.val[1]);
+}
+#include "../generic/32/dec_loop.c"
+#include "../generic/32/enc_loop.c"
+#include "dec_loop.c"
+#include "enc_reshuffle.c"
+#include "enc_translate.c"
+#include "enc_loop.c"
+#endif	// BASE64_USE_NEON32
+// Stride size is so large on these NEON 32-bit functions
+// (48 bytes encode, 32 bytes decode) that we inline the
+// uint32 codec to stay performant on smaller inputs.
+BASE64_ENC_FUNCTION(neon32)
+{
+#ifdef BASE64_USE_NEON32
+	#include "../generic/enc_head.c"
+	enc_loop_neon32(&s, &slen, &o, &olen);
+	enc_loop_generic_32(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	BASE64_ENC_STUB
+#endif
+}
+BASE64_DEC_FUNCTION(neon32)
+{
+#ifdef BASE64_USE_NEON32
+	#include "../generic/dec_head.c"
+	dec_loop_neon32(&s, &slen, &o, &olen);
+	dec_loop_generic_32(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	BASE64_DEC_STUB
+#endif
+}

data/vendor/libbase64/lib/arch/neon32/dec_loop.c ADDED Viewed

@@ -0,0 +1,106 @@
+static inline int
+is_nonzero (const uint8x16_t v)
+{
+	uint64_t u64;
+	const uint64x2_t v64 = vreinterpretq_u64_u8(v);
+	const uint32x2_t v32 = vqmovn_u64(v64);
+	vst1_u64(&u64, vreinterpret_u64_u32(v32));
+	return u64 != 0;
+}
+static inline uint8x16_t
+delta_lookup (const uint8x16_t v)
+{
+	const uint8x8_t lut = {
+		0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
+	};
+	return vcombine_u8(
+		vtbl1_u8(lut, vget_low_u8(v)),
+		vtbl1_u8(lut, vget_high_u8(v)));
+}
+static inline uint8x16_t
+dec_loop_neon32_lane (uint8x16_t *lane)
+{
+	// See the SSSE3 decoder for an explanation of the algorithm.
+	const uint8x16_t lut_lo = {
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
+	};
+	const uint8x16_t lut_hi = {
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
+	};
+	const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
+	const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
+	const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
+	const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
+	const uint8x16_t eq_2F      = vceqq_u8(*lane, mask_2F);
+	const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
+	const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
+	// Now simply add the delta values to the input:
+	*lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
+	// Return the validity mask:
+	return vandq_u8(lo, hi);
+}
+static inline void
+dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 64) {
+		return;
+	}
+	// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
+	// extra trailing zero bytes are written, so it is not necessary to
+	// reserve extra input bytes:
+	size_t rounds = *slen / 64;
+	*slen -= rounds * 64;	// 64 bytes consumed per round
+	*olen += rounds * 48;	// 48 bytes produced per round
+	do {
+		uint8x16x3_t dec;
+		// Load 64 bytes and deinterleave:
+		uint8x16x4_t str = vld4q_u8(*s);
+		// Decode each lane, collect a mask of invalid inputs:
+		const uint8x16_t classified
+			= dec_loop_neon32_lane(&str.val[0])
+			| dec_loop_neon32_lane(&str.val[1])
+			| dec_loop_neon32_lane(&str.val[2])
+			| dec_loop_neon32_lane(&str.val[3]);
+		// Check for invalid input: if any of the delta values are
+		// zero, fall back on bytewise code to do error checking and
+		// reporting:
+		if (is_nonzero(classified)) {
+			break;
+		}
+		// Compress four bytes into three:
+		dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
+		dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
+		dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+		// Interleave and store decoded result:
+		vst3q_u8(*o, dec);
+		*s += 64;
+		*o += 48;
+	} while (--rounds > 0);
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 64;
+	*olen -= rounds * 48;
+}

data/vendor/libbase64/lib/arch/neon32/enc_loop.c ADDED Viewed

@@ -0,0 +1,58 @@
+static inline void
+enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
+{
+	// Load 48 bytes and deinterleave:
+	uint8x16x3_t src = vld3q_u8(*s);
+	// Reshuffle:
+	uint8x16x4_t out = enc_reshuffle(src);
+	// Translate reshuffled bytes to the Base64 alphabet:
+	out = enc_translate(out);
+	// Interleave and store output:
+	vst4q_u8(*o, out);
+	*s += 48;
+	*o += 64;
+}
+static inline void
+enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	size_t rounds = *slen / 48;
+	*slen -= rounds * 48;	// 48 bytes consumed per round
+	*olen += rounds * 64;	// 64 bytes produced per round
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_neon32_inner(s, o);
+		break;
+	}
+}