RubyGems - libdeflate - Versions diffs - 0.1.0 - Mend

libdeflate 0.1.0

Files changed (89) hide show

checksums.yaml +7 -0
data/.gitignore +17 -0
data/.gitmodules +3 -0
data/.rspec +2 -0
data/.rubocop.yml +1 -0
data/.rubocop_todo.yml +9 -0
data/.travis.yml +5 -0
data/Gemfile +4 -0
data/LICENSE.txt +21 -0
data/README.md +52 -0
data/Rakefile +15 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/ext/libdeflate/extconf.rb +14 -0
data/ext/libdeflate/libdeflate/.gitignore +19 -0
data/ext/libdeflate/libdeflate/COPYING +21 -0
data/ext/libdeflate/libdeflate/Makefile +231 -0
data/ext/libdeflate/libdeflate/Makefile.msc +64 -0
data/ext/libdeflate/libdeflate/NEWS +57 -0
data/ext/libdeflate/libdeflate/README.md +170 -0
data/ext/libdeflate/libdeflate/common/common_defs.h +351 -0
data/ext/libdeflate/libdeflate/common/compiler_gcc.h +134 -0
data/ext/libdeflate/libdeflate/common/compiler_msc.h +95 -0
data/ext/libdeflate/libdeflate/lib/adler32.c +213 -0
data/ext/libdeflate/libdeflate/lib/adler32_impl.h +281 -0
data/ext/libdeflate/libdeflate/lib/aligned_malloc.c +57 -0
data/ext/libdeflate/libdeflate/lib/aligned_malloc.h +13 -0
data/ext/libdeflate/libdeflate/lib/bt_matchfinder.h +357 -0
data/ext/libdeflate/libdeflate/lib/crc32.c +368 -0
data/ext/libdeflate/libdeflate/lib/crc32_impl.h +286 -0
data/ext/libdeflate/libdeflate/lib/crc32_table.h +526 -0
data/ext/libdeflate/libdeflate/lib/decompress_impl.h +404 -0
data/ext/libdeflate/libdeflate/lib/deflate_compress.c +2817 -0
data/ext/libdeflate/libdeflate/lib/deflate_compress.h +14 -0
data/ext/libdeflate/libdeflate/lib/deflate_constants.h +66 -0
data/ext/libdeflate/libdeflate/lib/deflate_decompress.c +889 -0
data/ext/libdeflate/libdeflate/lib/gzip_compress.c +95 -0
data/ext/libdeflate/libdeflate/lib/gzip_constants.h +45 -0
data/ext/libdeflate/libdeflate/lib/gzip_decompress.c +130 -0
data/ext/libdeflate/libdeflate/lib/hc_matchfinder.h +405 -0
data/ext/libdeflate/libdeflate/lib/lib_common.h +35 -0
data/ext/libdeflate/libdeflate/lib/matchfinder_avx2.h +53 -0
data/ext/libdeflate/libdeflate/lib/matchfinder_common.h +205 -0
data/ext/libdeflate/libdeflate/lib/matchfinder_neon.h +61 -0
data/ext/libdeflate/libdeflate/lib/matchfinder_sse2.h +53 -0
data/ext/libdeflate/libdeflate/lib/unaligned.h +202 -0
data/ext/libdeflate/libdeflate/lib/x86_cpu_features.c +169 -0
data/ext/libdeflate/libdeflate/lib/x86_cpu_features.h +48 -0
data/ext/libdeflate/libdeflate/lib/zlib_compress.c +87 -0
data/ext/libdeflate/libdeflate/lib/zlib_constants.h +21 -0
data/ext/libdeflate/libdeflate/lib/zlib_decompress.c +91 -0
data/ext/libdeflate/libdeflate/libdeflate.h +274 -0
data/ext/libdeflate/libdeflate/programs/benchmark.c +558 -0
data/ext/libdeflate/libdeflate/programs/checksum.c +197 -0
data/ext/libdeflate/libdeflate/programs/detect.sh +62 -0
data/ext/libdeflate/libdeflate/programs/gzip.c +603 -0
data/ext/libdeflate/libdeflate/programs/prog_util.c +530 -0
data/ext/libdeflate/libdeflate/programs/prog_util.h +162 -0
data/ext/libdeflate/libdeflate/programs/test_checksums.c +135 -0
data/ext/libdeflate/libdeflate/programs/tgetopt.c +118 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/Makefile +12 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/fuzz.c +40 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_compress/inputs/0 +0 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/fuzz.c +28 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/deflate_decompress/inputs/0 +3 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/fuzz.c +28 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/gzip_decompress/inputs/0 +0 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/prepare_for_fuzz.sh +14 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/fuzz.c +28 -0
data/ext/libdeflate/libdeflate/tools/afl-fuzz/zlib_decompress/inputs/0 +3 -0
data/ext/libdeflate/libdeflate/tools/android_build.sh +104 -0
data/ext/libdeflate/libdeflate/tools/checksum_benchmarks.sh +76 -0
data/ext/libdeflate/libdeflate/tools/exec_tests.sh +30 -0
data/ext/libdeflate/libdeflate/tools/gen_crc32_multipliers.c +108 -0
data/ext/libdeflate/libdeflate/tools/gen_crc32_table.c +100 -0
data/ext/libdeflate/libdeflate/tools/gzip_tests.sh +412 -0
data/ext/libdeflate/libdeflate/tools/make-windows-releases +21 -0
data/ext/libdeflate/libdeflate/tools/mips_build.sh +9 -0
data/ext/libdeflate/libdeflate/tools/msc_test.bat +3 -0
data/ext/libdeflate/libdeflate/tools/pgo_build.sh +23 -0
data/ext/libdeflate/libdeflate/tools/produce_gzip_benchmark_table.sh +37 -0
data/ext/libdeflate/libdeflate/tools/run_tests.sh +305 -0
data/ext/libdeflate/libdeflate/tools/windows_build.sh +10 -0
data/ext/libdeflate/libdeflate_ext.c +389 -0
data/ext/libdeflate/libdeflate_ext.h +8 -0
data/lib/libdeflate.rb +2 -0
data/lib/libdeflate/version.rb +3 -0
data/libdeflate.gemspec +33 -0
metadata +230 -0

data/ext/libdeflate/libdeflate/common/compiler_gcc.h ADDED Viewed

@@ -0,0 +1,134 @@
+/*
+ * compiler_gcc.h - definitions for the GNU C Compiler.  This also handles clang
+ * and the Intel C Compiler (icc).
+ *
+ * TODO: icc is not well tested, so some things are currently disabled even
+ * though they maybe can be enabled on some icc versions.
+ */
+#if !defined(__clang__) && !defined(__INTEL_COMPILER)
+#  define GCC_PREREQ(major, minor)		\
+	(__GNUC__ > (major) ||			\
+	 (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#else
+#  define GCC_PREREQ(major, minor)	0
+#endif
+/* Note: only check the clang version when absolutely necessary!
+ * "Vendors" such as Apple can use different version numbers. */
+#ifdef __clang__
+#  ifdef __apple_build_version__
+#    define CLANG_PREREQ(major, minor, apple_version)	\
+	(__apple_build_version__ >= (apple_version))
+#  else
+#    define CLANG_PREREQ(major, minor, apple_version)	\
+	(__clang_major__ > (major) ||			\
+	 (__clang_major__ == (major) && __clang_minor__ >= (minor)))
+#  endif
+#else
+#  define CLANG_PREREQ(major, minor, apple_version)	0
+#endif
+#ifndef __has_attribute
+#  define __has_attribute(attribute)	0
+#endif
+#ifndef __has_feature
+#  define __has_feature(feature)	0
+#endif
+#ifndef __has_builtin
+#  define __has_builtin(builtin)	0
+#endif
+#ifdef _WIN32
+#  define LIBEXPORT __declspec(dllexport)
+#else
+#  define LIBEXPORT __attribute__((visibility("default")))
+#endif
+#define inline			inline
+#define forceinline		inline __attribute__((always_inline))
+#define restrict		__restrict__
+#define likely(expr)		__builtin_expect(!!(expr), 1)
+#define unlikely(expr)		__builtin_expect(!!(expr), 0)
+#define prefetchr(addr)		__builtin_prefetch((addr), 0)
+#define prefetchw(addr)		__builtin_prefetch((addr), 1)
+#define _aligned_attribute(n)	__attribute__((aligned(n)))
+/*
+ * Support for the following x86 instruction set extensions was introduced by
+ * the following gcc versions:
+ *
+ *	PCLMUL	4.4
+ *	AVX	4.6
+ *	BMI2	4.7
+ *	AVX2	4.7
+ *
+ * With clang, __has_builtin() can be used to detect the presence of one of the
+ * associated builtins.
+ *
+ * Additionally, gcc 4.4 introduced the 'target' function attribute.  With
+ * clang, support for this can be detected with with __has_attribute(target).
+ *
+ * However, prior to gcc 4.9 and clang 3.8, x86 intrinsics not available in the
+ * main target could not be used in 'target' attribute functions.  Unfortunately
+ * clang has no feature test macro for this so we have to check its version.
+ */
+#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE		\
+	(GCC_PREREQ(4, 4) || __has_attribute(target))
+#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
+#  define COMPILER_SUPPORTS_TARGET_INTRINSICS			\
+	(GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000))
+#  define COMPILER_SUPPORTS_PCLMUL_TARGET			\
+	(GCC_PREREQ(4, 4) || __has_builtin(__builtin_ia32_pclmulqdq128))
+#  define COMPILER_SUPPORTS_AVX_TARGET				\
+	(GCC_PREREQ(4, 6) || __has_builtin(__builtin_ia32_maxps256))
+#  define COMPILER_SUPPORTS_BMI2_TARGET				\
+	(GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di))
+#  define COMPILER_SUPPORTS_AVX2_TARGET				\
+	(GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pmaddwd256))
+#endif
+/* Newer gcc supports __BYTE_ORDER__.  Older gcc doesn't. */
+#ifdef __BYTE_ORDER__
+#  define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#endif
+#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+#  define bswap16	__builtin_bswap16
+#endif
+#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+#  define bswap32	__builtin_bswap32
+#endif
+#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+#  define bswap64	__builtin_bswap64
+#endif
+#if defined(__x86_64__) || defined(__i386__) || defined(__ARM_FEATURE_UNALIGNED)
+#  define UNALIGNED_ACCESS_IS_FAST 1
+#endif
+/* With gcc, we can access unaligned memory through 'packed' structures. */
+#define DEFINE_UNALIGNED_TYPE(type)				\
+								\
+struct type##unaligned {					\
+	type v;							\
+} __attribute__((packed));					\
+								\
+static forceinline type						\
+load_##type##_unaligned(const void *p)				\
+{								\
+	return ((const struct type##unaligned *)p)->v;		\
+}								\
+								\
+static forceinline void						\
+store_##type##_unaligned(type v, void *p)			\
+{								\
+	((struct type##unaligned *)p)->v = v;			\
+}
+#define bsr32(n)	(31 - __builtin_clz(n))
+#define bsr64(n)	(63 - __builtin_clzll(n))
+#define bsf32(n)	__builtin_ctz(n)
+#define bsf64(n)	__builtin_ctzll(n)

data/ext/libdeflate/libdeflate/common/compiler_msc.h ADDED Viewed

@@ -0,0 +1,95 @@
+/*
+ * compiler_msc.h - definitions for the Microsoft C Compiler
+ */
+#define LIBEXPORT	__declspec(dllexport)
+/*
+ * Old versions (e.g. VS2010) of MSC don't have the C99 header stdbool.h.
+ * Beware: the below replacement isn't fully standard, since normally any value
+ * != 0 should be implicitly cast to a bool with value 1... but that doesn't
+ * happen if bool is really just an 'int'.
+ */
+typedef int bool;
+#define true 1
+#define false 0
+#define __bool_true_false_are_defined 1
+/* Define ssize_t */
+#ifdef _WIN64
+typedef long long ssize_t;
+#else
+typedef int ssize_t;
+#endif
+/*
+ * Old versions (e.g. VS2010) of MSC have stdint.h but not the C99 header
+ * inttypes.h.  Work around this by defining the PRI* macros ourselves.
+ */
+#include <stdint.h>
+#define PRIu8  "hhu"
+#define PRIu16 "hu"
+#define PRIu32 "u"
+#define PRIu64 "llu"
+#define PRIi8  "hhi"
+#define PRIi16 "hi"
+#define PRIi32 "i"
+#define PRIi64 "lli"
+#define PRIx8  "hhx"
+#define PRIx16 "hx"
+#define PRIx32 "x"
+#define PRIx64 "llx"
+/* Assume a little endian architecture with fast unaligned access */
+#define CPU_IS_LITTLE_ENDIAN()		1
+#define UNALIGNED_ACCESS_IS_FAST	1
+/* __restrict has nonstandard behavior; don't use it */
+#define restrict
+/* ... but we can use __inline and __forceinline */
+#define inline		__inline
+#define forceinline	__forceinline
+/* Byte swap functions */
+#define bswap16	_byteswap_ushort
+#define bswap32	_byteswap_ulong
+#define bswap64	_byteswap_uint64
+/* Bit scan functions (32-bit) */
+static forceinline unsigned
+bsr32(uint32_t n)
+{
+	_BitScanReverse(&n, n);
+	return n;
+}
+#define bsr32 bsr32
+static forceinline unsigned
+bsf32(uint32_t n)
+{
+	_BitScanForward(&n, n);
+	return n;
+}
+#define bsf32 bsf32
+#ifdef _M_X64 /* Bit scan functions (64-bit) */
+static forceinline unsigned
+bsr64(uint64_t n)
+{
+	_BitScanReverse64(&n, n);
+	return n;
+}
+#define bsr64 bsr64
+static forceinline unsigned
+bsf64(uint64_t n)
+{
+	_BitScanForward64(&n, n);
+	return n;
+}
+#define bsf64 bsf64
+#endif /* _M_X64 */

data/ext/libdeflate/libdeflate/lib/adler32.c ADDED Viewed

@@ -0,0 +1,213 @@
+/*
+ * adler32.c - Adler-32 checksum algorithm
+ *
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#include "x86_cpu_features.h"
+#include "libdeflate.h"
+/* The Adler-32 divisor, or "base", value. */
+#define DIVISOR 65521
+/*
+ * MAX_BYTES_PER_CHUNK is the most bytes that can be processed without the
+ * possibility of s2 overflowing when it is represented as an unsigned 32-bit
+ * integer.  This value was computed using the following Python script:
+ *
+ *	divisor = 65521
+ *	count = 0
+ *	s1 = divisor - 1
+ *	s2 = divisor - 1
+ *	while True:
+ *		s1 += 0xFF
+ *		s2 += s1
+ *		if s2 > 0xFFFFFFFF:
+ *			break
+ *		count += 1
+ *	print(count)
+ *
+ * Note that to get the correct worst-case value, we must assume that every byte
+ * has value 0xFF and that s1 and s2 started with the highest possible values
+ * modulo the divisor.
+ */
+#define MAX_BYTES_PER_CHUNK	5552
+/* Select the implementations to compile in. */
+#define NEED_GENERIC_IMPL 1 /* include generic impl unless overridden */
+/* Include the SSE2 implementation? */
+#define NEED_SSE2_IMPL 0
+#ifdef __SSE2__
+#  include <emmintrin.h>
+#  undef NEED_SSE2_IMPL
+#  define NEED_SSE2_IMPL 1
+#  undef NEED_GENERIC_IMPL
+#  define NEED_GENERIC_IMPL 0 /* generic impl not needed */
+#endif
+/* Include the AVX2 implementation? */
+#define NEED_AVX2_IMPL 0
+#if defined(__AVX2__) || \
+	(X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX2_TARGET && \
+	 COMPILER_SUPPORTS_TARGET_INTRINSICS)
+#  include <immintrin.h>
+#  undef NEED_AVX2_IMPL
+#  define NEED_AVX2_IMPL 1
+#  ifdef __AVX2__ /* compiling for AVX2, i.e. can we assume it's there? */
+#    undef NEED_GENERIC_IMPL
+#    define NEED_GENERIC_IMPL 0 /* generic impl not needed */
+#    undef NEED_SSE2_IMPL
+#    define NEED_SSE2_IMPL 0 /* SSE2 impl not needed */
+#  endif /* otherwise, we can build an AVX2 version, but we won't know whether
+	    we can use it until runtime */
+#endif
+/* Include the NEON implementation? */
+#define NEED_NEON_IMPL 0
+#ifdef __ARM_NEON
+#  include <arm_neon.h>
+#  undef NEED_NEON_IMPL
+#  define NEED_NEON_IMPL 1
+#  undef NEED_GENERIC_IMPL
+#  define NEED_GENERIC_IMPL 0 /* generic impl not needed */
+#endif
+#define NUM_IMPLS (NEED_GENERIC_IMPL + NEED_SSE2_IMPL + NEED_AVX2_IMPL + \
+		   NEED_NEON_IMPL)
+/* Define the generic implementation if needed. */
+#if NEED_GENERIC_IMPL
+static u32 adler32_generic(u32 adler, const void *buffer, size_t size)
+{
+	u32 s1 = adler & 0xFFFF;
+	u32 s2 = adler >> 16;
+	const u8 *p = buffer;
+	const u8 * const end = p + size;
+	while (p != end) {
+		size_t chunk_size = MIN(end - p, MAX_BYTES_PER_CHUNK);
+		const u8 *chunk_end = p + chunk_size;
+		size_t num_unrolled_iterations = chunk_size / 4;
+		while (num_unrolled_iterations--) {
+			s1 += *p++;
+			s2 += s1;
+			s1 += *p++;
+			s2 += s1;
+			s1 += *p++;
+			s2 += s1;
+			s1 += *p++;
+			s2 += s1;
+		}
+		while (p != chunk_end) {
+			s1 += *p++;
+			s2 += s1;
+		}
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+	return (s2 << 16) | s1;
+}
+#define DEFAULT_IMPL adler32_generic
+#endif /* NEED_GENERIC_IMPL */
+#define TARGET_SSE2 100
+#define TARGET_AVX2 200
+#define TARGET_NEON 300
+/* Define the SSE2 implementation if needed. */
+#if NEED_SSE2_IMPL
+#  define FUNCNAME		adler32_sse2
+#  define TARGET		TARGET_SSE2
+#  define ALIGNMENT_REQUIRED	16
+#  define BYTES_PER_ITERATION	32
+#  define ATTRIBUTES
+#  define DEFAULT_IMPL		adler32_sse2
+#  include "adler32_impl.h"
+#endif
+/* Define the AVX2 implementation if needed. */
+#if NEED_AVX2_IMPL
+#  define FUNCNAME		adler32_avx2
+#  define TARGET		TARGET_AVX2
+#  define ALIGNMENT_REQUIRED	32
+#  define BYTES_PER_ITERATION	32
+#  ifdef __AVX2__
+#    define ATTRIBUTES
+#    define DEFAULT_IMPL	adler32_avx2
+#  else
+#    define ATTRIBUTES		__attribute__((target("avx2")))
+#  endif
+#  include "adler32_impl.h"
+#endif
+/* Define the NEON implementation if needed. */
+#if NEED_NEON_IMPL
+#  define FUNCNAME		adler32_neon
+#  define TARGET		TARGET_NEON
+#  define ALIGNMENT_REQUIRED	16
+#  define BYTES_PER_ITERATION	32
+#  define ATTRIBUTES
+#  define DEFAULT_IMPL		adler32_neon
+#  include "adler32_impl.h"
+#endif
+typedef u32 (*adler32_func_t)(u32, const void *, size_t);
+/*
+ * If multiple implementations are available, then dispatch among them based on
+ * CPU features at runtime.  Otherwise just call the single one directly.
+ */
+#if NUM_IMPLS == 1
+#  define adler32_impl DEFAULT_IMPL
+#else
+static u32 dispatch(u32, const void *, size_t);
+static adler32_func_t adler32_impl = dispatch;
+static u32 dispatch(u32 adler, const void *buffer, size_t size)
+{
+	adler32_func_t f = DEFAULT_IMPL;
+#if NEED_AVX2_IMPL && !defined(__AVX2__)
+	if (x86_have_cpu_features(X86_CPU_FEATURE_AVX2))
+		f = adler32_avx2;
+#endif
+	adler32_impl = f;
+	return adler32_impl(adler, buffer, size);
+}
+#endif /* NUM_IMPLS != 1 */
+LIBDEFLATEAPI u32
+libdeflate_adler32(u32 adler, const void *buffer, size_t size)
+{
+	if (buffer == NULL) /* return initial value */
+		return 1;
+	return adler32_impl(adler, buffer, size);
+}

data/ext/libdeflate/libdeflate/lib/adler32_impl.h ADDED Viewed

@@ -0,0 +1,281 @@
+/*
+ * adler32_impl.h
+ *
+ * Originally public domain; changes after 2016-09-07 are copyrighted.
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * This file contains a template for vectorized Adler-32 implementations.
+ *
+ * The inner loop between reductions modulo 65521 of an unvectorized Adler-32
+ * implementation looks something like this:
+ *
+ *	do {
+ * 		s1 += *p;
+ * 		s2 += s1;
+ *	} while (++p != chunk_end);
+ *
+ * For vectorized calculation of s1, we only need to sum the input bytes.  They
+ * can be accumulated into multiple counters which are eventually summed
+ * together.
+ *
+ * For vectorized calculation of s2, the basic idea is that for each iteration
+ * that processes N bytes, we can perform the following vectorizable
+ * calculation:
+ *
+ *	s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
+ *
+ * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
+ * separate counters, then do the multiplications by N...1 just once at the end
+ * rather than once per iteration.
+ *
+ * Also, we must account for how previous bytes will affect s2 by doing the
+ * following at beginning of each iteration:
+ *
+ *	s2 += s1 * N
+ *
+ * Furthermore, like s1, "s2" can actually be multiple counters which are
+ * eventually summed together.
+ */
+static u32 ATTRIBUTES
+FUNCNAME(u32 adler, const void *buffer, size_t size)
+{
+	u32 s1 = adler & 0xFFFF;
+	u32 s2 = adler >> 16;
+	const u8 *p = buffer;
+	const u8 * const end = p + size;
+	const u8 *vend;
+	/* Process a byte at a time until the required alignment is reached. */
+	if (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED) {
+		do {
+			s1 += *p++;
+			s2 += s1;
+		} while (p != end && (uintptr_t)p % ALIGNMENT_REQUIRED);
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+	/*
+	 * Process "chunks" of bytes using vector instructions.  Chunk sizes are
+	 * limited to MAX_BYTES_PER_CHUNK, which guarantees that s1 and s2 never
+	 * overflow before being reduced modulo DIVISOR.  For vector processing,
+	 * chunks size are also made evenly divisible by BYTES_PER_ITERATION.
+	 */
+	STATIC_ASSERT(BYTES_PER_ITERATION % ALIGNMENT_REQUIRED == 0);
+	vend = end - ((size_t)(end - p) % BYTES_PER_ITERATION);
+	while (p != vend) {
+		size_t chunk_size;
+		const u8 *chunk_end;
+		chunk_size = MIN((size_t)(vend - p), MAX_BYTES_PER_CHUNK);
+	#if TARGET == TARGET_SSE2
+		/* SSE2: the 16-bit precision byte counters must not undergo
+		 * *signed* overflow, otherwise the signed multiplication at the
+		 * end will not behave as desired. */
+		chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0x7FFF / 0xFF));
+	#elif TARGET == TARGET_NEON
+		/* NEON: the 16-bit precision counters must not undergo
+		 * *unsigned* overflow. */
+		chunk_size = MIN(chunk_size, BYTES_PER_ITERATION * (0xFFFF / 0xFF));
+	#endif
+		chunk_size -= chunk_size % BYTES_PER_ITERATION;
+		chunk_end = p + chunk_size;
+		s2 += s1 * chunk_size;
+		{
+	#if TARGET == TARGET_AVX2
+		/* AVX2 implementation */
+		const __m256i zeroes = _mm256_setzero_si256();
+		const __v32qi multipliers = (__v32qi) { 32, 31, 30, 29, 28, 27, 26, 25,
+							24, 23, 22, 21, 20, 19, 18, 17,
+							16, 15, 14, 13, 12, 11, 10, 9,
+							8,  7,  6,  5,  4,  3,  2,  1 };
+		const __v16hi ones = (__v16hi)_mm256_set1_epi16(1);
+		__v8si v_s1 = (__v8si)zeroes;
+		__v8si v_s1_sums = (__v8si)zeroes;
+		__v8si v_s2 = (__v8si)zeroes;
+		STATIC_ASSERT(ALIGNMENT_REQUIRED == 32 && BYTES_PER_ITERATION == 32);
+		do {
+			__m256i bytes = *(const __m256i *)p;
+			__v16hi sums = (__v16hi)_mm256_maddubs_epi16(
+							bytes, (__m256i)multipliers);
+			v_s1_sums += v_s1;
+			v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes);
+			v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
+		} while ((p += BYTES_PER_ITERATION) != chunk_end);
+		v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
+		v_s1 = (__v8si)_mm256_hadd_epi32((__m256i)v_s1, zeroes);
+		s1 += v_s1[0] + v_s1[4];
+		v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
+		v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
+		v_s2 = (__v8si)_mm256_hadd_epi32((__m256i)v_s2, zeroes);
+		s2 += v_s2[0] + v_s2[4];
+	#elif TARGET == TARGET_SSE2
+		/* SSE2 implementation */
+		const __m128i zeroes = _mm_setzero_si128();
+		/* s1 counters: 32-bit, sum of bytes */
+		__v4si v_s1 = (__v4si)zeroes;
+		/* s2 counters: 32-bit, sum of s1 values */
+		__v4si v_s2 = (__v4si)zeroes;
+		/*
+		 * Thirty-two 16-bit counters for byte sums.  Each accumulates
+		 * the bytes that eventually need to be multiplied by a number
+		 * 32...1 for addition into s2.
+		 */
+		__v8hi v_byte_sums_a = (__v8hi)zeroes;
+		__v8hi v_byte_sums_b = (__v8hi)zeroes;
+		__v8hi v_byte_sums_c = (__v8hi)zeroes;
+		__v8hi v_byte_sums_d = (__v8hi)zeroes;
+		STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
+		do {
+			/* Load the next 32 bytes. */
+			const __m128i bytes1 = *(const __m128i *)p;
+			const __m128i bytes2 = *(const __m128i *)(p + 16);
+			/*
+			 * Accumulate the previous s1 counters into the s2
+			 * counters.  Logically, this really should be
+			 * v_s2 += v_s1 * BYTES_PER_ITERATION, but we can do the
+			 * multiplication (or left shift) later.
+			 */
+			v_s2 += v_s1;
+			/*
+			 * s1 update: use "Packed Sum of Absolute Differences"
+			 * to add the bytes horizontally with 8 bytes per sum.
+			 * Then add the sums to the s1 counters.
+			 */
+			v_s1 += (__v4si)_mm_sad_epu8(bytes1, zeroes);
+			v_s1 += (__v4si)_mm_sad_epu8(bytes2, zeroes);
+			/*
+			 * Also accumulate the bytes into 32 separate counters
+			 * that have 16-bit precision.
+			 */
+			v_byte_sums_a += (__v8hi)_mm_unpacklo_epi8(bytes1, zeroes);
+			v_byte_sums_b += (__v8hi)_mm_unpackhi_epi8(bytes1, zeroes);
+			v_byte_sums_c += (__v8hi)_mm_unpacklo_epi8(bytes2, zeroes);
+			v_byte_sums_d += (__v8hi)_mm_unpackhi_epi8(bytes2, zeroes);
+		} while ((p += BYTES_PER_ITERATION) != chunk_end);
+		/* Finish calculating the s2 counters. */
+		v_s2 = (__v4si)_mm_slli_epi32((__m128i)v_s2, 5);
+		v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_a,
+					       (__m128i)(__v8hi){ 32, 31, 30, 29, 28, 27, 26, 25 });
+		v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_b,
+					       (__m128i)(__v8hi){ 24, 23, 22, 21, 20, 19, 18, 17 });
+		v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_c,
+					       (__m128i)(__v8hi){ 16, 15, 14, 13, 12, 11, 10, 9 });
+		v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d,
+					       (__m128i)(__v8hi){ 8,  7,  6,  5,  4,  3,  2,  1 });
+		/* Now accumulate what we computed into the real s1 and s2. */
+		v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x31);
+		v_s1 += (__v4si)_mm_shuffle_epi32((__m128i)v_s1, 0x02);
+		s1 += _mm_cvtsi128_si32((__m128i)v_s1);
+		v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x31);
+		v_s2 += (__v4si)_mm_shuffle_epi32((__m128i)v_s2, 0x02);
+		s2 += _mm_cvtsi128_si32((__m128i)v_s2);
+	#elif TARGET == TARGET_NEON
+		/* ARM NEON (Advanced SIMD) implementation */
+		uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
+		uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 };
+		uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
+		uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
+		uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
+		uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
+		STATIC_ASSERT(ALIGNMENT_REQUIRED == 16 && BYTES_PER_ITERATION == 32);
+		do {
+			const uint8x16_t bytes1 = *(const uint8x16_t *)p;
+			const uint8x16_t bytes2 = *(const uint8x16_t *)(p + 16);
+			uint16x8_t tmp;
+			v_s2 += v_s1;
+			tmp = vpaddlq_u8(bytes1);
+			tmp = vpadalq_u8(tmp, bytes2);
+			v_s1 = vpadalq_u16(v_s1, tmp);
+			v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
+			v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
+			v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
+			v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
+		} while ((p += BYTES_PER_ITERATION) != chunk_end);
+		v_s2 = vqshlq_n_u32(v_s2, 5);
+		v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a),  (uint16x4_t) { 32, 31, 30, 29 });
+		v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 });
+		v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b),  (uint16x4_t) { 24, 23, 22, 21 });
+		v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 });
+		v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c),  (uint16x4_t) { 16, 15, 14, 13 });
+		v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10,  9 });
+		v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) {  8,  7,  6,  5 });
+		v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) {  4,  3,  2,  1 });
+		s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
+		s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
+	#else
+	#  error "BUG: unknown target"
+	#endif
+		}
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+	/* Process any remaining bytes. */
+	if (p != end) {
+		do {
+			s1 += *p++;
+			s2 += s1;
+		} while (p != end);
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+	return (s2 << 16) | s1;
+}
+#undef FUNCNAME
+#undef TARGET
+#undef ALIGNMENT_REQUIRED
+#undef BYTES_PER_ITERATION
+#undef ATTRIBUTES