RubyGems - deflate-ruby - Versions diffs - 0.1.0 - Mend

deflate-ruby 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (100) hide show

data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_helpers.h ADDED Viewed

@@ -0,0 +1,156 @@
+/*
+ * arm/crc32_pmull_helpers.h - helper functions for CRC-32 folding with PMULL
+ *
+ * Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * This file is a "template" for instantiating helper functions for CRC folding
+ * with pmull instructions.  It accepts the following parameters:
+ *
+ * SUFFIX:
+ *	Name suffix to append to all instantiated functions.
+ * ATTRIBUTES:
+ *	Target function attributes to use.
+ * ENABLE_EOR3:
+ *	Use the eor3 instruction (from the sha3 extension).
+ */
+/* Create a vector with 'a' in the first 4 bytes, and the rest zeroed out. */
+#undef u32_to_bytevec
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(u32_to_bytevec)(u32 a)
+{
+	return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0));
+}
+#define u32_to_bytevec	ADD_SUFFIX(u32_to_bytevec)
+/* Load two 64-bit values into a vector. */
+#undef load_multipliers
+static forceinline ATTRIBUTES poly64x2_t
+ADD_SUFFIX(load_multipliers)(const u64 p[2])
+{
+	return vreinterpretq_p64_u64(vld1q_u64(p));
+}
+#define load_multipliers	ADD_SUFFIX(load_multipliers)
+/* Do carryless multiplication of the low halves of two vectors. */
+#undef clmul_low
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b)
+{
+	return vreinterpretq_u8_p128(
+		     compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0),
+				      vgetq_lane_p64(b, 0)));
+}
+#define clmul_low	ADD_SUFFIX(clmul_low)
+/* Do carryless multiplication of the high halves of two vectors. */
+#undef clmul_high
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b)
+{
+#ifdef __clang__
+	/*
+	 * Use inline asm to ensure that pmull2 is really used.  This works
+	 * around clang bug https://github.com/llvm/llvm-project/issues/52868.
+	 */
+	uint8x16_t res;
+	__asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b));
+	return res;
+#else
+	return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b));
+#endif
+}
+#define clmul_high	ADD_SUFFIX(clmul_high)
+#undef eor3
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c)
+{
+#if ENABLE_EOR3
+	return veor3q_u8(a, b, c);
+#else
+	return veorq_u8(veorq_u8(a, b), c);
+#endif
+}
+#define eor3	ADD_SUFFIX(eor3)
+#undef fold_vec
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers)
+{
+	uint8x16_t a = clmul_low(src, multipliers);
+	uint8x16_t b = clmul_high(src, multipliers);
+	return eor3(a, b, dst);
+}
+#define fold_vec	ADD_SUFFIX(fold_vec)
+/*
+ * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the
+ * next '1 <= len <= 15' data bytes, rearrange the concatenation of v and the
+ * data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
+ * respectively.  Then fold x0 into x1 and return the result.  Assumes that
+ * 'p + len - 16' is in-bounds.
+ */
+#undef fold_partial_vec
+static forceinline ATTRIBUTES MAYBE_UNUSED uint8x16_t
+ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len,
+			     poly64x2_t multipliers_1)
+{
+	/*
+	 * vqtbl1q_u8(v, shift_tab[len..len+15]) left shifts v by 16-len bytes.
+	 * vqtbl1q_u8(v, shift_tab[len+16..len+31]) right shifts v by len bytes.
+	 */
+	static const u8 shift_tab[48] = {
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	};
+	const uint8x16_t lshift = vld1q_u8(&shift_tab[len]);
+	const uint8x16_t rshift = vld1q_u8(&shift_tab[len + 16]);
+	uint8x16_t x0, x1, bsl_mask;
+	/* x0 = v left-shifted by '16 - len' bytes */
+	x0 = vqtbl1q_u8(v, lshift);
+	/* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */
+	bsl_mask = vreinterpretq_u8_s8(
+			vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7));
+	/*
+	 * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len'
+	 * bytes) followed by the remaining data.
+	 */
+	x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */,
+		      vld1q_u8(p + len - 16), vqtbl1q_u8(v, rshift));
+	return fold_vec(x0, x1, multipliers_1);
+}
+#define fold_partial_vec	ADD_SUFFIX(fold_partial_vec)

data/ext/deflate_ruby/libdeflate/lib/arm/crc32_pmull_wide.h ADDED Viewed

@@ -0,0 +1,226 @@
+/*
+ * arm/crc32_pmull_wide.h - gzip CRC-32 with PMULL (extra-wide version)
+ *
+ * Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+/*
+ * This file is a "template" for instantiating PMULL-based crc32_arm functions.
+ * The "parameters" are:
+ *
+ * SUFFIX:
+ *	Name suffix to append to all instantiated functions.
+ * ATTRIBUTES:
+ *	Target function attributes to use.
+ * ENABLE_EOR3:
+ *	Use the eor3 instruction (from the sha3 extension).
+ *
+ * This is the extra-wide version; it uses an unusually large stride length of
+ * 12, and it assumes that crc32 instructions are available too.  It's intended
+ * for powerful CPUs that support both pmull and crc32 instructions, but where
+ * throughput of pmull and xor (given enough instructions issued in parallel) is
+ * significantly higher than that of crc32, thus making the crc32 instructions
+ * (counterintuitively) not actually the fastest way to compute the CRC-32.  The
+ * Apple M1 processor is an example of such a CPU.
+ */
+#include "crc32_pmull_helpers.h"
+static ATTRIBUTES u32
+ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
+{
+	uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
+	if (len < 3 * 192) {
+		static const u64 _aligned_attribute(16) mults[3][2] = {
+			{ CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
+			{ CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
+			{ CRC32_X159_MODG, CRC32_X95_MODG },  /* 1 vecs */
+		};
+		poly64x2_t multipliers_4, multipliers_2, multipliers_1;
+		if (len < 64)
+			goto tail;
+		multipliers_4 = load_multipliers(mults[0]);
+		multipliers_2 = load_multipliers(mults[1]);
+		multipliers_1 = load_multipliers(mults[2]);
+		/*
+		 * Short length; don't bother aligning the pointer, and fold
+		 * 64 bytes (4 vectors) at a time, at most.
+		 */
+		v0 = veorq_u8(vld1q_u8(p + 0), u32_to_bytevec(crc));
+		v1 = vld1q_u8(p + 16);
+		v2 = vld1q_u8(p + 32);
+		v3 = vld1q_u8(p + 48);
+		p += 64;
+		len -= 64;
+		while (len >= 64) {
+			v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_4);
+			v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_4);
+			v2 = fold_vec(v2, vld1q_u8(p + 32), multipliers_4);
+			v3 = fold_vec(v3, vld1q_u8(p + 48), multipliers_4);
+			p += 64;
+			len -= 64;
+		}
+		v0 = fold_vec(v0, v2, multipliers_2);
+		v1 = fold_vec(v1, v3, multipliers_2);
+		if (len >= 32) {
+			v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_2);
+			v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_2);
+			p += 32;
+			len -= 32;
+		}
+		v0 = fold_vec(v0, v1, multipliers_1);
+	} else {
+		static const u64 _aligned_attribute(16) mults[4][2] = {
+			{ CRC32_X1567_MODG, CRC32_X1503_MODG }, /* 12 vecs */
+			{ CRC32_X799_MODG, CRC32_X735_MODG },   /* 6 vecs */
+			{ CRC32_X415_MODG, CRC32_X351_MODG },   /* 3 vecs */
+			{ CRC32_X159_MODG, CRC32_X95_MODG },    /* 1 vecs */
+		};
+		const poly64x2_t multipliers_12 = load_multipliers(mults[0]);
+		const poly64x2_t multipliers_6 = load_multipliers(mults[1]);
+		const poly64x2_t multipliers_3 = load_multipliers(mults[2]);
+		const poly64x2_t multipliers_1 = load_multipliers(mults[3]);
+		const size_t align = -(uintptr_t)p & 15;
+		const uint8x16_t *vp;
+		/* Align p to the next 16-byte boundary. */
+		if (align) {
+			if (align & 1)
+				crc = __crc32b(crc, *p++);
+			if (align & 2) {
+				crc = __crc32h(crc, le16_bswap(*(u16 *)p));
+				p += 2;
+			}
+			if (align & 4) {
+				crc = __crc32w(crc, le32_bswap(*(u32 *)p));
+				p += 4;
+			}
+			if (align & 8) {
+				crc = __crc32d(crc, le64_bswap(*(u64 *)p));
+				p += 8;
+			}
+			len -= align;
+		}
+		vp = (const uint8x16_t *)p;
+		v0 = veorq_u8(*vp++, u32_to_bytevec(crc));
+		v1 = *vp++;
+		v2 = *vp++;
+		v3 = *vp++;
+		v4 = *vp++;
+		v5 = *vp++;
+		v6 = *vp++;
+		v7 = *vp++;
+		v8 = *vp++;
+		v9 = *vp++;
+		v10 = *vp++;
+		v11 = *vp++;
+		len -= 192;
+		/* Fold 192 bytes (12 vectors) at a time. */
+		do {
+			v0 = fold_vec(v0, *vp++, multipliers_12);
+			v1 = fold_vec(v1, *vp++, multipliers_12);
+			v2 = fold_vec(v2, *vp++, multipliers_12);
+			v3 = fold_vec(v3, *vp++, multipliers_12);
+			v4 = fold_vec(v4, *vp++, multipliers_12);
+			v5 = fold_vec(v5, *vp++, multipliers_12);
+			v6 = fold_vec(v6, *vp++, multipliers_12);
+			v7 = fold_vec(v7, *vp++, multipliers_12);
+			v8 = fold_vec(v8, *vp++, multipliers_12);
+			v9 = fold_vec(v9, *vp++, multipliers_12);
+			v10 = fold_vec(v10, *vp++, multipliers_12);
+			v11 = fold_vec(v11, *vp++, multipliers_12);
+			len -= 192;
+		} while (len >= 192);
+		/*
+		 * Fewer than 192 bytes left.  Fold v0-v11 down to just v0,
+		 * while processing up to 144 more bytes.
+		 */
+		v0 = fold_vec(v0, v6, multipliers_6);
+		v1 = fold_vec(v1, v7, multipliers_6);
+		v2 = fold_vec(v2, v8, multipliers_6);
+		v3 = fold_vec(v3, v9, multipliers_6);
+		v4 = fold_vec(v4, v10, multipliers_6);
+		v5 = fold_vec(v5, v11, multipliers_6);
+		if (len >= 96) {
+			v0 = fold_vec(v0, *vp++, multipliers_6);
+			v1 = fold_vec(v1, *vp++, multipliers_6);
+			v2 = fold_vec(v2, *vp++, multipliers_6);
+			v3 = fold_vec(v3, *vp++, multipliers_6);
+			v4 = fold_vec(v4, *vp++, multipliers_6);
+			v5 = fold_vec(v5, *vp++, multipliers_6);
+			len -= 96;
+		}
+		v0 = fold_vec(v0, v3, multipliers_3);
+		v1 = fold_vec(v1, v4, multipliers_3);
+		v2 = fold_vec(v2, v5, multipliers_3);
+		if (len >= 48) {
+			v0 = fold_vec(v0, *vp++, multipliers_3);
+			v1 = fold_vec(v1, *vp++, multipliers_3);
+			v2 = fold_vec(v2, *vp++, multipliers_3);
+			len -= 48;
+		}
+		v0 = fold_vec(v0, v1, multipliers_1);
+		v0 = fold_vec(v0, v2, multipliers_1);
+		p = (const u8 *)vp;
+	}
+	/* Reduce 128 to 32 bits using crc32 instructions. */
+	crc = __crc32d(0, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 0));
+	crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 1));
+tail:
+	/* Finish up the remainder using crc32 instructions. */
+	if (len & 32) {
+		crc = __crc32d(crc, get_unaligned_le64(p + 0));
+		crc = __crc32d(crc, get_unaligned_le64(p + 8));
+		crc = __crc32d(crc, get_unaligned_le64(p + 16));
+		crc = __crc32d(crc, get_unaligned_le64(p + 24));
+		p += 32;
+	}
+	if (len & 16) {
+		crc = __crc32d(crc, get_unaligned_le64(p + 0));
+		crc = __crc32d(crc, get_unaligned_le64(p + 8));
+		p += 16;
+	}
+	if (len & 8) {
+		crc = __crc32d(crc, get_unaligned_le64(p));
+		p += 8;
+	}
+	if (len & 4) {
+		crc = __crc32w(crc, get_unaligned_le32(p));
+		p += 4;
+	}
+	if (len & 2) {
+		crc = __crc32h(crc, get_unaligned_le16(p));
+		p += 2;
+	}
+	if (len & 1)
+		crc = __crc32b(crc, *p);
+	return crc;
+}
+#undef SUFFIX
+#undef ATTRIBUTES
+#undef ENABLE_EOR3

data/ext/deflate_ruby/libdeflate/lib/arm/matchfinder_impl.h ADDED Viewed

@@ -0,0 +1,78 @@
+/*
+ * arm/matchfinder_impl.h - ARM implementations of matchfinder functions
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef LIB_ARM_MATCHFINDER_IMPL_H
+#define LIB_ARM_MATCHFINDER_IMPL_H
+#include "cpu_features.h"
+#if HAVE_NEON_NATIVE
+static forceinline void
+matchfinder_init_neon(mf_pos_t *data, size_t size)
+{
+	int16x8_t *p = (int16x8_t *)data;
+	int16x8_t v = vdupq_n_s16(MATCHFINDER_INITVAL);
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+	do {
+		p[0] = v;
+		p[1] = v;
+		p[2] = v;
+		p[3] = v;
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_init matchfinder_init_neon
+static forceinline void
+matchfinder_rebase_neon(mf_pos_t *data, size_t size)
+{
+	int16x8_t *p = (int16x8_t *)data;
+	int16x8_t v = vdupq_n_s16((u16)-MATCHFINDER_WINDOW_SIZE);
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+	do {
+		p[0] = vqaddq_s16(p[0], v);
+		p[1] = vqaddq_s16(p[1], v);
+		p[2] = vqaddq_s16(p[2], v);
+		p[3] = vqaddq_s16(p[3], v);
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_neon
+#endif /* HAVE_NEON_NATIVE */
+#endif /* LIB_ARM_MATCHFINDER_IMPL_H */