deflate-ruby 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. checksums.yaml +4 -4
  2. data/CLAUDE.md +95 -92
  3. data/LICENSE.txt +6 -6
  4. data/README.md +87 -65
  5. data/Rakefile +23 -0
  6. data/ext/deflate_ruby/{libdeflate/lib/x86/adler32_impl.h → adler32_impl.h} +8 -7
  7. data/ext/deflate_ruby/common_defs.h +748 -0
  8. data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.c → cpu_features.c} +46 -16
  9. data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.h → cpu_features.h} +2 -1
  10. data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_impl.h → crc32_impl.h} +22 -23
  11. data/ext/deflate_ruby/{libdeflate/lib/crc32_multipliers.h → crc32_multipliers.h} +2 -4
  12. data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_pclmul_template.h → crc32_pclmul_template.h} +23 -94
  13. data/ext/deflate_ruby/{libdeflate/lib/crc32_tables.h → crc32_tables.h} +1 -1
  14. data/ext/deflate_ruby/{libdeflate/lib/deflate_compress.c → deflate_compress.c} +59 -60
  15. data/ext/deflate_ruby/deflate_ruby.c +392 -218
  16. data/ext/deflate_ruby/deflate_ruby.h +6 -0
  17. data/ext/deflate_ruby/extconf.rb +35 -25
  18. data/ext/deflate_ruby/libdeflate/adler32.c +162 -0
  19. data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/adler32_impl.h +14 -7
  20. data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/crc32_impl.h +25 -31
  21. data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_helpers.h +156 -0
  22. data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_wide.h +226 -0
  23. data/ext/deflate_ruby/libdeflate/bt_matchfinder.h +342 -0
  24. data/ext/deflate_ruby/libdeflate/common_defs.h +2 -1
  25. data/ext/deflate_ruby/libdeflate/cpu_features_common.h +93 -0
  26. data/ext/deflate_ruby/libdeflate/crc32.c +262 -0
  27. data/ext/deflate_ruby/libdeflate/crc32_multipliers.h +375 -0
  28. data/ext/deflate_ruby/libdeflate/crc32_tables.h +587 -0
  29. data/ext/deflate_ruby/libdeflate/decompress_template.h +777 -0
  30. data/ext/deflate_ruby/libdeflate/deflate_compress.c +4128 -0
  31. data/ext/deflate_ruby/libdeflate/deflate_compress.h +15 -0
  32. data/ext/deflate_ruby/libdeflate/deflate_constants.h +56 -0
  33. data/ext/deflate_ruby/libdeflate/deflate_decompress.c +1208 -0
  34. data/ext/deflate_ruby/libdeflate/gzip_compress.c +90 -0
  35. data/ext/deflate_ruby/libdeflate/gzip_constants.h +45 -0
  36. data/ext/deflate_ruby/libdeflate/gzip_decompress.c +144 -0
  37. data/ext/deflate_ruby/libdeflate/hc_matchfinder.h +401 -0
  38. data/ext/deflate_ruby/libdeflate/ht_matchfinder.h +234 -0
  39. data/ext/deflate_ruby/libdeflate/lib_common.h +106 -0
  40. data/ext/deflate_ruby/libdeflate/libdeflate.h +2 -2
  41. data/ext/deflate_ruby/libdeflate/{lib/matchfinder_common.h → matchfinder_common.h} +3 -3
  42. data/ext/deflate_ruby/libdeflate/x86/adler32_impl.h +135 -0
  43. data/ext/deflate_ruby/libdeflate/x86/adler32_template.h +518 -0
  44. data/ext/deflate_ruby/libdeflate/x86/cpu_features.c +213 -0
  45. data/ext/deflate_ruby/libdeflate/x86/cpu_features.h +170 -0
  46. data/ext/deflate_ruby/libdeflate/x86/crc32_impl.h +159 -0
  47. data/ext/deflate_ruby/libdeflate/x86/crc32_pclmul_template.h +424 -0
  48. data/ext/deflate_ruby/libdeflate/x86/decompress_impl.h +57 -0
  49. data/ext/deflate_ruby/libdeflate.h +411 -0
  50. data/ext/deflate_ruby/matchfinder_common.h +224 -0
  51. data/ext/deflate_ruby/matchfinder_impl.h +122 -0
  52. data/ext/deflate_ruby/utils.c +141 -0
  53. data/ext/deflate_ruby/zlib_compress.c +82 -0
  54. data/ext/deflate_ruby/zlib_constants.h +21 -0
  55. data/ext/deflate_ruby/zlib_decompress.c +104 -0
  56. data/lib/deflate_ruby/version.rb +1 -1
  57. data/lib/deflate_ruby.rb +1 -63
  58. data/sig/deflate_ruby.rbs +4 -0
  59. data/test/test_deflate_ruby.rb +220 -0
  60. data/test/test_helper.rb +6 -0
  61. metadata +89 -144
  62. data/ext/deflate_ruby/libdeflate/CMakeLists.txt +0 -270
  63. data/ext/deflate_ruby/libdeflate/NEWS.md +0 -494
  64. data/ext/deflate_ruby/libdeflate/README.md +0 -228
  65. data/ext/deflate_ruby/libdeflate/libdeflate-config.cmake.in +0 -3
  66. data/ext/deflate_ruby/libdeflate/libdeflate.pc.in +0 -18
  67. data/ext/deflate_ruby/libdeflate/programs/CMakeLists.txt +0 -105
  68. data/ext/deflate_ruby/libdeflate/programs/benchmark.c +0 -696
  69. data/ext/deflate_ruby/libdeflate/programs/checksum.c +0 -218
  70. data/ext/deflate_ruby/libdeflate/programs/config.h.in +0 -19
  71. data/ext/deflate_ruby/libdeflate/programs/gzip.c +0 -688
  72. data/ext/deflate_ruby/libdeflate/programs/prog_util.c +0 -521
  73. data/ext/deflate_ruby/libdeflate/programs/prog_util.h +0 -225
  74. data/ext/deflate_ruby/libdeflate/programs/test_checksums.c +0 -200
  75. data/ext/deflate_ruby/libdeflate/programs/test_custom_malloc.c +0 -155
  76. data/ext/deflate_ruby/libdeflate/programs/test_incomplete_codes.c +0 -385
  77. data/ext/deflate_ruby/libdeflate/programs/test_invalid_streams.c +0 -130
  78. data/ext/deflate_ruby/libdeflate/programs/test_litrunlen_overflow.c +0 -72
  79. data/ext/deflate_ruby/libdeflate/programs/test_overread.c +0 -95
  80. data/ext/deflate_ruby/libdeflate/programs/test_slow_decompression.c +0 -472
  81. data/ext/deflate_ruby/libdeflate/programs/test_trailing_bytes.c +0 -151
  82. data/ext/deflate_ruby/libdeflate/programs/test_util.c +0 -237
  83. data/ext/deflate_ruby/libdeflate/programs/test_util.h +0 -61
  84. data/ext/deflate_ruby/libdeflate/programs/tgetopt.c +0 -118
  85. data/ext/deflate_ruby/libdeflate/scripts/android_build.sh +0 -118
  86. data/ext/deflate_ruby/libdeflate/scripts/android_tests.sh +0 -69
  87. data/ext/deflate_ruby/libdeflate/scripts/benchmark.sh +0 -10
  88. data/ext/deflate_ruby/libdeflate/scripts/checksum.sh +0 -10
  89. data/ext/deflate_ruby/libdeflate/scripts/checksum_benchmarks.sh +0 -253
  90. data/ext/deflate_ruby/libdeflate/scripts/cmake-helper.sh +0 -17
  91. data/ext/deflate_ruby/libdeflate/scripts/deflate_benchmarks.sh +0 -119
  92. data/ext/deflate_ruby/libdeflate/scripts/exec_tests.sh +0 -38
  93. data/ext/deflate_ruby/libdeflate/scripts/gen-release-archives.sh +0 -37
  94. data/ext/deflate_ruby/libdeflate/scripts/gen_bitreverse_tab.py +0 -19
  95. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_multipliers.c +0 -199
  96. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_tables.c +0 -105
  97. data/ext/deflate_ruby/libdeflate/scripts/gen_default_litlen_costs.py +0 -44
  98. data/ext/deflate_ruby/libdeflate/scripts/gen_offset_slot_map.py +0 -29
  99. data/ext/deflate_ruby/libdeflate/scripts/gzip_tests.sh +0 -523
  100. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/corpus/0 +0 -0
  101. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/fuzz.c +0 -95
  102. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/corpus/0 +0 -3
  103. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/fuzz.c +0 -62
  104. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/fuzz.sh +0 -108
  105. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/corpus/0 +0 -0
  106. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/fuzz.c +0 -19
  107. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/corpus/0 +0 -3
  108. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/fuzz.c +0 -19
  109. data/ext/deflate_ruby/libdeflate/scripts/run_tests.sh +0 -416
  110. data/ext/deflate_ruby/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake +0 -8
  111. data/ext/deflate_ruby/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake +0 -8
  112. /data/ext/deflate_ruby/{libdeflate/lib/adler32.c → adler32.c} +0 -0
  113. /data/ext/deflate_ruby/{libdeflate/lib/x86/adler32_template.h → adler32_template.h} +0 -0
  114. /data/ext/deflate_ruby/{libdeflate/lib/bt_matchfinder.h → bt_matchfinder.h} +0 -0
  115. /data/ext/deflate_ruby/{libdeflate/lib/cpu_features_common.h → cpu_features_common.h} +0 -0
  116. /data/ext/deflate_ruby/{libdeflate/lib/crc32.c → crc32.c} +0 -0
  117. /data/ext/deflate_ruby/{libdeflate/lib/arm/crc32_pmull_helpers.h → crc32_pmull_helpers.h} +0 -0
  118. /data/ext/deflate_ruby/{libdeflate/lib/arm/crc32_pmull_wide.h → crc32_pmull_wide.h} +0 -0
  119. /data/ext/deflate_ruby/{libdeflate/lib/x86/decompress_impl.h → decompress_impl.h} +0 -0
  120. /data/ext/deflate_ruby/{libdeflate/lib/decompress_template.h → decompress_template.h} +0 -0
  121. /data/ext/deflate_ruby/{libdeflate/lib/deflate_compress.h → deflate_compress.h} +0 -0
  122. /data/ext/deflate_ruby/{libdeflate/lib/deflate_constants.h → deflate_constants.h} +0 -0
  123. /data/ext/deflate_ruby/{libdeflate/lib/deflate_decompress.c → deflate_decompress.c} +0 -0
  124. /data/ext/deflate_ruby/{libdeflate/lib/gzip_compress.c → gzip_compress.c} +0 -0
  125. /data/ext/deflate_ruby/{libdeflate/lib/gzip_constants.h → gzip_constants.h} +0 -0
  126. /data/ext/deflate_ruby/{libdeflate/lib/gzip_decompress.c → gzip_decompress.c} +0 -0
  127. /data/ext/deflate_ruby/{libdeflate/lib/hc_matchfinder.h → hc_matchfinder.h} +0 -0
  128. /data/ext/deflate_ruby/{libdeflate/lib/ht_matchfinder.h → ht_matchfinder.h} +0 -0
  129. /data/ext/deflate_ruby/{libdeflate/lib/lib_common.h → lib_common.h} +0 -0
  130. /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/cpu_features.c +0 -0
  131. /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/cpu_features.h +0 -0
  132. /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/matchfinder_impl.h +0 -0
  133. /data/ext/deflate_ruby/libdeflate/{lib/riscv → riscv}/matchfinder_impl.h +0 -0
  134. /data/ext/deflate_ruby/libdeflate/{lib/utils.c → utils.c} +0 -0
  135. /data/ext/deflate_ruby/libdeflate/{lib/x86 → x86}/matchfinder_impl.h +0 -0
  136. /data/ext/deflate_ruby/libdeflate/{lib/zlib_compress.c → zlib_compress.c} +0 -0
  137. /data/ext/deflate_ruby/libdeflate/{lib/zlib_constants.h → zlib_constants.h} +0 -0
  138. /data/ext/deflate_ruby/libdeflate/{lib/zlib_decompress.c → zlib_decompress.c} +0 -0
@@ -0,0 +1,518 @@
1
+ /*
2
+ * x86/adler32_template.h - template for vectorized Adler-32 implementations
3
+ *
4
+ * Copyright 2016 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ */
27
+
28
+ /*
29
+ * This file is a "template" for instantiating Adler-32 functions for x86.
30
+ * The "parameters" are:
31
+ *
32
+ * SUFFIX:
33
+ * Name suffix to append to all instantiated functions.
34
+ * ATTRIBUTES:
35
+ * Target function attributes to use. Must satisfy the dependencies of the
36
+ * other parameters as follows:
37
+ * VL=16 && USE_VNNI=0 && USE_AVX512=0: at least sse2
38
+ * VL=32 && USE_VNNI=0 && USE_AVX512=0: at least avx2
39
+ * VL=32 && USE_VNNI=1 && USE_AVX512=0: at least avx2,avxvnni
40
+ * VL=32 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vl,avx512vnni
41
+ * VL=64 && USE_VNNI=1 && USE_AVX512=1: at least avx512bw,avx512vnni
42
+ * (Other combinations are not useful and have not been tested.)
43
+ * VL:
44
+ * Vector length in bytes. Must be 16, 32, or 64.
45
+ * USE_VNNI:
46
+ * If 1, use the VNNI dot product based algorithm.
47
+ * If 0, use the legacy SSE2 and AVX2 compatible algorithm.
48
+ * USE_AVX512:
49
+ * If 1, take advantage of AVX-512 features such as masking. This doesn't
50
+ * enable the use of 512-bit vectors; the vector length is controlled by
51
+ * VL. If 0, assume that the CPU might not support AVX-512.
52
+ */
53
+
54
+ #if VL == 16
55
+ # define vec_t __m128i
56
+ # define mask_t u16
57
+ # define LOG2_VL 4
58
+ # define VADD8(a, b) _mm_add_epi8((a), (b))
59
+ # define VADD16(a, b) _mm_add_epi16((a), (b))
60
+ # define VADD32(a, b) _mm_add_epi32((a), (b))
61
+ # if USE_AVX512
62
+ # define VDPBUSD(a, b, c) _mm_dpbusd_epi32((a), (b), (c))
63
+ # else
64
+ # define VDPBUSD(a, b, c) _mm_dpbusd_avx_epi32((a), (b), (c))
65
+ # endif
66
+ # define VLOAD(p) _mm_load_si128((const void *)(p))
67
+ # define VLOADU(p) _mm_loadu_si128((const void *)(p))
68
+ # define VMADD16(a, b) _mm_madd_epi16((a), (b))
69
+ # define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p))
70
+ # define VMULLO32(a, b) _mm_mullo_epi32((a), (b))
71
+ # define VSAD8(a, b) _mm_sad_epu8((a), (b))
72
+ # define VSET1_8(a) _mm_set1_epi8(a)
73
+ # define VSET1_32(a) _mm_set1_epi32(a)
74
+ # define VSETZERO() _mm_setzero_si128()
75
+ # define VSLL32(a, b) _mm_slli_epi32((a), (b))
76
+ # define VUNPACKLO8(a, b) _mm_unpacklo_epi8((a), (b))
77
+ # define VUNPACKHI8(a, b) _mm_unpackhi_epi8((a), (b))
78
+ #elif VL == 32
79
+ # define vec_t __m256i
80
+ # define mask_t u32
81
+ # define LOG2_VL 5
82
+ # define VADD8(a, b) _mm256_add_epi8((a), (b))
83
+ # define VADD16(a, b) _mm256_add_epi16((a), (b))
84
+ # define VADD32(a, b) _mm256_add_epi32((a), (b))
85
+ # if USE_AVX512
86
+ # define VDPBUSD(a, b, c) _mm256_dpbusd_epi32((a), (b), (c))
87
+ # else
88
+ # define VDPBUSD(a, b, c) _mm256_dpbusd_avx_epi32((a), (b), (c))
89
+ # endif
90
+ # define VLOAD(p) _mm256_load_si256((const void *)(p))
91
+ # define VLOADU(p) _mm256_loadu_si256((const void *)(p))
92
+ # define VMADD16(a, b) _mm256_madd_epi16((a), (b))
93
+ # define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p))
94
+ # define VMULLO32(a, b) _mm256_mullo_epi32((a), (b))
95
+ # define VSAD8(a, b) _mm256_sad_epu8((a), (b))
96
+ # define VSET1_8(a) _mm256_set1_epi8(a)
97
+ # define VSET1_32(a) _mm256_set1_epi32(a)
98
+ # define VSETZERO() _mm256_setzero_si256()
99
+ # define VSLL32(a, b) _mm256_slli_epi32((a), (b))
100
+ # define VUNPACKLO8(a, b) _mm256_unpacklo_epi8((a), (b))
101
+ # define VUNPACKHI8(a, b) _mm256_unpackhi_epi8((a), (b))
102
+ #elif VL == 64
103
+ # define vec_t __m512i
104
+ # define mask_t u64
105
+ # define LOG2_VL 6
106
+ # define VADD8(a, b) _mm512_add_epi8((a), (b))
107
+ # define VADD16(a, b) _mm512_add_epi16((a), (b))
108
+ # define VADD32(a, b) _mm512_add_epi32((a), (b))
109
+ # define VDPBUSD(a, b, c) _mm512_dpbusd_epi32((a), (b), (c))
110
+ # define VLOAD(p) _mm512_load_si512((const void *)(p))
111
+ # define VLOADU(p) _mm512_loadu_si512((const void *)(p))
112
+ # define VMADD16(a, b) _mm512_madd_epi16((a), (b))
113
+ # define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p))
114
+ # define VMULLO32(a, b) _mm512_mullo_epi32((a), (b))
115
+ # define VSAD8(a, b) _mm512_sad_epu8((a), (b))
116
+ # define VSET1_8(a) _mm512_set1_epi8(a)
117
+ # define VSET1_32(a) _mm512_set1_epi32(a)
118
+ # define VSETZERO() _mm512_setzero_si512()
119
+ # define VSLL32(a, b) _mm512_slli_epi32((a), (b))
120
+ # define VUNPACKLO8(a, b) _mm512_unpacklo_epi8((a), (b))
121
+ # define VUNPACKHI8(a, b) _mm512_unpackhi_epi8((a), (b))
122
+ #else
123
+ # error "unsupported vector length"
124
+ #endif
125
+
126
+ #define VADD32_3X(a, b, c) VADD32(VADD32((a), (b)), (c))
127
+ #define VADD32_4X(a, b, c, d) VADD32(VADD32((a), (b)), VADD32((c), (d)))
128
+ #define VADD32_5X(a, b, c, d, e) VADD32((a), VADD32_4X((b), (c), (d), (e)))
129
+ #define VADD32_7X(a, b, c, d, e, f, g) \
130
+ VADD32(VADD32_3X((a), (b), (c)), VADD32_4X((d), (e), (f), (g)))
131
+
132
+ /* Sum the 32-bit elements of v_s1 and add them to s1, and likewise for s2. */
133
+ #undef reduce_to_32bits
134
+ static forceinline ATTRIBUTES void
135
+ ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p)
136
+ {
137
+ __m128i v_s1_128, v_s2_128;
138
+ #if VL == 16
139
+ {
140
+ v_s1_128 = v_s1;
141
+ v_s2_128 = v_s2;
142
+ }
143
+ #else
144
+ {
145
+ __m256i v_s1_256, v_s2_256;
146
+ #if VL == 32
147
+ v_s1_256 = v_s1;
148
+ v_s2_256 = v_s2;
149
+ #else
150
+ /* Reduce 512 bits to 256 bits. */
151
+ v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0),
152
+ _mm512_extracti64x4_epi64(v_s1, 1));
153
+ v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0),
154
+ _mm512_extracti64x4_epi64(v_s2, 1));
155
+ #endif
156
+ /* Reduce 256 bits to 128 bits. */
157
+ v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0),
158
+ _mm256_extracti128_si256(v_s1_256, 1));
159
+ v_s2_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s2_256, 0),
160
+ _mm256_extracti128_si256(v_s2_256, 1));
161
+ }
162
+ #endif
163
+
164
+ /*
165
+ * Reduce 128 bits to 32 bits.
166
+ *
167
+ * If the bytes were summed into v_s1 using psadbw + paddd, then ignore
168
+ * the odd-indexed elements of v_s1_128 since they are zero.
169
+ */
170
+ #if USE_VNNI
171
+ v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x31));
172
+ #endif
173
+ v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31));
174
+ v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x02));
175
+ v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x02));
176
+
177
+ *s1_p += (u32)_mm_cvtsi128_si32(v_s1_128);
178
+ *s2_p += (u32)_mm_cvtsi128_si32(v_s2_128);
179
+ }
180
+ #define reduce_to_32bits ADD_SUFFIX(reduce_to_32bits)
181
+
182
+ static ATTRIBUTES u32
183
+ ADD_SUFFIX(adler32_x86)(u32 adler, const u8 *p, size_t len)
184
+ {
185
+ #if USE_VNNI
186
+ /* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */
187
+ static const u8 _aligned_attribute(VL) raw_mults[VL] = {
188
+ #if VL == 64
189
+ 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
190
+ 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
191
+ #endif
192
+ #if VL >= 32
193
+ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
194
+ #endif
195
+ 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
196
+ };
197
+ const vec_t ones = VSET1_8(1);
198
+ #else
199
+ /*
200
+ * This contains the 16-bit values [2*VL, 2*VL - 1, 2*VL - 2, ..., 1].
201
+ * For VL==32 the ordering is weird because it has to match the way that
202
+ * vpunpcklbw and vpunpckhbw work on 128-bit lanes separately.
203
+ */
204
+ static const u16 _aligned_attribute(VL) raw_mults[4][VL / 2] = {
205
+ #if VL == 16
206
+ { 32, 31, 30, 29, 28, 27, 26, 25 },
207
+ { 24, 23, 22, 21, 20, 19, 18, 17 },
208
+ { 16, 15, 14, 13, 12, 11, 10, 9 },
209
+ { 8, 7, 6, 5, 4, 3, 2, 1 },
210
+ #elif VL == 32
211
+ { 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 },
212
+ { 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 },
213
+ { 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9 },
214
+ { 24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1 },
215
+ #else
216
+ # error "unsupported parameters"
217
+ #endif
218
+ };
219
+ const vec_t mults_a = VLOAD(raw_mults[0]);
220
+ const vec_t mults_b = VLOAD(raw_mults[1]);
221
+ const vec_t mults_c = VLOAD(raw_mults[2]);
222
+ const vec_t mults_d = VLOAD(raw_mults[3]);
223
+ #endif
224
+ const vec_t zeroes = VSETZERO();
225
+ u32 s1 = adler & 0xFFFF;
226
+ u32 s2 = adler >> 16;
227
+
228
+ /*
229
+ * If the length is large and the pointer is misaligned, align it.
230
+ * For smaller lengths, just take the misaligned load penalty.
231
+ */
232
+ if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) {
233
+ do {
234
+ s1 += *p++;
235
+ s2 += s1;
236
+ len--;
237
+ } while ((uintptr_t)p & (VL-1));
238
+ s1 %= DIVISOR;
239
+ s2 %= DIVISOR;
240
+ }
241
+
242
+ #if USE_VNNI
243
+ /*
244
+ * This is Adler-32 using the vpdpbusd instruction from AVX512VNNI or
245
+ * AVX-VNNI. vpdpbusd multiplies the unsigned bytes of one vector by
246
+ * the signed bytes of another vector and adds the sums in groups of 4
247
+ * to the 32-bit elements of a third vector. We use it in two ways:
248
+ * multiplying the data bytes by a sequence like 64,63,62,...,1 for
249
+ * calculating part of s2, and multiplying the data bytes by an all-ones
250
+ * sequence 1,1,1,...,1 for calculating s1 and part of s2. The all-ones
251
+ * trick seems to be faster than the alternative of vpsadbw + vpaddd.
252
+ */
253
+ while (len) {
254
+ /*
255
+ * Calculate the length of the next data chunk such that s1 and
256
+ * s2 are guaranteed to not exceed UINT32_MAX.
257
+ */
258
+ size_t n = MIN(len, MAX_CHUNK_LEN & ~(4*VL - 1));
259
+ vec_t mults = VLOAD(raw_mults);
260
+ vec_t v_s1 = zeroes;
261
+ vec_t v_s2 = zeroes;
262
+
263
+ s2 += s1 * n;
264
+ len -= n;
265
+
266
+ if (n >= 4*VL) {
267
+ vec_t v_s1_b = zeroes;
268
+ vec_t v_s1_c = zeroes;
269
+ vec_t v_s1_d = zeroes;
270
+ vec_t v_s2_b = zeroes;
271
+ vec_t v_s2_c = zeroes;
272
+ vec_t v_s2_d = zeroes;
273
+ vec_t v_s1_sums = zeroes;
274
+ vec_t v_s1_sums_b = zeroes;
275
+ vec_t v_s1_sums_c = zeroes;
276
+ vec_t v_s1_sums_d = zeroes;
277
+ vec_t tmp0, tmp1;
278
+
279
+ do {
280
+ vec_t data_a = VLOADU(p + 0*VL);
281
+ vec_t data_b = VLOADU(p + 1*VL);
282
+ vec_t data_c = VLOADU(p + 2*VL);
283
+ vec_t data_d = VLOADU(p + 3*VL);
284
+
285
+ /*
286
+ * Workaround for gcc bug where it generates
287
+ * unnecessary move instructions
288
+ * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
289
+ */
290
+ #if GCC_PREREQ(1, 0)
291
+ __asm__("" : "+v" (data_a), "+v" (data_b),
292
+ "+v" (data_c), "+v" (data_d));
293
+ #endif
294
+
295
+ v_s2 = VDPBUSD(v_s2, data_a, mults);
296
+ v_s2_b = VDPBUSD(v_s2_b, data_b, mults);
297
+ v_s2_c = VDPBUSD(v_s2_c, data_c, mults);
298
+ v_s2_d = VDPBUSD(v_s2_d, data_d, mults);
299
+
300
+ v_s1_sums = VADD32(v_s1_sums, v_s1);
301
+ v_s1_sums_b = VADD32(v_s1_sums_b, v_s1_b);
302
+ v_s1_sums_c = VADD32(v_s1_sums_c, v_s1_c);
303
+ v_s1_sums_d = VADD32(v_s1_sums_d, v_s1_d);
304
+
305
+ v_s1 = VDPBUSD(v_s1, data_a, ones);
306
+ v_s1_b = VDPBUSD(v_s1_b, data_b, ones);
307
+ v_s1_c = VDPBUSD(v_s1_c, data_c, ones);
308
+ v_s1_d = VDPBUSD(v_s1_d, data_d, ones);
309
+
310
+ /* Same gcc bug workaround. See above */
311
+ #if GCC_PREREQ(1, 0) && !defined(ARCH_X86_32)
312
+ __asm__("" : "+v" (v_s2), "+v" (v_s2_b),
313
+ "+v" (v_s2_c), "+v" (v_s2_d),
314
+ "+v" (v_s1_sums),
315
+ "+v" (v_s1_sums_b),
316
+ "+v" (v_s1_sums_c),
317
+ "+v" (v_s1_sums_d),
318
+ "+v" (v_s1), "+v" (v_s1_b),
319
+ "+v" (v_s1_c), "+v" (v_s1_d));
320
+ #endif
321
+ p += 4*VL;
322
+ n -= 4*VL;
323
+ } while (n >= 4*VL);
324
+
325
+ /*
326
+ * Reduce into v_s1 and v_s2 as follows:
327
+ *
328
+ * v_s2 = v_s2 + v_s2_b + v_s2_c + v_s2_d +
329
+ * (4*VL)*(v_s1_sums + v_s1_sums_b +
330
+ * v_s1_sums_c + v_s1_sums_d) +
331
+ * (3*VL)*v_s1 + (2*VL)*v_s1_b + VL*v_s1_c
332
+ * v_s1 = v_s1 + v_s1_b + v_s1_c + v_s1_d
333
+ */
334
+ tmp0 = VADD32(v_s1, v_s1_b);
335
+ tmp1 = VADD32(v_s1, v_s1_c);
336
+ v_s1_sums = VADD32_4X(v_s1_sums, v_s1_sums_b,
337
+ v_s1_sums_c, v_s1_sums_d);
338
+ v_s1 = VADD32_3X(tmp0, v_s1_c, v_s1_d);
339
+ v_s2 = VADD32_7X(VSLL32(v_s1_sums, LOG2_VL + 2),
340
+ VSLL32(tmp0, LOG2_VL + 1),
341
+ VSLL32(tmp1, LOG2_VL),
342
+ v_s2, v_s2_b, v_s2_c, v_s2_d);
343
+ }
344
+
345
+ /* Process the last 0 <= n < 4*VL bytes of the chunk. */
346
+ if (n >= 2*VL) {
347
+ const vec_t data_a = VLOADU(p + 0*VL);
348
+ const vec_t data_b = VLOADU(p + 1*VL);
349
+
350
+ v_s2 = VADD32(v_s2, VSLL32(v_s1, LOG2_VL + 1));
351
+ v_s1 = VDPBUSD(v_s1, data_a, ones);
352
+ v_s1 = VDPBUSD(v_s1, data_b, ones);
353
+ v_s2 = VDPBUSD(v_s2, data_a, VSET1_8(VL));
354
+ v_s2 = VDPBUSD(v_s2, data_a, mults);
355
+ v_s2 = VDPBUSD(v_s2, data_b, mults);
356
+ p += 2*VL;
357
+ n -= 2*VL;
358
+ }
359
+ if (n) {
360
+ /* Process the last 0 < n < 2*VL bytes of the chunk. */
361
+ vec_t data;
362
+
363
+ v_s2 = VADD32(v_s2, VMULLO32(v_s1, VSET1_32(n)));
364
+
365
+ mults = VADD8(mults, VSET1_8((int)n - VL));
366
+ if (n > VL) {
367
+ data = VLOADU(p);
368
+ v_s1 = VDPBUSD(v_s1, data, ones);
369
+ v_s2 = VDPBUSD(v_s2, data, mults);
370
+ p += VL;
371
+ n -= VL;
372
+ mults = VADD8(mults, VSET1_8(-VL));
373
+ }
374
+ /*
375
+ * Process the last 0 < n <= VL bytes of the chunk.
376
+ * Utilize a masked load if it's available.
377
+ */
378
+ #if USE_AVX512
379
+ data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p);
380
+ #else
381
+ data = zeroes;
382
+ memcpy(&data, p, n);
383
+ #endif
384
+ v_s1 = VDPBUSD(v_s1, data, ones);
385
+ v_s2 = VDPBUSD(v_s2, data, mults);
386
+ p += n;
387
+ }
388
+
389
+ reduce_to_32bits(v_s1, v_s2, &s1, &s2);
390
+ s1 %= DIVISOR;
391
+ s2 %= DIVISOR;
392
+ }
393
+ #else /* USE_VNNI */
394
+ /*
395
+ * This is Adler-32 for SSE2 and AVX2.
396
+ *
397
+ * To horizontally sum bytes, use psadbw + paddd, where one of the
398
+ * arguments to psadbw is all-zeroes.
399
+ *
400
+ * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL
401
+ * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw
402
+ * to sum, for each i across iterations, byte i into a corresponding
403
+ * 16-bit counter in v_byte_sums_*. After the inner loop, use pmaddwd
404
+ * to multiply each counter by (2*VL - i), then add the products to s2.
405
+ *
406
+ * An alternative implementation would use pmaddubsw and pmaddwd in the
407
+ * inner loop to do (2*VL - i)*data[i] directly and add the products in
408
+ * groups of 4 to 32-bit counters. However, on average that approach
409
+ * seems to be slower than the current approach which delays the
410
+ * multiplications. Also, pmaddubsw requires SSSE3; the current
411
+ * approach keeps the implementation aligned between SSE2 and AVX2.
412
+ *
413
+ * The inner loop processes 2*VL bytes per iteration. Increasing this
414
+ * to 4*VL doesn't seem to be helpful here.
415
+ */
416
+ while (len) {
417
+ /*
418
+ * Calculate the length of the next data chunk such that s1 and
419
+ * s2 are guaranteed to not exceed UINT32_MAX, and every
420
+ * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX.
421
+ * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are
422
+ * used with pmaddwd which does signed multiplication. In the
423
+ * SSE2 case this limits chunks to 4096 bytes instead of 5536.
424
+ */
425
+ size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX),
426
+ MAX_CHUNK_LEN) & ~(2*VL - 1));
427
+ len -= n;
428
+
429
+ if (n >= 2*VL) {
430
+ vec_t v_s1 = zeroes;
431
+ vec_t v_s1_sums = zeroes;
432
+ vec_t v_byte_sums_a = zeroes;
433
+ vec_t v_byte_sums_b = zeroes;
434
+ vec_t v_byte_sums_c = zeroes;
435
+ vec_t v_byte_sums_d = zeroes;
436
+ vec_t v_s2;
437
+
438
+ s2 += s1 * (n & ~(2*VL - 1));
439
+
440
+ do {
441
+ vec_t data_a = VLOADU(p + 0*VL);
442
+ vec_t data_b = VLOADU(p + 1*VL);
443
+
444
+ v_s1_sums = VADD32(v_s1_sums, v_s1);
445
+ v_byte_sums_a = VADD16(v_byte_sums_a,
446
+ VUNPACKLO8(data_a, zeroes));
447
+ v_byte_sums_b = VADD16(v_byte_sums_b,
448
+ VUNPACKHI8(data_a, zeroes));
449
+ v_byte_sums_c = VADD16(v_byte_sums_c,
450
+ VUNPACKLO8(data_b, zeroes));
451
+ v_byte_sums_d = VADD16(v_byte_sums_d,
452
+ VUNPACKHI8(data_b, zeroes));
453
+ v_s1 = VADD32(v_s1,
454
+ VADD32(VSAD8(data_a, zeroes),
455
+ VSAD8(data_b, zeroes)));
456
+ /*
457
+ * Workaround for gcc bug where it generates
458
+ * unnecessary move instructions
459
+ * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
460
+ */
461
+ #if GCC_PREREQ(1, 0)
462
+ __asm__("" : "+x" (v_s1), "+x" (v_s1_sums),
463
+ "+x" (v_byte_sums_a),
464
+ "+x" (v_byte_sums_b),
465
+ "+x" (v_byte_sums_c),
466
+ "+x" (v_byte_sums_d));
467
+ #endif
468
+ p += 2*VL;
469
+ n -= 2*VL;
470
+ } while (n >= 2*VL);
471
+
472
+ /*
473
+ * Calculate v_s2 as (2*VL)*v_s1_sums +
474
+ * [2*VL, 2*VL - 1, 2*VL - 2, ..., 1] * v_byte_sums.
475
+ * Then update s1 and s2 from v_s1 and v_s2.
476
+ */
477
+ v_s2 = VADD32_5X(VSLL32(v_s1_sums, LOG2_VL + 1),
478
+ VMADD16(v_byte_sums_a, mults_a),
479
+ VMADD16(v_byte_sums_b, mults_b),
480
+ VMADD16(v_byte_sums_c, mults_c),
481
+ VMADD16(v_byte_sums_d, mults_d));
482
+ reduce_to_32bits(v_s1, v_s2, &s1, &s2);
483
+ }
484
+ /*
485
+ * Process the last 0 <= n < 2*VL bytes of the chunk using
486
+ * scalar instructions and reduce s1 and s2 mod DIVISOR.
487
+ */
488
+ ADLER32_CHUNK(s1, s2, p, n);
489
+ }
490
+ #endif /* !USE_VNNI */
491
+ return (s2 << 16) | s1;
492
+ }
493
+
494
+ #undef vec_t
495
+ #undef mask_t
496
+ #undef LOG2_VL
497
+ #undef VADD8
498
+ #undef VADD16
499
+ #undef VADD32
500
+ #undef VDPBUSD
501
+ #undef VLOAD
502
+ #undef VLOADU
503
+ #undef VMADD16
504
+ #undef VMASKZ_LOADU
505
+ #undef VMULLO32
506
+ #undef VSAD8
507
+ #undef VSET1_8
508
+ #undef VSET1_32
509
+ #undef VSETZERO
510
+ #undef VSLL32
511
+ #undef VUNPACKLO8
512
+ #undef VUNPACKHI8
513
+
514
+ #undef SUFFIX
515
+ #undef ATTRIBUTES
516
+ #undef VL
517
+ #undef USE_VNNI
518
+ #undef USE_AVX512