deflate-ruby 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. checksums.yaml +4 -4
  2. data/CLAUDE.md +95 -92
  3. data/LICENSE.txt +6 -6
  4. data/README.md +87 -65
  5. data/Rakefile +23 -0
  6. data/ext/deflate_ruby/{libdeflate/lib/x86/adler32_impl.h → adler32_impl.h} +8 -7
  7. data/ext/deflate_ruby/common_defs.h +748 -0
  8. data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.c → cpu_features.c} +46 -16
  9. data/ext/deflate_ruby/{libdeflate/lib/x86/cpu_features.h → cpu_features.h} +2 -1
  10. data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_impl.h → crc32_impl.h} +22 -23
  11. data/ext/deflate_ruby/{libdeflate/lib/crc32_multipliers.h → crc32_multipliers.h} +2 -4
  12. data/ext/deflate_ruby/{libdeflate/lib/x86/crc32_pclmul_template.h → crc32_pclmul_template.h} +23 -94
  13. data/ext/deflate_ruby/{libdeflate/lib/crc32_tables.h → crc32_tables.h} +1 -1
  14. data/ext/deflate_ruby/{libdeflate/lib/deflate_compress.c → deflate_compress.c} +59 -60
  15. data/ext/deflate_ruby/deflate_ruby.c +392 -218
  16. data/ext/deflate_ruby/deflate_ruby.h +6 -0
  17. data/ext/deflate_ruby/extconf.rb +35 -25
  18. data/ext/deflate_ruby/libdeflate/adler32.c +162 -0
  19. data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/adler32_impl.h +14 -7
  20. data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/crc32_impl.h +25 -31
  21. data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_helpers.h +156 -0
  22. data/ext/deflate_ruby/libdeflate/arm/crc32_pmull_wide.h +226 -0
  23. data/ext/deflate_ruby/libdeflate/bt_matchfinder.h +342 -0
  24. data/ext/deflate_ruby/libdeflate/common_defs.h +2 -1
  25. data/ext/deflate_ruby/libdeflate/cpu_features_common.h +93 -0
  26. data/ext/deflate_ruby/libdeflate/crc32.c +262 -0
  27. data/ext/deflate_ruby/libdeflate/crc32_multipliers.h +375 -0
  28. data/ext/deflate_ruby/libdeflate/crc32_tables.h +587 -0
  29. data/ext/deflate_ruby/libdeflate/decompress_template.h +777 -0
  30. data/ext/deflate_ruby/libdeflate/deflate_compress.c +4128 -0
  31. data/ext/deflate_ruby/libdeflate/deflate_compress.h +15 -0
  32. data/ext/deflate_ruby/libdeflate/deflate_constants.h +56 -0
  33. data/ext/deflate_ruby/libdeflate/deflate_decompress.c +1208 -0
  34. data/ext/deflate_ruby/libdeflate/gzip_compress.c +90 -0
  35. data/ext/deflate_ruby/libdeflate/gzip_constants.h +45 -0
  36. data/ext/deflate_ruby/libdeflate/gzip_decompress.c +144 -0
  37. data/ext/deflate_ruby/libdeflate/hc_matchfinder.h +401 -0
  38. data/ext/deflate_ruby/libdeflate/ht_matchfinder.h +234 -0
  39. data/ext/deflate_ruby/libdeflate/lib_common.h +106 -0
  40. data/ext/deflate_ruby/libdeflate/libdeflate.h +2 -2
  41. data/ext/deflate_ruby/libdeflate/{lib/matchfinder_common.h → matchfinder_common.h} +3 -3
  42. data/ext/deflate_ruby/libdeflate/x86/adler32_impl.h +135 -0
  43. data/ext/deflate_ruby/libdeflate/x86/adler32_template.h +518 -0
  44. data/ext/deflate_ruby/libdeflate/x86/cpu_features.c +213 -0
  45. data/ext/deflate_ruby/libdeflate/x86/cpu_features.h +170 -0
  46. data/ext/deflate_ruby/libdeflate/x86/crc32_impl.h +159 -0
  47. data/ext/deflate_ruby/libdeflate/x86/crc32_pclmul_template.h +424 -0
  48. data/ext/deflate_ruby/libdeflate/x86/decompress_impl.h +57 -0
  49. data/ext/deflate_ruby/libdeflate.h +411 -0
  50. data/ext/deflate_ruby/matchfinder_common.h +224 -0
  51. data/ext/deflate_ruby/matchfinder_impl.h +122 -0
  52. data/ext/deflate_ruby/utils.c +141 -0
  53. data/ext/deflate_ruby/zlib_compress.c +82 -0
  54. data/ext/deflate_ruby/zlib_constants.h +21 -0
  55. data/ext/deflate_ruby/zlib_decompress.c +104 -0
  56. data/lib/deflate_ruby/version.rb +1 -1
  57. data/lib/deflate_ruby.rb +1 -63
  58. data/sig/deflate_ruby.rbs +4 -0
  59. data/test/test_deflate_ruby.rb +220 -0
  60. data/test/test_helper.rb +6 -0
  61. metadata +89 -144
  62. data/ext/deflate_ruby/libdeflate/CMakeLists.txt +0 -270
  63. data/ext/deflate_ruby/libdeflate/NEWS.md +0 -494
  64. data/ext/deflate_ruby/libdeflate/README.md +0 -228
  65. data/ext/deflate_ruby/libdeflate/libdeflate-config.cmake.in +0 -3
  66. data/ext/deflate_ruby/libdeflate/libdeflate.pc.in +0 -18
  67. data/ext/deflate_ruby/libdeflate/programs/CMakeLists.txt +0 -105
  68. data/ext/deflate_ruby/libdeflate/programs/benchmark.c +0 -696
  69. data/ext/deflate_ruby/libdeflate/programs/checksum.c +0 -218
  70. data/ext/deflate_ruby/libdeflate/programs/config.h.in +0 -19
  71. data/ext/deflate_ruby/libdeflate/programs/gzip.c +0 -688
  72. data/ext/deflate_ruby/libdeflate/programs/prog_util.c +0 -521
  73. data/ext/deflate_ruby/libdeflate/programs/prog_util.h +0 -225
  74. data/ext/deflate_ruby/libdeflate/programs/test_checksums.c +0 -200
  75. data/ext/deflate_ruby/libdeflate/programs/test_custom_malloc.c +0 -155
  76. data/ext/deflate_ruby/libdeflate/programs/test_incomplete_codes.c +0 -385
  77. data/ext/deflate_ruby/libdeflate/programs/test_invalid_streams.c +0 -130
  78. data/ext/deflate_ruby/libdeflate/programs/test_litrunlen_overflow.c +0 -72
  79. data/ext/deflate_ruby/libdeflate/programs/test_overread.c +0 -95
  80. data/ext/deflate_ruby/libdeflate/programs/test_slow_decompression.c +0 -472
  81. data/ext/deflate_ruby/libdeflate/programs/test_trailing_bytes.c +0 -151
  82. data/ext/deflate_ruby/libdeflate/programs/test_util.c +0 -237
  83. data/ext/deflate_ruby/libdeflate/programs/test_util.h +0 -61
  84. data/ext/deflate_ruby/libdeflate/programs/tgetopt.c +0 -118
  85. data/ext/deflate_ruby/libdeflate/scripts/android_build.sh +0 -118
  86. data/ext/deflate_ruby/libdeflate/scripts/android_tests.sh +0 -69
  87. data/ext/deflate_ruby/libdeflate/scripts/benchmark.sh +0 -10
  88. data/ext/deflate_ruby/libdeflate/scripts/checksum.sh +0 -10
  89. data/ext/deflate_ruby/libdeflate/scripts/checksum_benchmarks.sh +0 -253
  90. data/ext/deflate_ruby/libdeflate/scripts/cmake-helper.sh +0 -17
  91. data/ext/deflate_ruby/libdeflate/scripts/deflate_benchmarks.sh +0 -119
  92. data/ext/deflate_ruby/libdeflate/scripts/exec_tests.sh +0 -38
  93. data/ext/deflate_ruby/libdeflate/scripts/gen-release-archives.sh +0 -37
  94. data/ext/deflate_ruby/libdeflate/scripts/gen_bitreverse_tab.py +0 -19
  95. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_multipliers.c +0 -199
  96. data/ext/deflate_ruby/libdeflate/scripts/gen_crc32_tables.c +0 -105
  97. data/ext/deflate_ruby/libdeflate/scripts/gen_default_litlen_costs.py +0 -44
  98. data/ext/deflate_ruby/libdeflate/scripts/gen_offset_slot_map.py +0 -29
  99. data/ext/deflate_ruby/libdeflate/scripts/gzip_tests.sh +0 -523
  100. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/corpus/0 +0 -0
  101. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_compress/fuzz.c +0 -95
  102. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/corpus/0 +0 -3
  103. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/deflate_decompress/fuzz.c +0 -62
  104. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/fuzz.sh +0 -108
  105. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/corpus/0 +0 -0
  106. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/gzip_decompress/fuzz.c +0 -19
  107. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/corpus/0 +0 -3
  108. data/ext/deflate_ruby/libdeflate/scripts/libFuzzer/zlib_decompress/fuzz.c +0 -19
  109. data/ext/deflate_ruby/libdeflate/scripts/run_tests.sh +0 -416
  110. data/ext/deflate_ruby/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake +0 -8
  111. data/ext/deflate_ruby/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake +0 -8
  112. /data/ext/deflate_ruby/{libdeflate/lib/adler32.c → adler32.c} +0 -0
  113. /data/ext/deflate_ruby/{libdeflate/lib/x86/adler32_template.h → adler32_template.h} +0 -0
  114. /data/ext/deflate_ruby/{libdeflate/lib/bt_matchfinder.h → bt_matchfinder.h} +0 -0
  115. /data/ext/deflate_ruby/{libdeflate/lib/cpu_features_common.h → cpu_features_common.h} +0 -0
  116. /data/ext/deflate_ruby/{libdeflate/lib/crc32.c → crc32.c} +0 -0
  117. /data/ext/deflate_ruby/{libdeflate/lib/arm/crc32_pmull_helpers.h → crc32_pmull_helpers.h} +0 -0
  118. /data/ext/deflate_ruby/{libdeflate/lib/arm/crc32_pmull_wide.h → crc32_pmull_wide.h} +0 -0
  119. /data/ext/deflate_ruby/{libdeflate/lib/x86/decompress_impl.h → decompress_impl.h} +0 -0
  120. /data/ext/deflate_ruby/{libdeflate/lib/decompress_template.h → decompress_template.h} +0 -0
  121. /data/ext/deflate_ruby/{libdeflate/lib/deflate_compress.h → deflate_compress.h} +0 -0
  122. /data/ext/deflate_ruby/{libdeflate/lib/deflate_constants.h → deflate_constants.h} +0 -0
  123. /data/ext/deflate_ruby/{libdeflate/lib/deflate_decompress.c → deflate_decompress.c} +0 -0
  124. /data/ext/deflate_ruby/{libdeflate/lib/gzip_compress.c → gzip_compress.c} +0 -0
  125. /data/ext/deflate_ruby/{libdeflate/lib/gzip_constants.h → gzip_constants.h} +0 -0
  126. /data/ext/deflate_ruby/{libdeflate/lib/gzip_decompress.c → gzip_decompress.c} +0 -0
  127. /data/ext/deflate_ruby/{libdeflate/lib/hc_matchfinder.h → hc_matchfinder.h} +0 -0
  128. /data/ext/deflate_ruby/{libdeflate/lib/ht_matchfinder.h → ht_matchfinder.h} +0 -0
  129. /data/ext/deflate_ruby/{libdeflate/lib/lib_common.h → lib_common.h} +0 -0
  130. /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/cpu_features.c +0 -0
  131. /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/cpu_features.h +0 -0
  132. /data/ext/deflate_ruby/libdeflate/{lib/arm → arm}/matchfinder_impl.h +0 -0
  133. /data/ext/deflate_ruby/libdeflate/{lib/riscv → riscv}/matchfinder_impl.h +0 -0
  134. /data/ext/deflate_ruby/libdeflate/{lib/utils.c → utils.c} +0 -0
  135. /data/ext/deflate_ruby/libdeflate/{lib/x86 → x86}/matchfinder_impl.h +0 -0
  136. /data/ext/deflate_ruby/libdeflate/{lib/zlib_compress.c → zlib_compress.c} +0 -0
  137. /data/ext/deflate_ruby/libdeflate/{lib/zlib_constants.h → zlib_constants.h} +0 -0
  138. /data/ext/deflate_ruby/libdeflate/{lib/zlib_decompress.c → zlib_decompress.c} +0 -0
@@ -0,0 +1,6 @@
1
+ #ifndef DEFLATE_RUBY_H
2
+ #define DEFLATE_RUBY_H 1
3
+
4
+ #include "ruby.h"
5
+
6
+ #endif /* DEFLATE_RUBY_H */
@@ -1,34 +1,44 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require "mkmf"
4
+ require "rbconfig"
4
5
 
5
- # Add libdeflate source directory to the include path
6
+ # Makes all symbols private by default to avoid unintended conflict
7
+ # with other gems. To explicitly export symbols you can use RUBY_FUNC_EXPORTED
8
+ # selectively, or entirely remove this flag.
9
+ append_cflags("-fvisibility=hidden")
10
+
11
+ # Add libdeflate source directory and subdirectories to include path
6
12
  $INCFLAGS << " -I$(srcdir)/libdeflate"
7
- $CFLAGS << " -O2 -std=c99"
8
-
9
- # Define source files to compile
10
- libdeflate_sources = %w[
11
- libdeflate/lib/deflate_compress.c
12
- libdeflate/lib/deflate_decompress.c
13
- libdeflate/lib/zlib_compress.c
14
- libdeflate/lib/zlib_decompress.c
15
- libdeflate/lib/gzip_compress.c
16
- libdeflate/lib/gzip_decompress.c
17
- libdeflate/lib/adler32.c
18
- libdeflate/lib/crc32.c
19
- libdeflate/lib/utils.c
20
- ]
21
-
22
- # Add CPU architecture-specific files
23
- arch_dirs = Dir.glob("libdeflate/lib/*/").select { |d| File.directory?(d) }
24
- arch_dirs.each do |dir|
25
- Dir.glob("#{dir}*.c").each do |source|
26
- libdeflate_sources << source
27
- end
13
+ $INCFLAGS << " -I$(srcdir)/libdeflate/arm"
14
+ $INCFLAGS << " -I$(srcdir)/libdeflate/x86"
15
+ $INCFLAGS << " -I$(srcdir)/libdeflate/riscv"
16
+
17
+ # Detect CPU architecture
18
+ arch = RbConfig::CONFIG['host_cpu']
19
+
20
+ # Get base libdeflate C files (not in subdirectories)
21
+ libdeflate_sources = Dir.glob("#{__dir__}/libdeflate/*.c")
22
+
23
+ # Add architecture-specific files
24
+ if arch =~ /arm|aarch64/
25
+ libdeflate_sources += Dir.glob("#{__dir__}/libdeflate/arm/*.c")
26
+ elsif arch =~ /x86_64|i686|i386/
27
+ libdeflate_sources += Dir.glob("#{__dir__}/libdeflate/x86/*.c")
28
+ elsif arch =~ /riscv/
29
+ libdeflate_sources += Dir.glob("#{__dir__}/libdeflate/riscv/*.c")
28
30
  end
29
31
 
30
- # Set object files for libdeflate
31
- $objs = libdeflate_sources.map { |src| src.sub(/\.c$/, ".o") }
32
- $objs << "deflate_ruby.o"
32
+ # Build source file list for mkmf
33
+ $srcs = ["deflate_ruby.c"] + libdeflate_sources.map { |f| File.basename(f) }
34
+
35
+ # Optimization flags for better performance
36
+ append_cflags("-O3")
37
+
38
+ # Platform-specific optimizations
39
+ if arch =~ /x86_64|i686|i386/
40
+ # Enable SSE2 on x86 (generally available on x86_64)
41
+ have_func("__builtin_cpu_supports")
42
+ end
33
43
 
34
44
  create_makefile("deflate_ruby/deflate_ruby")
@@ -0,0 +1,162 @@
1
+ /*
2
+ * adler32.c - Adler-32 checksum algorithm
3
+ *
4
+ * Copyright 2016 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ */
27
+
28
+ #include "lib_common.h"
29
+
30
+ /* The Adler-32 divisor, or "base", value */
31
+ #define DIVISOR 65521
32
+
33
+ /*
34
+ * MAX_CHUNK_LEN is the most bytes that can be processed without the possibility
35
+ * of s2 overflowing when it is represented as an unsigned 32-bit integer. This
36
+ * value was computed using the following Python script:
37
+ *
38
+ * divisor = 65521
39
+ * count = 0
40
+ * s1 = divisor - 1
41
+ * s2 = divisor - 1
42
+ * while True:
43
+ * s1 += 0xFF
44
+ * s2 += s1
45
+ * if s2 > 0xFFFFFFFF:
46
+ * break
47
+ * count += 1
48
+ * print(count)
49
+ *
50
+ * Note that to get the correct worst-case value, we must assume that every byte
51
+ * has value 0xFF and that s1 and s2 started with the highest possible values
52
+ * modulo the divisor.
53
+ */
54
+ #define MAX_CHUNK_LEN 5552
55
+
56
+ /*
57
+ * Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n,
58
+ * update n to 0, and reduce s1 and s2 mod DIVISOR. It is assumed that neither
59
+ * s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes
60
+ * already processed after the last reduction must not exceed MAX_CHUNK_LEN.
61
+ *
62
+ * This uses only portable C code. This is used as a fallback when a vectorized
63
+ * implementation of Adler-32 (e.g. AVX2) is unavailable on the platform.
64
+ *
65
+ * Some of the vectorized implementations also use this to handle the end of the
66
+ * data when the data isn't evenly divisible by the length the vectorized code
67
+ * works on. To avoid compiler errors about target-specific option mismatches
68
+ * when this is used in that way, this is a macro rather than a function.
69
+ *
70
+ * Although this is unvectorized, this does include an optimization where the
71
+ * main loop processes four bytes at a time using a strategy similar to that
72
+ * used by vectorized implementations. This provides increased instruction-
73
+ * level parallelism compared to the traditional 's1 += *p++; s2 += s1;'.
74
+ */
75
+ #define ADLER32_CHUNK(s1, s2, p, n) \
76
+ do { \
77
+ if (n >= 4) { \
78
+ u32 s1_sum = 0; \
79
+ u32 byte_0_sum = 0; \
80
+ u32 byte_1_sum = 0; \
81
+ u32 byte_2_sum = 0; \
82
+ u32 byte_3_sum = 0; \
83
+ \
84
+ do { \
85
+ s1_sum += s1; \
86
+ s1 += p[0] + p[1] + p[2] + p[3]; \
87
+ byte_0_sum += p[0]; \
88
+ byte_1_sum += p[1]; \
89
+ byte_2_sum += p[2]; \
90
+ byte_3_sum += p[3]; \
91
+ p += 4; \
92
+ n -= 4; \
93
+ } while (n >= 4); \
94
+ s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) + \
95
+ (2 * byte_2_sum) + byte_3_sum; \
96
+ } \
97
+ for (; n; n--, p++) { \
98
+ s1 += *p; \
99
+ s2 += s1; \
100
+ } \
101
+ s1 %= DIVISOR; \
102
+ s2 %= DIVISOR; \
103
+ } while (0)
104
+
105
+ static u32 MAYBE_UNUSED
106
+ adler32_generic(u32 adler, const u8 *p, size_t len)
107
+ {
108
+ u32 s1 = adler & 0xFFFF;
109
+ u32 s2 = adler >> 16;
110
+
111
+ while (len) {
112
+ size_t n = MIN(len, MAX_CHUNK_LEN & ~3);
113
+
114
+ len -= n;
115
+ ADLER32_CHUNK(s1, s2, p, n);
116
+ }
117
+
118
+ return (s2 << 16) | s1;
119
+ }
120
+
121
+ /* Include architecture-specific implementation(s) if available. */
122
+ #undef DEFAULT_IMPL
123
+ #undef arch_select_adler32_func
124
+ typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len);
125
+ #if defined(ARCH_ARM32) || defined(ARCH_ARM64)
126
+ # include "arm/adler32_impl.h"
127
+ #elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
128
+ # include "x86/adler32_impl.h"
129
+ #endif
130
+
131
+ #ifndef DEFAULT_IMPL
132
+ # define DEFAULT_IMPL adler32_generic
133
+ #endif
134
+
135
+ #ifdef arch_select_adler32_func
136
+ static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len);
137
+
138
+ static volatile adler32_func_t adler32_impl = dispatch_adler32;
139
+
140
+ /* Choose the best implementation at runtime. */
141
+ static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
142
+ {
143
+ adler32_func_t f = arch_select_adler32_func();
144
+
145
+ if (f == NULL)
146
+ f = DEFAULT_IMPL;
147
+
148
+ adler32_impl = f;
149
+ return f(adler, p, len);
150
+ }
151
+ #else
152
+ /* The best implementation is statically known, so call it directly. */
153
+ #define adler32_impl DEFAULT_IMPL
154
+ #endif
155
+
156
+ LIBDEFLATEAPI u32
157
+ libdeflate_adler32(u32 adler, const void *buffer, size_t len)
158
+ {
159
+ if (buffer == NULL) /* Return initial value. */
160
+ return 1;
161
+ return adler32_impl(adler, buffer, len);
162
+ }
@@ -209,18 +209,25 @@ adler32_arm_neon(u32 adler, const u8 *p, size_t len)
209
209
  #endif /* Regular NEON implementation */
210
210
 
211
211
  /* NEON+dotprod implementation */
212
- #if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
212
+ #if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN() && \
213
+ !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_DOTPROD)
213
214
  # define adler32_arm_neon_dotprod adler32_arm_neon_dotprod
214
215
  # ifdef __clang__
215
216
  # define ATTRIBUTES _target_attribute("dotprod")
216
217
  /*
217
- * With gcc 13.1 and earlier (before gcc commit 73d3bc348190 or 9aac37ab8a7b,
218
- * "aarch64: Remove architecture dependencies from intrinsics"),
219
- * arch=armv8.2-a is needed for the dotprod intrinsics, unless the default
220
- * target is armv8.3-a or later in which case it must be omitted. armv8.3-a
221
- * or later can be detected by checking for __ARM_FEATURE_JCVT.
218
+ * Both gcc and binutils originally considered dotprod to depend on
219
+ * arch=armv8.2-a or later. This was fixed in gcc 13.2 by commit
220
+ * 9aac37ab8a7b ("aarch64: Remove architecture dependencies from intrinsics")
221
+ * and in binutils 2.41 by commit 205e4380c800 ("aarch64: Remove version
222
+ * dependencies from features"). Unfortunately, always using arch=armv8.2-a
223
+ * causes build errors with some compiler options because it may reduce the
224
+ * arch rather than increase it. Therefore we try to omit the arch whenever
225
+ * possible. If gcc is 14 or later, then both gcc and binutils are probably
226
+ * fixed, so we omit the arch. We also omit the arch if a feature that
227
+ * depends on armv8.2-a or later (in gcc 13.1 and earlier) is present.
222
228
  */
223
- # elif GCC_PREREQ(13, 2) || defined(__ARM_FEATURE_JCVT)
229
+ # elif GCC_PREREQ(14, 0) || defined(__ARM_FEATURE_JCVT) \
230
+ || defined(__ARM_FEATURE_DOTPROD)
224
231
  # define ATTRIBUTES _target_attribute("+dotprod")
225
232
  # else
226
233
  # define ATTRIBUTES _target_attribute("arch=armv8.2-a+dotprod")
@@ -434,13 +434,11 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
434
434
  { CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
435
435
  { CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
436
436
  };
437
- static const u64 _aligned_attribute(16) final_mults[3][2] = {
438
- { CRC32_X63_MODG, 0 },
439
- { CRC32_BARRETT_CONSTANT_1, 0 },
440
- { CRC32_BARRETT_CONSTANT_2, 0 },
437
+ static const u64 _aligned_attribute(16) barrett_consts[3][2] = {
438
+ { CRC32_X95_MODG, },
439
+ { CRC32_BARRETT_CONSTANT_1, },
440
+ { CRC32_BARRETT_CONSTANT_2, },
441
441
  };
442
- const uint8x16_t zeroes = vdupq_n_u8(0);
443
- const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
444
442
  const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
445
443
  uint8x16_t v0, v1, v2, v3;
446
444
 
@@ -497,24 +495,13 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
497
495
  if (len)
498
496
  v0 = fold_partial_vec(v0, p, len, multipliers_1);
499
497
 
500
- /*
501
- * Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
502
- * which is equivalent to multiplying by x^32. This is needed because
503
- * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
504
- */
505
-
506
- v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
507
- clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));
508
-
509
- /* Fold 96 => 64 bits. */
510
- v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
511
- clmul_low(vandq_u8(v0, mask32),
512
- load_multipliers(final_mults[0])));
513
-
514
- /* Reduce 64 => 32 bits using Barrett reduction. */
515
- v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
516
- v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
517
- return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
498
+ /* Reduce to 32 bits, following lib/x86/crc32_pclmul_template.h */
499
+ v0 = veorq_u8(clmul_low(v0, load_multipliers(barrett_consts[0])),
500
+ vextq_u8(v0, vdupq_n_u8(0), 8));
501
+ v1 = clmul_low(v0, load_multipliers(barrett_consts[1]));
502
+ v1 = clmul_low(v1, load_multipliers(barrett_consts[2]));
503
+ v0 = veorq_u8(v0, v1);
504
+ return vgetq_lane_u32(vreinterpretq_u32_u8(v0), 2);
518
505
  }
519
506
  #undef SUFFIX
520
507
  #undef ATTRIBUTES
@@ -545,19 +532,26 @@ crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
545
532
  * This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from
546
533
  * the sha3 extension) for even better performance.
547
534
  */
548
- #if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN
535
+ #if HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && HAVE_SHA3_INTRIN && \
536
+ !defined(LIBDEFLATE_ASSEMBLER_DOES_NOT_SUPPORT_SHA3)
549
537
  # define crc32_arm_pmullx12_crc_eor3 crc32_arm_pmullx12_crc_eor3
550
538
  # define SUFFIX _pmullx12_crc_eor3
551
539
  # ifdef __clang__
552
540
  # define ATTRIBUTES _target_attribute("aes,crc,sha3")
553
541
  /*
554
- * With gcc 13.1 and earlier (before gcc commit 73d3bc348190 or 9aac37ab8a7b,
555
- * "aarch64: Remove architecture dependencies from intrinsics"),
556
- * arch=armv8.2-a is needed for the sha3 intrinsics, unless the default
557
- * target is armv8.3-a or later in which case it must be omitted. armv8.3-a
558
- * or later can be detected by checking for __ARM_FEATURE_JCVT.
542
+ * Both gcc and binutils originally considered sha3 to depend on
543
+ * arch=armv8.2-a or later. This was fixed in gcc 13.2 by commit
544
+ * 9aac37ab8a7b ("aarch64: Remove architecture dependencies from intrinsics")
545
+ * and in binutils 2.41 by commit 205e4380c800 ("aarch64: Remove version
546
+ * dependencies from features"). Unfortunately, always using arch=armv8.2-a
547
+ * causes build errors with some compiler options because it may reduce the
548
+ * arch rather than increase it. Therefore we try to omit the arch whenever
549
+ * possible. If gcc is 14 or later, then both gcc and binutils are probably
550
+ * fixed, so we omit the arch. We also omit the arch if a feature that
551
+ * depends on armv8.2-a or later (in gcc 13.1 and earlier) is present.
559
552
  */
560
- # elif GCC_PREREQ(13, 2) || defined(__ARM_FEATURE_JCVT)
553
+ # elif GCC_PREREQ(14, 0) || defined(__ARM_FEATURE_JCVT) \
554
+ || defined(__ARM_FEATURE_DOTPROD)
561
555
  # define ATTRIBUTES _target_attribute("+crypto,+crc,+sha3")
562
556
  # else
563
557
  # define ATTRIBUTES _target_attribute("arch=armv8.2-a+crypto+crc+sha3")
@@ -0,0 +1,156 @@
1
+ /*
2
+ * arm/crc32_pmull_helpers.h - helper functions for CRC-32 folding with PMULL
3
+ *
4
+ * Copyright 2022 Eric Biggers
5
+ *
6
+ * Permission is hereby granted, free of charge, to any person
7
+ * obtaining a copy of this software and associated documentation
8
+ * files (the "Software"), to deal in the Software without
9
+ * restriction, including without limitation the rights to use,
10
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the
12
+ * Software is furnished to do so, subject to the following
13
+ * conditions:
14
+ *
15
+ * The above copyright notice and this permission notice shall be
16
+ * included in all copies or substantial portions of the Software.
17
+ *
18
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ */
27
+
28
+ /*
29
+ * This file is a "template" for instantiating helper functions for CRC folding
30
+ * with pmull instructions. It accepts the following parameters:
31
+ *
32
+ * SUFFIX:
33
+ * Name suffix to append to all instantiated functions.
34
+ * ATTRIBUTES:
35
+ * Target function attributes to use.
36
+ * ENABLE_EOR3:
37
+ * Use the eor3 instruction (from the sha3 extension).
38
+ */
39
+
40
+ /* Create a vector with 'a' in the first 4 bytes, and the rest zeroed out. */
41
+ #undef u32_to_bytevec
42
+ static forceinline ATTRIBUTES uint8x16_t
43
+ ADD_SUFFIX(u32_to_bytevec)(u32 a)
44
+ {
45
+ return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0));
46
+ }
47
+ #define u32_to_bytevec ADD_SUFFIX(u32_to_bytevec)
48
+
49
+ /* Load two 64-bit values into a vector. */
50
+ #undef load_multipliers
51
+ static forceinline ATTRIBUTES poly64x2_t
52
+ ADD_SUFFIX(load_multipliers)(const u64 p[2])
53
+ {
54
+ return vreinterpretq_p64_u64(vld1q_u64(p));
55
+ }
56
+ #define load_multipliers ADD_SUFFIX(load_multipliers)
57
+
58
+ /* Do carryless multiplication of the low halves of two vectors. */
59
+ #undef clmul_low
60
+ static forceinline ATTRIBUTES uint8x16_t
61
+ ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b)
62
+ {
63
+ return vreinterpretq_u8_p128(
64
+ compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0),
65
+ vgetq_lane_p64(b, 0)));
66
+ }
67
+ #define clmul_low ADD_SUFFIX(clmul_low)
68
+
69
+ /* Do carryless multiplication of the high halves of two vectors. */
70
+ #undef clmul_high
71
+ static forceinline ATTRIBUTES uint8x16_t
72
+ ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b)
73
+ {
74
+ #ifdef __clang__
75
+ /*
76
+ * Use inline asm to ensure that pmull2 is really used. This works
77
+ * around clang bug https://github.com/llvm/llvm-project/issues/52868.
78
+ */
79
+ uint8x16_t res;
80
+
81
+ __asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b));
82
+ return res;
83
+ #else
84
+ return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b));
85
+ #endif
86
+ }
87
+ #define clmul_high ADD_SUFFIX(clmul_high)
88
+
89
+ #undef eor3
90
+ static forceinline ATTRIBUTES uint8x16_t
91
+ ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c)
92
+ {
93
+ #if ENABLE_EOR3
94
+ return veor3q_u8(a, b, c);
95
+ #else
96
+ return veorq_u8(veorq_u8(a, b), c);
97
+ #endif
98
+ }
99
+ #define eor3 ADD_SUFFIX(eor3)
100
+
101
+ #undef fold_vec
102
+ static forceinline ATTRIBUTES uint8x16_t
103
+ ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers)
104
+ {
105
+ uint8x16_t a = clmul_low(src, multipliers);
106
+ uint8x16_t b = clmul_high(src, multipliers);
107
+
108
+ return eor3(a, b, dst);
109
+ }
110
+ #define fold_vec ADD_SUFFIX(fold_vec)
111
+
112
+ /*
113
+ * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the
114
+ * next '1 <= len <= 15' data bytes, rearrange the concatenation of v and the
115
+ * data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
116
+ * respectively. Then fold x0 into x1 and return the result. Assumes that
117
+ * 'p + len - 16' is in-bounds.
118
+ */
119
+ #undef fold_partial_vec
120
+ static forceinline ATTRIBUTES MAYBE_UNUSED uint8x16_t
121
+ ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len,
122
+ poly64x2_t multipliers_1)
123
+ {
124
+ /*
125
+ * vqtbl1q_u8(v, shift_tab[len..len+15]) left shifts v by 16-len bytes.
126
+ * vqtbl1q_u8(v, shift_tab[len+16..len+31]) right shifts v by len bytes.
127
+ */
128
+ static const u8 shift_tab[48] = {
129
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
130
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
131
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
132
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
133
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
134
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
135
+ };
136
+ const uint8x16_t lshift = vld1q_u8(&shift_tab[len]);
137
+ const uint8x16_t rshift = vld1q_u8(&shift_tab[len + 16]);
138
+ uint8x16_t x0, x1, bsl_mask;
139
+
140
+ /* x0 = v left-shifted by '16 - len' bytes */
141
+ x0 = vqtbl1q_u8(v, lshift);
142
+
143
+ /* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */
144
+ bsl_mask = vreinterpretq_u8_s8(
145
+ vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7));
146
+
147
+ /*
148
+ * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len'
149
+ * bytes) followed by the remaining data.
150
+ */
151
+ x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */,
152
+ vld1q_u8(p + len - 16), vqtbl1q_u8(v, rshift));
153
+
154
+ return fold_vec(x0, x1, multipliers_1);
155
+ }
156
+ #define fold_partial_vec ADD_SUFFIX(fold_partial_vec)