zstdlib 0.13.0-x86-linux

Sign up to get free protection for your applications and to get access to all the features.
Files changed (129) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +6 -0
  3. data/CHANGES.md +107 -0
  4. data/Gemfile +3 -0
  5. data/README.md +107 -0
  6. data/Rakefile +59 -0
  7. data/ext/zstdlib_c/extconf.rb +59 -0
  8. data/ext/zstdlib_c/ruby/zlib-2.2/zstdlib.c +4675 -0
  9. data/ext/zstdlib_c/ruby/zlib-2.3/zstdlib.c +4702 -0
  10. data/ext/zstdlib_c/ruby/zlib-2.4/zstdlib.c +4859 -0
  11. data/ext/zstdlib_c/ruby/zlib-2.5/zstdlib.c +4864 -0
  12. data/ext/zstdlib_c/ruby/zlib-2.6/zstdlib.c +4906 -0
  13. data/ext/zstdlib_c/ruby/zlib-2.7/zstdlib.c +4895 -0
  14. data/ext/zstdlib_c/ruby/zlib-3.0/zstdlib.c +4994 -0
  15. data/ext/zstdlib_c/ruby/zlib-3.1/zstdlib.c +5076 -0
  16. data/ext/zstdlib_c/ruby/zlib-3.2/zstdlib.c +5090 -0
  17. data/ext/zstdlib_c/ruby/zlib-3.3/zstdlib.c +5090 -0
  18. data/ext/zstdlib_c/zlib-1.3.1/adler32.c +164 -0
  19. data/ext/zstdlib_c/zlib-1.3.1/compress.c +75 -0
  20. data/ext/zstdlib_c/zlib-1.3.1/crc32.c +1049 -0
  21. data/ext/zstdlib_c/zlib-1.3.1/crc32.h +9446 -0
  22. data/ext/zstdlib_c/zlib-1.3.1/deflate.c +2139 -0
  23. data/ext/zstdlib_c/zlib-1.3.1/deflate.h +377 -0
  24. data/ext/zstdlib_c/zlib-1.3.1/gzclose.c +23 -0
  25. data/ext/zstdlib_c/zlib-1.3.1/gzguts.h +214 -0
  26. data/ext/zstdlib_c/zlib-1.3.1/gzlib.c +582 -0
  27. data/ext/zstdlib_c/zlib-1.3.1/gzread.c +602 -0
  28. data/ext/zstdlib_c/zlib-1.3.1/gzwrite.c +631 -0
  29. data/ext/zstdlib_c/zlib-1.3.1/infback.c +628 -0
  30. data/ext/zstdlib_c/zlib-1.3.1/inffast.c +320 -0
  31. data/ext/zstdlib_c/zlib-1.3.1/inffast.h +11 -0
  32. data/ext/zstdlib_c/zlib-1.3.1/inffixed.h +94 -0
  33. data/ext/zstdlib_c/zlib-1.3.1/inflate.c +1526 -0
  34. data/ext/zstdlib_c/zlib-1.3.1/inflate.h +126 -0
  35. data/ext/zstdlib_c/zlib-1.3.1/inftrees.c +299 -0
  36. data/ext/zstdlib_c/zlib-1.3.1/inftrees.h +62 -0
  37. data/ext/zstdlib_c/zlib-1.3.1/trees.c +1117 -0
  38. data/ext/zstdlib_c/zlib-1.3.1/trees.h +128 -0
  39. data/ext/zstdlib_c/zlib-1.3.1/uncompr.c +85 -0
  40. data/ext/zstdlib_c/zlib-1.3.1/zconf.h +543 -0
  41. data/ext/zstdlib_c/zlib-1.3.1/zlib.h +1938 -0
  42. data/ext/zstdlib_c/zlib-1.3.1/zutil.c +299 -0
  43. data/ext/zstdlib_c/zlib-1.3.1/zutil.h +254 -0
  44. data/ext/zstdlib_c/zlib.mk +14 -0
  45. data/ext/zstdlib_c/zlibwrapper/zlibwrapper.c +10 -0
  46. data/ext/zstdlib_c/zlibwrapper.mk +14 -0
  47. data/ext/zstdlib_c/zstd-1.5.6/lib/common/allocations.h +55 -0
  48. data/ext/zstdlib_c/zstd-1.5.6/lib/common/bits.h +200 -0
  49. data/ext/zstdlib_c/zstd-1.5.6/lib/common/bitstream.h +457 -0
  50. data/ext/zstdlib_c/zstd-1.5.6/lib/common/compiler.h +450 -0
  51. data/ext/zstdlib_c/zstd-1.5.6/lib/common/cpu.h +249 -0
  52. data/ext/zstdlib_c/zstd-1.5.6/lib/common/debug.c +30 -0
  53. data/ext/zstdlib_c/zstd-1.5.6/lib/common/debug.h +116 -0
  54. data/ext/zstdlib_c/zstd-1.5.6/lib/common/entropy_common.c +340 -0
  55. data/ext/zstdlib_c/zstd-1.5.6/lib/common/error_private.c +63 -0
  56. data/ext/zstdlib_c/zstd-1.5.6/lib/common/error_private.h +168 -0
  57. data/ext/zstdlib_c/zstd-1.5.6/lib/common/fse.h +640 -0
  58. data/ext/zstdlib_c/zstd-1.5.6/lib/common/fse_decompress.c +313 -0
  59. data/ext/zstdlib_c/zstd-1.5.6/lib/common/huf.h +286 -0
  60. data/ext/zstdlib_c/zstd-1.5.6/lib/common/mem.h +426 -0
  61. data/ext/zstdlib_c/zstd-1.5.6/lib/common/pool.c +371 -0
  62. data/ext/zstdlib_c/zstd-1.5.6/lib/common/pool.h +90 -0
  63. data/ext/zstdlib_c/zstd-1.5.6/lib/common/portability_macros.h +158 -0
  64. data/ext/zstdlib_c/zstd-1.5.6/lib/common/threading.c +182 -0
  65. data/ext/zstdlib_c/zstd-1.5.6/lib/common/threading.h +150 -0
  66. data/ext/zstdlib_c/zstd-1.5.6/lib/common/xxhash.c +18 -0
  67. data/ext/zstdlib_c/zstd-1.5.6/lib/common/xxhash.h +7020 -0
  68. data/ext/zstdlib_c/zstd-1.5.6/lib/common/zstd_common.c +48 -0
  69. data/ext/zstdlib_c/zstd-1.5.6/lib/common/zstd_deps.h +111 -0
  70. data/ext/zstdlib_c/zstd-1.5.6/lib/common/zstd_internal.h +392 -0
  71. data/ext/zstdlib_c/zstd-1.5.6/lib/common/zstd_trace.h +163 -0
  72. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/clevels.h +134 -0
  73. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/fse_compress.c +625 -0
  74. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/hist.c +181 -0
  75. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/hist.h +75 -0
  76. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/huf_compress.c +1464 -0
  77. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress.c +7153 -0
  78. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_internal.h +1534 -0
  79. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_literals.c +235 -0
  80. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_literals.h +39 -0
  81. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_sequences.c +442 -0
  82. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_sequences.h +54 -0
  83. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_superblock.c +688 -0
  84. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_superblock.h +32 -0
  85. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_cwksp.h +748 -0
  86. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_double_fast.c +770 -0
  87. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_double_fast.h +50 -0
  88. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_fast.c +968 -0
  89. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_fast.h +38 -0
  90. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_lazy.c +2199 -0
  91. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_lazy.h +202 -0
  92. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_ldm.c +730 -0
  93. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_ldm.h +117 -0
  94. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_ldm_geartab.h +106 -0
  95. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_opt.c +1576 -0
  96. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_opt.h +80 -0
  97. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstdmt_compress.c +1882 -0
  98. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstdmt_compress.h +113 -0
  99. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/huf_decompress.c +1944 -0
  100. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/huf_decompress_amd64.S +595 -0
  101. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_ddict.c +244 -0
  102. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_ddict.h +44 -0
  103. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_decompress.c +2407 -0
  104. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_decompress_block.c +2215 -0
  105. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_decompress_block.h +73 -0
  106. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_decompress_internal.h +240 -0
  107. data/ext/zstdlib_c/zstd-1.5.6/lib/zdict.h +474 -0
  108. data/ext/zstdlib_c/zstd-1.5.6/lib/zstd.h +3089 -0
  109. data/ext/zstdlib_c/zstd-1.5.6/lib/zstd_errors.h +114 -0
  110. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzclose.c +26 -0
  111. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzcompatibility.h +68 -0
  112. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzguts.h +229 -0
  113. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzlib.c +587 -0
  114. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzread.c +637 -0
  115. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzwrite.c +631 -0
  116. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/zstd_zlibwrapper.c +1200 -0
  117. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/zstd_zlibwrapper.h +91 -0
  118. data/ext/zstdlib_c/zstd.mk +15 -0
  119. data/lib/2.4/zstdlib_c.so +0 -0
  120. data/lib/2.5/zstdlib_c.so +0 -0
  121. data/lib/2.6/zstdlib_c.so +0 -0
  122. data/lib/2.7/zstdlib_c.so +0 -0
  123. data/lib/3.0/zstdlib_c.so +0 -0
  124. data/lib/3.1/zstdlib_c.so +0 -0
  125. data/lib/3.2/zstdlib_c.so +0 -0
  126. data/lib/3.3/zstdlib_c.so +0 -0
  127. data/lib/zstdlib.rb +6 -0
  128. data/test/zstdlib_test.rb +21 -0
  129. metadata +243 -0
@@ -0,0 +1,595 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #include "../common/portability_macros.h"
12
+
13
+ #if defined(__ELF__) && defined(__GNUC__)
14
+ /* Stack marking
15
+ * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
16
+ */
17
+ .section .note.GNU-stack,"",%progbits
18
+
19
+ #if defined(__aarch64__)
20
+ /* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
21
+ * See: https://github.com/facebook/zstd/issues/3841
22
+ * See: https://gcc.godbolt.org/z/sqr5T4ffK
23
+ * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/
24
+ * See: https://reviews.llvm.org/D62609
25
+ */
26
+ .pushsection .note.gnu.property, "a"
27
+ .p2align 3
28
+ .long 4 /* size of the name - "GNU\0" */
29
+ .long 0x10 /* size of descriptor */
30
+ .long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */
31
+ .asciz "GNU"
32
+ .long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
33
+ .long 4 /* pr_datasz - 4 bytes */
34
+ .long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
35
+ .p2align 3 /* pr_padding - bring everything to 8 byte alignment */
36
+ .popsection
37
+ #endif
38
+
39
+ #endif
40
+
41
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
42
+
43
+ /* Calling convention:
44
+ *
45
+ * %rdi contains the first argument: HUF_DecompressAsmArgs*.
46
+ * %rbp isn't maintained (no frame pointer).
47
+ * %rsp contains the stack pointer that grows down.
48
+ * No red-zone is assumed, only addresses >= %rsp are used.
49
+ * All register contents are preserved.
50
+ *
51
+ * TODO: Support Windows calling convention.
52
+ */
53
+
54
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
55
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
56
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
57
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
58
+ .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
59
+ .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
60
+ .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
61
+ .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
62
+ .text
63
+
64
+ /* Sets up register mappings for clarity.
65
+ * op[], bits[], dtable & ip[0] each get their own register.
66
+ * ip[1,2,3] & olimit alias var[].
67
+ * %rax is a scratch register.
68
+ */
69
+
70
+ #define op0 rsi
71
+ #define op1 rbx
72
+ #define op2 rcx
73
+ #define op3 rdi
74
+
75
+ #define ip0 r8
76
+ #define ip1 r9
77
+ #define ip2 r10
78
+ #define ip3 r11
79
+
80
+ #define bits0 rbp
81
+ #define bits1 rdx
82
+ #define bits2 r12
83
+ #define bits3 r13
84
+ #define dtable r14
85
+ #define olimit r15
86
+
87
+ /* var[] aliases ip[1,2,3] & olimit
88
+ * ip[1,2,3] are saved every iteration.
89
+ * olimit is only used in compute_olimit.
90
+ */
91
+ #define var0 r15
92
+ #define var1 r9
93
+ #define var2 r10
94
+ #define var3 r11
95
+
96
+ /* 32-bit var registers */
97
+ #define vard0 r15d
98
+ #define vard1 r9d
99
+ #define vard2 r10d
100
+ #define vard3 r11d
101
+
102
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
103
+ #define FOR_EACH_STREAM(X) \
104
+ X(0); \
105
+ X(1); \
106
+ X(2); \
107
+ X(3)
108
+
109
+ /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
110
+ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
111
+ X(0, idx); \
112
+ X(1, idx); \
113
+ X(2, idx); \
114
+ X(3, idx)
115
+
116
+ /* Define both _HUF_* & HUF_* symbols because MacOS
117
+ * C symbols are prefixed with '_' & Linux symbols aren't.
118
+ */
119
+ _HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
120
+ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
121
+ ZSTD_CET_ENDBRANCH
122
+ /* Save all registers - even if they are callee saved for simplicity. */
123
+ push %rax
124
+ push %rbx
125
+ push %rcx
126
+ push %rdx
127
+ push %rbp
128
+ push %rsi
129
+ push %rdi
130
+ push %r8
131
+ push %r9
132
+ push %r10
133
+ push %r11
134
+ push %r12
135
+ push %r13
136
+ push %r14
137
+ push %r15
138
+
139
+ /* Read HUF_DecompressAsmArgs* args from %rax */
140
+ movq %rdi, %rax
141
+ movq 0(%rax), %ip0
142
+ movq 8(%rax), %ip1
143
+ movq 16(%rax), %ip2
144
+ movq 24(%rax), %ip3
145
+ movq 32(%rax), %op0
146
+ movq 40(%rax), %op1
147
+ movq 48(%rax), %op2
148
+ movq 56(%rax), %op3
149
+ movq 64(%rax), %bits0
150
+ movq 72(%rax), %bits1
151
+ movq 80(%rax), %bits2
152
+ movq 88(%rax), %bits3
153
+ movq 96(%rax), %dtable
154
+ push %rax /* argument */
155
+ push 104(%rax) /* ilowest */
156
+ push 112(%rax) /* oend */
157
+ push %olimit /* olimit space */
158
+
159
+ subq $24, %rsp
160
+
161
+ .L_4X1_compute_olimit:
162
+ /* Computes how many iterations we can do safely
163
+ * %r15, %rax may be clobbered
164
+ * rbx, rdx must be saved
165
+ * op3 & ip0 mustn't be clobbered
166
+ */
167
+ movq %rbx, 0(%rsp)
168
+ movq %rdx, 8(%rsp)
169
+
170
+ movq 32(%rsp), %rax /* rax = oend */
171
+ subq %op3, %rax /* rax = oend - op3 */
172
+
173
+ /* r15 = (oend - op3) / 5 */
174
+ movabsq $-3689348814741910323, %rdx
175
+ mulq %rdx
176
+ movq %rdx, %r15
177
+ shrq $2, %r15
178
+
179
+ movq %ip0, %rax /* rax = ip0 */
180
+ movq 40(%rsp), %rdx /* rdx = ilowest */
181
+ subq %rdx, %rax /* rax = ip0 - ilowest */
182
+ movq %rax, %rbx /* rbx = ip0 - ilowest */
183
+
184
+ /* rdx = (ip0 - ilowest) / 7 */
185
+ movabsq $2635249153387078803, %rdx
186
+ mulq %rdx
187
+ subq %rdx, %rbx
188
+ shrq %rbx
189
+ addq %rbx, %rdx
190
+ shrq $2, %rdx
191
+
192
+ /* r15 = min(%rdx, %r15) */
193
+ cmpq %rdx, %r15
194
+ cmova %rdx, %r15
195
+
196
+ /* r15 = r15 * 5 */
197
+ leaq (%r15, %r15, 4), %r15
198
+
199
+ /* olimit = op3 + r15 */
200
+ addq %op3, %olimit
201
+
202
+ movq 8(%rsp), %rdx
203
+ movq 0(%rsp), %rbx
204
+
205
+ /* If (op3 + 20 > olimit) */
206
+ movq %op3, %rax /* rax = op3 */
207
+ cmpq %rax, %olimit /* op3 == olimit */
208
+ je .L_4X1_exit
209
+
210
+ /* If (ip1 < ip0) go to exit */
211
+ cmpq %ip0, %ip1
212
+ jb .L_4X1_exit
213
+
214
+ /* If (ip2 < ip1) go to exit */
215
+ cmpq %ip1, %ip2
216
+ jb .L_4X1_exit
217
+
218
+ /* If (ip3 < ip2) go to exit */
219
+ cmpq %ip2, %ip3
220
+ jb .L_4X1_exit
221
+
222
+ /* Reads top 11 bits from bits[n]
223
+ * Loads dt[bits[n]] into var[n]
224
+ */
225
+ #define GET_NEXT_DELT(n) \
226
+ movq $53, %var##n; \
227
+ shrxq %var##n, %bits##n, %var##n; \
228
+ movzwl (%dtable,%var##n,2),%vard##n
229
+
230
+ /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
231
+ * Moves var[n] to %rax
232
+ * bits[n] <<= var[n] & 63
233
+ * op[n][idx] = %rax >> 8
234
+ * %ah is a way to access bits [8, 16) of %rax
235
+ */
236
+ #define DECODE_FROM_DELT(n, idx) \
237
+ movq %var##n, %rax; \
238
+ shlxq %var##n, %bits##n, %bits##n; \
239
+ movb %ah, idx(%op##n)
240
+
241
+ /* Assumes GET_NEXT_DELT has been called.
242
+ * Calls DECODE_FROM_DELT then GET_NEXT_DELT
243
+ */
244
+ #define DECODE_AND_GET_NEXT(n, idx) \
245
+ DECODE_FROM_DELT(n, idx); \
246
+ GET_NEXT_DELT(n) \
247
+
248
+ /* // ctz & nbBytes is stored in bits[n]
249
+ * // nbBits is stored in %rax
250
+ * ctz = CTZ[bits[n]]
251
+ * nbBits = ctz & 7
252
+ * nbBytes = ctz >> 3
253
+ * op[n] += 5
254
+ * ip[n] -= nbBytes
255
+ * // Note: x86-64 is little-endian ==> no bswap
256
+ * bits[n] = MEM_readST(ip[n]) | 1
257
+ * bits[n] <<= nbBits
258
+ */
259
+ #define RELOAD_BITS(n) \
260
+ bsfq %bits##n, %bits##n; \
261
+ movq %bits##n, %rax; \
262
+ andq $7, %rax; \
263
+ shrq $3, %bits##n; \
264
+ leaq 5(%op##n), %op##n; \
265
+ subq %bits##n, %ip##n; \
266
+ movq (%ip##n), %bits##n; \
267
+ orq $1, %bits##n; \
268
+ shlx %rax, %bits##n, %bits##n
269
+
270
+ /* Store clobbered variables on the stack */
271
+ movq %olimit, 24(%rsp)
272
+ movq %ip1, 0(%rsp)
273
+ movq %ip2, 8(%rsp)
274
+ movq %ip3, 16(%rsp)
275
+
276
+ /* Call GET_NEXT_DELT for each stream */
277
+ FOR_EACH_STREAM(GET_NEXT_DELT)
278
+
279
+ .p2align 6
280
+
281
+ .L_4X1_loop_body:
282
+ /* Decode 5 symbols in each of the 4 streams (20 total)
283
+ * Must have called GET_NEXT_DELT for each stream
284
+ */
285
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
286
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
287
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
288
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
289
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
290
+
291
+ /* Load ip[1,2,3] from stack (var[] aliases them)
292
+ * ip[] is needed for RELOAD_BITS
293
+ * Each will be stored back to the stack after RELOAD
294
+ */
295
+ movq 0(%rsp), %ip1
296
+ movq 8(%rsp), %ip2
297
+ movq 16(%rsp), %ip3
298
+
299
+ /* Reload each stream & fetch the next table entry
300
+ * to prepare for the next iteration
301
+ */
302
+ RELOAD_BITS(0)
303
+ GET_NEXT_DELT(0)
304
+
305
+ RELOAD_BITS(1)
306
+ movq %ip1, 0(%rsp)
307
+ GET_NEXT_DELT(1)
308
+
309
+ RELOAD_BITS(2)
310
+ movq %ip2, 8(%rsp)
311
+ GET_NEXT_DELT(2)
312
+
313
+ RELOAD_BITS(3)
314
+ movq %ip3, 16(%rsp)
315
+ GET_NEXT_DELT(3)
316
+
317
+ /* If op3 < olimit: continue the loop */
318
+ cmp %op3, 24(%rsp)
319
+ ja .L_4X1_loop_body
320
+
321
+ /* Reload ip[1,2,3] from stack */
322
+ movq 0(%rsp), %ip1
323
+ movq 8(%rsp), %ip2
324
+ movq 16(%rsp), %ip3
325
+
326
+ /* Re-compute olimit */
327
+ jmp .L_4X1_compute_olimit
328
+
329
+ #undef GET_NEXT_DELT
330
+ #undef DECODE_FROM_DELT
331
+ #undef DECODE
332
+ #undef RELOAD_BITS
333
+ .L_4X1_exit:
334
+ addq $24, %rsp
335
+
336
+ /* Restore stack (oend & olimit) */
337
+ pop %rax /* olimit */
338
+ pop %rax /* oend */
339
+ pop %rax /* ilowest */
340
+ pop %rax /* arg */
341
+
342
+ /* Save ip / op / bits */
343
+ movq %ip0, 0(%rax)
344
+ movq %ip1, 8(%rax)
345
+ movq %ip2, 16(%rax)
346
+ movq %ip3, 24(%rax)
347
+ movq %op0, 32(%rax)
348
+ movq %op1, 40(%rax)
349
+ movq %op2, 48(%rax)
350
+ movq %op3, 56(%rax)
351
+ movq %bits0, 64(%rax)
352
+ movq %bits1, 72(%rax)
353
+ movq %bits2, 80(%rax)
354
+ movq %bits3, 88(%rax)
355
+
356
+ /* Restore registers */
357
+ pop %r15
358
+ pop %r14
359
+ pop %r13
360
+ pop %r12
361
+ pop %r11
362
+ pop %r10
363
+ pop %r9
364
+ pop %r8
365
+ pop %rdi
366
+ pop %rsi
367
+ pop %rbp
368
+ pop %rdx
369
+ pop %rcx
370
+ pop %rbx
371
+ pop %rax
372
+ ret
373
+
374
+ _HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
375
+ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
376
+ ZSTD_CET_ENDBRANCH
377
+ /* Save all registers - even if they are callee saved for simplicity. */
378
+ push %rax
379
+ push %rbx
380
+ push %rcx
381
+ push %rdx
382
+ push %rbp
383
+ push %rsi
384
+ push %rdi
385
+ push %r8
386
+ push %r9
387
+ push %r10
388
+ push %r11
389
+ push %r12
390
+ push %r13
391
+ push %r14
392
+ push %r15
393
+
394
+ movq %rdi, %rax
395
+ movq 0(%rax), %ip0
396
+ movq 8(%rax), %ip1
397
+ movq 16(%rax), %ip2
398
+ movq 24(%rax), %ip3
399
+ movq 32(%rax), %op0
400
+ movq 40(%rax), %op1
401
+ movq 48(%rax), %op2
402
+ movq 56(%rax), %op3
403
+ movq 64(%rax), %bits0
404
+ movq 72(%rax), %bits1
405
+ movq 80(%rax), %bits2
406
+ movq 88(%rax), %bits3
407
+ movq 96(%rax), %dtable
408
+ push %rax /* argument */
409
+ push %rax /* olimit */
410
+ push 104(%rax) /* ilowest */
411
+
412
+ movq 112(%rax), %rax
413
+ push %rax /* oend3 */
414
+
415
+ movq %op3, %rax
416
+ push %rax /* oend2 */
417
+
418
+ movq %op2, %rax
419
+ push %rax /* oend1 */
420
+
421
+ movq %op1, %rax
422
+ push %rax /* oend0 */
423
+
424
+ /* Scratch space */
425
+ subq $8, %rsp
426
+
427
+ .L_4X2_compute_olimit:
428
+ /* Computes how many iterations we can do safely
429
+ * %r15, %rax may be clobbered
430
+ * rdx must be saved
431
+ * op[1,2,3,4] & ip0 mustn't be clobbered
432
+ */
433
+ movq %rdx, 0(%rsp)
434
+
435
+ /* We can consume up to 7 input bytes each iteration. */
436
+ movq %ip0, %rax /* rax = ip0 */
437
+ movq 40(%rsp), %rdx /* rdx = ilowest */
438
+ subq %rdx, %rax /* rax = ip0 - ilowest */
439
+ movq %rax, %r15 /* r15 = ip0 - ilowest */
440
+
441
+ /* rdx = rax / 7 */
442
+ movabsq $2635249153387078803, %rdx
443
+ mulq %rdx
444
+ subq %rdx, %r15
445
+ shrq %r15
446
+ addq %r15, %rdx
447
+ shrq $2, %rdx
448
+
449
+ /* r15 = (ip0 - ilowest) / 7 */
450
+ movq %rdx, %r15
451
+
452
+ /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
453
+ movq 8(%rsp), %rax /* rax = oend0 */
454
+ subq %op0, %rax /* rax = oend0 - op0 */
455
+ movq 16(%rsp), %rdx /* rdx = oend1 */
456
+ subq %op1, %rdx /* rdx = oend1 - op1 */
457
+
458
+ cmpq %rax, %rdx
459
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
460
+
461
+ movq 24(%rsp), %rax /* rax = oend2 */
462
+ subq %op2, %rax /* rax = oend2 - op2 */
463
+
464
+ cmpq %rax, %rdx
465
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
466
+
467
+ movq 32(%rsp), %rax /* rax = oend3 */
468
+ subq %op3, %rax /* rax = oend3 - op3 */
469
+
470
+ cmpq %rax, %rdx
471
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
472
+
473
+ movabsq $-3689348814741910323, %rax
474
+ mulq %rdx
475
+ shrq $3, %rdx /* rdx = rdx / 10 */
476
+
477
+ /* r15 = min(%rdx, %r15) */
478
+ cmpq %rdx, %r15
479
+ cmova %rdx, %r15
480
+
481
+ /* olimit = op3 + 5 * r15 */
482
+ movq %r15, %rax
483
+ leaq (%op3, %rax, 4), %olimit
484
+ addq %rax, %olimit
485
+
486
+ movq 0(%rsp), %rdx
487
+
488
+ /* If (op3 + 10 > olimit) */
489
+ movq %op3, %rax /* rax = op3 */
490
+ cmpq %rax, %olimit /* op3 == olimit */
491
+ je .L_4X2_exit
492
+
493
+ /* If (ip1 < ip0) go to exit */
494
+ cmpq %ip0, %ip1
495
+ jb .L_4X2_exit
496
+
497
+ /* If (ip2 < ip1) go to exit */
498
+ cmpq %ip1, %ip2
499
+ jb .L_4X2_exit
500
+
501
+ /* If (ip3 < ip2) go to exit */
502
+ cmpq %ip2, %ip3
503
+ jb .L_4X2_exit
504
+
505
+ #define DECODE(n, idx) \
506
+ movq %bits##n, %rax; \
507
+ shrq $53, %rax; \
508
+ movzwl 0(%dtable,%rax,4),%r8d; \
509
+ movzbl 2(%dtable,%rax,4),%r15d; \
510
+ movzbl 3(%dtable,%rax,4),%eax; \
511
+ movw %r8w, (%op##n); \
512
+ shlxq %r15, %bits##n, %bits##n; \
513
+ addq %rax, %op##n
514
+
515
+ #define RELOAD_BITS(n) \
516
+ bsfq %bits##n, %bits##n; \
517
+ movq %bits##n, %rax; \
518
+ shrq $3, %bits##n; \
519
+ andq $7, %rax; \
520
+ subq %bits##n, %ip##n; \
521
+ movq (%ip##n), %bits##n; \
522
+ orq $1, %bits##n; \
523
+ shlxq %rax, %bits##n, %bits##n
524
+
525
+
526
+ movq %olimit, 48(%rsp)
527
+
528
+ .p2align 6
529
+
530
+ .L_4X2_loop_body:
531
+ /* We clobber r8, so store it on the stack */
532
+ movq %r8, 0(%rsp)
533
+
534
+ /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
535
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
536
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
537
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
538
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
539
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
540
+
541
+ /* Reload r8 */
542
+ movq 0(%rsp), %r8
543
+
544
+ FOR_EACH_STREAM(RELOAD_BITS)
545
+
546
+ cmp %op3, 48(%rsp)
547
+ ja .L_4X2_loop_body
548
+ jmp .L_4X2_compute_olimit
549
+
550
+ #undef DECODE
551
+ #undef RELOAD_BITS
552
+ .L_4X2_exit:
553
+ addq $8, %rsp
554
+ /* Restore stack (oend & olimit) */
555
+ pop %rax /* oend0 */
556
+ pop %rax /* oend1 */
557
+ pop %rax /* oend2 */
558
+ pop %rax /* oend3 */
559
+ pop %rax /* ilowest */
560
+ pop %rax /* olimit */
561
+ pop %rax /* arg */
562
+
563
+ /* Save ip / op / bits */
564
+ movq %ip0, 0(%rax)
565
+ movq %ip1, 8(%rax)
566
+ movq %ip2, 16(%rax)
567
+ movq %ip3, 24(%rax)
568
+ movq %op0, 32(%rax)
569
+ movq %op1, 40(%rax)
570
+ movq %op2, 48(%rax)
571
+ movq %op3, 56(%rax)
572
+ movq %bits0, 64(%rax)
573
+ movq %bits1, 72(%rax)
574
+ movq %bits2, 80(%rax)
575
+ movq %bits3, 88(%rax)
576
+
577
+ /* Restore registers */
578
+ pop %r15
579
+ pop %r14
580
+ pop %r13
581
+ pop %r12
582
+ pop %r11
583
+ pop %r10
584
+ pop %r9
585
+ pop %r8
586
+ pop %rdi
587
+ pop %rsi
588
+ pop %rbp
589
+ pop %rdx
590
+ pop %rcx
591
+ pop %rbx
592
+ pop %rax
593
+ ret
594
+
595
+ #endif