zstdlib 0.13.0-x86-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +6 -0
  3. data/CHANGES.md +107 -0
  4. data/Gemfile +3 -0
  5. data/README.md +107 -0
  6. data/Rakefile +59 -0
  7. data/ext/zstdlib_c/extconf.rb +59 -0
  8. data/ext/zstdlib_c/ruby/zlib-2.2/zstdlib.c +4675 -0
  9. data/ext/zstdlib_c/ruby/zlib-2.3/zstdlib.c +4702 -0
  10. data/ext/zstdlib_c/ruby/zlib-2.4/zstdlib.c +4859 -0
  11. data/ext/zstdlib_c/ruby/zlib-2.5/zstdlib.c +4864 -0
  12. data/ext/zstdlib_c/ruby/zlib-2.6/zstdlib.c +4906 -0
  13. data/ext/zstdlib_c/ruby/zlib-2.7/zstdlib.c +4895 -0
  14. data/ext/zstdlib_c/ruby/zlib-3.0/zstdlib.c +4994 -0
  15. data/ext/zstdlib_c/ruby/zlib-3.1/zstdlib.c +5076 -0
  16. data/ext/zstdlib_c/ruby/zlib-3.2/zstdlib.c +5090 -0
  17. data/ext/zstdlib_c/ruby/zlib-3.3/zstdlib.c +5090 -0
  18. data/ext/zstdlib_c/zlib-1.3.1/adler32.c +164 -0
  19. data/ext/zstdlib_c/zlib-1.3.1/compress.c +75 -0
  20. data/ext/zstdlib_c/zlib-1.3.1/crc32.c +1049 -0
  21. data/ext/zstdlib_c/zlib-1.3.1/crc32.h +9446 -0
  22. data/ext/zstdlib_c/zlib-1.3.1/deflate.c +2139 -0
  23. data/ext/zstdlib_c/zlib-1.3.1/deflate.h +377 -0
  24. data/ext/zstdlib_c/zlib-1.3.1/gzclose.c +23 -0
  25. data/ext/zstdlib_c/zlib-1.3.1/gzguts.h +214 -0
  26. data/ext/zstdlib_c/zlib-1.3.1/gzlib.c +582 -0
  27. data/ext/zstdlib_c/zlib-1.3.1/gzread.c +602 -0
  28. data/ext/zstdlib_c/zlib-1.3.1/gzwrite.c +631 -0
  29. data/ext/zstdlib_c/zlib-1.3.1/infback.c +628 -0
  30. data/ext/zstdlib_c/zlib-1.3.1/inffast.c +320 -0
  31. data/ext/zstdlib_c/zlib-1.3.1/inffast.h +11 -0
  32. data/ext/zstdlib_c/zlib-1.3.1/inffixed.h +94 -0
  33. data/ext/zstdlib_c/zlib-1.3.1/inflate.c +1526 -0
  34. data/ext/zstdlib_c/zlib-1.3.1/inflate.h +126 -0
  35. data/ext/zstdlib_c/zlib-1.3.1/inftrees.c +299 -0
  36. data/ext/zstdlib_c/zlib-1.3.1/inftrees.h +62 -0
  37. data/ext/zstdlib_c/zlib-1.3.1/trees.c +1117 -0
  38. data/ext/zstdlib_c/zlib-1.3.1/trees.h +128 -0
  39. data/ext/zstdlib_c/zlib-1.3.1/uncompr.c +85 -0
  40. data/ext/zstdlib_c/zlib-1.3.1/zconf.h +543 -0
  41. data/ext/zstdlib_c/zlib-1.3.1/zlib.h +1938 -0
  42. data/ext/zstdlib_c/zlib-1.3.1/zutil.c +299 -0
  43. data/ext/zstdlib_c/zlib-1.3.1/zutil.h +254 -0
  44. data/ext/zstdlib_c/zlib.mk +14 -0
  45. data/ext/zstdlib_c/zlibwrapper/zlibwrapper.c +10 -0
  46. data/ext/zstdlib_c/zlibwrapper.mk +14 -0
  47. data/ext/zstdlib_c/zstd-1.5.6/lib/common/allocations.h +55 -0
  48. data/ext/zstdlib_c/zstd-1.5.6/lib/common/bits.h +200 -0
  49. data/ext/zstdlib_c/zstd-1.5.6/lib/common/bitstream.h +457 -0
  50. data/ext/zstdlib_c/zstd-1.5.6/lib/common/compiler.h +450 -0
  51. data/ext/zstdlib_c/zstd-1.5.6/lib/common/cpu.h +249 -0
  52. data/ext/zstdlib_c/zstd-1.5.6/lib/common/debug.c +30 -0
  53. data/ext/zstdlib_c/zstd-1.5.6/lib/common/debug.h +116 -0
  54. data/ext/zstdlib_c/zstd-1.5.6/lib/common/entropy_common.c +340 -0
  55. data/ext/zstdlib_c/zstd-1.5.6/lib/common/error_private.c +63 -0
  56. data/ext/zstdlib_c/zstd-1.5.6/lib/common/error_private.h +168 -0
  57. data/ext/zstdlib_c/zstd-1.5.6/lib/common/fse.h +640 -0
  58. data/ext/zstdlib_c/zstd-1.5.6/lib/common/fse_decompress.c +313 -0
  59. data/ext/zstdlib_c/zstd-1.5.6/lib/common/huf.h +286 -0
  60. data/ext/zstdlib_c/zstd-1.5.6/lib/common/mem.h +426 -0
  61. data/ext/zstdlib_c/zstd-1.5.6/lib/common/pool.c +371 -0
  62. data/ext/zstdlib_c/zstd-1.5.6/lib/common/pool.h +90 -0
  63. data/ext/zstdlib_c/zstd-1.5.6/lib/common/portability_macros.h +158 -0
  64. data/ext/zstdlib_c/zstd-1.5.6/lib/common/threading.c +182 -0
  65. data/ext/zstdlib_c/zstd-1.5.6/lib/common/threading.h +150 -0
  66. data/ext/zstdlib_c/zstd-1.5.6/lib/common/xxhash.c +18 -0
  67. data/ext/zstdlib_c/zstd-1.5.6/lib/common/xxhash.h +7020 -0
  68. data/ext/zstdlib_c/zstd-1.5.6/lib/common/zstd_common.c +48 -0
  69. data/ext/zstdlib_c/zstd-1.5.6/lib/common/zstd_deps.h +111 -0
  70. data/ext/zstdlib_c/zstd-1.5.6/lib/common/zstd_internal.h +392 -0
  71. data/ext/zstdlib_c/zstd-1.5.6/lib/common/zstd_trace.h +163 -0
  72. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/clevels.h +134 -0
  73. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/fse_compress.c +625 -0
  74. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/hist.c +181 -0
  75. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/hist.h +75 -0
  76. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/huf_compress.c +1464 -0
  77. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress.c +7153 -0
  78. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_internal.h +1534 -0
  79. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_literals.c +235 -0
  80. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_literals.h +39 -0
  81. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_sequences.c +442 -0
  82. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_sequences.h +54 -0
  83. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_superblock.c +688 -0
  84. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_compress_superblock.h +32 -0
  85. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_cwksp.h +748 -0
  86. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_double_fast.c +770 -0
  87. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_double_fast.h +50 -0
  88. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_fast.c +968 -0
  89. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_fast.h +38 -0
  90. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_lazy.c +2199 -0
  91. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_lazy.h +202 -0
  92. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_ldm.c +730 -0
  93. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_ldm.h +117 -0
  94. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_ldm_geartab.h +106 -0
  95. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_opt.c +1576 -0
  96. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstd_opt.h +80 -0
  97. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstdmt_compress.c +1882 -0
  98. data/ext/zstdlib_c/zstd-1.5.6/lib/compress/zstdmt_compress.h +113 -0
  99. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/huf_decompress.c +1944 -0
  100. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/huf_decompress_amd64.S +595 -0
  101. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_ddict.c +244 -0
  102. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_ddict.h +44 -0
  103. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_decompress.c +2407 -0
  104. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_decompress_block.c +2215 -0
  105. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_decompress_block.h +73 -0
  106. data/ext/zstdlib_c/zstd-1.5.6/lib/decompress/zstd_decompress_internal.h +240 -0
  107. data/ext/zstdlib_c/zstd-1.5.6/lib/zdict.h +474 -0
  108. data/ext/zstdlib_c/zstd-1.5.6/lib/zstd.h +3089 -0
  109. data/ext/zstdlib_c/zstd-1.5.6/lib/zstd_errors.h +114 -0
  110. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzclose.c +26 -0
  111. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzcompatibility.h +68 -0
  112. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzguts.h +229 -0
  113. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzlib.c +587 -0
  114. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzread.c +637 -0
  115. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/gzwrite.c +631 -0
  116. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/zstd_zlibwrapper.c +1200 -0
  117. data/ext/zstdlib_c/zstd-1.5.6/zlibWrapper/zstd_zlibwrapper.h +91 -0
  118. data/ext/zstdlib_c/zstd.mk +15 -0
  119. data/lib/2.4/zstdlib_c.so +0 -0
  120. data/lib/2.5/zstdlib_c.so +0 -0
  121. data/lib/2.6/zstdlib_c.so +0 -0
  122. data/lib/2.7/zstdlib_c.so +0 -0
  123. data/lib/3.0/zstdlib_c.so +0 -0
  124. data/lib/3.1/zstdlib_c.so +0 -0
  125. data/lib/3.2/zstdlib_c.so +0 -0
  126. data/lib/3.3/zstdlib_c.so +0 -0
  127. data/lib/zstdlib.rb +6 -0
  128. data/test/zstdlib_test.rb +21 -0
  129. metadata +243 -0
@@ -0,0 +1,595 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #include "../common/portability_macros.h"
12
+
13
+ #if defined(__ELF__) && defined(__GNUC__)
14
+ /* Stack marking
15
+ * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
16
+ */
17
+ .section .note.GNU-stack,"",%progbits
18
+
19
+ #if defined(__aarch64__)
20
+ /* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
21
+ * See: https://github.com/facebook/zstd/issues/3841
22
+ * See: https://gcc.godbolt.org/z/sqr5T4ffK
23
+ * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/
24
+ * See: https://reviews.llvm.org/D62609
25
+ */
26
+ .pushsection .note.gnu.property, "a"
27
+ .p2align 3
28
+ .long 4 /* size of the name - "GNU\0" */
29
+ .long 0x10 /* size of descriptor */
30
+ .long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */
31
+ .asciz "GNU"
32
+ .long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
33
+ .long 4 /* pr_datasz - 4 bytes */
34
+ .long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
35
+ .p2align 3 /* pr_padding - bring everything to 8 byte alignment */
36
+ .popsection
37
+ #endif
38
+
39
+ #endif
40
+
41
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
42
+
43
+ /* Calling convention:
44
+ *
45
+ * %rdi contains the first argument: HUF_DecompressAsmArgs*.
46
+ * %rbp isn't maintained (no frame pointer).
47
+ * %rsp contains the stack pointer that grows down.
48
+ * No red-zone is assumed, only addresses >= %rsp are used.
49
+ * All register contents are preserved.
50
+ *
51
+ * TODO: Support Windows calling convention.
52
+ */
53
+
54
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
55
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
56
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
57
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
58
+ .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
59
+ .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
60
+ .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
61
+ .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
62
+ .text
63
+
64
+ /* Sets up register mappings for clarity.
65
+ * op[], bits[], dtable & ip[0] each get their own register.
66
+ * ip[1,2,3] & olimit alias var[].
67
+ * %rax is a scratch register.
68
+ */
69
+
70
+ #define op0 rsi
71
+ #define op1 rbx
72
+ #define op2 rcx
73
+ #define op3 rdi
74
+
75
+ #define ip0 r8
76
+ #define ip1 r9
77
+ #define ip2 r10
78
+ #define ip3 r11
79
+
80
+ #define bits0 rbp
81
+ #define bits1 rdx
82
+ #define bits2 r12
83
+ #define bits3 r13
84
+ #define dtable r14
85
+ #define olimit r15
86
+
87
+ /* var[] aliases ip[1,2,3] & olimit
88
+ * ip[1,2,3] are saved every iteration.
89
+ * olimit is only used in compute_olimit.
90
+ */
91
+ #define var0 r15
92
+ #define var1 r9
93
+ #define var2 r10
94
+ #define var3 r11
95
+
96
+ /* 32-bit var registers */
97
+ #define vard0 r15d
98
+ #define vard1 r9d
99
+ #define vard2 r10d
100
+ #define vard3 r11d
101
+
102
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
103
+ #define FOR_EACH_STREAM(X) \
104
+ X(0); \
105
+ X(1); \
106
+ X(2); \
107
+ X(3)
108
+
109
+ /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
110
+ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
111
+ X(0, idx); \
112
+ X(1, idx); \
113
+ X(2, idx); \
114
+ X(3, idx)
115
+
116
+ /* Define both _HUF_* & HUF_* symbols because MacOS
117
+ * C symbols are prefixed with '_' & Linux symbols aren't.
118
+ */
119
+ _HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
120
+ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
121
+ ZSTD_CET_ENDBRANCH
122
+ /* Save all registers - even if they are callee saved for simplicity. */
123
+ push %rax
124
+ push %rbx
125
+ push %rcx
126
+ push %rdx
127
+ push %rbp
128
+ push %rsi
129
+ push %rdi
130
+ push %r8
131
+ push %r9
132
+ push %r10
133
+ push %r11
134
+ push %r12
135
+ push %r13
136
+ push %r14
137
+ push %r15
138
+
139
+ /* Read HUF_DecompressAsmArgs* args from %rax */
140
+ movq %rdi, %rax
141
+ movq 0(%rax), %ip0
142
+ movq 8(%rax), %ip1
143
+ movq 16(%rax), %ip2
144
+ movq 24(%rax), %ip3
145
+ movq 32(%rax), %op0
146
+ movq 40(%rax), %op1
147
+ movq 48(%rax), %op2
148
+ movq 56(%rax), %op3
149
+ movq 64(%rax), %bits0
150
+ movq 72(%rax), %bits1
151
+ movq 80(%rax), %bits2
152
+ movq 88(%rax), %bits3
153
+ movq 96(%rax), %dtable
154
+ push %rax /* argument */
155
+ push 104(%rax) /* ilowest */
156
+ push 112(%rax) /* oend */
157
+ push %olimit /* olimit space */
158
+
159
+ subq $24, %rsp
160
+
161
+ .L_4X1_compute_olimit:
162
+ /* Computes how many iterations we can do safely
163
+ * %r15, %rax may be clobbered
164
+ * rbx, rdx must be saved
165
+ * op3 & ip0 mustn't be clobbered
166
+ */
167
+ movq %rbx, 0(%rsp)
168
+ movq %rdx, 8(%rsp)
169
+
170
+ movq 32(%rsp), %rax /* rax = oend */
171
+ subq %op3, %rax /* rax = oend - op3 */
172
+
173
+ /* r15 = (oend - op3) / 5 */
174
+ movabsq $-3689348814741910323, %rdx
175
+ mulq %rdx
176
+ movq %rdx, %r15
177
+ shrq $2, %r15
178
+
179
+ movq %ip0, %rax /* rax = ip0 */
180
+ movq 40(%rsp), %rdx /* rdx = ilowest */
181
+ subq %rdx, %rax /* rax = ip0 - ilowest */
182
+ movq %rax, %rbx /* rbx = ip0 - ilowest */
183
+
184
+ /* rdx = (ip0 - ilowest) / 7 */
185
+ movabsq $2635249153387078803, %rdx
186
+ mulq %rdx
187
+ subq %rdx, %rbx
188
+ shrq %rbx
189
+ addq %rbx, %rdx
190
+ shrq $2, %rdx
191
+
192
+ /* r15 = min(%rdx, %r15) */
193
+ cmpq %rdx, %r15
194
+ cmova %rdx, %r15
195
+
196
+ /* r15 = r15 * 5 */
197
+ leaq (%r15, %r15, 4), %r15
198
+
199
+ /* olimit = op3 + r15 */
200
+ addq %op3, %olimit
201
+
202
+ movq 8(%rsp), %rdx
203
+ movq 0(%rsp), %rbx
204
+
205
+ /* If (op3 + 20 > olimit) */
206
+ movq %op3, %rax /* rax = op3 */
207
+ cmpq %rax, %olimit /* op3 == olimit */
208
+ je .L_4X1_exit
209
+
210
+ /* If (ip1 < ip0) go to exit */
211
+ cmpq %ip0, %ip1
212
+ jb .L_4X1_exit
213
+
214
+ /* If (ip2 < ip1) go to exit */
215
+ cmpq %ip1, %ip2
216
+ jb .L_4X1_exit
217
+
218
+ /* If (ip3 < ip2) go to exit */
219
+ cmpq %ip2, %ip3
220
+ jb .L_4X1_exit
221
+
222
+ /* Reads top 11 bits from bits[n]
223
+ * Loads dt[bits[n]] into var[n]
224
+ */
225
+ #define GET_NEXT_DELT(n) \
226
+ movq $53, %var##n; \
227
+ shrxq %var##n, %bits##n, %var##n; \
228
+ movzwl (%dtable,%var##n,2),%vard##n
229
+
230
+ /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
231
+ * Moves var[n] to %rax
232
+ * bits[n] <<= var[n] & 63
233
+ * op[n][idx] = %rax >> 8
234
+ * %ah is a way to access bits [8, 16) of %rax
235
+ */
236
+ #define DECODE_FROM_DELT(n, idx) \
237
+ movq %var##n, %rax; \
238
+ shlxq %var##n, %bits##n, %bits##n; \
239
+ movb %ah, idx(%op##n)
240
+
241
+ /* Assumes GET_NEXT_DELT has been called.
242
+ * Calls DECODE_FROM_DELT then GET_NEXT_DELT
243
+ */
244
+ #define DECODE_AND_GET_NEXT(n, idx) \
245
+ DECODE_FROM_DELT(n, idx); \
246
+ GET_NEXT_DELT(n) \
247
+
248
+ /* // ctz & nbBytes is stored in bits[n]
249
+ * // nbBits is stored in %rax
250
+ * ctz = CTZ[bits[n]]
251
+ * nbBits = ctz & 7
252
+ * nbBytes = ctz >> 3
253
+ * op[n] += 5
254
+ * ip[n] -= nbBytes
255
+ * // Note: x86-64 is little-endian ==> no bswap
256
+ * bits[n] = MEM_readST(ip[n]) | 1
257
+ * bits[n] <<= nbBits
258
+ */
259
+ #define RELOAD_BITS(n) \
260
+ bsfq %bits##n, %bits##n; \
261
+ movq %bits##n, %rax; \
262
+ andq $7, %rax; \
263
+ shrq $3, %bits##n; \
264
+ leaq 5(%op##n), %op##n; \
265
+ subq %bits##n, %ip##n; \
266
+ movq (%ip##n), %bits##n; \
267
+ orq $1, %bits##n; \
268
+ shlx %rax, %bits##n, %bits##n
269
+
270
+ /* Store clobbered variables on the stack */
271
+ movq %olimit, 24(%rsp)
272
+ movq %ip1, 0(%rsp)
273
+ movq %ip2, 8(%rsp)
274
+ movq %ip3, 16(%rsp)
275
+
276
+ /* Call GET_NEXT_DELT for each stream */
277
+ FOR_EACH_STREAM(GET_NEXT_DELT)
278
+
279
+ .p2align 6
280
+
281
+ .L_4X1_loop_body:
282
+ /* Decode 5 symbols in each of the 4 streams (20 total)
283
+ * Must have called GET_NEXT_DELT for each stream
284
+ */
285
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
286
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
287
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
288
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
289
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
290
+
291
+ /* Load ip[1,2,3] from stack (var[] aliases them)
292
+ * ip[] is needed for RELOAD_BITS
293
+ * Each will be stored back to the stack after RELOAD
294
+ */
295
+ movq 0(%rsp), %ip1
296
+ movq 8(%rsp), %ip2
297
+ movq 16(%rsp), %ip3
298
+
299
+ /* Reload each stream & fetch the next table entry
300
+ * to prepare for the next iteration
301
+ */
302
+ RELOAD_BITS(0)
303
+ GET_NEXT_DELT(0)
304
+
305
+ RELOAD_BITS(1)
306
+ movq %ip1, 0(%rsp)
307
+ GET_NEXT_DELT(1)
308
+
309
+ RELOAD_BITS(2)
310
+ movq %ip2, 8(%rsp)
311
+ GET_NEXT_DELT(2)
312
+
313
+ RELOAD_BITS(3)
314
+ movq %ip3, 16(%rsp)
315
+ GET_NEXT_DELT(3)
316
+
317
+ /* If op3 < olimit: continue the loop */
318
+ cmp %op3, 24(%rsp)
319
+ ja .L_4X1_loop_body
320
+
321
+ /* Reload ip[1,2,3] from stack */
322
+ movq 0(%rsp), %ip1
323
+ movq 8(%rsp), %ip2
324
+ movq 16(%rsp), %ip3
325
+
326
+ /* Re-compute olimit */
327
+ jmp .L_4X1_compute_olimit
328
+
329
+ #undef GET_NEXT_DELT
330
+ #undef DECODE_FROM_DELT
331
+ #undef DECODE
332
+ #undef RELOAD_BITS
333
+ .L_4X1_exit:
334
+ addq $24, %rsp
335
+
336
+ /* Restore stack (oend & olimit) */
337
+ pop %rax /* olimit */
338
+ pop %rax /* oend */
339
+ pop %rax /* ilowest */
340
+ pop %rax /* arg */
341
+
342
+ /* Save ip / op / bits */
343
+ movq %ip0, 0(%rax)
344
+ movq %ip1, 8(%rax)
345
+ movq %ip2, 16(%rax)
346
+ movq %ip3, 24(%rax)
347
+ movq %op0, 32(%rax)
348
+ movq %op1, 40(%rax)
349
+ movq %op2, 48(%rax)
350
+ movq %op3, 56(%rax)
351
+ movq %bits0, 64(%rax)
352
+ movq %bits1, 72(%rax)
353
+ movq %bits2, 80(%rax)
354
+ movq %bits3, 88(%rax)
355
+
356
+ /* Restore registers */
357
+ pop %r15
358
+ pop %r14
359
+ pop %r13
360
+ pop %r12
361
+ pop %r11
362
+ pop %r10
363
+ pop %r9
364
+ pop %r8
365
+ pop %rdi
366
+ pop %rsi
367
+ pop %rbp
368
+ pop %rdx
369
+ pop %rcx
370
+ pop %rbx
371
+ pop %rax
372
+ ret
373
+
374
+ _HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
375
+ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
376
+ ZSTD_CET_ENDBRANCH
377
+ /* Save all registers - even if they are callee saved for simplicity. */
378
+ push %rax
379
+ push %rbx
380
+ push %rcx
381
+ push %rdx
382
+ push %rbp
383
+ push %rsi
384
+ push %rdi
385
+ push %r8
386
+ push %r9
387
+ push %r10
388
+ push %r11
389
+ push %r12
390
+ push %r13
391
+ push %r14
392
+ push %r15
393
+
394
+ movq %rdi, %rax
395
+ movq 0(%rax), %ip0
396
+ movq 8(%rax), %ip1
397
+ movq 16(%rax), %ip2
398
+ movq 24(%rax), %ip3
399
+ movq 32(%rax), %op0
400
+ movq 40(%rax), %op1
401
+ movq 48(%rax), %op2
402
+ movq 56(%rax), %op3
403
+ movq 64(%rax), %bits0
404
+ movq 72(%rax), %bits1
405
+ movq 80(%rax), %bits2
406
+ movq 88(%rax), %bits3
407
+ movq 96(%rax), %dtable
408
+ push %rax /* argument */
409
+ push %rax /* olimit */
410
+ push 104(%rax) /* ilowest */
411
+
412
+ movq 112(%rax), %rax
413
+ push %rax /* oend3 */
414
+
415
+ movq %op3, %rax
416
+ push %rax /* oend2 */
417
+
418
+ movq %op2, %rax
419
+ push %rax /* oend1 */
420
+
421
+ movq %op1, %rax
422
+ push %rax /* oend0 */
423
+
424
+ /* Scratch space */
425
+ subq $8, %rsp
426
+
427
+ .L_4X2_compute_olimit:
428
+ /* Computes how many iterations we can do safely
429
+ * %r15, %rax may be clobbered
430
+ * rdx must be saved
431
+ * op[1,2,3,4] & ip0 mustn't be clobbered
432
+ */
433
+ movq %rdx, 0(%rsp)
434
+
435
+ /* We can consume up to 7 input bytes each iteration. */
436
+ movq %ip0, %rax /* rax = ip0 */
437
+ movq 40(%rsp), %rdx /* rdx = ilowest */
438
+ subq %rdx, %rax /* rax = ip0 - ilowest */
439
+ movq %rax, %r15 /* r15 = ip0 - ilowest */
440
+
441
+ /* rdx = rax / 7 */
442
+ movabsq $2635249153387078803, %rdx
443
+ mulq %rdx
444
+ subq %rdx, %r15
445
+ shrq %r15
446
+ addq %r15, %rdx
447
+ shrq $2, %rdx
448
+
449
+ /* r15 = (ip0 - ilowest) / 7 */
450
+ movq %rdx, %r15
451
+
452
+ /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
453
+ movq 8(%rsp), %rax /* rax = oend0 */
454
+ subq %op0, %rax /* rax = oend0 - op0 */
455
+ movq 16(%rsp), %rdx /* rdx = oend1 */
456
+ subq %op1, %rdx /* rdx = oend1 - op1 */
457
+
458
+ cmpq %rax, %rdx
459
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
460
+
461
+ movq 24(%rsp), %rax /* rax = oend2 */
462
+ subq %op2, %rax /* rax = oend2 - op2 */
463
+
464
+ cmpq %rax, %rdx
465
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
466
+
467
+ movq 32(%rsp), %rax /* rax = oend3 */
468
+ subq %op3, %rax /* rax = oend3 - op3 */
469
+
470
+ cmpq %rax, %rdx
471
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
472
+
473
+ movabsq $-3689348814741910323, %rax
474
+ mulq %rdx
475
+ shrq $3, %rdx /* rdx = rdx / 10 */
476
+
477
+ /* r15 = min(%rdx, %r15) */
478
+ cmpq %rdx, %r15
479
+ cmova %rdx, %r15
480
+
481
+ /* olimit = op3 + 5 * r15 */
482
+ movq %r15, %rax
483
+ leaq (%op3, %rax, 4), %olimit
484
+ addq %rax, %olimit
485
+
486
+ movq 0(%rsp), %rdx
487
+
488
+ /* If (op3 + 10 > olimit) */
489
+ movq %op3, %rax /* rax = op3 */
490
+ cmpq %rax, %olimit /* op3 == olimit */
491
+ je .L_4X2_exit
492
+
493
+ /* If (ip1 < ip0) go to exit */
494
+ cmpq %ip0, %ip1
495
+ jb .L_4X2_exit
496
+
497
+ /* If (ip2 < ip1) go to exit */
498
+ cmpq %ip1, %ip2
499
+ jb .L_4X2_exit
500
+
501
+ /* If (ip3 < ip2) go to exit */
502
+ cmpq %ip2, %ip3
503
+ jb .L_4X2_exit
504
+
505
+ #define DECODE(n, idx) \
506
+ movq %bits##n, %rax; \
507
+ shrq $53, %rax; \
508
+ movzwl 0(%dtable,%rax,4),%r8d; \
509
+ movzbl 2(%dtable,%rax,4),%r15d; \
510
+ movzbl 3(%dtable,%rax,4),%eax; \
511
+ movw %r8w, (%op##n); \
512
+ shlxq %r15, %bits##n, %bits##n; \
513
+ addq %rax, %op##n
514
+
515
+ #define RELOAD_BITS(n) \
516
+ bsfq %bits##n, %bits##n; \
517
+ movq %bits##n, %rax; \
518
+ shrq $3, %bits##n; \
519
+ andq $7, %rax; \
520
+ subq %bits##n, %ip##n; \
521
+ movq (%ip##n), %bits##n; \
522
+ orq $1, %bits##n; \
523
+ shlxq %rax, %bits##n, %bits##n
524
+
525
+
526
+ movq %olimit, 48(%rsp)
527
+
528
+ .p2align 6
529
+
530
+ .L_4X2_loop_body:
531
+ /* We clobber r8, so store it on the stack */
532
+ movq %r8, 0(%rsp)
533
+
534
+ /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
535
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
536
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
537
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
538
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
539
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
540
+
541
+ /* Reload r8 */
542
+ movq 0(%rsp), %r8
543
+
544
+ FOR_EACH_STREAM(RELOAD_BITS)
545
+
546
+ cmp %op3, 48(%rsp)
547
+ ja .L_4X2_loop_body
548
+ jmp .L_4X2_compute_olimit
549
+
550
+ #undef DECODE
551
+ #undef RELOAD_BITS
552
+ .L_4X2_exit:
553
+ addq $8, %rsp
554
+ /* Restore stack (oend & olimit) */
555
+ pop %rax /* oend0 */
556
+ pop %rax /* oend1 */
557
+ pop %rax /* oend2 */
558
+ pop %rax /* oend3 */
559
+ pop %rax /* ilowest */
560
+ pop %rax /* olimit */
561
+ pop %rax /* arg */
562
+
563
+ /* Save ip / op / bits */
564
+ movq %ip0, 0(%rax)
565
+ movq %ip1, 8(%rax)
566
+ movq %ip2, 16(%rax)
567
+ movq %ip3, 24(%rax)
568
+ movq %op0, 32(%rax)
569
+ movq %op1, 40(%rax)
570
+ movq %op2, 48(%rax)
571
+ movq %op3, 56(%rax)
572
+ movq %bits0, 64(%rax)
573
+ movq %bits1, 72(%rax)
574
+ movq %bits2, 80(%rax)
575
+ movq %bits3, 88(%rax)
576
+
577
+ /* Restore registers */
578
+ pop %r15
579
+ pop %r14
580
+ pop %r13
581
+ pop %r12
582
+ pop %r11
583
+ pop %r10
584
+ pop %r9
585
+ pop %r8
586
+ pop %rdi
587
+ pop %rsi
588
+ pop %rbp
589
+ pop %rdx
590
+ pop %rcx
591
+ pop %rbx
592
+ pop %rax
593
+ ret
594
+
595
+ #endif