vibe_zstd 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. checksums.yaml +7 -0
  2. data/.standard.yml +3 -0
  3. data/CHANGELOG.md +22 -0
  4. data/LICENSE.txt +21 -0
  5. data/README.md +978 -0
  6. data/Rakefile +20 -0
  7. data/benchmark/README.md +198 -0
  8. data/benchmark/compression_levels.rb +99 -0
  9. data/benchmark/context_reuse.rb +174 -0
  10. data/benchmark/decompression_speed_by_level.rb +65 -0
  11. data/benchmark/dictionary_training.rb +182 -0
  12. data/benchmark/dictionary_usage.rb +121 -0
  13. data/benchmark/for_readme.rb +157 -0
  14. data/benchmark/generate_fixture.rb +82 -0
  15. data/benchmark/helpers.rb +237 -0
  16. data/benchmark/multithreading.rb +105 -0
  17. data/benchmark/run_all.rb +150 -0
  18. data/benchmark/streaming.rb +154 -0
  19. data/ext/vibe_zstd/Makefile +270 -0
  20. data/ext/vibe_zstd/cctx.c +565 -0
  21. data/ext/vibe_zstd/dctx.c +493 -0
  22. data/ext/vibe_zstd/dict.c +587 -0
  23. data/ext/vibe_zstd/extconf.rb +52 -0
  24. data/ext/vibe_zstd/frames.c +132 -0
  25. data/ext/vibe_zstd/libzstd/LICENSE +30 -0
  26. data/ext/vibe_zstd/libzstd/common/allocations.h +55 -0
  27. data/ext/vibe_zstd/libzstd/common/bits.h +205 -0
  28. data/ext/vibe_zstd/libzstd/common/bitstream.h +454 -0
  29. data/ext/vibe_zstd/libzstd/common/compiler.h +464 -0
  30. data/ext/vibe_zstd/libzstd/common/cpu.h +249 -0
  31. data/ext/vibe_zstd/libzstd/common/debug.c +30 -0
  32. data/ext/vibe_zstd/libzstd/common/debug.h +107 -0
  33. data/ext/vibe_zstd/libzstd/common/entropy_common.c +340 -0
  34. data/ext/vibe_zstd/libzstd/common/error_private.c +64 -0
  35. data/ext/vibe_zstd/libzstd/common/error_private.h +158 -0
  36. data/ext/vibe_zstd/libzstd/common/fse.h +625 -0
  37. data/ext/vibe_zstd/libzstd/common/fse_decompress.c +315 -0
  38. data/ext/vibe_zstd/libzstd/common/huf.h +277 -0
  39. data/ext/vibe_zstd/libzstd/common/mem.h +422 -0
  40. data/ext/vibe_zstd/libzstd/common/pool.c +371 -0
  41. data/ext/vibe_zstd/libzstd/common/pool.h +81 -0
  42. data/ext/vibe_zstd/libzstd/common/portability_macros.h +171 -0
  43. data/ext/vibe_zstd/libzstd/common/threading.c +182 -0
  44. data/ext/vibe_zstd/libzstd/common/threading.h +142 -0
  45. data/ext/vibe_zstd/libzstd/common/xxhash.c +18 -0
  46. data/ext/vibe_zstd/libzstd/common/xxhash.h +7094 -0
  47. data/ext/vibe_zstd/libzstd/common/zstd_common.c +48 -0
  48. data/ext/vibe_zstd/libzstd/common/zstd_deps.h +123 -0
  49. data/ext/vibe_zstd/libzstd/common/zstd_internal.h +324 -0
  50. data/ext/vibe_zstd/libzstd/common/zstd_trace.h +156 -0
  51. data/ext/vibe_zstd/libzstd/compress/clevels.h +134 -0
  52. data/ext/vibe_zstd/libzstd/compress/fse_compress.c +625 -0
  53. data/ext/vibe_zstd/libzstd/compress/hist.c +191 -0
  54. data/ext/vibe_zstd/libzstd/compress/hist.h +82 -0
  55. data/ext/vibe_zstd/libzstd/compress/huf_compress.c +1464 -0
  56. data/ext/vibe_zstd/libzstd/compress/zstd_compress.c +7843 -0
  57. data/ext/vibe_zstd/libzstd/compress/zstd_compress_internal.h +1636 -0
  58. data/ext/vibe_zstd/libzstd/compress/zstd_compress_literals.c +235 -0
  59. data/ext/vibe_zstd/libzstd/compress/zstd_compress_literals.h +39 -0
  60. data/ext/vibe_zstd/libzstd/compress/zstd_compress_sequences.c +442 -0
  61. data/ext/vibe_zstd/libzstd/compress/zstd_compress_sequences.h +55 -0
  62. data/ext/vibe_zstd/libzstd/compress/zstd_compress_superblock.c +688 -0
  63. data/ext/vibe_zstd/libzstd/compress/zstd_compress_superblock.h +32 -0
  64. data/ext/vibe_zstd/libzstd/compress/zstd_cwksp.h +765 -0
  65. data/ext/vibe_zstd/libzstd/compress/zstd_double_fast.c +778 -0
  66. data/ext/vibe_zstd/libzstd/compress/zstd_double_fast.h +42 -0
  67. data/ext/vibe_zstd/libzstd/compress/zstd_fast.c +985 -0
  68. data/ext/vibe_zstd/libzstd/compress/zstd_fast.h +30 -0
  69. data/ext/vibe_zstd/libzstd/compress/zstd_lazy.c +2199 -0
  70. data/ext/vibe_zstd/libzstd/compress/zstd_lazy.h +193 -0
  71. data/ext/vibe_zstd/libzstd/compress/zstd_ldm.c +745 -0
  72. data/ext/vibe_zstd/libzstd/compress/zstd_ldm.h +109 -0
  73. data/ext/vibe_zstd/libzstd/compress/zstd_ldm_geartab.h +106 -0
  74. data/ext/vibe_zstd/libzstd/compress/zstd_opt.c +1580 -0
  75. data/ext/vibe_zstd/libzstd/compress/zstd_opt.h +72 -0
  76. data/ext/vibe_zstd/libzstd/compress/zstd_preSplit.c +238 -0
  77. data/ext/vibe_zstd/libzstd/compress/zstd_preSplit.h +33 -0
  78. data/ext/vibe_zstd/libzstd/compress/zstdmt_compress.c +1923 -0
  79. data/ext/vibe_zstd/libzstd/compress/zstdmt_compress.h +102 -0
  80. data/ext/vibe_zstd/libzstd/decompress/huf_decompress.c +1944 -0
  81. data/ext/vibe_zstd/libzstd/decompress/huf_decompress_amd64.S +602 -0
  82. data/ext/vibe_zstd/libzstd/decompress/zstd_ddict.c +244 -0
  83. data/ext/vibe_zstd/libzstd/decompress/zstd_ddict.h +44 -0
  84. data/ext/vibe_zstd/libzstd/decompress/zstd_decompress.c +2410 -0
  85. data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_block.c +2209 -0
  86. data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_block.h +73 -0
  87. data/ext/vibe_zstd/libzstd/decompress/zstd_decompress_internal.h +240 -0
  88. data/ext/vibe_zstd/libzstd/deprecated/zbuff.h +214 -0
  89. data/ext/vibe_zstd/libzstd/deprecated/zbuff_common.c +26 -0
  90. data/ext/vibe_zstd/libzstd/deprecated/zbuff_compress.c +167 -0
  91. data/ext/vibe_zstd/libzstd/deprecated/zbuff_decompress.c +77 -0
  92. data/ext/vibe_zstd/libzstd/dictBuilder/cover.c +1302 -0
  93. data/ext/vibe_zstd/libzstd/dictBuilder/cover.h +152 -0
  94. data/ext/vibe_zstd/libzstd/dictBuilder/divsufsort.c +1913 -0
  95. data/ext/vibe_zstd/libzstd/dictBuilder/divsufsort.h +57 -0
  96. data/ext/vibe_zstd/libzstd/dictBuilder/fastcover.c +766 -0
  97. data/ext/vibe_zstd/libzstd/dictBuilder/zdict.c +1133 -0
  98. data/ext/vibe_zstd/libzstd/zdict.h +481 -0
  99. data/ext/vibe_zstd/libzstd/zstd.h +3198 -0
  100. data/ext/vibe_zstd/libzstd/zstd_errors.h +107 -0
  101. data/ext/vibe_zstd/streaming.c +410 -0
  102. data/ext/vibe_zstd/vibe_zstd.c +293 -0
  103. data/ext/vibe_zstd/vibe_zstd.h +56 -0
  104. data/ext/vibe_zstd/vibe_zstd_internal.h +27 -0
  105. data/lib/vibe_zstd/constants.rb +67 -0
  106. data/lib/vibe_zstd/version.rb +5 -0
  107. data/lib/vibe_zstd.rb +255 -0
  108. data/sig/vibe_zstd.rbs +76 -0
  109. metadata +179 -0
@@ -0,0 +1,602 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #include "../common/portability_macros.h"
12
+
13
+ #if defined(__ELF__) && defined(__GNUC__)
14
+ /* Stack marking
15
+ * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
16
+ */
17
+ .section .note.GNU-stack,"",%progbits
18
+
19
+ #if defined(__aarch64__)
20
+ /* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
21
+ * See: https://github.com/facebook/zstd/issues/3841
22
+ * See: https://gcc.godbolt.org/z/sqr5T4ffK
23
+ * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/
24
+ * See: https://reviews.llvm.org/D62609
25
+ */
26
+ .pushsection .note.gnu.property, "a"
27
+ .p2align 3
28
+ .long 4 /* size of the name - "GNU\0" */
29
+ .long 0x10 /* size of descriptor */
30
+ .long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */
31
+ .asciz "GNU"
32
+ .long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
33
+ .long 4 /* pr_datasz - 4 bytes */
34
+ .long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
35
+ .p2align 3 /* pr_padding - bring everything to 8 byte alignment */
36
+ .popsection
37
+ #endif
38
+
39
+ #endif
40
+
41
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
42
+
43
+ /* Calling convention:
44
+ *
45
+ * %rdi (or %rcx on Windows) contains the first argument: HUF_DecompressAsmArgs*.
46
+ * %rbp isn't maintained (no frame pointer).
47
+ * %rsp contains the stack pointer that grows down.
48
+ * No red-zone is assumed, only addresses >= %rsp are used.
49
+ * All register contents are preserved.
50
+ */
51
+
52
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
53
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
54
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
55
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
56
+ .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
57
+ .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
58
+ .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
59
+ .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
60
+ .text
61
+
62
+ /* Sets up register mappings for clarity.
63
+ * op[], bits[], dtable & ip[0] each get their own register.
64
+ * ip[1,2,3] & olimit alias var[].
65
+ * %rax is a scratch register.
66
+ */
67
+
68
+ #define op0 rsi
69
+ #define op1 rbx
70
+ #define op2 rcx
71
+ #define op3 rdi
72
+
73
+ #define ip0 r8
74
+ #define ip1 r9
75
+ #define ip2 r10
76
+ #define ip3 r11
77
+
78
+ #define bits0 rbp
79
+ #define bits1 rdx
80
+ #define bits2 r12
81
+ #define bits3 r13
82
+ #define dtable r14
83
+ #define olimit r15
84
+
85
+ /* var[] aliases ip[1,2,3] & olimit
86
+ * ip[1,2,3] are saved every iteration.
87
+ * olimit is only used in compute_olimit.
88
+ */
89
+ #define var0 r15
90
+ #define var1 r9
91
+ #define var2 r10
92
+ #define var3 r11
93
+
94
+ /* 32-bit var registers */
95
+ #define vard0 r15d
96
+ #define vard1 r9d
97
+ #define vard2 r10d
98
+ #define vard3 r11d
99
+
100
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
101
+ #define FOR_EACH_STREAM(X) \
102
+ X(0); \
103
+ X(1); \
104
+ X(2); \
105
+ X(3)
106
+
107
+ /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
108
+ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
109
+ X(0, idx); \
110
+ X(1, idx); \
111
+ X(2, idx); \
112
+ X(3, idx)
113
+
114
+ /* Define both _HUF_* & HUF_* symbols because MacOS
115
+ * C symbols are prefixed with '_' & Linux symbols aren't.
116
+ */
117
+ _HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
118
+ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
119
+ ZSTD_CET_ENDBRANCH
120
+ /* Save all registers - even if they are callee saved for simplicity. */
121
+ push %rax
122
+ push %rbx
123
+ push %rcx
124
+ push %rdx
125
+ push %rbp
126
+ push %rsi
127
+ push %rdi
128
+ push %r8
129
+ push %r9
130
+ push %r10
131
+ push %r11
132
+ push %r12
133
+ push %r13
134
+ push %r14
135
+ push %r15
136
+
137
+ /* Read HUF_DecompressAsmArgs* args from %rax */
138
+ #if defined(_WIN32)
139
+ movq %rcx, %rax
140
+ #else
141
+ movq %rdi, %rax
142
+ #endif
143
+ movq 0(%rax), %ip0
144
+ movq 8(%rax), %ip1
145
+ movq 16(%rax), %ip2
146
+ movq 24(%rax), %ip3
147
+ movq 32(%rax), %op0
148
+ movq 40(%rax), %op1
149
+ movq 48(%rax), %op2
150
+ movq 56(%rax), %op3
151
+ movq 64(%rax), %bits0
152
+ movq 72(%rax), %bits1
153
+ movq 80(%rax), %bits2
154
+ movq 88(%rax), %bits3
155
+ movq 96(%rax), %dtable
156
+ push %rax /* argument */
157
+ push 104(%rax) /* ilowest */
158
+ push 112(%rax) /* oend */
159
+ push %olimit /* olimit space */
160
+
161
+ subq $24, %rsp
162
+
163
+ .L_4X1_compute_olimit:
164
+ /* Computes how many iterations we can do safely
165
+ * %r15, %rax may be clobbered
166
+ * rbx, rdx must be saved
167
+ * op3 & ip0 mustn't be clobbered
168
+ */
169
+ movq %rbx, 0(%rsp)
170
+ movq %rdx, 8(%rsp)
171
+
172
+ movq 32(%rsp), %rax /* rax = oend */
173
+ subq %op3, %rax /* rax = oend - op3 */
174
+
175
+ /* r15 = (oend - op3) / 5 */
176
+ movabsq $-3689348814741910323, %rdx
177
+ mulq %rdx
178
+ movq %rdx, %r15
179
+ shrq $2, %r15
180
+
181
+ movq %ip0, %rax /* rax = ip0 */
182
+ movq 40(%rsp), %rdx /* rdx = ilowest */
183
+ subq %rdx, %rax /* rax = ip0 - ilowest */
184
+ movq %rax, %rbx /* rbx = ip0 - ilowest */
185
+
186
+ /* rdx = (ip0 - ilowest) / 7 */
187
+ movabsq $2635249153387078803, %rdx
188
+ mulq %rdx
189
+ subq %rdx, %rbx
190
+ shrq %rbx
191
+ addq %rbx, %rdx
192
+ shrq $2, %rdx
193
+
194
+ /* r15 = min(%rdx, %r15) */
195
+ cmpq %rdx, %r15
196
+ cmova %rdx, %r15
197
+
198
+ /* r15 = r15 * 5 */
199
+ leaq (%r15, %r15, 4), %r15
200
+
201
+ /* olimit = op3 + r15 */
202
+ addq %op3, %olimit
203
+
204
+ movq 8(%rsp), %rdx
205
+ movq 0(%rsp), %rbx
206
+
207
+ /* If (op3 + 20 > olimit) */
208
+ movq %op3, %rax /* rax = op3 */
209
+ cmpq %rax, %olimit /* op3 == olimit */
210
+ je .L_4X1_exit
211
+
212
+ /* If (ip1 < ip0) go to exit */
213
+ cmpq %ip0, %ip1
214
+ jb .L_4X1_exit
215
+
216
+ /* If (ip2 < ip1) go to exit */
217
+ cmpq %ip1, %ip2
218
+ jb .L_4X1_exit
219
+
220
+ /* If (ip3 < ip2) go to exit */
221
+ cmpq %ip2, %ip3
222
+ jb .L_4X1_exit
223
+
224
+ /* Reads top 11 bits from bits[n]
225
+ * Loads dt[bits[n]] into var[n]
226
+ */
227
+ #define GET_NEXT_DELT(n) \
228
+ movq $53, %var##n; \
229
+ shrxq %var##n, %bits##n, %var##n; \
230
+ movzwl (%dtable,%var##n,2),%vard##n
231
+
232
+ /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
233
+ * Moves var[n] to %rax
234
+ * bits[n] <<= var[n] & 63
235
+ * op[n][idx] = %rax >> 8
236
+ * %ah is a way to access bits [8, 16) of %rax
237
+ */
238
+ #define DECODE_FROM_DELT(n, idx) \
239
+ movq %var##n, %rax; \
240
+ shlxq %var##n, %bits##n, %bits##n; \
241
+ movb %ah, idx(%op##n)
242
+
243
+ /* Assumes GET_NEXT_DELT has been called.
244
+ * Calls DECODE_FROM_DELT then GET_NEXT_DELT
245
+ */
246
+ #define DECODE_AND_GET_NEXT(n, idx) \
247
+ DECODE_FROM_DELT(n, idx); \
248
+ GET_NEXT_DELT(n) \
249
+
250
+ /* // ctz & nbBytes is stored in bits[n]
251
+ * // nbBits is stored in %rax
252
+ * ctz = CTZ[bits[n]]
253
+ * nbBits = ctz & 7
254
+ * nbBytes = ctz >> 3
255
+ * op[n] += 5
256
+ * ip[n] -= nbBytes
257
+ * // Note: x86-64 is little-endian ==> no bswap
258
+ * bits[n] = MEM_readST(ip[n]) | 1
259
+ * bits[n] <<= nbBits
260
+ */
261
+ #define RELOAD_BITS(n) \
262
+ bsfq %bits##n, %bits##n; \
263
+ movq %bits##n, %rax; \
264
+ andq $7, %rax; \
265
+ shrq $3, %bits##n; \
266
+ leaq 5(%op##n), %op##n; \
267
+ subq %bits##n, %ip##n; \
268
+ movq (%ip##n), %bits##n; \
269
+ orq $1, %bits##n; \
270
+ shlx %rax, %bits##n, %bits##n
271
+
272
+ /* Store clobbered variables on the stack */
273
+ movq %olimit, 24(%rsp)
274
+ movq %ip1, 0(%rsp)
275
+ movq %ip2, 8(%rsp)
276
+ movq %ip3, 16(%rsp)
277
+
278
+ /* Call GET_NEXT_DELT for each stream */
279
+ FOR_EACH_STREAM(GET_NEXT_DELT)
280
+
281
+ .p2align 6
282
+
283
+ .L_4X1_loop_body:
284
+ /* Decode 5 symbols in each of the 4 streams (20 total)
285
+ * Must have called GET_NEXT_DELT for each stream
286
+ */
287
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
288
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
289
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
290
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
291
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
292
+
293
+ /* Load ip[1,2,3] from stack (var[] aliases them)
294
+ * ip[] is needed for RELOAD_BITS
295
+ * Each will be stored back to the stack after RELOAD
296
+ */
297
+ movq 0(%rsp), %ip1
298
+ movq 8(%rsp), %ip2
299
+ movq 16(%rsp), %ip3
300
+
301
+ /* Reload each stream & fetch the next table entry
302
+ * to prepare for the next iteration
303
+ */
304
+ RELOAD_BITS(0)
305
+ GET_NEXT_DELT(0)
306
+
307
+ RELOAD_BITS(1)
308
+ movq %ip1, 0(%rsp)
309
+ GET_NEXT_DELT(1)
310
+
311
+ RELOAD_BITS(2)
312
+ movq %ip2, 8(%rsp)
313
+ GET_NEXT_DELT(2)
314
+
315
+ RELOAD_BITS(3)
316
+ movq %ip3, 16(%rsp)
317
+ GET_NEXT_DELT(3)
318
+
319
+ /* If op3 < olimit: continue the loop */
320
+ cmp %op3, 24(%rsp)
321
+ ja .L_4X1_loop_body
322
+
323
+ /* Reload ip[1,2,3] from stack */
324
+ movq 0(%rsp), %ip1
325
+ movq 8(%rsp), %ip2
326
+ movq 16(%rsp), %ip3
327
+
328
+ /* Re-compute olimit */
329
+ jmp .L_4X1_compute_olimit
330
+
331
+ #undef GET_NEXT_DELT
332
+ #undef DECODE_FROM_DELT
333
+ #undef DECODE
334
+ #undef RELOAD_BITS
335
+ .L_4X1_exit:
336
+ addq $24, %rsp
337
+
338
+ /* Restore stack (oend & olimit) */
339
+ pop %rax /* olimit */
340
+ pop %rax /* oend */
341
+ pop %rax /* ilowest */
342
+ pop %rax /* arg */
343
+
344
+ /* Save ip / op / bits */
345
+ movq %ip0, 0(%rax)
346
+ movq %ip1, 8(%rax)
347
+ movq %ip2, 16(%rax)
348
+ movq %ip3, 24(%rax)
349
+ movq %op0, 32(%rax)
350
+ movq %op1, 40(%rax)
351
+ movq %op2, 48(%rax)
352
+ movq %op3, 56(%rax)
353
+ movq %bits0, 64(%rax)
354
+ movq %bits1, 72(%rax)
355
+ movq %bits2, 80(%rax)
356
+ movq %bits3, 88(%rax)
357
+
358
+ /* Restore registers */
359
+ pop %r15
360
+ pop %r14
361
+ pop %r13
362
+ pop %r12
363
+ pop %r11
364
+ pop %r10
365
+ pop %r9
366
+ pop %r8
367
+ pop %rdi
368
+ pop %rsi
369
+ pop %rbp
370
+ pop %rdx
371
+ pop %rcx
372
+ pop %rbx
373
+ pop %rax
374
+ ret
375
+
376
+ _HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
377
+ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
378
+ ZSTD_CET_ENDBRANCH
379
+ /* Save all registers - even if they are callee saved for simplicity. */
380
+ push %rax
381
+ push %rbx
382
+ push %rcx
383
+ push %rdx
384
+ push %rbp
385
+ push %rsi
386
+ push %rdi
387
+ push %r8
388
+ push %r9
389
+ push %r10
390
+ push %r11
391
+ push %r12
392
+ push %r13
393
+ push %r14
394
+ push %r15
395
+
396
+ /* Read HUF_DecompressAsmArgs* args from %rax */
397
+ #if defined(_WIN32)
398
+ movq %rcx, %rax
399
+ #else
400
+ movq %rdi, %rax
401
+ #endif
402
+ movq 0(%rax), %ip0
403
+ movq 8(%rax), %ip1
404
+ movq 16(%rax), %ip2
405
+ movq 24(%rax), %ip3
406
+ movq 32(%rax), %op0
407
+ movq 40(%rax), %op1
408
+ movq 48(%rax), %op2
409
+ movq 56(%rax), %op3
410
+ movq 64(%rax), %bits0
411
+ movq 72(%rax), %bits1
412
+ movq 80(%rax), %bits2
413
+ movq 88(%rax), %bits3
414
+ movq 96(%rax), %dtable
415
+ push %rax /* argument */
416
+ push %rax /* olimit */
417
+ push 104(%rax) /* ilowest */
418
+
419
+ movq 112(%rax), %rax
420
+ push %rax /* oend3 */
421
+
422
+ movq %op3, %rax
423
+ push %rax /* oend2 */
424
+
425
+ movq %op2, %rax
426
+ push %rax /* oend1 */
427
+
428
+ movq %op1, %rax
429
+ push %rax /* oend0 */
430
+
431
+ /* Scratch space */
432
+ subq $8, %rsp
433
+
434
+ .L_4X2_compute_olimit:
435
+ /* Computes how many iterations we can do safely
436
+ * %r15, %rax may be clobbered
437
+ * rdx must be saved
438
+ * op[1,2,3,4] & ip0 mustn't be clobbered
439
+ */
440
+ movq %rdx, 0(%rsp)
441
+
442
+ /* We can consume up to 7 input bytes each iteration. */
443
+ movq %ip0, %rax /* rax = ip0 */
444
+ movq 40(%rsp), %rdx /* rdx = ilowest */
445
+ subq %rdx, %rax /* rax = ip0 - ilowest */
446
+ movq %rax, %r15 /* r15 = ip0 - ilowest */
447
+
448
+ /* rdx = rax / 7 */
449
+ movabsq $2635249153387078803, %rdx
450
+ mulq %rdx
451
+ subq %rdx, %r15
452
+ shrq %r15
453
+ addq %r15, %rdx
454
+ shrq $2, %rdx
455
+
456
+ /* r15 = (ip0 - ilowest) / 7 */
457
+ movq %rdx, %r15
458
+
459
+ /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
460
+ movq 8(%rsp), %rax /* rax = oend0 */
461
+ subq %op0, %rax /* rax = oend0 - op0 */
462
+ movq 16(%rsp), %rdx /* rdx = oend1 */
463
+ subq %op1, %rdx /* rdx = oend1 - op1 */
464
+
465
+ cmpq %rax, %rdx
466
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
467
+
468
+ movq 24(%rsp), %rax /* rax = oend2 */
469
+ subq %op2, %rax /* rax = oend2 - op2 */
470
+
471
+ cmpq %rax, %rdx
472
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
473
+
474
+ movq 32(%rsp), %rax /* rax = oend3 */
475
+ subq %op3, %rax /* rax = oend3 - op3 */
476
+
477
+ cmpq %rax, %rdx
478
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
479
+
480
+ movabsq $-3689348814741910323, %rax
481
+ mulq %rdx
482
+ shrq $3, %rdx /* rdx = rdx / 10 */
483
+
484
+ /* r15 = min(%rdx, %r15) */
485
+ cmpq %rdx, %r15
486
+ cmova %rdx, %r15
487
+
488
+ /* olimit = op3 + 5 * r15 */
489
+ movq %r15, %rax
490
+ leaq (%op3, %rax, 4), %olimit
491
+ addq %rax, %olimit
492
+
493
+ movq 0(%rsp), %rdx
494
+
495
+ /* If (op3 + 10 > olimit) */
496
+ movq %op3, %rax /* rax = op3 */
497
+ cmpq %rax, %olimit /* op3 == olimit */
498
+ je .L_4X2_exit
499
+
500
+ /* If (ip1 < ip0) go to exit */
501
+ cmpq %ip0, %ip1
502
+ jb .L_4X2_exit
503
+
504
+ /* If (ip2 < ip1) go to exit */
505
+ cmpq %ip1, %ip2
506
+ jb .L_4X2_exit
507
+
508
+ /* If (ip3 < ip2) go to exit */
509
+ cmpq %ip2, %ip3
510
+ jb .L_4X2_exit
511
+
512
+ #define DECODE(n, idx) \
513
+ movq %bits##n, %rax; \
514
+ shrq $53, %rax; \
515
+ movzwl 0(%dtable,%rax,4),%r8d; \
516
+ movzbl 2(%dtable,%rax,4),%r15d; \
517
+ movzbl 3(%dtable,%rax,4),%eax; \
518
+ movw %r8w, (%op##n); \
519
+ shlxq %r15, %bits##n, %bits##n; \
520
+ addq %rax, %op##n
521
+
522
+ #define RELOAD_BITS(n) \
523
+ bsfq %bits##n, %bits##n; \
524
+ movq %bits##n, %rax; \
525
+ shrq $3, %bits##n; \
526
+ andq $7, %rax; \
527
+ subq %bits##n, %ip##n; \
528
+ movq (%ip##n), %bits##n; \
529
+ orq $1, %bits##n; \
530
+ shlxq %rax, %bits##n, %bits##n
531
+
532
+
533
+ movq %olimit, 48(%rsp)
534
+
535
+ .p2align 6
536
+
537
+ .L_4X2_loop_body:
538
+ /* We clobber r8, so store it on the stack */
539
+ movq %r8, 0(%rsp)
540
+
541
+ /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
542
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
543
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
544
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
545
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
546
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
547
+
548
+ /* Reload r8 */
549
+ movq 0(%rsp), %r8
550
+
551
+ FOR_EACH_STREAM(RELOAD_BITS)
552
+
553
+ cmp %op3, 48(%rsp)
554
+ ja .L_4X2_loop_body
555
+ jmp .L_4X2_compute_olimit
556
+
557
+ #undef DECODE
558
+ #undef RELOAD_BITS
559
+ .L_4X2_exit:
560
+ addq $8, %rsp
561
+ /* Restore stack (oend & olimit) */
562
+ pop %rax /* oend0 */
563
+ pop %rax /* oend1 */
564
+ pop %rax /* oend2 */
565
+ pop %rax /* oend3 */
566
+ pop %rax /* ilowest */
567
+ pop %rax /* olimit */
568
+ pop %rax /* arg */
569
+
570
+ /* Save ip / op / bits */
571
+ movq %ip0, 0(%rax)
572
+ movq %ip1, 8(%rax)
573
+ movq %ip2, 16(%rax)
574
+ movq %ip3, 24(%rax)
575
+ movq %op0, 32(%rax)
576
+ movq %op1, 40(%rax)
577
+ movq %op2, 48(%rax)
578
+ movq %op3, 56(%rax)
579
+ movq %bits0, 64(%rax)
580
+ movq %bits1, 72(%rax)
581
+ movq %bits2, 80(%rax)
582
+ movq %bits3, 88(%rax)
583
+
584
+ /* Restore registers */
585
+ pop %r15
586
+ pop %r14
587
+ pop %r13
588
+ pop %r12
589
+ pop %r11
590
+ pop %r10
591
+ pop %r9
592
+ pop %r8
593
+ pop %rdi
594
+ pop %rsi
595
+ pop %rbp
596
+ pop %rdx
597
+ pop %rcx
598
+ pop %rbx
599
+ pop %rax
600
+ ret
601
+
602
+ #endif