zstdlib 0.9.0-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. checksums.yaml +7 -0
  2. data/.yardopts +6 -0
  3. data/CHANGES.md +78 -0
  4. data/Gemfile +3 -0
  5. data/README.md +107 -0
  6. data/Rakefile +59 -0
  7. data/ext/zstdlib_c/extconf.rb +54 -0
  8. data/ext/zstdlib_c/ruby/zlib-2.2/zstdlib.c +4675 -0
  9. data/ext/zstdlib_c/ruby/zlib-2.3/zstdlib.c +4702 -0
  10. data/ext/zstdlib_c/ruby/zlib-2.4/zstdlib.c +4859 -0
  11. data/ext/zstdlib_c/ruby/zlib-2.5/zstdlib.c +4864 -0
  12. data/ext/zstdlib_c/ruby/zlib-2.6/zstdlib.c +4906 -0
  13. data/ext/zstdlib_c/ruby/zlib-2.7/zstdlib.c +4895 -0
  14. data/ext/zstdlib_c/ruby/zlib-3.0/zstdlib.c +4994 -0
  15. data/ext/zstdlib_c/ruby/zlib-3.1/zstdlib.c +5076 -0
  16. data/ext/zstdlib_c/zlib-1.2.11/adler32.c +186 -0
  17. data/ext/zstdlib_c/zlib-1.2.11/compress.c +86 -0
  18. data/ext/zstdlib_c/zlib-1.2.11/crc32.c +442 -0
  19. data/ext/zstdlib_c/zlib-1.2.11/crc32.h +441 -0
  20. data/ext/zstdlib_c/zlib-1.2.11/deflate.c +2163 -0
  21. data/ext/zstdlib_c/zlib-1.2.11/deflate.h +349 -0
  22. data/ext/zstdlib_c/zlib-1.2.11/gzclose.c +25 -0
  23. data/ext/zstdlib_c/zlib-1.2.11/gzguts.h +218 -0
  24. data/ext/zstdlib_c/zlib-1.2.11/gzlib.c +637 -0
  25. data/ext/zstdlib_c/zlib-1.2.11/gzread.c +654 -0
  26. data/ext/zstdlib_c/zlib-1.2.11/gzwrite.c +665 -0
  27. data/ext/zstdlib_c/zlib-1.2.11/infback.c +640 -0
  28. data/ext/zstdlib_c/zlib-1.2.11/inffast.c +323 -0
  29. data/ext/zstdlib_c/zlib-1.2.11/inffast.h +11 -0
  30. data/ext/zstdlib_c/zlib-1.2.11/inffixed.h +94 -0
  31. data/ext/zstdlib_c/zlib-1.2.11/inflate.c +1561 -0
  32. data/ext/zstdlib_c/zlib-1.2.11/inflate.h +125 -0
  33. data/ext/zstdlib_c/zlib-1.2.11/inftrees.c +304 -0
  34. data/ext/zstdlib_c/zlib-1.2.11/inftrees.h +62 -0
  35. data/ext/zstdlib_c/zlib-1.2.11/trees.c +1203 -0
  36. data/ext/zstdlib_c/zlib-1.2.11/trees.h +128 -0
  37. data/ext/zstdlib_c/zlib-1.2.11/uncompr.c +93 -0
  38. data/ext/zstdlib_c/zlib-1.2.11/zconf.h +534 -0
  39. data/ext/zstdlib_c/zlib-1.2.11/zlib.h +1912 -0
  40. data/ext/zstdlib_c/zlib-1.2.11/zutil.c +325 -0
  41. data/ext/zstdlib_c/zlib-1.2.11/zutil.h +271 -0
  42. data/ext/zstdlib_c/zlib.mk +14 -0
  43. data/ext/zstdlib_c/zlibwrapper/zlibwrapper.c +10 -0
  44. data/ext/zstdlib_c/zlibwrapper.mk +14 -0
  45. data/ext/zstdlib_c/zstd-1.5.2/lib/common/bitstream.h +478 -0
  46. data/ext/zstdlib_c/zstd-1.5.2/lib/common/compiler.h +335 -0
  47. data/ext/zstdlib_c/zstd-1.5.2/lib/common/cpu.h +213 -0
  48. data/ext/zstdlib_c/zstd-1.5.2/lib/common/debug.c +24 -0
  49. data/ext/zstdlib_c/zstd-1.5.2/lib/common/debug.h +107 -0
  50. data/ext/zstdlib_c/zstd-1.5.2/lib/common/entropy_common.c +368 -0
  51. data/ext/zstdlib_c/zstd-1.5.2/lib/common/error_private.c +56 -0
  52. data/ext/zstdlib_c/zstd-1.5.2/lib/common/error_private.h +159 -0
  53. data/ext/zstdlib_c/zstd-1.5.2/lib/common/fse.h +717 -0
  54. data/ext/zstdlib_c/zstd-1.5.2/lib/common/fse_decompress.c +403 -0
  55. data/ext/zstdlib_c/zstd-1.5.2/lib/common/huf.h +364 -0
  56. data/ext/zstdlib_c/zstd-1.5.2/lib/common/mem.h +442 -0
  57. data/ext/zstdlib_c/zstd-1.5.2/lib/common/pool.c +355 -0
  58. data/ext/zstdlib_c/zstd-1.5.2/lib/common/pool.h +84 -0
  59. data/ext/zstdlib_c/zstd-1.5.2/lib/common/portability_macros.h +137 -0
  60. data/ext/zstdlib_c/zstd-1.5.2/lib/common/threading.c +122 -0
  61. data/ext/zstdlib_c/zstd-1.5.2/lib/common/threading.h +155 -0
  62. data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.c +24 -0
  63. data/ext/zstdlib_c/zstd-1.5.2/lib/common/xxhash.h +5686 -0
  64. data/ext/zstdlib_c/zstd-1.5.2/lib/common/zstd_common.c +83 -0
  65. data/ext/zstdlib_c/zstd-1.5.2/lib/common/zstd_deps.h +111 -0
  66. data/ext/zstdlib_c/zstd-1.5.2/lib/common/zstd_internal.h +493 -0
  67. data/ext/zstdlib_c/zstd-1.5.2/lib/common/zstd_trace.h +163 -0
  68. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/clevels.h +134 -0
  69. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/fse_compress.c +741 -0
  70. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/hist.c +181 -0
  71. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/hist.h +75 -0
  72. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/huf_compress.c +1370 -0
  73. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress.c +6327 -0
  74. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_internal.h +1458 -0
  75. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_literals.c +159 -0
  76. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_literals.h +31 -0
  77. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_sequences.c +442 -0
  78. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_sequences.h +54 -0
  79. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_superblock.c +573 -0
  80. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_compress_superblock.h +32 -0
  81. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_cwksp.h +676 -0
  82. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_double_fast.c +696 -0
  83. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_double_fast.h +38 -0
  84. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_fast.c +675 -0
  85. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_fast.h +37 -0
  86. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_lazy.c +2104 -0
  87. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_lazy.h +125 -0
  88. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_ldm.c +724 -0
  89. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_ldm.h +117 -0
  90. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_ldm_geartab.h +106 -0
  91. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_opt.c +1446 -0
  92. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstd_opt.h +56 -0
  93. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstdmt_compress.c +1859 -0
  94. data/ext/zstdlib_c/zstd-1.5.2/lib/compress/zstdmt_compress.h +113 -0
  95. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/huf_decompress.c +1889 -0
  96. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/huf_decompress_amd64.S +585 -0
  97. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/zstd_ddict.c +244 -0
  98. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/zstd_ddict.h +44 -0
  99. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/zstd_decompress.c +2230 -0
  100. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/zstd_decompress_block.c +2072 -0
  101. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/zstd_decompress_block.h +68 -0
  102. data/ext/zstdlib_c/zstd-1.5.2/lib/decompress/zstd_decompress_internal.h +236 -0
  103. data/ext/zstdlib_c/zstd-1.5.2/lib/zdict.h +452 -0
  104. data/ext/zstdlib_c/zstd-1.5.2/lib/zstd.h +2575 -0
  105. data/ext/zstdlib_c/zstd-1.5.2/lib/zstd_errors.h +95 -0
  106. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/gzclose.c +28 -0
  107. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/gzcompatibility.h +68 -0
  108. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/gzguts.h +229 -0
  109. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/gzlib.c +640 -0
  110. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/gzread.c +678 -0
  111. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/gzwrite.c +671 -0
  112. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/zstd_zlibwrapper.c +1198 -0
  113. data/ext/zstdlib_c/zstd-1.5.2/zlibWrapper/zstd_zlibwrapper.h +88 -0
  114. data/ext/zstdlib_c/zstd.mk +15 -0
  115. data/lib/2.4/zstdlib_c.bundle +0 -0
  116. data/lib/2.5/zstdlib_c.bundle +0 -0
  117. data/lib/2.6/zstdlib_c.bundle +0 -0
  118. data/lib/2.7/zstdlib_c.bundle +0 -0
  119. data/lib/3.0/zstdlib_c.bundle +0 -0
  120. data/lib/3.1/zstdlib_c.bundle +0 -0
  121. data/lib/zstdlib.rb +6 -0
  122. data/test/zstdlib_test.rb +21 -0
  123. metadata +237 -0
@@ -0,0 +1,585 @@
1
+ /*
2
+ * Copyright (c) Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #include "../common/portability_macros.h"
12
+
13
+ /* Stack marking
14
+ * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
15
+ */
16
+ #if defined(__ELF__) && defined(__GNUC__)
17
+ .section .note.GNU-stack,"",%progbits
18
+ #endif
19
+
20
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
21
+
22
+ /* Calling convention:
23
+ *
24
+ * %rdi contains the first argument: HUF_DecompressAsmArgs*.
25
+ * %rbp isn't maintained (no frame pointer).
26
+ * %rsp contains the stack pointer that grows down.
27
+ * No red-zone is assumed, only addresses >= %rsp are used.
28
+ * All register contents are preserved.
29
+ *
30
+ * TODO: Support Windows calling convention.
31
+ */
32
+
33
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
34
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
35
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
36
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
37
+ .global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
38
+ .global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
39
+ .global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
40
+ .global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
41
+ .text
42
+
43
+ /* Sets up register mappings for clarity.
44
+ * op[], bits[], dtable & ip[0] each get their own register.
45
+ * ip[1,2,3] & olimit alias var[].
46
+ * %rax is a scratch register.
47
+ */
48
+
49
+ #define op0 rsi
50
+ #define op1 rbx
51
+ #define op2 rcx
52
+ #define op3 rdi
53
+
54
+ #define ip0 r8
55
+ #define ip1 r9
56
+ #define ip2 r10
57
+ #define ip3 r11
58
+
59
+ #define bits0 rbp
60
+ #define bits1 rdx
61
+ #define bits2 r12
62
+ #define bits3 r13
63
+ #define dtable r14
64
+ #define olimit r15
65
+
66
+ /* var[] aliases ip[1,2,3] & olimit
67
+ * ip[1,2,3] are saved every iteration.
68
+ * olimit is only used in compute_olimit.
69
+ */
70
+ #define var0 r15
71
+ #define var1 r9
72
+ #define var2 r10
73
+ #define var3 r11
74
+
75
+ /* 32-bit var registers */
76
+ #define vard0 r15d
77
+ #define vard1 r9d
78
+ #define vard2 r10d
79
+ #define vard3 r11d
80
+
81
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
82
+ #define FOR_EACH_STREAM(X) \
83
+ X(0); \
84
+ X(1); \
85
+ X(2); \
86
+ X(3)
87
+
88
+ /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
89
+ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
90
+ X(0, idx); \
91
+ X(1, idx); \
92
+ X(2, idx); \
93
+ X(3, idx)
94
+
95
+ /* Define both _HUF_* & HUF_* symbols because MacOS
96
+ * C symbols are prefixed with '_' & Linux symbols aren't.
97
+ */
98
+ _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
99
+ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
100
+ /* Save all registers - even if they are callee saved for simplicity. */
101
+ push %rax
102
+ push %rbx
103
+ push %rcx
104
+ push %rdx
105
+ push %rbp
106
+ push %rsi
107
+ push %rdi
108
+ push %r8
109
+ push %r9
110
+ push %r10
111
+ push %r11
112
+ push %r12
113
+ push %r13
114
+ push %r14
115
+ push %r15
116
+
117
+ /* Read HUF_DecompressAsmArgs* args from %rax */
118
+ movq %rdi, %rax
119
+ movq 0(%rax), %ip0
120
+ movq 8(%rax), %ip1
121
+ movq 16(%rax), %ip2
122
+ movq 24(%rax), %ip3
123
+ movq 32(%rax), %op0
124
+ movq 40(%rax), %op1
125
+ movq 48(%rax), %op2
126
+ movq 56(%rax), %op3
127
+ movq 64(%rax), %bits0
128
+ movq 72(%rax), %bits1
129
+ movq 80(%rax), %bits2
130
+ movq 88(%rax), %bits3
131
+ movq 96(%rax), %dtable
132
+ push %rax /* argument */
133
+ push 104(%rax) /* ilimit */
134
+ push 112(%rax) /* oend */
135
+ push %olimit /* olimit space */
136
+
137
+ subq $24, %rsp
138
+
139
+ .L_4X1_compute_olimit:
140
+ /* Computes how many iterations we can do safely
141
+ * %r15, %rax may be clobbered
142
+ * rbx, rdx must be saved
143
+ * op3 & ip0 mustn't be clobbered
144
+ */
145
+ movq %rbx, 0(%rsp)
146
+ movq %rdx, 8(%rsp)
147
+
148
+ movq 32(%rsp), %rax /* rax = oend */
149
+ subq %op3, %rax /* rax = oend - op3 */
150
+
151
+ /* r15 = (oend - op3) / 5 */
152
+ movabsq $-3689348814741910323, %rdx
153
+ mulq %rdx
154
+ movq %rdx, %r15
155
+ shrq $2, %r15
156
+
157
+ movq %ip0, %rax /* rax = ip0 */
158
+ movq 40(%rsp), %rdx /* rdx = ilimit */
159
+ subq %rdx, %rax /* rax = ip0 - ilimit */
160
+ movq %rax, %rbx /* rbx = ip0 - ilimit */
161
+
162
+ /* rdx = (ip0 - ilimit) / 7 */
163
+ movabsq $2635249153387078803, %rdx
164
+ mulq %rdx
165
+ subq %rdx, %rbx
166
+ shrq %rbx
167
+ addq %rbx, %rdx
168
+ shrq $2, %rdx
169
+
170
+ /* r15 = min(%rdx, %r15) */
171
+ cmpq %rdx, %r15
172
+ cmova %rdx, %r15
173
+
174
+ /* r15 = r15 * 5 */
175
+ leaq (%r15, %r15, 4), %r15
176
+
177
+ /* olimit = op3 + r15 */
178
+ addq %op3, %olimit
179
+
180
+ movq 8(%rsp), %rdx
181
+ movq 0(%rsp), %rbx
182
+
183
+ /* If (op3 + 20 > olimit) */
184
+ movq %op3, %rax /* rax = op3 */
185
+ addq $20, %rax /* rax = op3 + 20 */
186
+ cmpq %rax, %olimit /* op3 + 20 > olimit */
187
+ jb .L_4X1_exit
188
+
189
+ /* If (ip1 < ip0) go to exit */
190
+ cmpq %ip0, %ip1
191
+ jb .L_4X1_exit
192
+
193
+ /* If (ip2 < ip1) go to exit */
194
+ cmpq %ip1, %ip2
195
+ jb .L_4X1_exit
196
+
197
+ /* If (ip3 < ip2) go to exit */
198
+ cmpq %ip2, %ip3
199
+ jb .L_4X1_exit
200
+
201
+ /* Reads top 11 bits from bits[n]
202
+ * Loads dt[bits[n]] into var[n]
203
+ */
204
+ #define GET_NEXT_DELT(n) \
205
+ movq $53, %var##n; \
206
+ shrxq %var##n, %bits##n, %var##n; \
207
+ movzwl (%dtable,%var##n,2),%vard##n
208
+
209
+ /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
210
+ * Moves var[n] to %rax
211
+ * bits[n] <<= var[n] & 63
212
+ * op[n][idx] = %rax >> 8
213
+ * %ah is a way to access bits [8, 16) of %rax
214
+ */
215
+ #define DECODE_FROM_DELT(n, idx) \
216
+ movq %var##n, %rax; \
217
+ shlxq %var##n, %bits##n, %bits##n; \
218
+ movb %ah, idx(%op##n)
219
+
220
+ /* Assumes GET_NEXT_DELT has been called.
221
+ * Calls DECODE_FROM_DELT then GET_NEXT_DELT
222
+ */
223
+ #define DECODE_AND_GET_NEXT(n, idx) \
224
+ DECODE_FROM_DELT(n, idx); \
225
+ GET_NEXT_DELT(n) \
226
+
227
+ /* // ctz & nbBytes is stored in bits[n]
228
+ * // nbBits is stored in %rax
229
+ * ctz = CTZ[bits[n]]
230
+ * nbBits = ctz & 7
231
+ * nbBytes = ctz >> 3
232
+ * op[n] += 5
233
+ * ip[n] -= nbBytes
234
+ * // Note: x86-64 is little-endian ==> no bswap
235
+ * bits[n] = MEM_readST(ip[n]) | 1
236
+ * bits[n] <<= nbBits
237
+ */
238
+ #define RELOAD_BITS(n) \
239
+ bsfq %bits##n, %bits##n; \
240
+ movq %bits##n, %rax; \
241
+ andq $7, %rax; \
242
+ shrq $3, %bits##n; \
243
+ leaq 5(%op##n), %op##n; \
244
+ subq %bits##n, %ip##n; \
245
+ movq (%ip##n), %bits##n; \
246
+ orq $1, %bits##n; \
247
+ shlx %rax, %bits##n, %bits##n
248
+
249
+ /* Store clobbered variables on the stack */
250
+ movq %olimit, 24(%rsp)
251
+ movq %ip1, 0(%rsp)
252
+ movq %ip2, 8(%rsp)
253
+ movq %ip3, 16(%rsp)
254
+
255
+ /* Call GET_NEXT_DELT for each stream */
256
+ FOR_EACH_STREAM(GET_NEXT_DELT)
257
+
258
+ .p2align 6
259
+
260
+ .L_4X1_loop_body:
261
+ /* Decode 5 symbols in each of the 4 streams (20 total)
262
+ * Must have called GET_NEXT_DELT for each stream
263
+ */
264
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
265
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
266
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
267
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
268
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
269
+
270
+ /* Load ip[1,2,3] from stack (var[] aliases them)
271
+ * ip[] is needed for RELOAD_BITS
272
+ * Each will be stored back to the stack after RELOAD
273
+ */
274
+ movq 0(%rsp), %ip1
275
+ movq 8(%rsp), %ip2
276
+ movq 16(%rsp), %ip3
277
+
278
+ /* Reload each stream & fetch the next table entry
279
+ * to prepare for the next iteration
280
+ */
281
+ RELOAD_BITS(0)
282
+ GET_NEXT_DELT(0)
283
+
284
+ RELOAD_BITS(1)
285
+ movq %ip1, 0(%rsp)
286
+ GET_NEXT_DELT(1)
287
+
288
+ RELOAD_BITS(2)
289
+ movq %ip2, 8(%rsp)
290
+ GET_NEXT_DELT(2)
291
+
292
+ RELOAD_BITS(3)
293
+ movq %ip3, 16(%rsp)
294
+ GET_NEXT_DELT(3)
295
+
296
+ /* If op3 < olimit: continue the loop */
297
+ cmp %op3, 24(%rsp)
298
+ ja .L_4X1_loop_body
299
+
300
+ /* Reload ip[1,2,3] from stack */
301
+ movq 0(%rsp), %ip1
302
+ movq 8(%rsp), %ip2
303
+ movq 16(%rsp), %ip3
304
+
305
+ /* Re-compute olimit */
306
+ jmp .L_4X1_compute_olimit
307
+
308
+ #undef GET_NEXT_DELT
309
+ #undef DECODE_FROM_DELT
310
+ #undef DECODE
311
+ #undef RELOAD_BITS
312
+ .L_4X1_exit:
313
+ addq $24, %rsp
314
+
315
+ /* Restore stack (oend & olimit) */
316
+ pop %rax /* olimit */
317
+ pop %rax /* oend */
318
+ pop %rax /* ilimit */
319
+ pop %rax /* arg */
320
+
321
+ /* Save ip / op / bits */
322
+ movq %ip0, 0(%rax)
323
+ movq %ip1, 8(%rax)
324
+ movq %ip2, 16(%rax)
325
+ movq %ip3, 24(%rax)
326
+ movq %op0, 32(%rax)
327
+ movq %op1, 40(%rax)
328
+ movq %op2, 48(%rax)
329
+ movq %op3, 56(%rax)
330
+ movq %bits0, 64(%rax)
331
+ movq %bits1, 72(%rax)
332
+ movq %bits2, 80(%rax)
333
+ movq %bits3, 88(%rax)
334
+
335
+ /* Restore registers */
336
+ pop %r15
337
+ pop %r14
338
+ pop %r13
339
+ pop %r12
340
+ pop %r11
341
+ pop %r10
342
+ pop %r9
343
+ pop %r8
344
+ pop %rdi
345
+ pop %rsi
346
+ pop %rbp
347
+ pop %rdx
348
+ pop %rcx
349
+ pop %rbx
350
+ pop %rax
351
+ ret
352
+
353
+ _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
354
+ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
355
+ /* Save all registers - even if they are callee saved for simplicity. */
356
+ push %rax
357
+ push %rbx
358
+ push %rcx
359
+ push %rdx
360
+ push %rbp
361
+ push %rsi
362
+ push %rdi
363
+ push %r8
364
+ push %r9
365
+ push %r10
366
+ push %r11
367
+ push %r12
368
+ push %r13
369
+ push %r14
370
+ push %r15
371
+
372
+ movq %rdi, %rax
373
+ movq 0(%rax), %ip0
374
+ movq 8(%rax), %ip1
375
+ movq 16(%rax), %ip2
376
+ movq 24(%rax), %ip3
377
+ movq 32(%rax), %op0
378
+ movq 40(%rax), %op1
379
+ movq 48(%rax), %op2
380
+ movq 56(%rax), %op3
381
+ movq 64(%rax), %bits0
382
+ movq 72(%rax), %bits1
383
+ movq 80(%rax), %bits2
384
+ movq 88(%rax), %bits3
385
+ movq 96(%rax), %dtable
386
+ push %rax /* argument */
387
+ push %rax /* olimit */
388
+ push 104(%rax) /* ilimit */
389
+
390
+ movq 112(%rax), %rax
391
+ push %rax /* oend3 */
392
+
393
+ movq %op3, %rax
394
+ push %rax /* oend2 */
395
+
396
+ movq %op2, %rax
397
+ push %rax /* oend1 */
398
+
399
+ movq %op1, %rax
400
+ push %rax /* oend0 */
401
+
402
+ /* Scratch space */
403
+ subq $8, %rsp
404
+
405
+ .L_4X2_compute_olimit:
406
+ /* Computes how many iterations we can do safely
407
+ * %r15, %rax may be clobbered
408
+ * rdx must be saved
409
+ * op[1,2,3,4] & ip0 mustn't be clobbered
410
+ */
411
+ movq %rdx, 0(%rsp)
412
+
413
+ /* We can consume up to 7 input bytes each iteration. */
414
+ movq %ip0, %rax /* rax = ip0 */
415
+ movq 40(%rsp), %rdx /* rdx = ilimit */
416
+ subq %rdx, %rax /* rax = ip0 - ilimit */
417
+ movq %rax, %r15 /* r15 = ip0 - ilimit */
418
+
419
+ /* rdx = rax / 7 */
420
+ movabsq $2635249153387078803, %rdx
421
+ mulq %rdx
422
+ subq %rdx, %r15
423
+ shrq %r15
424
+ addq %r15, %rdx
425
+ shrq $2, %rdx
426
+
427
+ /* r15 = (ip0 - ilimit) / 7 */
428
+ movq %rdx, %r15
429
+
430
+ movabsq $-3689348814741910323, %rdx
431
+ movq 8(%rsp), %rax /* rax = oend0 */
432
+ subq %op0, %rax /* rax = oend0 - op0 */
433
+ mulq %rdx
434
+ shrq $3, %rdx /* rdx = rax / 10 */
435
+
436
+ /* r15 = min(%rdx, %r15) */
437
+ cmpq %rdx, %r15
438
+ cmova %rdx, %r15
439
+
440
+ movabsq $-3689348814741910323, %rdx
441
+ movq 16(%rsp), %rax /* rax = oend1 */
442
+ subq %op1, %rax /* rax = oend1 - op1 */
443
+ mulq %rdx
444
+ shrq $3, %rdx /* rdx = rax / 10 */
445
+
446
+ /* r15 = min(%rdx, %r15) */
447
+ cmpq %rdx, %r15
448
+ cmova %rdx, %r15
449
+
450
+ movabsq $-3689348814741910323, %rdx
451
+ movq 24(%rsp), %rax /* rax = oend2 */
452
+ subq %op2, %rax /* rax = oend2 - op2 */
453
+ mulq %rdx
454
+ shrq $3, %rdx /* rdx = rax / 10 */
455
+
456
+ /* r15 = min(%rdx, %r15) */
457
+ cmpq %rdx, %r15
458
+ cmova %rdx, %r15
459
+
460
+ movabsq $-3689348814741910323, %rdx
461
+ movq 32(%rsp), %rax /* rax = oend3 */
462
+ subq %op3, %rax /* rax = oend3 - op3 */
463
+ mulq %rdx
464
+ shrq $3, %rdx /* rdx = rax / 10 */
465
+
466
+ /* r15 = min(%rdx, %r15) */
467
+ cmpq %rdx, %r15
468
+ cmova %rdx, %r15
469
+
470
+ /* olimit = op3 + 5 * r15 */
471
+ movq %r15, %rax
472
+ leaq (%op3, %rax, 4), %olimit
473
+ addq %rax, %olimit
474
+
475
+ movq 0(%rsp), %rdx
476
+
477
+ /* If (op3 + 10 > olimit) */
478
+ movq %op3, %rax /* rax = op3 */
479
+ addq $10, %rax /* rax = op3 + 10 */
480
+ cmpq %rax, %olimit /* op3 + 10 > olimit */
481
+ jb .L_4X2_exit
482
+
483
+ /* If (ip1 < ip0) go to exit */
484
+ cmpq %ip0, %ip1
485
+ jb .L_4X2_exit
486
+
487
+ /* If (ip2 < ip1) go to exit */
488
+ cmpq %ip1, %ip2
489
+ jb .L_4X2_exit
490
+
491
+ /* If (ip3 < ip2) go to exit */
492
+ cmpq %ip2, %ip3
493
+ jb .L_4X2_exit
494
+
495
+ #define DECODE(n, idx) \
496
+ movq %bits##n, %rax; \
497
+ shrq $53, %rax; \
498
+ movzwl 0(%dtable,%rax,4),%r8d; \
499
+ movzbl 2(%dtable,%rax,4),%r15d; \
500
+ movzbl 3(%dtable,%rax,4),%eax; \
501
+ movw %r8w, (%op##n); \
502
+ shlxq %r15, %bits##n, %bits##n; \
503
+ addq %rax, %op##n
504
+
505
+ #define RELOAD_BITS(n) \
506
+ bsfq %bits##n, %bits##n; \
507
+ movq %bits##n, %rax; \
508
+ shrq $3, %bits##n; \
509
+ andq $7, %rax; \
510
+ subq %bits##n, %ip##n; \
511
+ movq (%ip##n), %bits##n; \
512
+ orq $1, %bits##n; \
513
+ shlxq %rax, %bits##n, %bits##n
514
+
515
+
516
+ movq %olimit, 48(%rsp)
517
+
518
+ .p2align 6
519
+
520
+ .L_4X2_loop_body:
521
+ /* We clobber r8, so store it on the stack */
522
+ movq %r8, 0(%rsp)
523
+
524
+ /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
525
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
526
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
527
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
528
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
529
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
530
+
531
+ /* Reload r8 */
532
+ movq 0(%rsp), %r8
533
+
534
+ FOR_EACH_STREAM(RELOAD_BITS)
535
+
536
+ cmp %op3, 48(%rsp)
537
+ ja .L_4X2_loop_body
538
+ jmp .L_4X2_compute_olimit
539
+
540
+ #undef DECODE
541
+ #undef RELOAD_BITS
542
+ .L_4X2_exit:
543
+ addq $8, %rsp
544
+ /* Restore stack (oend & olimit) */
545
+ pop %rax /* oend0 */
546
+ pop %rax /* oend1 */
547
+ pop %rax /* oend2 */
548
+ pop %rax /* oend3 */
549
+ pop %rax /* ilimit */
550
+ pop %rax /* olimit */
551
+ pop %rax /* arg */
552
+
553
+ /* Save ip / op / bits */
554
+ movq %ip0, 0(%rax)
555
+ movq %ip1, 8(%rax)
556
+ movq %ip2, 16(%rax)
557
+ movq %ip3, 24(%rax)
558
+ movq %op0, 32(%rax)
559
+ movq %op1, 40(%rax)
560
+ movq %op2, 48(%rax)
561
+ movq %op3, 56(%rax)
562
+ movq %bits0, 64(%rax)
563
+ movq %bits1, 72(%rax)
564
+ movq %bits2, 80(%rax)
565
+ movq %bits3, 88(%rax)
566
+
567
+ /* Restore registers */
568
+ pop %r15
569
+ pop %r14
570
+ pop %r13
571
+ pop %r12
572
+ pop %r11
573
+ pop %r10
574
+ pop %r9
575
+ pop %r8
576
+ pop %rdi
577
+ pop %rsi
578
+ pop %rbp
579
+ pop %rdx
580
+ pop %rcx
581
+ pop %rbx
582
+ pop %rax
583
+ ret
584
+
585
+ #endif