extzstd 0.3.2 → 0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (112) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +4 -3
  3. data/contrib/zstd/CHANGELOG +225 -1
  4. data/contrib/zstd/CONTRIBUTING.md +158 -75
  5. data/contrib/zstd/LICENSE +4 -4
  6. data/contrib/zstd/Makefile +106 -69
  7. data/contrib/zstd/Package.swift +36 -0
  8. data/contrib/zstd/README.md +64 -36
  9. data/contrib/zstd/SECURITY.md +15 -0
  10. data/contrib/zstd/TESTING.md +2 -3
  11. data/contrib/zstd/lib/BUCK +5 -7
  12. data/contrib/zstd/lib/Makefile +117 -199
  13. data/contrib/zstd/lib/README.md +37 -7
  14. data/contrib/zstd/lib/common/allocations.h +55 -0
  15. data/contrib/zstd/lib/common/bits.h +200 -0
  16. data/contrib/zstd/lib/common/bitstream.h +80 -86
  17. data/contrib/zstd/lib/common/compiler.h +225 -63
  18. data/contrib/zstd/lib/common/cpu.h +37 -1
  19. data/contrib/zstd/lib/common/debug.c +7 -1
  20. data/contrib/zstd/lib/common/debug.h +21 -12
  21. data/contrib/zstd/lib/common/entropy_common.c +15 -37
  22. data/contrib/zstd/lib/common/error_private.c +9 -2
  23. data/contrib/zstd/lib/common/error_private.h +93 -5
  24. data/contrib/zstd/lib/common/fse.h +12 -87
  25. data/contrib/zstd/lib/common/fse_decompress.c +37 -117
  26. data/contrib/zstd/lib/common/huf.h +97 -172
  27. data/contrib/zstd/lib/common/mem.h +58 -58
  28. data/contrib/zstd/lib/common/pool.c +38 -17
  29. data/contrib/zstd/lib/common/pool.h +10 -4
  30. data/contrib/zstd/lib/common/portability_macros.h +158 -0
  31. data/contrib/zstd/lib/common/threading.c +74 -14
  32. data/contrib/zstd/lib/common/threading.h +5 -10
  33. data/contrib/zstd/lib/common/xxhash.c +6 -814
  34. data/contrib/zstd/lib/common/xxhash.h +6930 -195
  35. data/contrib/zstd/lib/common/zstd_common.c +1 -36
  36. data/contrib/zstd/lib/common/zstd_deps.h +1 -1
  37. data/contrib/zstd/lib/common/zstd_internal.h +68 -154
  38. data/contrib/zstd/lib/common/zstd_trace.h +163 -0
  39. data/contrib/zstd/lib/compress/clevels.h +134 -0
  40. data/contrib/zstd/lib/compress/fse_compress.c +75 -155
  41. data/contrib/zstd/lib/compress/hist.c +1 -1
  42. data/contrib/zstd/lib/compress/hist.h +1 -1
  43. data/contrib/zstd/lib/compress/huf_compress.c +810 -259
  44. data/contrib/zstd/lib/compress/zstd_compress.c +2864 -919
  45. data/contrib/zstd/lib/compress/zstd_compress_internal.h +523 -192
  46. data/contrib/zstd/lib/compress/zstd_compress_literals.c +117 -40
  47. data/contrib/zstd/lib/compress/zstd_compress_literals.h +16 -6
  48. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +28 -19
  49. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +1 -1
  50. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +251 -412
  51. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +1 -1
  52. data/contrib/zstd/lib/compress/zstd_cwksp.h +284 -97
  53. data/contrib/zstd/lib/compress/zstd_double_fast.c +382 -133
  54. data/contrib/zstd/lib/compress/zstd_double_fast.h +14 -2
  55. data/contrib/zstd/lib/compress/zstd_fast.c +732 -260
  56. data/contrib/zstd/lib/compress/zstd_fast.h +3 -2
  57. data/contrib/zstd/lib/compress/zstd_lazy.c +1177 -390
  58. data/contrib/zstd/lib/compress/zstd_lazy.h +129 -14
  59. data/contrib/zstd/lib/compress/zstd_ldm.c +280 -210
  60. data/contrib/zstd/lib/compress/zstd_ldm.h +3 -2
  61. data/contrib/zstd/lib/compress/zstd_ldm_geartab.h +106 -0
  62. data/contrib/zstd/lib/compress/zstd_opt.c +516 -285
  63. data/contrib/zstd/lib/compress/zstd_opt.h +32 -8
  64. data/contrib/zstd/lib/compress/zstdmt_compress.c +202 -131
  65. data/contrib/zstd/lib/compress/zstdmt_compress.h +9 -6
  66. data/contrib/zstd/lib/decompress/huf_decompress.c +1149 -555
  67. data/contrib/zstd/lib/decompress/huf_decompress_amd64.S +595 -0
  68. data/contrib/zstd/lib/decompress/zstd_ddict.c +4 -4
  69. data/contrib/zstd/lib/decompress/zstd_ddict.h +1 -1
  70. data/contrib/zstd/lib/decompress/zstd_decompress.c +583 -106
  71. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +1054 -379
  72. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +14 -3
  73. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +56 -6
  74. data/contrib/zstd/lib/deprecated/zbuff.h +1 -1
  75. data/contrib/zstd/lib/deprecated/zbuff_common.c +1 -1
  76. data/contrib/zstd/lib/deprecated/zbuff_compress.c +24 -4
  77. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +3 -1
  78. data/contrib/zstd/lib/dictBuilder/cover.c +60 -44
  79. data/contrib/zstd/lib/dictBuilder/cover.h +6 -11
  80. data/contrib/zstd/lib/dictBuilder/divsufsort.c +1 -1
  81. data/contrib/zstd/lib/dictBuilder/fastcover.c +26 -18
  82. data/contrib/zstd/lib/dictBuilder/zdict.c +100 -101
  83. data/contrib/zstd/lib/legacy/zstd_legacy.h +38 -1
  84. data/contrib/zstd/lib/legacy/zstd_v01.c +18 -53
  85. data/contrib/zstd/lib/legacy/zstd_v01.h +1 -1
  86. data/contrib/zstd/lib/legacy/zstd_v02.c +28 -85
  87. data/contrib/zstd/lib/legacy/zstd_v02.h +1 -1
  88. data/contrib/zstd/lib/legacy/zstd_v03.c +29 -88
  89. data/contrib/zstd/lib/legacy/zstd_v03.h +1 -1
  90. data/contrib/zstd/lib/legacy/zstd_v04.c +27 -80
  91. data/contrib/zstd/lib/legacy/zstd_v04.h +1 -1
  92. data/contrib/zstd/lib/legacy/zstd_v05.c +36 -85
  93. data/contrib/zstd/lib/legacy/zstd_v05.h +1 -1
  94. data/contrib/zstd/lib/legacy/zstd_v06.c +44 -96
  95. data/contrib/zstd/lib/legacy/zstd_v06.h +1 -1
  96. data/contrib/zstd/lib/legacy/zstd_v07.c +37 -92
  97. data/contrib/zstd/lib/legacy/zstd_v07.h +1 -1
  98. data/contrib/zstd/lib/libzstd.mk +237 -0
  99. data/contrib/zstd/lib/libzstd.pc.in +4 -3
  100. data/contrib/zstd/lib/module.modulemap +35 -0
  101. data/contrib/zstd/lib/{dictBuilder/zdict.h → zdict.h} +202 -33
  102. data/contrib/zstd/lib/zstd.h +1030 -332
  103. data/contrib/zstd/lib/{common/zstd_errors.h → zstd_errors.h} +27 -8
  104. data/ext/extconf.rb +26 -7
  105. data/ext/extzstd.c +51 -24
  106. data/ext/extzstd.h +33 -6
  107. data/ext/extzstd_stream.c +74 -31
  108. data/ext/libzstd_conf.h +0 -1
  109. data/ext/zstd_decompress_asm.S +1 -0
  110. metadata +17 -7
  111. data/contrib/zstd/appveyor.yml +0 -292
  112. data/ext/depend +0 -2
@@ -0,0 +1,595 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #include "../common/portability_macros.h"
12
+
13
+ #if defined(__ELF__) && defined(__GNUC__)
14
+ /* Stack marking
15
+ * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
16
+ */
17
+ .section .note.GNU-stack,"",%progbits
18
+
19
+ #if defined(__aarch64__)
20
+ /* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
21
+ * See: https://github.com/facebook/zstd/issues/3841
22
+ * See: https://gcc.godbolt.org/z/sqr5T4ffK
23
+ * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/
24
+ * See: https://reviews.llvm.org/D62609
25
+ */
26
+ .pushsection .note.gnu.property, "a"
27
+ .p2align 3
28
+ .long 4 /* size of the name - "GNU\0" */
29
+ .long 0x10 /* size of descriptor */
30
+ .long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */
31
+ .asciz "GNU"
32
+ .long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
33
+ .long 4 /* pr_datasz - 4 bytes */
34
+ .long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
35
+ .p2align 3 /* pr_padding - bring everything to 8 byte alignment */
36
+ .popsection
37
+ #endif
38
+
39
+ #endif
40
+
41
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
42
+
43
+ /* Calling convention:
44
+ *
45
+ * %rdi contains the first argument: HUF_DecompressAsmArgs*.
46
+ * %rbp isn't maintained (no frame pointer).
47
+ * %rsp contains the stack pointer that grows down.
48
+ * No red-zone is assumed, only addresses >= %rsp are used.
49
+ * All register contents are preserved.
50
+ *
51
+ * TODO: Support Windows calling convention.
52
+ */
53
+
54
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
55
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
56
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
57
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
58
+ .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
59
+ .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
60
+ .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
61
+ .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
62
+ .text
63
+
64
+ /* Sets up register mappings for clarity.
65
+ * op[], bits[], dtable & ip[0] each get their own register.
66
+ * ip[1,2,3] & olimit alias var[].
67
+ * %rax is a scratch register.
68
+ */
69
+
70
+ #define op0 rsi
71
+ #define op1 rbx
72
+ #define op2 rcx
73
+ #define op3 rdi
74
+
75
+ #define ip0 r8
76
+ #define ip1 r9
77
+ #define ip2 r10
78
+ #define ip3 r11
79
+
80
+ #define bits0 rbp
81
+ #define bits1 rdx
82
+ #define bits2 r12
83
+ #define bits3 r13
84
+ #define dtable r14
85
+ #define olimit r15
86
+
87
+ /* var[] aliases ip[1,2,3] & olimit
88
+ * ip[1,2,3] are saved every iteration.
89
+ * olimit is only used in compute_olimit.
90
+ */
91
+ #define var0 r15
92
+ #define var1 r9
93
+ #define var2 r10
94
+ #define var3 r11
95
+
96
+ /* 32-bit var registers */
97
+ #define vard0 r15d
98
+ #define vard1 r9d
99
+ #define vard2 r10d
100
+ #define vard3 r11d
101
+
102
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
103
+ #define FOR_EACH_STREAM(X) \
104
+ X(0); \
105
+ X(1); \
106
+ X(2); \
107
+ X(3)
108
+
109
+ /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
110
+ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
111
+ X(0, idx); \
112
+ X(1, idx); \
113
+ X(2, idx); \
114
+ X(3, idx)
115
+
116
+ /* Define both _HUF_* & HUF_* symbols because MacOS
117
+ * C symbols are prefixed with '_' & Linux symbols aren't.
118
+ */
119
+ _HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
120
+ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
121
+ ZSTD_CET_ENDBRANCH
122
+ /* Save all registers - even if they are callee saved for simplicity. */
123
+ push %rax
124
+ push %rbx
125
+ push %rcx
126
+ push %rdx
127
+ push %rbp
128
+ push %rsi
129
+ push %rdi
130
+ push %r8
131
+ push %r9
132
+ push %r10
133
+ push %r11
134
+ push %r12
135
+ push %r13
136
+ push %r14
137
+ push %r15
138
+
139
+ /* Read HUF_DecompressAsmArgs* args from %rax */
140
+ movq %rdi, %rax
141
+ movq 0(%rax), %ip0
142
+ movq 8(%rax), %ip1
143
+ movq 16(%rax), %ip2
144
+ movq 24(%rax), %ip3
145
+ movq 32(%rax), %op0
146
+ movq 40(%rax), %op1
147
+ movq 48(%rax), %op2
148
+ movq 56(%rax), %op3
149
+ movq 64(%rax), %bits0
150
+ movq 72(%rax), %bits1
151
+ movq 80(%rax), %bits2
152
+ movq 88(%rax), %bits3
153
+ movq 96(%rax), %dtable
154
+ push %rax /* argument */
155
+ push 104(%rax) /* ilowest */
156
+ push 112(%rax) /* oend */
157
+ push %olimit /* olimit space */
158
+
159
+ subq $24, %rsp
160
+
161
+ .L_4X1_compute_olimit:
162
+ /* Computes how many iterations we can do safely
163
+ * %r15, %rax may be clobbered
164
+ * rbx, rdx must be saved
165
+ * op3 & ip0 mustn't be clobbered
166
+ */
167
+ movq %rbx, 0(%rsp)
168
+ movq %rdx, 8(%rsp)
169
+
170
+ movq 32(%rsp), %rax /* rax = oend */
171
+ subq %op3, %rax /* rax = oend - op3 */
172
+
173
+ /* r15 = (oend - op3) / 5 */
174
+ movabsq $-3689348814741910323, %rdx
175
+ mulq %rdx
176
+ movq %rdx, %r15
177
+ shrq $2, %r15
178
+
179
+ movq %ip0, %rax /* rax = ip0 */
180
+ movq 40(%rsp), %rdx /* rdx = ilowest */
181
+ subq %rdx, %rax /* rax = ip0 - ilowest */
182
+ movq %rax, %rbx /* rbx = ip0 - ilowest */
183
+
184
+ /* rdx = (ip0 - ilowest) / 7 */
185
+ movabsq $2635249153387078803, %rdx
186
+ mulq %rdx
187
+ subq %rdx, %rbx
188
+ shrq %rbx
189
+ addq %rbx, %rdx
190
+ shrq $2, %rdx
191
+
192
+ /* r15 = min(%rdx, %r15) */
193
+ cmpq %rdx, %r15
194
+ cmova %rdx, %r15
195
+
196
+ /* r15 = r15 * 5 */
197
+ leaq (%r15, %r15, 4), %r15
198
+
199
+ /* olimit = op3 + r15 */
200
+ addq %op3, %olimit
201
+
202
+ movq 8(%rsp), %rdx
203
+ movq 0(%rsp), %rbx
204
+
205
+ /* If (op3 + 20 > olimit) */
206
+ movq %op3, %rax /* rax = op3 */
207
+ cmpq %rax, %olimit /* op3 == olimit */
208
+ je .L_4X1_exit
209
+
210
+ /* If (ip1 < ip0) go to exit */
211
+ cmpq %ip0, %ip1
212
+ jb .L_4X1_exit
213
+
214
+ /* If (ip2 < ip1) go to exit */
215
+ cmpq %ip1, %ip2
216
+ jb .L_4X1_exit
217
+
218
+ /* If (ip3 < ip2) go to exit */
219
+ cmpq %ip2, %ip3
220
+ jb .L_4X1_exit
221
+
222
+ /* Reads top 11 bits from bits[n]
223
+ * Loads dt[bits[n]] into var[n]
224
+ */
225
+ #define GET_NEXT_DELT(n) \
226
+ movq $53, %var##n; \
227
+ shrxq %var##n, %bits##n, %var##n; \
228
+ movzwl (%dtable,%var##n,2),%vard##n
229
+
230
+ /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
231
+ * Moves var[n] to %rax
232
+ * bits[n] <<= var[n] & 63
233
+ * op[n][idx] = %rax >> 8
234
+ * %ah is a way to access bits [8, 16) of %rax
235
+ */
236
+ #define DECODE_FROM_DELT(n, idx) \
237
+ movq %var##n, %rax; \
238
+ shlxq %var##n, %bits##n, %bits##n; \
239
+ movb %ah, idx(%op##n)
240
+
241
+ /* Assumes GET_NEXT_DELT has been called.
242
+ * Calls DECODE_FROM_DELT then GET_NEXT_DELT
243
+ */
244
+ #define DECODE_AND_GET_NEXT(n, idx) \
245
+ DECODE_FROM_DELT(n, idx); \
246
+ GET_NEXT_DELT(n) \
247
+
248
+ /* // ctz & nbBytes is stored in bits[n]
249
+ * // nbBits is stored in %rax
250
+ * ctz = CTZ[bits[n]]
251
+ * nbBits = ctz & 7
252
+ * nbBytes = ctz >> 3
253
+ * op[n] += 5
254
+ * ip[n] -= nbBytes
255
+ * // Note: x86-64 is little-endian ==> no bswap
256
+ * bits[n] = MEM_readST(ip[n]) | 1
257
+ * bits[n] <<= nbBits
258
+ */
259
+ #define RELOAD_BITS(n) \
260
+ bsfq %bits##n, %bits##n; \
261
+ movq %bits##n, %rax; \
262
+ andq $7, %rax; \
263
+ shrq $3, %bits##n; \
264
+ leaq 5(%op##n), %op##n; \
265
+ subq %bits##n, %ip##n; \
266
+ movq (%ip##n), %bits##n; \
267
+ orq $1, %bits##n; \
268
+ shlx %rax, %bits##n, %bits##n
269
+
270
+ /* Store clobbered variables on the stack */
271
+ movq %olimit, 24(%rsp)
272
+ movq %ip1, 0(%rsp)
273
+ movq %ip2, 8(%rsp)
274
+ movq %ip3, 16(%rsp)
275
+
276
+ /* Call GET_NEXT_DELT for each stream */
277
+ FOR_EACH_STREAM(GET_NEXT_DELT)
278
+
279
+ .p2align 6
280
+
281
+ .L_4X1_loop_body:
282
+ /* Decode 5 symbols in each of the 4 streams (20 total)
283
+ * Must have called GET_NEXT_DELT for each stream
284
+ */
285
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
286
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
287
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
288
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
289
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
290
+
291
+ /* Load ip[1,2,3] from stack (var[] aliases them)
292
+ * ip[] is needed for RELOAD_BITS
293
+ * Each will be stored back to the stack after RELOAD
294
+ */
295
+ movq 0(%rsp), %ip1
296
+ movq 8(%rsp), %ip2
297
+ movq 16(%rsp), %ip3
298
+
299
+ /* Reload each stream & fetch the next table entry
300
+ * to prepare for the next iteration
301
+ */
302
+ RELOAD_BITS(0)
303
+ GET_NEXT_DELT(0)
304
+
305
+ RELOAD_BITS(1)
306
+ movq %ip1, 0(%rsp)
307
+ GET_NEXT_DELT(1)
308
+
309
+ RELOAD_BITS(2)
310
+ movq %ip2, 8(%rsp)
311
+ GET_NEXT_DELT(2)
312
+
313
+ RELOAD_BITS(3)
314
+ movq %ip3, 16(%rsp)
315
+ GET_NEXT_DELT(3)
316
+
317
+ /* If op3 < olimit: continue the loop */
318
+ cmp %op3, 24(%rsp)
319
+ ja .L_4X1_loop_body
320
+
321
+ /* Reload ip[1,2,3] from stack */
322
+ movq 0(%rsp), %ip1
323
+ movq 8(%rsp), %ip2
324
+ movq 16(%rsp), %ip3
325
+
326
+ /* Re-compute olimit */
327
+ jmp .L_4X1_compute_olimit
328
+
329
+ #undef GET_NEXT_DELT
330
+ #undef DECODE_FROM_DELT
331
+ #undef DECODE
332
+ #undef RELOAD_BITS
333
+ .L_4X1_exit:
334
+ addq $24, %rsp
335
+
336
+ /* Restore stack (oend & olimit) */
337
+ pop %rax /* olimit */
338
+ pop %rax /* oend */
339
+ pop %rax /* ilowest */
340
+ pop %rax /* arg */
341
+
342
+ /* Save ip / op / bits */
343
+ movq %ip0, 0(%rax)
344
+ movq %ip1, 8(%rax)
345
+ movq %ip2, 16(%rax)
346
+ movq %ip3, 24(%rax)
347
+ movq %op0, 32(%rax)
348
+ movq %op1, 40(%rax)
349
+ movq %op2, 48(%rax)
350
+ movq %op3, 56(%rax)
351
+ movq %bits0, 64(%rax)
352
+ movq %bits1, 72(%rax)
353
+ movq %bits2, 80(%rax)
354
+ movq %bits3, 88(%rax)
355
+
356
+ /* Restore registers */
357
+ pop %r15
358
+ pop %r14
359
+ pop %r13
360
+ pop %r12
361
+ pop %r11
362
+ pop %r10
363
+ pop %r9
364
+ pop %r8
365
+ pop %rdi
366
+ pop %rsi
367
+ pop %rbp
368
+ pop %rdx
369
+ pop %rcx
370
+ pop %rbx
371
+ pop %rax
372
+ ret
373
+
374
+ _HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
375
+ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
376
+ ZSTD_CET_ENDBRANCH
377
+ /* Save all registers - even if they are callee saved for simplicity. */
378
+ push %rax
379
+ push %rbx
380
+ push %rcx
381
+ push %rdx
382
+ push %rbp
383
+ push %rsi
384
+ push %rdi
385
+ push %r8
386
+ push %r9
387
+ push %r10
388
+ push %r11
389
+ push %r12
390
+ push %r13
391
+ push %r14
392
+ push %r15
393
+
394
+ movq %rdi, %rax
395
+ movq 0(%rax), %ip0
396
+ movq 8(%rax), %ip1
397
+ movq 16(%rax), %ip2
398
+ movq 24(%rax), %ip3
399
+ movq 32(%rax), %op0
400
+ movq 40(%rax), %op1
401
+ movq 48(%rax), %op2
402
+ movq 56(%rax), %op3
403
+ movq 64(%rax), %bits0
404
+ movq 72(%rax), %bits1
405
+ movq 80(%rax), %bits2
406
+ movq 88(%rax), %bits3
407
+ movq 96(%rax), %dtable
408
+ push %rax /* argument */
409
+ push %rax /* olimit */
410
+ push 104(%rax) /* ilowest */
411
+
412
+ movq 112(%rax), %rax
413
+ push %rax /* oend3 */
414
+
415
+ movq %op3, %rax
416
+ push %rax /* oend2 */
417
+
418
+ movq %op2, %rax
419
+ push %rax /* oend1 */
420
+
421
+ movq %op1, %rax
422
+ push %rax /* oend0 */
423
+
424
+ /* Scratch space */
425
+ subq $8, %rsp
426
+
427
+ .L_4X2_compute_olimit:
428
+ /* Computes how many iterations we can do safely
429
+ * %r15, %rax may be clobbered
430
+ * rdx must be saved
431
+ * op[1,2,3,4] & ip0 mustn't be clobbered
432
+ */
433
+ movq %rdx, 0(%rsp)
434
+
435
+ /* We can consume up to 7 input bytes each iteration. */
436
+ movq %ip0, %rax /* rax = ip0 */
437
+ movq 40(%rsp), %rdx /* rdx = ilowest */
438
+ subq %rdx, %rax /* rax = ip0 - ilowest */
439
+ movq %rax, %r15 /* r15 = ip0 - ilowest */
440
+
441
+ /* rdx = rax / 7 */
442
+ movabsq $2635249153387078803, %rdx
443
+ mulq %rdx
444
+ subq %rdx, %r15
445
+ shrq %r15
446
+ addq %r15, %rdx
447
+ shrq $2, %rdx
448
+
449
+ /* r15 = (ip0 - ilowest) / 7 */
450
+ movq %rdx, %r15
451
+
452
+ /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
453
+ movq 8(%rsp), %rax /* rax = oend0 */
454
+ subq %op0, %rax /* rax = oend0 - op0 */
455
+ movq 16(%rsp), %rdx /* rdx = oend1 */
456
+ subq %op1, %rdx /* rdx = oend1 - op1 */
457
+
458
+ cmpq %rax, %rdx
459
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
460
+
461
+ movq 24(%rsp), %rax /* rax = oend2 */
462
+ subq %op2, %rax /* rax = oend2 - op2 */
463
+
464
+ cmpq %rax, %rdx
465
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
466
+
467
+ movq 32(%rsp), %rax /* rax = oend3 */
468
+ subq %op3, %rax /* rax = oend3 - op3 */
469
+
470
+ cmpq %rax, %rdx
471
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
472
+
473
+ movabsq $-3689348814741910323, %rax
474
+ mulq %rdx
475
+ shrq $3, %rdx /* rdx = rdx / 10 */
476
+
477
+ /* r15 = min(%rdx, %r15) */
478
+ cmpq %rdx, %r15
479
+ cmova %rdx, %r15
480
+
481
+ /* olimit = op3 + 5 * r15 */
482
+ movq %r15, %rax
483
+ leaq (%op3, %rax, 4), %olimit
484
+ addq %rax, %olimit
485
+
486
+ movq 0(%rsp), %rdx
487
+
488
+ /* If (op3 + 10 > olimit) */
489
+ movq %op3, %rax /* rax = op3 */
490
+ cmpq %rax, %olimit /* op3 == olimit */
491
+ je .L_4X2_exit
492
+
493
+ /* If (ip1 < ip0) go to exit */
494
+ cmpq %ip0, %ip1
495
+ jb .L_4X2_exit
496
+
497
+ /* If (ip2 < ip1) go to exit */
498
+ cmpq %ip1, %ip2
499
+ jb .L_4X2_exit
500
+
501
+ /* If (ip3 < ip2) go to exit */
502
+ cmpq %ip2, %ip3
503
+ jb .L_4X2_exit
504
+
505
+ #define DECODE(n, idx) \
506
+ movq %bits##n, %rax; \
507
+ shrq $53, %rax; \
508
+ movzwl 0(%dtable,%rax,4),%r8d; \
509
+ movzbl 2(%dtable,%rax,4),%r15d; \
510
+ movzbl 3(%dtable,%rax,4),%eax; \
511
+ movw %r8w, (%op##n); \
512
+ shlxq %r15, %bits##n, %bits##n; \
513
+ addq %rax, %op##n
514
+
515
+ #define RELOAD_BITS(n) \
516
+ bsfq %bits##n, %bits##n; \
517
+ movq %bits##n, %rax; \
518
+ shrq $3, %bits##n; \
519
+ andq $7, %rax; \
520
+ subq %bits##n, %ip##n; \
521
+ movq (%ip##n), %bits##n; \
522
+ orq $1, %bits##n; \
523
+ shlxq %rax, %bits##n, %bits##n
524
+
525
+
526
+ movq %olimit, 48(%rsp)
527
+
528
+ .p2align 6
529
+
530
+ .L_4X2_loop_body:
531
+ /* We clobber r8, so store it on the stack */
532
+ movq %r8, 0(%rsp)
533
+
534
+ /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
535
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
536
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
537
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
538
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
539
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
540
+
541
+ /* Reload r8 */
542
+ movq 0(%rsp), %r8
543
+
544
+ FOR_EACH_STREAM(RELOAD_BITS)
545
+
546
+ cmp %op3, 48(%rsp)
547
+ ja .L_4X2_loop_body
548
+ jmp .L_4X2_compute_olimit
549
+
550
+ #undef DECODE
551
+ #undef RELOAD_BITS
552
+ .L_4X2_exit:
553
+ addq $8, %rsp
554
+ /* Restore stack (oend & olimit) */
555
+ pop %rax /* oend0 */
556
+ pop %rax /* oend1 */
557
+ pop %rax /* oend2 */
558
+ pop %rax /* oend3 */
559
+ pop %rax /* ilowest */
560
+ pop %rax /* olimit */
561
+ pop %rax /* arg */
562
+
563
+ /* Save ip / op / bits */
564
+ movq %ip0, 0(%rax)
565
+ movq %ip1, 8(%rax)
566
+ movq %ip2, 16(%rax)
567
+ movq %ip3, 24(%rax)
568
+ movq %op0, 32(%rax)
569
+ movq %op1, 40(%rax)
570
+ movq %op2, 48(%rax)
571
+ movq %op3, 56(%rax)
572
+ movq %bits0, 64(%rax)
573
+ movq %bits1, 72(%rax)
574
+ movq %bits2, 80(%rax)
575
+ movq %bits3, 88(%rax)
576
+
577
+ /* Restore registers */
578
+ pop %r15
579
+ pop %r14
580
+ pop %r13
581
+ pop %r12
582
+ pop %r11
583
+ pop %r10
584
+ pop %r9
585
+ pop %r8
586
+ pop %rdi
587
+ pop %rsi
588
+ pop %rbp
589
+ pop %rdx
590
+ pop %rcx
591
+ pop %rbx
592
+ pop %rax
593
+ ret
594
+
595
+ #endif
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,12 +14,12 @@
14
14
  /*-*******************************************************
15
15
  * Dependencies
16
16
  *********************************************************/
17
+ #include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */
17
18
  #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
18
19
  #include "../common/cpu.h" /* bmi2 */
19
20
  #include "../common/mem.h" /* low level memory routines */
20
21
  #define FSE_STATIC_LINKING_ONLY
21
22
  #include "../common/fse.h"
22
- #define HUF_STATIC_LINKING_ONLY
23
23
  #include "../common/huf.h"
24
24
  #include "zstd_decompress_internal.h"
25
25
  #include "zstd_ddict.h"
@@ -134,7 +134,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
134
134
  ZSTD_memcpy(internalBuffer, dict, dictSize);
135
135
  }
136
136
  ddict->dictSize = dictSize;
137
- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
137
+ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */
138
138
 
139
139
  /* parse dictionary content */
140
140
  FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
@@ -240,5 +240,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
240
240
  unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
241
241
  {
242
242
  if (ddict==NULL) return 0;
243
- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
243
+ return ddict->dictID;
244
244
  }
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the