extzstd 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (108) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +3 -3
  3. data/contrib/zstd/CHANGELOG +188 -1
  4. data/contrib/zstd/CONTRIBUTING.md +157 -74
  5. data/contrib/zstd/LICENSE +4 -4
  6. data/contrib/zstd/Makefile +81 -58
  7. data/contrib/zstd/Package.swift +36 -0
  8. data/contrib/zstd/README.md +59 -35
  9. data/contrib/zstd/TESTING.md +2 -3
  10. data/contrib/zstd/appveyor.yml +49 -136
  11. data/contrib/zstd/lib/BUCK +5 -7
  12. data/contrib/zstd/lib/Makefile +87 -181
  13. data/contrib/zstd/lib/README.md +23 -6
  14. data/contrib/zstd/lib/common/allocations.h +55 -0
  15. data/contrib/zstd/lib/common/bits.h +200 -0
  16. data/contrib/zstd/lib/common/bitstream.h +33 -59
  17. data/contrib/zstd/lib/common/compiler.h +115 -45
  18. data/contrib/zstd/lib/common/cpu.h +1 -1
  19. data/contrib/zstd/lib/common/debug.c +1 -1
  20. data/contrib/zstd/lib/common/debug.h +1 -1
  21. data/contrib/zstd/lib/common/entropy_common.c +15 -37
  22. data/contrib/zstd/lib/common/error_private.c +9 -2
  23. data/contrib/zstd/lib/common/error_private.h +82 -3
  24. data/contrib/zstd/lib/common/fse.h +9 -85
  25. data/contrib/zstd/lib/common/fse_decompress.c +29 -111
  26. data/contrib/zstd/lib/common/huf.h +84 -172
  27. data/contrib/zstd/lib/common/mem.h +58 -49
  28. data/contrib/zstd/lib/common/pool.c +37 -16
  29. data/contrib/zstd/lib/common/pool.h +9 -3
  30. data/contrib/zstd/lib/common/portability_macros.h +156 -0
  31. data/contrib/zstd/lib/common/threading.c +68 -14
  32. data/contrib/zstd/lib/common/threading.h +5 -10
  33. data/contrib/zstd/lib/common/xxhash.c +7 -809
  34. data/contrib/zstd/lib/common/xxhash.h +5568 -167
  35. data/contrib/zstd/lib/common/zstd_common.c +1 -36
  36. data/contrib/zstd/lib/common/zstd_deps.h +1 -1
  37. data/contrib/zstd/lib/common/zstd_internal.h +64 -150
  38. data/contrib/zstd/lib/common/zstd_trace.h +163 -0
  39. data/contrib/zstd/lib/compress/clevels.h +134 -0
  40. data/contrib/zstd/lib/compress/fse_compress.c +69 -150
  41. data/contrib/zstd/lib/compress/hist.c +1 -1
  42. data/contrib/zstd/lib/compress/hist.h +1 -1
  43. data/contrib/zstd/lib/compress/huf_compress.c +773 -251
  44. data/contrib/zstd/lib/compress/zstd_compress.c +2650 -826
  45. data/contrib/zstd/lib/compress/zstd_compress_internal.h +509 -180
  46. data/contrib/zstd/lib/compress/zstd_compress_literals.c +117 -40
  47. data/contrib/zstd/lib/compress/zstd_compress_literals.h +16 -6
  48. data/contrib/zstd/lib/compress/zstd_compress_sequences.c +28 -19
  49. data/contrib/zstd/lib/compress/zstd_compress_sequences.h +1 -1
  50. data/contrib/zstd/lib/compress/zstd_compress_superblock.c +33 -305
  51. data/contrib/zstd/lib/compress/zstd_compress_superblock.h +1 -1
  52. data/contrib/zstd/lib/compress/zstd_cwksp.h +266 -85
  53. data/contrib/zstd/lib/compress/zstd_double_fast.c +369 -132
  54. data/contrib/zstd/lib/compress/zstd_double_fast.h +3 -2
  55. data/contrib/zstd/lib/compress/zstd_fast.c +722 -258
  56. data/contrib/zstd/lib/compress/zstd_fast.h +3 -2
  57. data/contrib/zstd/lib/compress/zstd_lazy.c +1105 -360
  58. data/contrib/zstd/lib/compress/zstd_lazy.h +41 -1
  59. data/contrib/zstd/lib/compress/zstd_ldm.c +272 -208
  60. data/contrib/zstd/lib/compress/zstd_ldm.h +3 -2
  61. data/contrib/zstd/lib/compress/zstd_ldm_geartab.h +106 -0
  62. data/contrib/zstd/lib/compress/zstd_opt.c +324 -197
  63. data/contrib/zstd/lib/compress/zstd_opt.h +1 -1
  64. data/contrib/zstd/lib/compress/zstdmt_compress.c +109 -53
  65. data/contrib/zstd/lib/compress/zstdmt_compress.h +9 -6
  66. data/contrib/zstd/lib/decompress/huf_decompress.c +1071 -539
  67. data/contrib/zstd/lib/decompress/huf_decompress_amd64.S +576 -0
  68. data/contrib/zstd/lib/decompress/zstd_ddict.c +4 -4
  69. data/contrib/zstd/lib/decompress/zstd_ddict.h +1 -1
  70. data/contrib/zstd/lib/decompress/zstd_decompress.c +507 -82
  71. data/contrib/zstd/lib/decompress/zstd_decompress_block.c +962 -310
  72. data/contrib/zstd/lib/decompress/zstd_decompress_block.h +14 -3
  73. data/contrib/zstd/lib/decompress/zstd_decompress_internal.h +54 -6
  74. data/contrib/zstd/lib/deprecated/zbuff.h +1 -1
  75. data/contrib/zstd/lib/deprecated/zbuff_common.c +1 -1
  76. data/contrib/zstd/lib/deprecated/zbuff_compress.c +24 -4
  77. data/contrib/zstd/lib/deprecated/zbuff_decompress.c +3 -1
  78. data/contrib/zstd/lib/dictBuilder/cover.c +44 -32
  79. data/contrib/zstd/lib/dictBuilder/cover.h +6 -5
  80. data/contrib/zstd/lib/dictBuilder/divsufsort.c +1 -1
  81. data/contrib/zstd/lib/dictBuilder/fastcover.c +24 -16
  82. data/contrib/zstd/lib/dictBuilder/zdict.c +88 -95
  83. data/contrib/zstd/lib/legacy/zstd_legacy.h +8 -1
  84. data/contrib/zstd/lib/legacy/zstd_v01.c +16 -53
  85. data/contrib/zstd/lib/legacy/zstd_v01.h +1 -1
  86. data/contrib/zstd/lib/legacy/zstd_v02.c +24 -69
  87. data/contrib/zstd/lib/legacy/zstd_v02.h +1 -1
  88. data/contrib/zstd/lib/legacy/zstd_v03.c +25 -72
  89. data/contrib/zstd/lib/legacy/zstd_v03.h +1 -1
  90. data/contrib/zstd/lib/legacy/zstd_v04.c +23 -69
  91. data/contrib/zstd/lib/legacy/zstd_v04.h +1 -1
  92. data/contrib/zstd/lib/legacy/zstd_v05.c +35 -85
  93. data/contrib/zstd/lib/legacy/zstd_v05.h +1 -1
  94. data/contrib/zstd/lib/legacy/zstd_v06.c +42 -87
  95. data/contrib/zstd/lib/legacy/zstd_v06.h +1 -1
  96. data/contrib/zstd/lib/legacy/zstd_v07.c +35 -82
  97. data/contrib/zstd/lib/legacy/zstd_v07.h +1 -1
  98. data/contrib/zstd/lib/libzstd.mk +214 -0
  99. data/contrib/zstd/lib/libzstd.pc.in +4 -3
  100. data/contrib/zstd/lib/module.modulemap +35 -0
  101. data/contrib/zstd/lib/{dictBuilder/zdict.h → zdict.h} +202 -33
  102. data/contrib/zstd/lib/zstd.h +922 -293
  103. data/contrib/zstd/lib/{common/zstd_errors.h → zstd_errors.h} +27 -8
  104. data/ext/extconf.rb +7 -6
  105. data/ext/extzstd.c +13 -10
  106. data/ext/libzstd_conf.h +0 -1
  107. data/ext/zstd_decompress_asm.S +1 -0
  108. metadata +16 -5
@@ -0,0 +1,576 @@
1
+ /*
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #include "../common/portability_macros.h"
12
+
13
+ /* Stack marking
14
+ * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
15
+ */
16
+ #if defined(__ELF__) && defined(__GNUC__)
17
+ .section .note.GNU-stack,"",%progbits
18
+ #endif
19
+
20
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
21
+
22
+ /* Calling convention:
23
+ *
24
+ * %rdi contains the first argument: HUF_DecompressAsmArgs*.
25
+ * %rbp isn't maintained (no frame pointer).
26
+ * %rsp contains the stack pointer that grows down.
27
+ * No red-zone is assumed, only addresses >= %rsp are used.
28
+ * All register contents are preserved.
29
+ *
30
+ * TODO: Support Windows calling convention.
31
+ */
32
+
33
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
34
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
35
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
36
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
37
+ .global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
38
+ .global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
39
+ .global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
40
+ .global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
41
+ .text
42
+
43
+ /* Sets up register mappings for clarity.
44
+ * op[], bits[], dtable & ip[0] each get their own register.
45
+ * ip[1,2,3] & olimit alias var[].
46
+ * %rax is a scratch register.
47
+ */
48
+
49
+ #define op0 rsi
50
+ #define op1 rbx
51
+ #define op2 rcx
52
+ #define op3 rdi
53
+
54
+ #define ip0 r8
55
+ #define ip1 r9
56
+ #define ip2 r10
57
+ #define ip3 r11
58
+
59
+ #define bits0 rbp
60
+ #define bits1 rdx
61
+ #define bits2 r12
62
+ #define bits3 r13
63
+ #define dtable r14
64
+ #define olimit r15
65
+
66
+ /* var[] aliases ip[1,2,3] & olimit
67
+ * ip[1,2,3] are saved every iteration.
68
+ * olimit is only used in compute_olimit.
69
+ */
70
+ #define var0 r15
71
+ #define var1 r9
72
+ #define var2 r10
73
+ #define var3 r11
74
+
75
+ /* 32-bit var registers */
76
+ #define vard0 r15d
77
+ #define vard1 r9d
78
+ #define vard2 r10d
79
+ #define vard3 r11d
80
+
81
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
82
+ #define FOR_EACH_STREAM(X) \
83
+ X(0); \
84
+ X(1); \
85
+ X(2); \
86
+ X(3)
87
+
88
+ /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
89
+ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
90
+ X(0, idx); \
91
+ X(1, idx); \
92
+ X(2, idx); \
93
+ X(3, idx)
94
+
95
+ /* Define both _HUF_* & HUF_* symbols because MacOS
96
+ * C symbols are prefixed with '_' & Linux symbols aren't.
97
+ */
98
+ _HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
99
+ HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
100
+ ZSTD_CET_ENDBRANCH
101
+ /* Save all registers - even if they are callee saved for simplicity. */
102
+ push %rax
103
+ push %rbx
104
+ push %rcx
105
+ push %rdx
106
+ push %rbp
107
+ push %rsi
108
+ push %rdi
109
+ push %r8
110
+ push %r9
111
+ push %r10
112
+ push %r11
113
+ push %r12
114
+ push %r13
115
+ push %r14
116
+ push %r15
117
+
118
+ /* Read HUF_DecompressAsmArgs* args from %rax */
119
+ movq %rdi, %rax
120
+ movq 0(%rax), %ip0
121
+ movq 8(%rax), %ip1
122
+ movq 16(%rax), %ip2
123
+ movq 24(%rax), %ip3
124
+ movq 32(%rax), %op0
125
+ movq 40(%rax), %op1
126
+ movq 48(%rax), %op2
127
+ movq 56(%rax), %op3
128
+ movq 64(%rax), %bits0
129
+ movq 72(%rax), %bits1
130
+ movq 80(%rax), %bits2
131
+ movq 88(%rax), %bits3
132
+ movq 96(%rax), %dtable
133
+ push %rax /* argument */
134
+ push 104(%rax) /* ilimit */
135
+ push 112(%rax) /* oend */
136
+ push %olimit /* olimit space */
137
+
138
+ subq $24, %rsp
139
+
140
+ .L_4X1_compute_olimit:
141
+ /* Computes how many iterations we can do safely
142
+ * %r15, %rax may be clobbered
143
+ * rbx, rdx must be saved
144
+ * op3 & ip0 mustn't be clobbered
145
+ */
146
+ movq %rbx, 0(%rsp)
147
+ movq %rdx, 8(%rsp)
148
+
149
+ movq 32(%rsp), %rax /* rax = oend */
150
+ subq %op3, %rax /* rax = oend - op3 */
151
+
152
+ /* r15 = (oend - op3) / 5 */
153
+ movabsq $-3689348814741910323, %rdx
154
+ mulq %rdx
155
+ movq %rdx, %r15
156
+ shrq $2, %r15
157
+
158
+ movq %ip0, %rax /* rax = ip0 */
159
+ movq 40(%rsp), %rdx /* rdx = ilimit */
160
+ subq %rdx, %rax /* rax = ip0 - ilimit */
161
+ movq %rax, %rbx /* rbx = ip0 - ilimit */
162
+
163
+ /* rdx = (ip0 - ilimit) / 7 */
164
+ movabsq $2635249153387078803, %rdx
165
+ mulq %rdx
166
+ subq %rdx, %rbx
167
+ shrq %rbx
168
+ addq %rbx, %rdx
169
+ shrq $2, %rdx
170
+
171
+ /* r15 = min(%rdx, %r15) */
172
+ cmpq %rdx, %r15
173
+ cmova %rdx, %r15
174
+
175
+ /* r15 = r15 * 5 */
176
+ leaq (%r15, %r15, 4), %r15
177
+
178
+ /* olimit = op3 + r15 */
179
+ addq %op3, %olimit
180
+
181
+ movq 8(%rsp), %rdx
182
+ movq 0(%rsp), %rbx
183
+
184
+ /* If (op3 + 20 > olimit) */
185
+ movq %op3, %rax /* rax = op3 */
186
+ addq $20, %rax /* rax = op3 + 20 */
187
+ cmpq %rax, %olimit /* op3 + 20 > olimit */
188
+ jb .L_4X1_exit
189
+
190
+ /* If (ip1 < ip0) go to exit */
191
+ cmpq %ip0, %ip1
192
+ jb .L_4X1_exit
193
+
194
+ /* If (ip2 < ip1) go to exit */
195
+ cmpq %ip1, %ip2
196
+ jb .L_4X1_exit
197
+
198
+ /* If (ip3 < ip2) go to exit */
199
+ cmpq %ip2, %ip3
200
+ jb .L_4X1_exit
201
+
202
+ /* Reads top 11 bits from bits[n]
203
+ * Loads dt[bits[n]] into var[n]
204
+ */
205
+ #define GET_NEXT_DELT(n) \
206
+ movq $53, %var##n; \
207
+ shrxq %var##n, %bits##n, %var##n; \
208
+ movzwl (%dtable,%var##n,2),%vard##n
209
+
210
+ /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
211
+ * Moves var[n] to %rax
212
+ * bits[n] <<= var[n] & 63
213
+ * op[n][idx] = %rax >> 8
214
+ * %ah is a way to access bits [8, 16) of %rax
215
+ */
216
+ #define DECODE_FROM_DELT(n, idx) \
217
+ movq %var##n, %rax; \
218
+ shlxq %var##n, %bits##n, %bits##n; \
219
+ movb %ah, idx(%op##n)
220
+
221
+ /* Assumes GET_NEXT_DELT has been called.
222
+ * Calls DECODE_FROM_DELT then GET_NEXT_DELT
223
+ */
224
+ #define DECODE_AND_GET_NEXT(n, idx) \
225
+ DECODE_FROM_DELT(n, idx); \
226
+ GET_NEXT_DELT(n) \
227
+
228
+ /* // ctz & nbBytes is stored in bits[n]
229
+ * // nbBits is stored in %rax
230
+ * ctz = CTZ[bits[n]]
231
+ * nbBits = ctz & 7
232
+ * nbBytes = ctz >> 3
233
+ * op[n] += 5
234
+ * ip[n] -= nbBytes
235
+ * // Note: x86-64 is little-endian ==> no bswap
236
+ * bits[n] = MEM_readST(ip[n]) | 1
237
+ * bits[n] <<= nbBits
238
+ */
239
+ #define RELOAD_BITS(n) \
240
+ bsfq %bits##n, %bits##n; \
241
+ movq %bits##n, %rax; \
242
+ andq $7, %rax; \
243
+ shrq $3, %bits##n; \
244
+ leaq 5(%op##n), %op##n; \
245
+ subq %bits##n, %ip##n; \
246
+ movq (%ip##n), %bits##n; \
247
+ orq $1, %bits##n; \
248
+ shlx %rax, %bits##n, %bits##n
249
+
250
+ /* Store clobbered variables on the stack */
251
+ movq %olimit, 24(%rsp)
252
+ movq %ip1, 0(%rsp)
253
+ movq %ip2, 8(%rsp)
254
+ movq %ip3, 16(%rsp)
255
+
256
+ /* Call GET_NEXT_DELT for each stream */
257
+ FOR_EACH_STREAM(GET_NEXT_DELT)
258
+
259
+ .p2align 6
260
+
261
+ .L_4X1_loop_body:
262
+ /* Decode 5 symbols in each of the 4 streams (20 total)
263
+ * Must have called GET_NEXT_DELT for each stream
264
+ */
265
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
266
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
267
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
268
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
269
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
270
+
271
+ /* Load ip[1,2,3] from stack (var[] aliases them)
272
+ * ip[] is needed for RELOAD_BITS
273
+ * Each will be stored back to the stack after RELOAD
274
+ */
275
+ movq 0(%rsp), %ip1
276
+ movq 8(%rsp), %ip2
277
+ movq 16(%rsp), %ip3
278
+
279
+ /* Reload each stream & fetch the next table entry
280
+ * to prepare for the next iteration
281
+ */
282
+ RELOAD_BITS(0)
283
+ GET_NEXT_DELT(0)
284
+
285
+ RELOAD_BITS(1)
286
+ movq %ip1, 0(%rsp)
287
+ GET_NEXT_DELT(1)
288
+
289
+ RELOAD_BITS(2)
290
+ movq %ip2, 8(%rsp)
291
+ GET_NEXT_DELT(2)
292
+
293
+ RELOAD_BITS(3)
294
+ movq %ip3, 16(%rsp)
295
+ GET_NEXT_DELT(3)
296
+
297
+ /* If op3 < olimit: continue the loop */
298
+ cmp %op3, 24(%rsp)
299
+ ja .L_4X1_loop_body
300
+
301
+ /* Reload ip[1,2,3] from stack */
302
+ movq 0(%rsp), %ip1
303
+ movq 8(%rsp), %ip2
304
+ movq 16(%rsp), %ip3
305
+
306
+ /* Re-compute olimit */
307
+ jmp .L_4X1_compute_olimit
308
+
309
+ #undef GET_NEXT_DELT
310
+ #undef DECODE_FROM_DELT
311
+ #undef DECODE
312
+ #undef RELOAD_BITS
313
+ .L_4X1_exit:
314
+ addq $24, %rsp
315
+
316
+ /* Restore stack (oend & olimit) */
317
+ pop %rax /* olimit */
318
+ pop %rax /* oend */
319
+ pop %rax /* ilimit */
320
+ pop %rax /* arg */
321
+
322
+ /* Save ip / op / bits */
323
+ movq %ip0, 0(%rax)
324
+ movq %ip1, 8(%rax)
325
+ movq %ip2, 16(%rax)
326
+ movq %ip3, 24(%rax)
327
+ movq %op0, 32(%rax)
328
+ movq %op1, 40(%rax)
329
+ movq %op2, 48(%rax)
330
+ movq %op3, 56(%rax)
331
+ movq %bits0, 64(%rax)
332
+ movq %bits1, 72(%rax)
333
+ movq %bits2, 80(%rax)
334
+ movq %bits3, 88(%rax)
335
+
336
+ /* Restore registers */
337
+ pop %r15
338
+ pop %r14
339
+ pop %r13
340
+ pop %r12
341
+ pop %r11
342
+ pop %r10
343
+ pop %r9
344
+ pop %r8
345
+ pop %rdi
346
+ pop %rsi
347
+ pop %rbp
348
+ pop %rdx
349
+ pop %rcx
350
+ pop %rbx
351
+ pop %rax
352
+ ret
353
+
354
+ _HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
355
+ HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
356
+ ZSTD_CET_ENDBRANCH
357
+ /* Save all registers - even if they are callee saved for simplicity. */
358
+ push %rax
359
+ push %rbx
360
+ push %rcx
361
+ push %rdx
362
+ push %rbp
363
+ push %rsi
364
+ push %rdi
365
+ push %r8
366
+ push %r9
367
+ push %r10
368
+ push %r11
369
+ push %r12
370
+ push %r13
371
+ push %r14
372
+ push %r15
373
+
374
+ movq %rdi, %rax
375
+ movq 0(%rax), %ip0
376
+ movq 8(%rax), %ip1
377
+ movq 16(%rax), %ip2
378
+ movq 24(%rax), %ip3
379
+ movq 32(%rax), %op0
380
+ movq 40(%rax), %op1
381
+ movq 48(%rax), %op2
382
+ movq 56(%rax), %op3
383
+ movq 64(%rax), %bits0
384
+ movq 72(%rax), %bits1
385
+ movq 80(%rax), %bits2
386
+ movq 88(%rax), %bits3
387
+ movq 96(%rax), %dtable
388
+ push %rax /* argument */
389
+ push %rax /* olimit */
390
+ push 104(%rax) /* ilimit */
391
+
392
+ movq 112(%rax), %rax
393
+ push %rax /* oend3 */
394
+
395
+ movq %op3, %rax
396
+ push %rax /* oend2 */
397
+
398
+ movq %op2, %rax
399
+ push %rax /* oend1 */
400
+
401
+ movq %op1, %rax
402
+ push %rax /* oend0 */
403
+
404
+ /* Scratch space */
405
+ subq $8, %rsp
406
+
407
+ .L_4X2_compute_olimit:
408
+ /* Computes how many iterations we can do safely
409
+ * %r15, %rax may be clobbered
410
+ * rdx must be saved
411
+ * op[1,2,3,4] & ip0 mustn't be clobbered
412
+ */
413
+ movq %rdx, 0(%rsp)
414
+
415
+ /* We can consume up to 7 input bytes each iteration. */
416
+ movq %ip0, %rax /* rax = ip0 */
417
+ movq 40(%rsp), %rdx /* rdx = ilimit */
418
+ subq %rdx, %rax /* rax = ip0 - ilimit */
419
+ movq %rax, %r15 /* r15 = ip0 - ilimit */
420
+
421
+ /* rdx = rax / 7 */
422
+ movabsq $2635249153387078803, %rdx
423
+ mulq %rdx
424
+ subq %rdx, %r15
425
+ shrq %r15
426
+ addq %r15, %rdx
427
+ shrq $2, %rdx
428
+
429
+ /* r15 = (ip0 - ilimit) / 7 */
430
+ movq %rdx, %r15
431
+
432
+ /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
433
+ movq 8(%rsp), %rax /* rax = oend0 */
434
+ subq %op0, %rax /* rax = oend0 - op0 */
435
+ movq 16(%rsp), %rdx /* rdx = oend1 */
436
+ subq %op1, %rdx /* rdx = oend1 - op1 */
437
+
438
+ cmpq %rax, %rdx
439
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
440
+
441
+ movq 24(%rsp), %rax /* rax = oend2 */
442
+ subq %op2, %rax /* rax = oend2 - op2 */
443
+
444
+ cmpq %rax, %rdx
445
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
446
+
447
+ movq 32(%rsp), %rax /* rax = oend3 */
448
+ subq %op3, %rax /* rax = oend3 - op3 */
449
+
450
+ cmpq %rax, %rdx
451
+ cmova %rax, %rdx /* rdx = min(%rdx, %rax) */
452
+
453
+ movabsq $-3689348814741910323, %rax
454
+ mulq %rdx
455
+ shrq $3, %rdx /* rdx = rdx / 10 */
456
+
457
+ /* r15 = min(%rdx, %r15) */
458
+ cmpq %rdx, %r15
459
+ cmova %rdx, %r15
460
+
461
+ /* olimit = op3 + 5 * r15 */
462
+ movq %r15, %rax
463
+ leaq (%op3, %rax, 4), %olimit
464
+ addq %rax, %olimit
465
+
466
+ movq 0(%rsp), %rdx
467
+
468
+ /* If (op3 + 10 > olimit) */
469
+ movq %op3, %rax /* rax = op3 */
470
+ addq $10, %rax /* rax = op3 + 10 */
471
+ cmpq %rax, %olimit /* op3 + 10 > olimit */
472
+ jb .L_4X2_exit
473
+
474
+ /* If (ip1 < ip0) go to exit */
475
+ cmpq %ip0, %ip1
476
+ jb .L_4X2_exit
477
+
478
+ /* If (ip2 < ip1) go to exit */
479
+ cmpq %ip1, %ip2
480
+ jb .L_4X2_exit
481
+
482
+ /* If (ip3 < ip2) go to exit */
483
+ cmpq %ip2, %ip3
484
+ jb .L_4X2_exit
485
+
486
+ #define DECODE(n, idx) \
487
+ movq %bits##n, %rax; \
488
+ shrq $53, %rax; \
489
+ movzwl 0(%dtable,%rax,4),%r8d; \
490
+ movzbl 2(%dtable,%rax,4),%r15d; \
491
+ movzbl 3(%dtable,%rax,4),%eax; \
492
+ movw %r8w, (%op##n); \
493
+ shlxq %r15, %bits##n, %bits##n; \
494
+ addq %rax, %op##n
495
+
496
+ #define RELOAD_BITS(n) \
497
+ bsfq %bits##n, %bits##n; \
498
+ movq %bits##n, %rax; \
499
+ shrq $3, %bits##n; \
500
+ andq $7, %rax; \
501
+ subq %bits##n, %ip##n; \
502
+ movq (%ip##n), %bits##n; \
503
+ orq $1, %bits##n; \
504
+ shlxq %rax, %bits##n, %bits##n
505
+
506
+
507
+ movq %olimit, 48(%rsp)
508
+
509
+ .p2align 6
510
+
511
+ .L_4X2_loop_body:
512
+ /* We clobber r8, so store it on the stack */
513
+ movq %r8, 0(%rsp)
514
+
515
+ /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
516
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
517
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
518
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
519
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
520
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
521
+
522
+ /* Reload r8 */
523
+ movq 0(%rsp), %r8
524
+
525
+ FOR_EACH_STREAM(RELOAD_BITS)
526
+
527
+ cmp %op3, 48(%rsp)
528
+ ja .L_4X2_loop_body
529
+ jmp .L_4X2_compute_olimit
530
+
531
+ #undef DECODE
532
+ #undef RELOAD_BITS
533
+ .L_4X2_exit:
534
+ addq $8, %rsp
535
+ /* Restore stack (oend & olimit) */
536
+ pop %rax /* oend0 */
537
+ pop %rax /* oend1 */
538
+ pop %rax /* oend2 */
539
+ pop %rax /* oend3 */
540
+ pop %rax /* ilimit */
541
+ pop %rax /* olimit */
542
+ pop %rax /* arg */
543
+
544
+ /* Save ip / op / bits */
545
+ movq %ip0, 0(%rax)
546
+ movq %ip1, 8(%rax)
547
+ movq %ip2, 16(%rax)
548
+ movq %ip3, 24(%rax)
549
+ movq %op0, 32(%rax)
550
+ movq %op1, 40(%rax)
551
+ movq %op2, 48(%rax)
552
+ movq %op3, 56(%rax)
553
+ movq %bits0, 64(%rax)
554
+ movq %bits1, 72(%rax)
555
+ movq %bits2, 80(%rax)
556
+ movq %bits3, 88(%rax)
557
+
558
+ /* Restore registers */
559
+ pop %r15
560
+ pop %r14
561
+ pop %r13
562
+ pop %r12
563
+ pop %r11
564
+ pop %r10
565
+ pop %r9
566
+ pop %r8
567
+ pop %rdi
568
+ pop %rsi
569
+ pop %rbp
570
+ pop %rdx
571
+ pop %rcx
572
+ pop %rbx
573
+ pop %rax
574
+ ret
575
+
576
+ #endif
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the
@@ -14,12 +14,12 @@
14
14
  /*-*******************************************************
15
15
  * Dependencies
16
16
  *********************************************************/
17
+ #include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */
17
18
  #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */
18
19
  #include "../common/cpu.h" /* bmi2 */
19
20
  #include "../common/mem.h" /* low level memory routines */
20
21
  #define FSE_STATIC_LINKING_ONLY
21
22
  #include "../common/fse.h"
22
- #define HUF_STATIC_LINKING_ONLY
23
23
  #include "../common/huf.h"
24
24
  #include "zstd_decompress_internal.h"
25
25
  #include "zstd_ddict.h"
@@ -134,7 +134,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
134
134
  ZSTD_memcpy(internalBuffer, dict, dictSize);
135
135
  }
136
136
  ddict->dictSize = dictSize;
137
- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */
137
+ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */
138
138
 
139
139
  /* parse dictionary content */
140
140
  FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
@@ -240,5 +240,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
240
240
  unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
241
241
  {
242
242
  if (ddict==NULL) return 0;
243
- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
243
+ return ddict->dictID;
244
244
  }
@@ -1,5 +1,5 @@
1
1
  /*
2
- * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
2
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
3
3
  * All rights reserved.
4
4
  *
5
5
  * This source code is licensed under both the BSD-style license (found in the