zstd-ruby 1.5.0.0 → 1.5.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +2 -2
  3. data/README.md +1 -1
  4. data/ext/zstdruby/extconf.rb +2 -1
  5. data/ext/zstdruby/libzstd/Makefile +50 -175
  6. data/ext/zstdruby/libzstd/README.md +7 -1
  7. data/ext/zstdruby/libzstd/common/bitstream.h +24 -9
  8. data/ext/zstdruby/libzstd/common/compiler.h +89 -43
  9. data/ext/zstdruby/libzstd/common/entropy_common.c +11 -5
  10. data/ext/zstdruby/libzstd/common/error_private.h +79 -0
  11. data/ext/zstdruby/libzstd/common/fse.h +2 -1
  12. data/ext/zstdruby/libzstd/common/fse_decompress.c +1 -1
  13. data/ext/zstdruby/libzstd/common/huf.h +24 -22
  14. data/ext/zstdruby/libzstd/common/mem.h +18 -0
  15. data/ext/zstdruby/libzstd/common/pool.c +11 -6
  16. data/ext/zstdruby/libzstd/common/pool.h +2 -2
  17. data/ext/zstdruby/libzstd/common/portability_macros.h +137 -0
  18. data/ext/zstdruby/libzstd/common/xxhash.c +5 -805
  19. data/ext/zstdruby/libzstd/common/xxhash.h +5568 -167
  20. data/ext/zstdruby/libzstd/common/zstd_internal.h +95 -92
  21. data/ext/zstdruby/libzstd/common/zstd_trace.h +12 -3
  22. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  23. data/ext/zstdruby/libzstd/compress/fse_compress.c +63 -27
  24. data/ext/zstdruby/libzstd/compress/huf_compress.c +537 -104
  25. data/ext/zstdruby/libzstd/compress/zstd_compress.c +307 -373
  26. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +174 -83
  27. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +4 -3
  28. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +3 -1
  29. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +15 -14
  30. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +4 -3
  31. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +41 -27
  32. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +295 -120
  33. data/ext/zstdruby/libzstd/compress/zstd_fast.c +309 -130
  34. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +482 -562
  35. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +9 -7
  36. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  37. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +4 -1
  38. data/ext/zstdruby/libzstd/compress/zstd_opt.c +249 -148
  39. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +76 -38
  40. data/ext/zstdruby/libzstd/compress/zstdmt_compress.h +4 -1
  41. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +727 -189
  42. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +585 -0
  43. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +85 -22
  44. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +744 -220
  45. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +8 -2
  46. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +34 -3
  47. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +23 -3
  48. data/ext/zstdruby/libzstd/dictBuilder/cover.c +9 -2
  49. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +11 -4
  50. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +101 -30
  51. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +2 -6
  52. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +3 -7
  53. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +3 -7
  54. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +3 -7
  55. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +3 -7
  56. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +3 -7
  57. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +3 -7
  58. data/ext/zstdruby/libzstd/libzstd.mk +203 -0
  59. data/ext/zstdruby/libzstd/libzstd.pc.in +1 -0
  60. data/ext/zstdruby/libzstd/module.modulemap +25 -0
  61. data/ext/zstdruby/libzstd/zdict.h +4 -4
  62. data/ext/zstdruby/libzstd/zstd.h +179 -136
  63. data/ext/zstdruby/zstdruby.c +2 -2
  64. data/lib/zstd-ruby/version.rb +1 -1
  65. metadata +11 -6
@@ -0,0 +1,585 @@
1
+ /*
2
+ * Copyright (c) Facebook, Inc.
3
+ * All rights reserved.
4
+ *
5
+ * This source code is licensed under both the BSD-style license (found in the
6
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7
+ * in the COPYING file in the root directory of this source tree).
8
+ * You may select, at your option, one of the above-listed licenses.
9
+ */
10
+
11
+ #include "../common/portability_macros.h"
12
+
13
+ /* Stack marking
14
+ * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
15
+ */
16
+ #if defined(__ELF__) && defined(__GNUC__)
17
+ .section .note.GNU-stack,"",%progbits
18
+ #endif
19
+
20
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
21
+
22
+ /* Calling convention:
23
+ *
24
+ * %rdi contains the first argument: HUF_DecompressAsmArgs*.
25
+ * %rbp isn't maintained (no frame pointer).
26
+ * %rsp contains the stack pointer that grows down.
27
+ * No red-zone is assumed, only addresses >= %rsp are used.
28
+ * All register contents are preserved.
29
+ *
30
+ * TODO: Support Windows calling convention.
31
+ */
32
+
33
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
34
+ ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
35
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop)
36
+ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop)
37
+ .global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
38
+ .global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
39
+ .global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
40
+ .global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
41
+ .text
42
+
43
+ /* Sets up register mappings for clarity.
44
+ * op[], bits[], dtable & ip[0] each get their own register.
45
+ * ip[1,2,3] & olimit alias var[].
46
+ * %rax is a scratch register.
47
+ */
48
+
49
+ #define op0 rsi
50
+ #define op1 rbx
51
+ #define op2 rcx
52
+ #define op3 rdi
53
+
54
+ #define ip0 r8
55
+ #define ip1 r9
56
+ #define ip2 r10
57
+ #define ip3 r11
58
+
59
+ #define bits0 rbp
60
+ #define bits1 rdx
61
+ #define bits2 r12
62
+ #define bits3 r13
63
+ #define dtable r14
64
+ #define olimit r15
65
+
66
+ /* var[] aliases ip[1,2,3] & olimit
67
+ * ip[1,2,3] are saved every iteration.
68
+ * olimit is only used in compute_olimit.
69
+ */
70
+ #define var0 r15
71
+ #define var1 r9
72
+ #define var2 r10
73
+ #define var3 r11
74
+
75
+ /* 32-bit var registers */
76
+ #define vard0 r15d
77
+ #define vard1 r9d
78
+ #define vard2 r10d
79
+ #define vard3 r11d
80
+
81
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
82
+ #define FOR_EACH_STREAM(X) \
83
+ X(0); \
84
+ X(1); \
85
+ X(2); \
86
+ X(3)
87
+
88
+ /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
89
+ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
90
+ X(0, idx); \
91
+ X(1, idx); \
92
+ X(2, idx); \
93
+ X(3, idx)
94
+
95
+ /* Define both _HUF_* & HUF_* symbols because MacOS
96
+ * C symbols are prefixed with '_' & Linux symbols aren't.
97
+ */
98
+ _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
99
+ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
100
+ /* Save all registers - even if they are callee saved for simplicity. */
101
+ push %rax
102
+ push %rbx
103
+ push %rcx
104
+ push %rdx
105
+ push %rbp
106
+ push %rsi
107
+ push %rdi
108
+ push %r8
109
+ push %r9
110
+ push %r10
111
+ push %r11
112
+ push %r12
113
+ push %r13
114
+ push %r14
115
+ push %r15
116
+
117
+ /* Read HUF_DecompressAsmArgs* args from %rax */
118
+ movq %rdi, %rax
119
+ movq 0(%rax), %ip0
120
+ movq 8(%rax), %ip1
121
+ movq 16(%rax), %ip2
122
+ movq 24(%rax), %ip3
123
+ movq 32(%rax), %op0
124
+ movq 40(%rax), %op1
125
+ movq 48(%rax), %op2
126
+ movq 56(%rax), %op3
127
+ movq 64(%rax), %bits0
128
+ movq 72(%rax), %bits1
129
+ movq 80(%rax), %bits2
130
+ movq 88(%rax), %bits3
131
+ movq 96(%rax), %dtable
132
+ push %rax /* argument */
133
+ push 104(%rax) /* ilimit */
134
+ push 112(%rax) /* oend */
135
+ push %olimit /* olimit space */
136
+
137
+ subq $24, %rsp
138
+
139
+ .L_4X1_compute_olimit:
140
+ /* Computes how many iterations we can do safely
141
+ * %r15, %rax may be clobbered
142
+ * rbx, rdx must be saved
143
+ * op3 & ip0 mustn't be clobbered
144
+ */
145
+ movq %rbx, 0(%rsp)
146
+ movq %rdx, 8(%rsp)
147
+
148
+ movq 32(%rsp), %rax /* rax = oend */
149
+ subq %op3, %rax /* rax = oend - op3 */
150
+
151
+ /* r15 = (oend - op3) / 5 */
152
+ movabsq $-3689348814741910323, %rdx
153
+ mulq %rdx
154
+ movq %rdx, %r15
155
+ shrq $2, %r15
156
+
157
+ movq %ip0, %rax /* rax = ip0 */
158
+ movq 40(%rsp), %rdx /* rdx = ilimit */
159
+ subq %rdx, %rax /* rax = ip0 - ilimit */
160
+ movq %rax, %rbx /* rbx = ip0 - ilimit */
161
+
162
+ /* rdx = (ip0 - ilimit) / 7 */
163
+ movabsq $2635249153387078803, %rdx
164
+ mulq %rdx
165
+ subq %rdx, %rbx
166
+ shrq %rbx
167
+ addq %rbx, %rdx
168
+ shrq $2, %rdx
169
+
170
+ /* r15 = min(%rdx, %r15) */
171
+ cmpq %rdx, %r15
172
+ cmova %rdx, %r15
173
+
174
+ /* r15 = r15 * 5 */
175
+ leaq (%r15, %r15, 4), %r15
176
+
177
+ /* olimit = op3 + r15 */
178
+ addq %op3, %olimit
179
+
180
+ movq 8(%rsp), %rdx
181
+ movq 0(%rsp), %rbx
182
+
183
+ /* If (op3 + 20 > olimit) */
184
+ movq %op3, %rax /* rax = op3 */
185
+ addq $20, %rax /* rax = op3 + 20 */
186
+ cmpq %rax, %olimit /* op3 + 20 > olimit */
187
+ jb .L_4X1_exit
188
+
189
+ /* If (ip1 < ip0) go to exit */
190
+ cmpq %ip0, %ip1
191
+ jb .L_4X1_exit
192
+
193
+ /* If (ip2 < ip1) go to exit */
194
+ cmpq %ip1, %ip2
195
+ jb .L_4X1_exit
196
+
197
+ /* If (ip3 < ip2) go to exit */
198
+ cmpq %ip2, %ip3
199
+ jb .L_4X1_exit
200
+
201
+ /* Reads top 11 bits from bits[n]
202
+ * Loads dt[bits[n]] into var[n]
203
+ */
204
+ #define GET_NEXT_DELT(n) \
205
+ movq $53, %var##n; \
206
+ shrxq %var##n, %bits##n, %var##n; \
207
+ movzwl (%dtable,%var##n,2),%vard##n
208
+
209
+ /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
210
+ * Moves var[n] to %rax
211
+ * bits[n] <<= var[n] & 63
212
+ * op[n][idx] = %rax >> 8
213
+ * %ah is a way to access bits [8, 16) of %rax
214
+ */
215
+ #define DECODE_FROM_DELT(n, idx) \
216
+ movq %var##n, %rax; \
217
+ shlxq %var##n, %bits##n, %bits##n; \
218
+ movb %ah, idx(%op##n)
219
+
220
+ /* Assumes GET_NEXT_DELT has been called.
221
+ * Calls DECODE_FROM_DELT then GET_NEXT_DELT
222
+ */
223
+ #define DECODE_AND_GET_NEXT(n, idx) \
224
+ DECODE_FROM_DELT(n, idx); \
225
+ GET_NEXT_DELT(n) \
226
+
227
+ /* // ctz & nbBytes is stored in bits[n]
228
+ * // nbBits is stored in %rax
229
+ * ctz = CTZ[bits[n]]
230
+ * nbBits = ctz & 7
231
+ * nbBytes = ctz >> 3
232
+ * op[n] += 5
233
+ * ip[n] -= nbBytes
234
+ * // Note: x86-64 is little-endian ==> no bswap
235
+ * bits[n] = MEM_readST(ip[n]) | 1
236
+ * bits[n] <<= nbBits
237
+ */
238
+ #define RELOAD_BITS(n) \
239
+ bsfq %bits##n, %bits##n; \
240
+ movq %bits##n, %rax; \
241
+ andq $7, %rax; \
242
+ shrq $3, %bits##n; \
243
+ leaq 5(%op##n), %op##n; \
244
+ subq %bits##n, %ip##n; \
245
+ movq (%ip##n), %bits##n; \
246
+ orq $1, %bits##n; \
247
+ shlx %rax, %bits##n, %bits##n
248
+
249
+ /* Store clobbered variables on the stack */
250
+ movq %olimit, 24(%rsp)
251
+ movq %ip1, 0(%rsp)
252
+ movq %ip2, 8(%rsp)
253
+ movq %ip3, 16(%rsp)
254
+
255
+ /* Call GET_NEXT_DELT for each stream */
256
+ FOR_EACH_STREAM(GET_NEXT_DELT)
257
+
258
+ .p2align 6
259
+
260
+ .L_4X1_loop_body:
261
+ /* Decode 5 symbols in each of the 4 streams (20 total)
262
+ * Must have called GET_NEXT_DELT for each stream
263
+ */
264
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
265
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
266
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
267
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
268
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
269
+
270
+ /* Load ip[1,2,3] from stack (var[] aliases them)
271
+ * ip[] is needed for RELOAD_BITS
272
+ * Each will be stored back to the stack after RELOAD
273
+ */
274
+ movq 0(%rsp), %ip1
275
+ movq 8(%rsp), %ip2
276
+ movq 16(%rsp), %ip3
277
+
278
+ /* Reload each stream & fetch the next table entry
279
+ * to prepare for the next iteration
280
+ */
281
+ RELOAD_BITS(0)
282
+ GET_NEXT_DELT(0)
283
+
284
+ RELOAD_BITS(1)
285
+ movq %ip1, 0(%rsp)
286
+ GET_NEXT_DELT(1)
287
+
288
+ RELOAD_BITS(2)
289
+ movq %ip2, 8(%rsp)
290
+ GET_NEXT_DELT(2)
291
+
292
+ RELOAD_BITS(3)
293
+ movq %ip3, 16(%rsp)
294
+ GET_NEXT_DELT(3)
295
+
296
+ /* If op3 < olimit: continue the loop */
297
+ cmp %op3, 24(%rsp)
298
+ ja .L_4X1_loop_body
299
+
300
+ /* Reload ip[1,2,3] from stack */
301
+ movq 0(%rsp), %ip1
302
+ movq 8(%rsp), %ip2
303
+ movq 16(%rsp), %ip3
304
+
305
+ /* Re-compute olimit */
306
+ jmp .L_4X1_compute_olimit
307
+
308
+ #undef GET_NEXT_DELT
309
+ #undef DECODE_FROM_DELT
310
+ #undef DECODE
311
+ #undef RELOAD_BITS
312
+ .L_4X1_exit:
313
+ addq $24, %rsp
314
+
315
+ /* Restore stack (oend & olimit) */
316
+ pop %rax /* olimit */
317
+ pop %rax /* oend */
318
+ pop %rax /* ilimit */
319
+ pop %rax /* arg */
320
+
321
+ /* Save ip / op / bits */
322
+ movq %ip0, 0(%rax)
323
+ movq %ip1, 8(%rax)
324
+ movq %ip2, 16(%rax)
325
+ movq %ip3, 24(%rax)
326
+ movq %op0, 32(%rax)
327
+ movq %op1, 40(%rax)
328
+ movq %op2, 48(%rax)
329
+ movq %op3, 56(%rax)
330
+ movq %bits0, 64(%rax)
331
+ movq %bits1, 72(%rax)
332
+ movq %bits2, 80(%rax)
333
+ movq %bits3, 88(%rax)
334
+
335
+ /* Restore registers */
336
+ pop %r15
337
+ pop %r14
338
+ pop %r13
339
+ pop %r12
340
+ pop %r11
341
+ pop %r10
342
+ pop %r9
343
+ pop %r8
344
+ pop %rdi
345
+ pop %rsi
346
+ pop %rbp
347
+ pop %rdx
348
+ pop %rcx
349
+ pop %rbx
350
+ pop %rax
351
+ ret
352
+
353
+ _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
354
+ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
355
+ /* Save all registers - even if they are callee saved for simplicity. */
356
+ push %rax
357
+ push %rbx
358
+ push %rcx
359
+ push %rdx
360
+ push %rbp
361
+ push %rsi
362
+ push %rdi
363
+ push %r8
364
+ push %r9
365
+ push %r10
366
+ push %r11
367
+ push %r12
368
+ push %r13
369
+ push %r14
370
+ push %r15
371
+
372
+ movq %rdi, %rax
373
+ movq 0(%rax), %ip0
374
+ movq 8(%rax), %ip1
375
+ movq 16(%rax), %ip2
376
+ movq 24(%rax), %ip3
377
+ movq 32(%rax), %op0
378
+ movq 40(%rax), %op1
379
+ movq 48(%rax), %op2
380
+ movq 56(%rax), %op3
381
+ movq 64(%rax), %bits0
382
+ movq 72(%rax), %bits1
383
+ movq 80(%rax), %bits2
384
+ movq 88(%rax), %bits3
385
+ movq 96(%rax), %dtable
386
+ push %rax /* argument */
387
+ push %rax /* olimit */
388
+ push 104(%rax) /* ilimit */
389
+
390
+ movq 112(%rax), %rax
391
+ push %rax /* oend3 */
392
+
393
+ movq %op3, %rax
394
+ push %rax /* oend2 */
395
+
396
+ movq %op2, %rax
397
+ push %rax /* oend1 */
398
+
399
+ movq %op1, %rax
400
+ push %rax /* oend0 */
401
+
402
+ /* Scratch space */
403
+ subq $8, %rsp
404
+
405
+ .L_4X2_compute_olimit:
406
+ /* Computes how many iterations we can do safely
407
+ * %r15, %rax may be clobbered
408
+ * rdx must be saved
409
+ * op[1,2,3,4] & ip0 mustn't be clobbered
410
+ */
411
+ movq %rdx, 0(%rsp)
412
+
413
+ /* We can consume up to 7 input bytes each iteration. */
414
+ movq %ip0, %rax /* rax = ip0 */
415
+ movq 40(%rsp), %rdx /* rdx = ilimit */
416
+ subq %rdx, %rax /* rax = ip0 - ilimit */
417
+ movq %rax, %r15 /* r15 = ip0 - ilimit */
418
+
419
+ /* rdx = rax / 7 */
420
+ movabsq $2635249153387078803, %rdx
421
+ mulq %rdx
422
+ subq %rdx, %r15
423
+ shrq %r15
424
+ addq %r15, %rdx
425
+ shrq $2, %rdx
426
+
427
+ /* r15 = (ip0 - ilimit) / 7 */
428
+ movq %rdx, %r15
429
+
430
+ movabsq $-3689348814741910323, %rdx
431
+ movq 8(%rsp), %rax /* rax = oend0 */
432
+ subq %op0, %rax /* rax = oend0 - op0 */
433
+ mulq %rdx
434
+ shrq $3, %rdx /* rdx = rax / 10 */
435
+
436
+ /* r15 = min(%rdx, %r15) */
437
+ cmpq %rdx, %r15
438
+ cmova %rdx, %r15
439
+
440
+ movabsq $-3689348814741910323, %rdx
441
+ movq 16(%rsp), %rax /* rax = oend1 */
442
+ subq %op1, %rax /* rax = oend1 - op1 */
443
+ mulq %rdx
444
+ shrq $3, %rdx /* rdx = rax / 10 */
445
+
446
+ /* r15 = min(%rdx, %r15) */
447
+ cmpq %rdx, %r15
448
+ cmova %rdx, %r15
449
+
450
+ movabsq $-3689348814741910323, %rdx
451
+ movq 24(%rsp), %rax /* rax = oend2 */
452
+ subq %op2, %rax /* rax = oend2 - op2 */
453
+ mulq %rdx
454
+ shrq $3, %rdx /* rdx = rax / 10 */
455
+
456
+ /* r15 = min(%rdx, %r15) */
457
+ cmpq %rdx, %r15
458
+ cmova %rdx, %r15
459
+
460
+ movabsq $-3689348814741910323, %rdx
461
+ movq 32(%rsp), %rax /* rax = oend3 */
462
+ subq %op3, %rax /* rax = oend3 - op3 */
463
+ mulq %rdx
464
+ shrq $3, %rdx /* rdx = rax / 10 */
465
+
466
+ /* r15 = min(%rdx, %r15) */
467
+ cmpq %rdx, %r15
468
+ cmova %rdx, %r15
469
+
470
+ /* olimit = op3 + 5 * r15 */
471
+ movq %r15, %rax
472
+ leaq (%op3, %rax, 4), %olimit
473
+ addq %rax, %olimit
474
+
475
+ movq 0(%rsp), %rdx
476
+
477
+ /* If (op3 + 10 > olimit) */
478
+ movq %op3, %rax /* rax = op3 */
479
+ addq $10, %rax /* rax = op3 + 10 */
480
+ cmpq %rax, %olimit /* op3 + 10 > olimit */
481
+ jb .L_4X2_exit
482
+
483
+ /* If (ip1 < ip0) go to exit */
484
+ cmpq %ip0, %ip1
485
+ jb .L_4X2_exit
486
+
487
+ /* If (ip2 < ip1) go to exit */
488
+ cmpq %ip1, %ip2
489
+ jb .L_4X2_exit
490
+
491
+ /* If (ip3 < ip2) go to exit */
492
+ cmpq %ip2, %ip3
493
+ jb .L_4X2_exit
494
+
495
+ #define DECODE(n, idx) \
496
+ movq %bits##n, %rax; \
497
+ shrq $53, %rax; \
498
+ movzwl 0(%dtable,%rax,4),%r8d; \
499
+ movzbl 2(%dtable,%rax,4),%r15d; \
500
+ movzbl 3(%dtable,%rax,4),%eax; \
501
+ movw %r8w, (%op##n); \
502
+ shlxq %r15, %bits##n, %bits##n; \
503
+ addq %rax, %op##n
504
+
505
+ #define RELOAD_BITS(n) \
506
+ bsfq %bits##n, %bits##n; \
507
+ movq %bits##n, %rax; \
508
+ shrq $3, %bits##n; \
509
+ andq $7, %rax; \
510
+ subq %bits##n, %ip##n; \
511
+ movq (%ip##n), %bits##n; \
512
+ orq $1, %bits##n; \
513
+ shlxq %rax, %bits##n, %bits##n
514
+
515
+
516
+ movq %olimit, 48(%rsp)
517
+
518
+ .p2align 6
519
+
520
+ .L_4X2_loop_body:
521
+ /* We clobber r8, so store it on the stack */
522
+ movq %r8, 0(%rsp)
523
+
524
+ /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
525
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
526
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
527
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
528
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
529
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
530
+
531
+ /* Reload r8 */
532
+ movq 0(%rsp), %r8
533
+
534
+ FOR_EACH_STREAM(RELOAD_BITS)
535
+
536
+ cmp %op3, 48(%rsp)
537
+ ja .L_4X2_loop_body
538
+ jmp .L_4X2_compute_olimit
539
+
540
+ #undef DECODE
541
+ #undef RELOAD_BITS
542
+ .L_4X2_exit:
543
+ addq $8, %rsp
544
+ /* Restore stack (oend & olimit) */
545
+ pop %rax /* oend0 */
546
+ pop %rax /* oend1 */
547
+ pop %rax /* oend2 */
548
+ pop %rax /* oend3 */
549
+ pop %rax /* ilimit */
550
+ pop %rax /* olimit */
551
+ pop %rax /* arg */
552
+
553
+ /* Save ip / op / bits */
554
+ movq %ip0, 0(%rax)
555
+ movq %ip1, 8(%rax)
556
+ movq %ip2, 16(%rax)
557
+ movq %ip3, 24(%rax)
558
+ movq %op0, 32(%rax)
559
+ movq %op1, 40(%rax)
560
+ movq %op2, 48(%rax)
561
+ movq %op3, 56(%rax)
562
+ movq %bits0, 64(%rax)
563
+ movq %bits1, 72(%rax)
564
+ movq %bits2, 80(%rax)
565
+ movq %bits3, 88(%rax)
566
+
567
+ /* Restore registers */
568
+ pop %r15
569
+ pop %r14
570
+ pop %r13
571
+ pop %r12
572
+ pop %r11
573
+ pop %r10
574
+ pop %r9
575
+ pop %r8
576
+ pop %rdi
577
+ pop %rsi
578
+ pop %rbp
579
+ pop %rdx
580
+ pop %rcx
581
+ pop %rbx
582
+ pop %rax
583
+ ret
584
+
585
+ #endif