zstd-ruby 1.5.0.0 → 1.5.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +2 -2
  3. data/README.md +1 -1
  4. data/ext/zstdruby/extconf.rb +1 -0
  5. data/ext/zstdruby/libzstd/Makefile +50 -175
  6. data/ext/zstdruby/libzstd/README.md +7 -1
  7. data/ext/zstdruby/libzstd/common/bitstream.h +24 -9
  8. data/ext/zstdruby/libzstd/common/compiler.h +89 -43
  9. data/ext/zstdruby/libzstd/common/entropy_common.c +11 -5
  10. data/ext/zstdruby/libzstd/common/error_private.h +79 -0
  11. data/ext/zstdruby/libzstd/common/fse.h +2 -1
  12. data/ext/zstdruby/libzstd/common/fse_decompress.c +1 -1
  13. data/ext/zstdruby/libzstd/common/huf.h +24 -22
  14. data/ext/zstdruby/libzstd/common/mem.h +18 -0
  15. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  16. data/ext/zstdruby/libzstd/common/xxhash.c +5 -805
  17. data/ext/zstdruby/libzstd/common/xxhash.h +5568 -167
  18. data/ext/zstdruby/libzstd/common/zstd_internal.h +92 -88
  19. data/ext/zstdruby/libzstd/common/zstd_trace.h +12 -3
  20. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  21. data/ext/zstdruby/libzstd/compress/fse_compress.c +63 -27
  22. data/ext/zstdruby/libzstd/compress/huf_compress.c +537 -104
  23. data/ext/zstdruby/libzstd/compress/zstd_compress.c +194 -278
  24. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +102 -44
  25. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +4 -3
  26. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +3 -1
  27. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +5 -4
  28. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +3 -2
  29. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +3 -3
  30. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +289 -114
  31. data/ext/zstdruby/libzstd/compress/zstd_fast.c +302 -123
  32. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +418 -502
  33. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +4 -4
  34. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  35. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +4 -1
  36. data/ext/zstdruby/libzstd/compress/zstd_opt.c +186 -108
  37. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +59 -29
  38. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +727 -189
  39. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  40. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +85 -22
  41. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +744 -220
  42. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +8 -2
  43. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +34 -3
  44. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +23 -3
  45. data/ext/zstdruby/libzstd/dictBuilder/cover.c +9 -2
  46. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +11 -4
  47. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +99 -28
  48. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +2 -6
  49. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +3 -7
  50. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +3 -7
  51. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +3 -7
  52. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +3 -7
  53. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +3 -7
  54. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +3 -7
  55. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  56. data/ext/zstdruby/libzstd/libzstd.pc.in +1 -0
  57. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  58. data/ext/zstdruby/libzstd/zdict.h +4 -4
  59. data/ext/zstdruby/libzstd/zstd.h +179 -136
  60. data/ext/zstdruby/zstdruby.c +2 -2
  61. data/lib/zstd-ruby/version.rb +1 -1
  62. metadata +8 -3
@@ -0,0 +1,571 @@
1
+ #include "../common/portability_macros.h"
2
+
3
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
4
+
5
+ /* Stack marking
6
+ * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
7
+ */
8
+ #if defined(__linux__) && defined(__ELF__)
9
+ .section .note.GNU-stack,"",%progbits
10
+ #endif
11
+
12
+ /* Calling convention:
13
+ *
14
+ * %rdi contains the first argument: HUF_DecompressAsmArgs*.
15
+ * %rbp isn't maintained (no frame pointer).
16
+ * %rsp contains the stack pointer that grows down.
17
+ * No red-zone is assumed, only addresses >= %rsp are used.
18
+ * All register contents are preserved.
19
+ *
20
+ * TODO: Support Windows calling convention.
21
+ */
22
+
23
+ .global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
24
+ .global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
25
+ .global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
26
+ .global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
27
+ .text
28
+
29
+ /* Sets up register mappings for clarity.
30
+ * op[], bits[], dtable & ip[0] each get their own register.
31
+ * ip[1,2,3] & olimit alias var[].
32
+ * %rax is a scratch register.
33
+ */
34
+
35
+ #define op0 rsi
36
+ #define op1 rbx
37
+ #define op2 rcx
38
+ #define op3 rdi
39
+
40
+ #define ip0 r8
41
+ #define ip1 r9
42
+ #define ip2 r10
43
+ #define ip3 r11
44
+
45
+ #define bits0 rbp
46
+ #define bits1 rdx
47
+ #define bits2 r12
48
+ #define bits3 r13
49
+ #define dtable r14
50
+ #define olimit r15
51
+
52
+ /* var[] aliases ip[1,2,3] & olimit
53
+ * ip[1,2,3] are saved every iteration.
54
+ * olimit is only used in compute_olimit.
55
+ */
56
+ #define var0 r15
57
+ #define var1 r9
58
+ #define var2 r10
59
+ #define var3 r11
60
+
61
+ /* 32-bit var registers */
62
+ #define vard0 r15d
63
+ #define vard1 r9d
64
+ #define vard2 r10d
65
+ #define vard3 r11d
66
+
67
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
68
+ #define FOR_EACH_STREAM(X) \
69
+ X(0); \
70
+ X(1); \
71
+ X(2); \
72
+ X(3)
73
+
74
+ /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
75
+ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
76
+ X(0, idx); \
77
+ X(1, idx); \
78
+ X(2, idx); \
79
+ X(3, idx)
80
+
81
+ /* Define both _HUF_* & HUF_* symbols because MacOS
82
+ * C symbols are prefixed with '_' & Linux symbols aren't.
83
+ */
84
+ _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
85
+ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
86
+ /* Save all registers - even if they are callee saved for simplicity. */
87
+ push %rax
88
+ push %rbx
89
+ push %rcx
90
+ push %rdx
91
+ push %rbp
92
+ push %rsi
93
+ push %rdi
94
+ push %r8
95
+ push %r9
96
+ push %r10
97
+ push %r11
98
+ push %r12
99
+ push %r13
100
+ push %r14
101
+ push %r15
102
+
103
+ /* Read HUF_DecompressAsmArgs* args from %rax */
104
+ movq %rdi, %rax
105
+ movq 0(%rax), %ip0
106
+ movq 8(%rax), %ip1
107
+ movq 16(%rax), %ip2
108
+ movq 24(%rax), %ip3
109
+ movq 32(%rax), %op0
110
+ movq 40(%rax), %op1
111
+ movq 48(%rax), %op2
112
+ movq 56(%rax), %op3
113
+ movq 64(%rax), %bits0
114
+ movq 72(%rax), %bits1
115
+ movq 80(%rax), %bits2
116
+ movq 88(%rax), %bits3
117
+ movq 96(%rax), %dtable
118
+ push %rax /* argument */
119
+ push 104(%rax) /* ilimit */
120
+ push 112(%rax) /* oend */
121
+ push %olimit /* olimit space */
122
+
123
+ subq $24, %rsp
124
+
125
+ .L_4X1_compute_olimit:
126
+ /* Computes how many iterations we can do safely
127
+ * %r15, %rax may be clobbered
128
+ * rbx, rdx must be saved
129
+ * op3 & ip0 mustn't be clobbered
130
+ */
131
+ movq %rbx, 0(%rsp)
132
+ movq %rdx, 8(%rsp)
133
+
134
+ movq 32(%rsp), %rax /* rax = oend */
135
+ subq %op3, %rax /* rax = oend - op3 */
136
+
137
+ /* r15 = (oend - op3) / 5 */
138
+ movabsq $-3689348814741910323, %rdx
139
+ mulq %rdx
140
+ movq %rdx, %r15
141
+ shrq $2, %r15
142
+
143
+ movq %ip0, %rax /* rax = ip0 */
144
+ movq 40(%rsp), %rdx /* rdx = ilimit */
145
+ subq %rdx, %rax /* rax = ip0 - ilimit */
146
+ movq %rax, %rbx /* rbx = ip0 - ilimit */
147
+
148
+ /* rdx = (ip0 - ilimit) / 7 */
149
+ movabsq $2635249153387078803, %rdx
150
+ mulq %rdx
151
+ subq %rdx, %rbx
152
+ shrq %rbx
153
+ addq %rbx, %rdx
154
+ shrq $2, %rdx
155
+
156
+ /* r15 = min(%rdx, %r15) */
157
+ cmpq %rdx, %r15
158
+ cmova %rdx, %r15
159
+
160
+ /* r15 = r15 * 5 */
161
+ leaq (%r15, %r15, 4), %r15
162
+
163
+ /* olimit = op3 + r15 */
164
+ addq %op3, %olimit
165
+
166
+ movq 8(%rsp), %rdx
167
+ movq 0(%rsp), %rbx
168
+
169
+ /* If (op3 + 20 > olimit) */
170
+ movq %op3, %rax /* rax = op3 */
171
+ addq $20, %rax /* rax = op3 + 20 */
172
+ cmpq %rax, %olimit /* op3 + 20 > olimit */
173
+ jb .L_4X1_exit
174
+
175
+ /* If (ip1 < ip0) go to exit */
176
+ cmpq %ip0, %ip1
177
+ jb .L_4X1_exit
178
+
179
+ /* If (ip2 < ip1) go to exit */
180
+ cmpq %ip1, %ip2
181
+ jb .L_4X1_exit
182
+
183
+ /* If (ip3 < ip2) go to exit */
184
+ cmpq %ip2, %ip3
185
+ jb .L_4X1_exit
186
+
187
+ /* Reads top 11 bits from bits[n]
188
+ * Loads dt[bits[n]] into var[n]
189
+ */
190
+ #define GET_NEXT_DELT(n) \
191
+ movq $53, %var##n; \
192
+ shrxq %var##n, %bits##n, %var##n; \
193
+ movzwl (%dtable,%var##n,2),%vard##n
194
+
195
+ /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
196
+ * Moves var[n] to %rax
197
+ * bits[n] <<= var[n] & 63
198
+ * op[n][idx] = %rax >> 8
199
+ * %ah is a way to access bits [8, 16) of %rax
200
+ */
201
+ #define DECODE_FROM_DELT(n, idx) \
202
+ movq %var##n, %rax; \
203
+ shlxq %var##n, %bits##n, %bits##n; \
204
+ movb %ah, idx(%op##n)
205
+
206
+ /* Assumes GET_NEXT_DELT has been called.
207
+ * Calls DECODE_FROM_DELT then GET_NEXT_DELT
208
+ */
209
+ #define DECODE_AND_GET_NEXT(n, idx) \
210
+ DECODE_FROM_DELT(n, idx); \
211
+ GET_NEXT_DELT(n) \
212
+
213
+ /* // ctz & nbBytes is stored in bits[n]
214
+ * // nbBits is stored in %rax
215
+ * ctz = CTZ[bits[n]]
216
+ * nbBits = ctz & 7
217
+ * nbBytes = ctz >> 3
218
+ * op[n] += 5
219
+ * ip[n] -= nbBytes
220
+ * // Note: x86-64 is little-endian ==> no bswap
221
+ * bits[n] = MEM_readST(ip[n]) | 1
222
+ * bits[n] <<= nbBits
223
+ */
224
+ #define RELOAD_BITS(n) \
225
+ bsfq %bits##n, %bits##n; \
226
+ movq %bits##n, %rax; \
227
+ andq $7, %rax; \
228
+ shrq $3, %bits##n; \
229
+ leaq 5(%op##n), %op##n; \
230
+ subq %bits##n, %ip##n; \
231
+ movq (%ip##n), %bits##n; \
232
+ orq $1, %bits##n; \
233
+ shlx %rax, %bits##n, %bits##n
234
+
235
+ /* Store clobbered variables on the stack */
236
+ movq %olimit, 24(%rsp)
237
+ movq %ip1, 0(%rsp)
238
+ movq %ip2, 8(%rsp)
239
+ movq %ip3, 16(%rsp)
240
+
241
+ /* Call GET_NEXT_DELT for each stream */
242
+ FOR_EACH_STREAM(GET_NEXT_DELT)
243
+
244
+ .p2align 6
245
+
246
+ .L_4X1_loop_body:
247
+ /* Decode 5 symbols in each of the 4 streams (20 total)
248
+ * Must have called GET_NEXT_DELT for each stream
249
+ */
250
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
251
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
252
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
253
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
254
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
255
+
256
+ /* Load ip[1,2,3] from stack (var[] aliases them)
257
+ * ip[] is needed for RELOAD_BITS
258
+ * Each will be stored back to the stack after RELOAD
259
+ */
260
+ movq 0(%rsp), %ip1
261
+ movq 8(%rsp), %ip2
262
+ movq 16(%rsp), %ip3
263
+
264
+ /* Reload each stream & fetch the next table entry
265
+ * to prepare for the next iteration
266
+ */
267
+ RELOAD_BITS(0)
268
+ GET_NEXT_DELT(0)
269
+
270
+ RELOAD_BITS(1)
271
+ movq %ip1, 0(%rsp)
272
+ GET_NEXT_DELT(1)
273
+
274
+ RELOAD_BITS(2)
275
+ movq %ip2, 8(%rsp)
276
+ GET_NEXT_DELT(2)
277
+
278
+ RELOAD_BITS(3)
279
+ movq %ip3, 16(%rsp)
280
+ GET_NEXT_DELT(3)
281
+
282
+ /* If op3 < olimit: continue the loop */
283
+ cmp %op3, 24(%rsp)
284
+ ja .L_4X1_loop_body
285
+
286
+ /* Reload ip[1,2,3] from stack */
287
+ movq 0(%rsp), %ip1
288
+ movq 8(%rsp), %ip2
289
+ movq 16(%rsp), %ip3
290
+
291
+ /* Re-compute olimit */
292
+ jmp .L_4X1_compute_olimit
293
+
294
+ #undef GET_NEXT_DELT
295
+ #undef DECODE_FROM_DELT
296
+ #undef DECODE
297
+ #undef RELOAD_BITS
298
+ .L_4X1_exit:
299
+ addq $24, %rsp
300
+
301
+ /* Restore stack (oend & olimit) */
302
+ pop %rax /* olimit */
303
+ pop %rax /* oend */
304
+ pop %rax /* ilimit */
305
+ pop %rax /* arg */
306
+
307
+ /* Save ip / op / bits */
308
+ movq %ip0, 0(%rax)
309
+ movq %ip1, 8(%rax)
310
+ movq %ip2, 16(%rax)
311
+ movq %ip3, 24(%rax)
312
+ movq %op0, 32(%rax)
313
+ movq %op1, 40(%rax)
314
+ movq %op2, 48(%rax)
315
+ movq %op3, 56(%rax)
316
+ movq %bits0, 64(%rax)
317
+ movq %bits1, 72(%rax)
318
+ movq %bits2, 80(%rax)
319
+ movq %bits3, 88(%rax)
320
+
321
+ /* Restore registers */
322
+ pop %r15
323
+ pop %r14
324
+ pop %r13
325
+ pop %r12
326
+ pop %r11
327
+ pop %r10
328
+ pop %r9
329
+ pop %r8
330
+ pop %rdi
331
+ pop %rsi
332
+ pop %rbp
333
+ pop %rdx
334
+ pop %rcx
335
+ pop %rbx
336
+ pop %rax
337
+ ret
338
+
339
+ _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
340
+ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
341
+ /* Save all registers - even if they are callee saved for simplicity. */
342
+ push %rax
343
+ push %rbx
344
+ push %rcx
345
+ push %rdx
346
+ push %rbp
347
+ push %rsi
348
+ push %rdi
349
+ push %r8
350
+ push %r9
351
+ push %r10
352
+ push %r11
353
+ push %r12
354
+ push %r13
355
+ push %r14
356
+ push %r15
357
+
358
+ movq %rdi, %rax
359
+ movq 0(%rax), %ip0
360
+ movq 8(%rax), %ip1
361
+ movq 16(%rax), %ip2
362
+ movq 24(%rax), %ip3
363
+ movq 32(%rax), %op0
364
+ movq 40(%rax), %op1
365
+ movq 48(%rax), %op2
366
+ movq 56(%rax), %op3
367
+ movq 64(%rax), %bits0
368
+ movq 72(%rax), %bits1
369
+ movq 80(%rax), %bits2
370
+ movq 88(%rax), %bits3
371
+ movq 96(%rax), %dtable
372
+ push %rax /* argument */
373
+ push %rax /* olimit */
374
+ push 104(%rax) /* ilimit */
375
+
376
+ movq 112(%rax), %rax
377
+ push %rax /* oend3 */
378
+
379
+ movq %op3, %rax
380
+ push %rax /* oend2 */
381
+
382
+ movq %op2, %rax
383
+ push %rax /* oend1 */
384
+
385
+ movq %op1, %rax
386
+ push %rax /* oend0 */
387
+
388
+ /* Scratch space */
389
+ subq $8, %rsp
390
+
391
+ .L_4X2_compute_olimit:
392
+ /* Computes how many iterations we can do safely
393
+ * %r15, %rax may be clobbered
394
+ * rdx must be saved
395
+ * op[1,2,3,4] & ip0 mustn't be clobbered
396
+ */
397
+ movq %rdx, 0(%rsp)
398
+
399
+ /* We can consume up to 7 input bytes each iteration. */
400
+ movq %ip0, %rax /* rax = ip0 */
401
+ movq 40(%rsp), %rdx /* rdx = ilimit */
402
+ subq %rdx, %rax /* rax = ip0 - ilimit */
403
+ movq %rax, %r15 /* r15 = ip0 - ilimit */
404
+
405
+ /* rdx = rax / 7 */
406
+ movabsq $2635249153387078803, %rdx
407
+ mulq %rdx
408
+ subq %rdx, %r15
409
+ shrq %r15
410
+ addq %r15, %rdx
411
+ shrq $2, %rdx
412
+
413
+ /* r15 = (ip0 - ilimit) / 7 */
414
+ movq %rdx, %r15
415
+
416
+ movabsq $-3689348814741910323, %rdx
417
+ movq 8(%rsp), %rax /* rax = oend0 */
418
+ subq %op0, %rax /* rax = oend0 - op0 */
419
+ mulq %rdx
420
+ shrq $3, %rdx /* rdx = rax / 10 */
421
+
422
+ /* r15 = min(%rdx, %r15) */
423
+ cmpq %rdx, %r15
424
+ cmova %rdx, %r15
425
+
426
+ movabsq $-3689348814741910323, %rdx
427
+ movq 16(%rsp), %rax /* rax = oend1 */
428
+ subq %op1, %rax /* rax = oend1 - op1 */
429
+ mulq %rdx
430
+ shrq $3, %rdx /* rdx = rax / 10 */
431
+
432
+ /* r15 = min(%rdx, %r15) */
433
+ cmpq %rdx, %r15
434
+ cmova %rdx, %r15
435
+
436
+ movabsq $-3689348814741910323, %rdx
437
+ movq 24(%rsp), %rax /* rax = oend2 */
438
+ subq %op2, %rax /* rax = oend2 - op2 */
439
+ mulq %rdx
440
+ shrq $3, %rdx /* rdx = rax / 10 */
441
+
442
+ /* r15 = min(%rdx, %r15) */
443
+ cmpq %rdx, %r15
444
+ cmova %rdx, %r15
445
+
446
+ movabsq $-3689348814741910323, %rdx
447
+ movq 32(%rsp), %rax /* rax = oend3 */
448
+ subq %op3, %rax /* rax = oend3 - op3 */
449
+ mulq %rdx
450
+ shrq $3, %rdx /* rdx = rax / 10 */
451
+
452
+ /* r15 = min(%rdx, %r15) */
453
+ cmpq %rdx, %r15
454
+ cmova %rdx, %r15
455
+
456
+ /* olimit = op3 + 5 * r15 */
457
+ movq %r15, %rax
458
+ leaq (%op3, %rax, 4), %olimit
459
+ addq %rax, %olimit
460
+
461
+ movq 0(%rsp), %rdx
462
+
463
+ /* If (op3 + 10 > olimit) */
464
+ movq %op3, %rax /* rax = op3 */
465
+ addq $10, %rax /* rax = op3 + 10 */
466
+ cmpq %rax, %olimit /* op3 + 10 > olimit */
467
+ jb .L_4X2_exit
468
+
469
+ /* If (ip1 < ip0) go to exit */
470
+ cmpq %ip0, %ip1
471
+ jb .L_4X2_exit
472
+
473
+ /* If (ip2 < ip1) go to exit */
474
+ cmpq %ip1, %ip2
475
+ jb .L_4X2_exit
476
+
477
+ /* If (ip3 < ip2) go to exit */
478
+ cmpq %ip2, %ip3
479
+ jb .L_4X2_exit
480
+
481
+ #define DECODE(n, idx) \
482
+ movq %bits##n, %rax; \
483
+ shrq $53, %rax; \
484
+ movzwl 0(%dtable,%rax,4),%r8d; \
485
+ movzbl 2(%dtable,%rax,4),%r15d; \
486
+ movzbl 3(%dtable,%rax,4),%eax; \
487
+ movw %r8w, (%op##n); \
488
+ shlxq %r15, %bits##n, %bits##n; \
489
+ addq %rax, %op##n
490
+
491
+ #define RELOAD_BITS(n) \
492
+ bsfq %bits##n, %bits##n; \
493
+ movq %bits##n, %rax; \
494
+ shrq $3, %bits##n; \
495
+ andq $7, %rax; \
496
+ subq %bits##n, %ip##n; \
497
+ movq (%ip##n), %bits##n; \
498
+ orq $1, %bits##n; \
499
+ shlxq %rax, %bits##n, %bits##n
500
+
501
+
502
+ movq %olimit, 48(%rsp)
503
+
504
+ .p2align 6
505
+
506
+ .L_4X2_loop_body:
507
+ /* We clobber r8, so store it on the stack */
508
+ movq %r8, 0(%rsp)
509
+
510
+ /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
511
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
512
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
513
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
514
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
515
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
516
+
517
+ /* Reload r8 */
518
+ movq 0(%rsp), %r8
519
+
520
+ FOR_EACH_STREAM(RELOAD_BITS)
521
+
522
+ cmp %op3, 48(%rsp)
523
+ ja .L_4X2_loop_body
524
+ jmp .L_4X2_compute_olimit
525
+
526
+ #undef DECODE
527
+ #undef RELOAD_BITS
528
+ .L_4X2_exit:
529
+ addq $8, %rsp
530
+ /* Restore stack (oend & olimit) */
531
+ pop %rax /* oend0 */
532
+ pop %rax /* oend1 */
533
+ pop %rax /* oend2 */
534
+ pop %rax /* oend3 */
535
+ pop %rax /* ilimit */
536
+ pop %rax /* olimit */
537
+ pop %rax /* arg */
538
+
539
+ /* Save ip / op / bits */
540
+ movq %ip0, 0(%rax)
541
+ movq %ip1, 8(%rax)
542
+ movq %ip2, 16(%rax)
543
+ movq %ip3, 24(%rax)
544
+ movq %op0, 32(%rax)
545
+ movq %op1, 40(%rax)
546
+ movq %op2, 48(%rax)
547
+ movq %op3, 56(%rax)
548
+ movq %bits0, 64(%rax)
549
+ movq %bits1, 72(%rax)
550
+ movq %bits2, 80(%rax)
551
+ movq %bits3, 88(%rax)
552
+
553
+ /* Restore registers */
554
+ pop %r15
555
+ pop %r14
556
+ pop %r13
557
+ pop %r12
558
+ pop %r11
559
+ pop %r10
560
+ pop %r9
561
+ pop %r8
562
+ pop %rdi
563
+ pop %rsi
564
+ pop %rbp
565
+ pop %rdx
566
+ pop %rcx
567
+ pop %rbx
568
+ pop %rax
569
+ ret
570
+
571
+ #endif