zstd-ruby 1.5.0.0 → 1.5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ruby.yml +2 -2
  3. data/README.md +1 -1
  4. data/ext/zstdruby/extconf.rb +1 -0
  5. data/ext/zstdruby/libzstd/Makefile +50 -175
  6. data/ext/zstdruby/libzstd/README.md +7 -1
  7. data/ext/zstdruby/libzstd/common/bitstream.h +24 -9
  8. data/ext/zstdruby/libzstd/common/compiler.h +89 -43
  9. data/ext/zstdruby/libzstd/common/entropy_common.c +11 -5
  10. data/ext/zstdruby/libzstd/common/error_private.h +79 -0
  11. data/ext/zstdruby/libzstd/common/fse.h +2 -1
  12. data/ext/zstdruby/libzstd/common/fse_decompress.c +1 -1
  13. data/ext/zstdruby/libzstd/common/huf.h +24 -22
  14. data/ext/zstdruby/libzstd/common/mem.h +18 -0
  15. data/ext/zstdruby/libzstd/common/portability_macros.h +131 -0
  16. data/ext/zstdruby/libzstd/common/xxhash.c +5 -805
  17. data/ext/zstdruby/libzstd/common/xxhash.h +5568 -167
  18. data/ext/zstdruby/libzstd/common/zstd_internal.h +92 -88
  19. data/ext/zstdruby/libzstd/common/zstd_trace.h +12 -3
  20. data/ext/zstdruby/libzstd/compress/clevels.h +134 -0
  21. data/ext/zstdruby/libzstd/compress/fse_compress.c +63 -27
  22. data/ext/zstdruby/libzstd/compress/huf_compress.c +537 -104
  23. data/ext/zstdruby/libzstd/compress/zstd_compress.c +194 -278
  24. data/ext/zstdruby/libzstd/compress/zstd_compress_internal.h +102 -44
  25. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.c +4 -3
  26. data/ext/zstdruby/libzstd/compress/zstd_compress_literals.h +3 -1
  27. data/ext/zstdruby/libzstd/compress/zstd_compress_sequences.c +5 -4
  28. data/ext/zstdruby/libzstd/compress/zstd_compress_superblock.c +3 -2
  29. data/ext/zstdruby/libzstd/compress/zstd_cwksp.h +3 -3
  30. data/ext/zstdruby/libzstd/compress/zstd_double_fast.c +289 -114
  31. data/ext/zstdruby/libzstd/compress/zstd_fast.c +302 -123
  32. data/ext/zstdruby/libzstd/compress/zstd_lazy.c +418 -502
  33. data/ext/zstdruby/libzstd/compress/zstd_ldm.c +4 -4
  34. data/ext/zstdruby/libzstd/compress/zstd_ldm.h +1 -1
  35. data/ext/zstdruby/libzstd/compress/zstd_ldm_geartab.h +4 -1
  36. data/ext/zstdruby/libzstd/compress/zstd_opt.c +186 -108
  37. data/ext/zstdruby/libzstd/compress/zstdmt_compress.c +59 -29
  38. data/ext/zstdruby/libzstd/decompress/huf_decompress.c +727 -189
  39. data/ext/zstdruby/libzstd/decompress/huf_decompress_amd64.S +571 -0
  40. data/ext/zstdruby/libzstd/decompress/zstd_decompress.c +85 -22
  41. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.c +744 -220
  42. data/ext/zstdruby/libzstd/decompress/zstd_decompress_block.h +8 -2
  43. data/ext/zstdruby/libzstd/decompress/zstd_decompress_internal.h +34 -3
  44. data/ext/zstdruby/libzstd/deprecated/zbuff_compress.c +23 -3
  45. data/ext/zstdruby/libzstd/dictBuilder/cover.c +9 -2
  46. data/ext/zstdruby/libzstd/dictBuilder/fastcover.c +11 -4
  47. data/ext/zstdruby/libzstd/dictBuilder/zdict.c +99 -28
  48. data/ext/zstdruby/libzstd/legacy/zstd_v01.c +2 -6
  49. data/ext/zstdruby/libzstd/legacy/zstd_v02.c +3 -7
  50. data/ext/zstdruby/libzstd/legacy/zstd_v03.c +3 -7
  51. data/ext/zstdruby/libzstd/legacy/zstd_v04.c +3 -7
  52. data/ext/zstdruby/libzstd/legacy/zstd_v05.c +3 -7
  53. data/ext/zstdruby/libzstd/legacy/zstd_v06.c +3 -7
  54. data/ext/zstdruby/libzstd/legacy/zstd_v07.c +3 -7
  55. data/ext/zstdruby/libzstd/libzstd.mk +185 -0
  56. data/ext/zstdruby/libzstd/libzstd.pc.in +1 -0
  57. data/ext/zstdruby/libzstd/modulemap/module.modulemap +4 -0
  58. data/ext/zstdruby/libzstd/zdict.h +4 -4
  59. data/ext/zstdruby/libzstd/zstd.h +179 -136
  60. data/ext/zstdruby/zstdruby.c +2 -2
  61. data/lib/zstd-ruby/version.rb +1 -1
  62. metadata +8 -3
@@ -0,0 +1,571 @@
1
+ #include "../common/portability_macros.h"
2
+
3
+ #if ZSTD_ENABLE_ASM_X86_64_BMI2
4
+
5
+ /* Stack marking
6
+ * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
7
+ */
8
+ #if defined(__linux__) && defined(__ELF__)
9
+ .section .note.GNU-stack,"",%progbits
10
+ #endif
11
+
12
+ /* Calling convention:
13
+ *
14
+ * %rdi contains the first argument: HUF_DecompressAsmArgs*.
15
+ * %rbp isn't maintained (no frame pointer).
16
+ * %rsp contains the stack pointer that grows down.
17
+ * No red-zone is assumed, only addresses >= %rsp are used.
18
+ * All register contents are preserved.
19
+ *
20
+ * TODO: Support Windows calling convention.
21
+ */
22
+
23
+ .global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
24
+ .global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
25
+ .global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop
26
+ .global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop
27
+ .text
28
+
29
+ /* Sets up register mappings for clarity.
30
+ * op[], bits[], dtable & ip[0] each get their own register.
31
+ * ip[1,2,3] & olimit alias var[].
32
+ * %rax is a scratch register.
33
+ */
34
+
35
+ #define op0 rsi
36
+ #define op1 rbx
37
+ #define op2 rcx
38
+ #define op3 rdi
39
+
40
+ #define ip0 r8
41
+ #define ip1 r9
42
+ #define ip2 r10
43
+ #define ip3 r11
44
+
45
+ #define bits0 rbp
46
+ #define bits1 rdx
47
+ #define bits2 r12
48
+ #define bits3 r13
49
+ #define dtable r14
50
+ #define olimit r15
51
+
52
+ /* var[] aliases ip[1,2,3] & olimit
53
+ * ip[1,2,3] are saved every iteration.
54
+ * olimit is only used in compute_olimit.
55
+ */
56
+ #define var0 r15
57
+ #define var1 r9
58
+ #define var2 r10
59
+ #define var3 r11
60
+
61
+ /* 32-bit var registers */
62
+ #define vard0 r15d
63
+ #define vard1 r9d
64
+ #define vard2 r10d
65
+ #define vard3 r11d
66
+
67
+ /* Calls X(N) for each stream 0, 1, 2, 3. */
68
+ #define FOR_EACH_STREAM(X) \
69
+ X(0); \
70
+ X(1); \
71
+ X(2); \
72
+ X(3)
73
+
74
+ /* Calls X(N, idx) for each stream 0, 1, 2, 3. */
75
+ #define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
76
+ X(0, idx); \
77
+ X(1, idx); \
78
+ X(2, idx); \
79
+ X(3, idx)
80
+
81
+ /* Define both _HUF_* & HUF_* symbols because MacOS
82
+ * C symbols are prefixed with '_' & Linux symbols aren't.
83
+ */
84
+ _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
85
+ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop:
86
+ /* Save all registers - even if they are callee saved for simplicity. */
87
+ push %rax
88
+ push %rbx
89
+ push %rcx
90
+ push %rdx
91
+ push %rbp
92
+ push %rsi
93
+ push %rdi
94
+ push %r8
95
+ push %r9
96
+ push %r10
97
+ push %r11
98
+ push %r12
99
+ push %r13
100
+ push %r14
101
+ push %r15
102
+
103
+ /* Read HUF_DecompressAsmArgs* args from %rax */
104
+ movq %rdi, %rax
105
+ movq 0(%rax), %ip0
106
+ movq 8(%rax), %ip1
107
+ movq 16(%rax), %ip2
108
+ movq 24(%rax), %ip3
109
+ movq 32(%rax), %op0
110
+ movq 40(%rax), %op1
111
+ movq 48(%rax), %op2
112
+ movq 56(%rax), %op3
113
+ movq 64(%rax), %bits0
114
+ movq 72(%rax), %bits1
115
+ movq 80(%rax), %bits2
116
+ movq 88(%rax), %bits3
117
+ movq 96(%rax), %dtable
118
+ push %rax /* argument */
119
+ push 104(%rax) /* ilimit */
120
+ push 112(%rax) /* oend */
121
+ push %olimit /* olimit space */
122
+
123
+ subq $24, %rsp
124
+
125
+ .L_4X1_compute_olimit:
126
+ /* Computes how many iterations we can do safely
127
+ * %r15, %rax may be clobbered
128
+ * rbx, rdx must be saved
129
+ * op3 & ip0 mustn't be clobbered
130
+ */
131
+ movq %rbx, 0(%rsp)
132
+ movq %rdx, 8(%rsp)
133
+
134
+ movq 32(%rsp), %rax /* rax = oend */
135
+ subq %op3, %rax /* rax = oend - op3 */
136
+
137
+ /* r15 = (oend - op3) / 5 */
138
+ movabsq $-3689348814741910323, %rdx
139
+ mulq %rdx
140
+ movq %rdx, %r15
141
+ shrq $2, %r15
142
+
143
+ movq %ip0, %rax /* rax = ip0 */
144
+ movq 40(%rsp), %rdx /* rdx = ilimit */
145
+ subq %rdx, %rax /* rax = ip0 - ilimit */
146
+ movq %rax, %rbx /* rbx = ip0 - ilimit */
147
+
148
+ /* rdx = (ip0 - ilimit) / 7 */
149
+ movabsq $2635249153387078803, %rdx
150
+ mulq %rdx
151
+ subq %rdx, %rbx
152
+ shrq %rbx
153
+ addq %rbx, %rdx
154
+ shrq $2, %rdx
155
+
156
+ /* r15 = min(%rdx, %r15) */
157
+ cmpq %rdx, %r15
158
+ cmova %rdx, %r15
159
+
160
+ /* r15 = r15 * 5 */
161
+ leaq (%r15, %r15, 4), %r15
162
+
163
+ /* olimit = op3 + r15 */
164
+ addq %op3, %olimit
165
+
166
+ movq 8(%rsp), %rdx
167
+ movq 0(%rsp), %rbx
168
+
169
+ /* If (op3 + 20 > olimit) */
170
+ movq %op3, %rax /* rax = op3 */
171
+ addq $20, %rax /* rax = op3 + 20 */
172
+ cmpq %rax, %olimit /* op3 + 20 > olimit */
173
+ jb .L_4X1_exit
174
+
175
+ /* If (ip1 < ip0) go to exit */
176
+ cmpq %ip0, %ip1
177
+ jb .L_4X1_exit
178
+
179
+ /* If (ip2 < ip1) go to exit */
180
+ cmpq %ip1, %ip2
181
+ jb .L_4X1_exit
182
+
183
+ /* If (ip3 < ip2) go to exit */
184
+ cmpq %ip2, %ip3
185
+ jb .L_4X1_exit
186
+
187
+ /* Reads top 11 bits from bits[n]
188
+ * Loads dt[bits[n]] into var[n]
189
+ */
190
+ #define GET_NEXT_DELT(n) \
191
+ movq $53, %var##n; \
192
+ shrxq %var##n, %bits##n, %var##n; \
193
+ movzwl (%dtable,%var##n,2),%vard##n
194
+
195
+ /* var[n] must contain the DTable entry computed with GET_NEXT_DELT
196
+ * Moves var[n] to %rax
197
+ * bits[n] <<= var[n] & 63
198
+ * op[n][idx] = %rax >> 8
199
+ * %ah is a way to access bits [8, 16) of %rax
200
+ */
201
+ #define DECODE_FROM_DELT(n, idx) \
202
+ movq %var##n, %rax; \
203
+ shlxq %var##n, %bits##n, %bits##n; \
204
+ movb %ah, idx(%op##n)
205
+
206
+ /* Assumes GET_NEXT_DELT has been called.
207
+ * Calls DECODE_FROM_DELT then GET_NEXT_DELT
208
+ */
209
+ #define DECODE_AND_GET_NEXT(n, idx) \
210
+ DECODE_FROM_DELT(n, idx); \
211
+ GET_NEXT_DELT(n) \
212
+
213
+ /* // ctz & nbBytes is stored in bits[n]
214
+ * // nbBits is stored in %rax
215
+ * ctz = CTZ[bits[n]]
216
+ * nbBits = ctz & 7
217
+ * nbBytes = ctz >> 3
218
+ * op[n] += 5
219
+ * ip[n] -= nbBytes
220
+ * // Note: x86-64 is little-endian ==> no bswap
221
+ * bits[n] = MEM_readST(ip[n]) | 1
222
+ * bits[n] <<= nbBits
223
+ */
224
+ #define RELOAD_BITS(n) \
225
+ bsfq %bits##n, %bits##n; \
226
+ movq %bits##n, %rax; \
227
+ andq $7, %rax; \
228
+ shrq $3, %bits##n; \
229
+ leaq 5(%op##n), %op##n; \
230
+ subq %bits##n, %ip##n; \
231
+ movq (%ip##n), %bits##n; \
232
+ orq $1, %bits##n; \
233
+ shlx %rax, %bits##n, %bits##n
234
+
235
+ /* Store clobbered variables on the stack */
236
+ movq %olimit, 24(%rsp)
237
+ movq %ip1, 0(%rsp)
238
+ movq %ip2, 8(%rsp)
239
+ movq %ip3, 16(%rsp)
240
+
241
+ /* Call GET_NEXT_DELT for each stream */
242
+ FOR_EACH_STREAM(GET_NEXT_DELT)
243
+
244
+ .p2align 6
245
+
246
+ .L_4X1_loop_body:
247
+ /* Decode 5 symbols in each of the 4 streams (20 total)
248
+ * Must have called GET_NEXT_DELT for each stream
249
+ */
250
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
251
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
252
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
253
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
254
+ FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
255
+
256
+ /* Load ip[1,2,3] from stack (var[] aliases them)
257
+ * ip[] is needed for RELOAD_BITS
258
+ * Each will be stored back to the stack after RELOAD
259
+ */
260
+ movq 0(%rsp), %ip1
261
+ movq 8(%rsp), %ip2
262
+ movq 16(%rsp), %ip3
263
+
264
+ /* Reload each stream & fetch the next table entry
265
+ * to prepare for the next iteration
266
+ */
267
+ RELOAD_BITS(0)
268
+ GET_NEXT_DELT(0)
269
+
270
+ RELOAD_BITS(1)
271
+ movq %ip1, 0(%rsp)
272
+ GET_NEXT_DELT(1)
273
+
274
+ RELOAD_BITS(2)
275
+ movq %ip2, 8(%rsp)
276
+ GET_NEXT_DELT(2)
277
+
278
+ RELOAD_BITS(3)
279
+ movq %ip3, 16(%rsp)
280
+ GET_NEXT_DELT(3)
281
+
282
+ /* If op3 < olimit: continue the loop */
283
+ cmp %op3, 24(%rsp)
284
+ ja .L_4X1_loop_body
285
+
286
+ /* Reload ip[1,2,3] from stack */
287
+ movq 0(%rsp), %ip1
288
+ movq 8(%rsp), %ip2
289
+ movq 16(%rsp), %ip3
290
+
291
+ /* Re-compute olimit */
292
+ jmp .L_4X1_compute_olimit
293
+
294
+ #undef GET_NEXT_DELT
295
+ #undef DECODE_FROM_DELT
296
+ #undef DECODE
297
+ #undef RELOAD_BITS
298
+ .L_4X1_exit:
299
+ addq $24, %rsp
300
+
301
+ /* Restore stack (oend & olimit) */
302
+ pop %rax /* olimit */
303
+ pop %rax /* oend */
304
+ pop %rax /* ilimit */
305
+ pop %rax /* arg */
306
+
307
+ /* Save ip / op / bits */
308
+ movq %ip0, 0(%rax)
309
+ movq %ip1, 8(%rax)
310
+ movq %ip2, 16(%rax)
311
+ movq %ip3, 24(%rax)
312
+ movq %op0, 32(%rax)
313
+ movq %op1, 40(%rax)
314
+ movq %op2, 48(%rax)
315
+ movq %op3, 56(%rax)
316
+ movq %bits0, 64(%rax)
317
+ movq %bits1, 72(%rax)
318
+ movq %bits2, 80(%rax)
319
+ movq %bits3, 88(%rax)
320
+
321
+ /* Restore registers */
322
+ pop %r15
323
+ pop %r14
324
+ pop %r13
325
+ pop %r12
326
+ pop %r11
327
+ pop %r10
328
+ pop %r9
329
+ pop %r8
330
+ pop %rdi
331
+ pop %rsi
332
+ pop %rbp
333
+ pop %rdx
334
+ pop %rcx
335
+ pop %rbx
336
+ pop %rax
337
+ ret
338
+
339
+ _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
340
+ HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop:
341
+ /* Save all registers - even if they are callee saved for simplicity. */
342
+ push %rax
343
+ push %rbx
344
+ push %rcx
345
+ push %rdx
346
+ push %rbp
347
+ push %rsi
348
+ push %rdi
349
+ push %r8
350
+ push %r9
351
+ push %r10
352
+ push %r11
353
+ push %r12
354
+ push %r13
355
+ push %r14
356
+ push %r15
357
+
358
+ movq %rdi, %rax
359
+ movq 0(%rax), %ip0
360
+ movq 8(%rax), %ip1
361
+ movq 16(%rax), %ip2
362
+ movq 24(%rax), %ip3
363
+ movq 32(%rax), %op0
364
+ movq 40(%rax), %op1
365
+ movq 48(%rax), %op2
366
+ movq 56(%rax), %op3
367
+ movq 64(%rax), %bits0
368
+ movq 72(%rax), %bits1
369
+ movq 80(%rax), %bits2
370
+ movq 88(%rax), %bits3
371
+ movq 96(%rax), %dtable
372
+ push %rax /* argument */
373
+ push %rax /* olimit */
374
+ push 104(%rax) /* ilimit */
375
+
376
+ movq 112(%rax), %rax
377
+ push %rax /* oend3 */
378
+
379
+ movq %op3, %rax
380
+ push %rax /* oend2 */
381
+
382
+ movq %op2, %rax
383
+ push %rax /* oend1 */
384
+
385
+ movq %op1, %rax
386
+ push %rax /* oend0 */
387
+
388
+ /* Scratch space */
389
+ subq $8, %rsp
390
+
391
+ .L_4X2_compute_olimit:
392
+ /* Computes how many iterations we can do safely
393
+ * %r15, %rax may be clobbered
394
+ * rdx must be saved
395
+ * op[1,2,3,4] & ip0 mustn't be clobbered
396
+ */
397
+ movq %rdx, 0(%rsp)
398
+
399
+ /* We can consume up to 7 input bytes each iteration. */
400
+ movq %ip0, %rax /* rax = ip0 */
401
+ movq 40(%rsp), %rdx /* rdx = ilimit */
402
+ subq %rdx, %rax /* rax = ip0 - ilimit */
403
+ movq %rax, %r15 /* r15 = ip0 - ilimit */
404
+
405
+ /* rdx = rax / 7 */
406
+ movabsq $2635249153387078803, %rdx
407
+ mulq %rdx
408
+ subq %rdx, %r15
409
+ shrq %r15
410
+ addq %r15, %rdx
411
+ shrq $2, %rdx
412
+
413
+ /* r15 = (ip0 - ilimit) / 7 */
414
+ movq %rdx, %r15
415
+
416
+ movabsq $-3689348814741910323, %rdx
417
+ movq 8(%rsp), %rax /* rax = oend0 */
418
+ subq %op0, %rax /* rax = oend0 - op0 */
419
+ mulq %rdx
420
+ shrq $3, %rdx /* rdx = rax / 10 */
421
+
422
+ /* r15 = min(%rdx, %r15) */
423
+ cmpq %rdx, %r15
424
+ cmova %rdx, %r15
425
+
426
+ movabsq $-3689348814741910323, %rdx
427
+ movq 16(%rsp), %rax /* rax = oend1 */
428
+ subq %op1, %rax /* rax = oend1 - op1 */
429
+ mulq %rdx
430
+ shrq $3, %rdx /* rdx = rax / 10 */
431
+
432
+ /* r15 = min(%rdx, %r15) */
433
+ cmpq %rdx, %r15
434
+ cmova %rdx, %r15
435
+
436
+ movabsq $-3689348814741910323, %rdx
437
+ movq 24(%rsp), %rax /* rax = oend2 */
438
+ subq %op2, %rax /* rax = oend2 - op2 */
439
+ mulq %rdx
440
+ shrq $3, %rdx /* rdx = rax / 10 */
441
+
442
+ /* r15 = min(%rdx, %r15) */
443
+ cmpq %rdx, %r15
444
+ cmova %rdx, %r15
445
+
446
+ movabsq $-3689348814741910323, %rdx
447
+ movq 32(%rsp), %rax /* rax = oend3 */
448
+ subq %op3, %rax /* rax = oend3 - op3 */
449
+ mulq %rdx
450
+ shrq $3, %rdx /* rdx = rax / 10 */
451
+
452
+ /* r15 = min(%rdx, %r15) */
453
+ cmpq %rdx, %r15
454
+ cmova %rdx, %r15
455
+
456
+ /* olimit = op3 + 5 * r15 */
457
+ movq %r15, %rax
458
+ leaq (%op3, %rax, 4), %olimit
459
+ addq %rax, %olimit
460
+
461
+ movq 0(%rsp), %rdx
462
+
463
+ /* If (op3 + 10 > olimit) */
464
+ movq %op3, %rax /* rax = op3 */
465
+ addq $10, %rax /* rax = op3 + 10 */
466
+ cmpq %rax, %olimit /* op3 + 10 > olimit */
467
+ jb .L_4X2_exit
468
+
469
+ /* If (ip1 < ip0) go to exit */
470
+ cmpq %ip0, %ip1
471
+ jb .L_4X2_exit
472
+
473
+ /* If (ip2 < ip1) go to exit */
474
+ cmpq %ip1, %ip2
475
+ jb .L_4X2_exit
476
+
477
+ /* If (ip3 < ip2) go to exit */
478
+ cmpq %ip2, %ip3
479
+ jb .L_4X2_exit
480
+
481
+ #define DECODE(n, idx) \
482
+ movq %bits##n, %rax; \
483
+ shrq $53, %rax; \
484
+ movzwl 0(%dtable,%rax,4),%r8d; \
485
+ movzbl 2(%dtable,%rax,4),%r15d; \
486
+ movzbl 3(%dtable,%rax,4),%eax; \
487
+ movw %r8w, (%op##n); \
488
+ shlxq %r15, %bits##n, %bits##n; \
489
+ addq %rax, %op##n
490
+
491
+ #define RELOAD_BITS(n) \
492
+ bsfq %bits##n, %bits##n; \
493
+ movq %bits##n, %rax; \
494
+ shrq $3, %bits##n; \
495
+ andq $7, %rax; \
496
+ subq %bits##n, %ip##n; \
497
+ movq (%ip##n), %bits##n; \
498
+ orq $1, %bits##n; \
499
+ shlxq %rax, %bits##n, %bits##n
500
+
501
+
502
+ movq %olimit, 48(%rsp)
503
+
504
+ .p2align 6
505
+
506
+ .L_4X2_loop_body:
507
+ /* We clobber r8, so store it on the stack */
508
+ movq %r8, 0(%rsp)
509
+
510
+ /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
511
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
512
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
513
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
514
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
515
+ FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
516
+
517
+ /* Reload r8 */
518
+ movq 0(%rsp), %r8
519
+
520
+ FOR_EACH_STREAM(RELOAD_BITS)
521
+
522
+ cmp %op3, 48(%rsp)
523
+ ja .L_4X2_loop_body
524
+ jmp .L_4X2_compute_olimit
525
+
526
+ #undef DECODE
527
+ #undef RELOAD_BITS
528
+ .L_4X2_exit:
529
+ addq $8, %rsp
530
+ /* Restore stack (oend & olimit) */
531
+ pop %rax /* oend0 */
532
+ pop %rax /* oend1 */
533
+ pop %rax /* oend2 */
534
+ pop %rax /* oend3 */
535
+ pop %rax /* ilimit */
536
+ pop %rax /* olimit */
537
+ pop %rax /* arg */
538
+
539
+ /* Save ip / op / bits */
540
+ movq %ip0, 0(%rax)
541
+ movq %ip1, 8(%rax)
542
+ movq %ip2, 16(%rax)
543
+ movq %ip3, 24(%rax)
544
+ movq %op0, 32(%rax)
545
+ movq %op1, 40(%rax)
546
+ movq %op2, 48(%rax)
547
+ movq %op3, 56(%rax)
548
+ movq %bits0, 64(%rax)
549
+ movq %bits1, 72(%rax)
550
+ movq %bits2, 80(%rax)
551
+ movq %bits3, 88(%rax)
552
+
553
+ /* Restore registers */
554
+ pop %r15
555
+ pop %r14
556
+ pop %r13
557
+ pop %r12
558
+ pop %r11
559
+ pop %r10
560
+ pop %r9
561
+ pop %r8
562
+ pop %rdi
563
+ pop %rsi
564
+ pop %rbp
565
+ pop %rdx
566
+ pop %rcx
567
+ pop %rbx
568
+ pop %rax
569
+ ret
570
+
571
+ #endif