rugged 1.3.2.1 → 1.3.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,12 +1,10 @@
1
1
  /* crc32.c -- compute the CRC-32 of a data stream
2
- * Copyright (C) 1995-2006, 2010, 2011, 2012, 2016 Mark Adler
2
+ * Copyright (C) 1995-2022 Mark Adler
3
3
  * For conditions of distribution and use, see copyright notice in zlib.h
4
4
  *
5
- * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
6
- * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
7
- * tables for updating the shift register in one step with three exclusive-ors
8
- * instead of four steps with four exclusive-ors. This results in about a
9
- * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
5
+ * This interleaved implementation of a CRC makes use of pipelined multiple
6
+ * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7
+ * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
10
8
  */
11
9
 
12
10
  /* @(#) $Id$ */
@@ -14,11 +12,12 @@
14
12
  /*
15
13
  Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
16
14
  protection on the static variables used to control the first-use generation
17
- of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
15
+ of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
18
16
  first call get_crc_table() to initialize the tables before allowing more than
19
17
  one thread to use crc32().
20
18
 
21
- DYNAMIC_CRC_TABLE and MAKECRCH can be #defined to write out crc32.h.
19
+ MAKECRCH can be #defined to write out crc32.h. A main() routine is also
20
+ produced, so that this one source file can be compiled to an executable.
22
21
  */
23
22
 
24
23
  #ifdef MAKECRCH
@@ -28,408 +27,1096 @@
28
27
  # endif /* !DYNAMIC_CRC_TABLE */
29
28
  #endif /* MAKECRCH */
30
29
 
31
- #include "zutil.h" /* for STDC and FAR definitions */
30
+ #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
32
31
 
33
- /* Definitions for doing the crc four data bytes at a time. */
34
- #if !defined(NOBYFOUR) && defined(Z_U4)
35
- # define BYFOUR
32
+ /*
33
+ A CRC of a message is computed on N braids of words in the message, where
34
+ each word consists of W bytes (4 or 8). If N is 3, for example, then three
35
+ running sparse CRCs are calculated respectively on each braid, at these
36
+ indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
37
+ This is done starting at a word boundary, and continues until as many blocks
38
+ of N * W bytes as are available have been processed. The results are combined
39
+ into a single CRC at the end. For this code, N must be in the range 1..6 and
40
+ W must be 4 or 8. The upper limit on N can be increased if desired by adding
41
+ more #if blocks, extending the patterns apparent in the code. In addition,
42
+ crc32.h would need to be regenerated, if the maximum N value is increased.
43
+
44
+ N and W are chosen empirically by benchmarking the execution time on a given
45
+ processor. The choices for N and W below were based on testing on Intel Kaby
46
+ Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
47
+ Octeon II processors. The Intel, AMD, and ARM processors were all fastest
48
+ with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
49
+ They were all tested with either gcc or clang, all using the -O3 optimization
50
+ level. Your mileage may vary.
51
+ */
52
+
53
+ /* Define N */
54
+ #ifdef Z_TESTN
55
+ # define N Z_TESTN
56
+ #else
57
+ # define N 5
58
+ #endif
59
+ #if N < 1 || N > 6
60
+ # error N must be in 1..6
36
61
  #endif
37
- #ifdef BYFOUR
38
- local unsigned long crc32_little OF((unsigned long,
39
- const unsigned char FAR *, z_size_t));
40
- local unsigned long crc32_big OF((unsigned long,
41
- const unsigned char FAR *, z_size_t));
42
- # define TBLS 8
62
+
63
+ /*
64
+ z_crc_t must be at least 32 bits. z_word_t must be at least as long as
65
+ z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
66
+ that bytes are eight bits.
67
+ */
68
+
69
+ /*
70
+ Define W and the associated z_word_t type. If W is not defined, then a
71
+ braided calculation is not used, and the associated tables and code are not
72
+ compiled.
73
+ */
74
+ #ifdef Z_TESTW
75
+ # if Z_TESTW-1 != -1
76
+ # define W Z_TESTW
77
+ # endif
43
78
  #else
44
- # define TBLS 1
45
- #endif /* BYFOUR */
79
+ # ifdef MAKECRCH
80
+ # define W 8 /* required for MAKECRCH */
81
+ # else
82
+ # if defined(__x86_64__) || defined(__aarch64__)
83
+ # define W 8
84
+ # else
85
+ # define W 4
86
+ # endif
87
+ # endif
88
+ #endif
89
+ #ifdef W
90
+ # if W == 8 && defined(Z_U8)
91
+ typedef Z_U8 z_word_t;
92
+ # elif defined(Z_U4)
93
+ # undef W
94
+ # define W 4
95
+ typedef Z_U4 z_word_t;
96
+ # else
97
+ # undef W
98
+ # endif
99
+ #endif
100
+
101
+ /* Local functions. */
102
+ local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
103
+ local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
104
+
105
+ /* If available, use the ARM processor CRC32 instruction. */
106
+ #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
107
+ # define ARMCRC32
108
+ #endif
109
+
110
+ #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
111
+ /*
112
+ Swap the bytes in a z_word_t to convert between little and big endian. Any
113
+ self-respecting compiler will optimize this to a single machine byte-swap
114
+ instruction, if one is available. This assumes that word_t is either 32 bits
115
+ or 64 bits.
116
+ */
46
117
 
47
- /* Local functions for crc concatenation */
48
- local unsigned long gf2_matrix_times OF((unsigned long *mat,
49
- unsigned long vec));
50
- local void gf2_matrix_square OF((unsigned long *square, unsigned long *mat));
51
- local uLong crc32_combine_ OF((uLong crc1, uLong crc2, z_off64_t len2));
118
+ local z_word_t byte_swap(z_word_t word);
52
119
 
120
+ local z_word_t byte_swap(word)
121
+ z_word_t word;
122
+ {
123
+ # if W == 8
124
+ return
125
+ (word & 0xff00000000000000) >> 56 |
126
+ (word & 0xff000000000000) >> 40 |
127
+ (word & 0xff0000000000) >> 24 |
128
+ (word & 0xff00000000) >> 8 |
129
+ (word & 0xff000000) << 8 |
130
+ (word & 0xff0000) << 24 |
131
+ (word & 0xff00) << 40 |
132
+ (word & 0xff) << 56;
133
+ # else /* W == 4 */
134
+ return
135
+ (word & 0xff000000) >> 24 |
136
+ (word & 0xff0000) >> 8 |
137
+ (word & 0xff00) << 8 |
138
+ (word & 0xff) << 24;
139
+ # endif
140
+ }
141
+ #endif
142
+
143
+ /* CRC polynomial. */
144
+ #define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
53
145
 
54
146
  #ifdef DYNAMIC_CRC_TABLE
55
147
 
56
- local volatile int crc_table_empty = 1;
57
- local z_crc_t FAR crc_table[TBLS][256];
148
+ local z_crc_t FAR crc_table[256];
149
+ local z_crc_t FAR x2n_table[32];
58
150
  local void make_crc_table OF((void));
151
+ #ifdef W
152
+ local z_word_t FAR crc_big_table[256];
153
+ local z_crc_t FAR crc_braid_table[W][256];
154
+ local z_word_t FAR crc_braid_big_table[W][256];
155
+ local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
156
+ #endif
59
157
  #ifdef MAKECRCH
60
- local void write_table OF((FILE *, const z_crc_t FAR *));
158
+ local void write_table OF((FILE *, const z_crc_t FAR *, int));
159
+ local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
160
+ local void write_table64 OF((FILE *, const z_word_t FAR *, int));
61
161
  #endif /* MAKECRCH */
162
+
163
+ /*
164
+ Define a once() function depending on the availability of atomics. If this is
165
+ compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
166
+ multiple threads, and if atomics are not available, then get_crc_table() must
167
+ be called to initialize the tables and must return before any threads are
168
+ allowed to compute or combine CRCs.
169
+ */
170
+
171
+ /* Definition of once functionality. */
172
+ typedef struct once_s once_t;
173
+ local void once OF((once_t *, void (*)(void)));
174
+
175
+ /* Check for the availability of atomics. */
176
+ #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
177
+ !defined(__STDC_NO_ATOMICS__)
178
+
179
+ #include <stdatomic.h>
180
+
181
+ /* Structure for once(), which must be initialized with ONCE_INIT. */
182
+ struct once_s {
183
+ atomic_flag begun;
184
+ atomic_int done;
185
+ };
186
+ #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
187
+
188
+ /*
189
+ Run the provided init() function exactly once, even if multiple threads
190
+ invoke once() at the same time. The state must be a once_t initialized with
191
+ ONCE_INIT.
192
+ */
193
+ local void once(state, init)
194
+ once_t *state;
195
+ void (*init)(void);
196
+ {
197
+ if (!atomic_load(&state->done)) {
198
+ if (atomic_flag_test_and_set(&state->begun))
199
+ while (!atomic_load(&state->done))
200
+ ;
201
+ else {
202
+ init();
203
+ atomic_store(&state->done, 1);
204
+ }
205
+ }
206
+ }
207
+
208
+ #else /* no atomics */
209
+
210
+ /* Structure for once(), which must be initialized with ONCE_INIT. */
211
+ struct once_s {
212
+ volatile int begun;
213
+ volatile int done;
214
+ };
215
+ #define ONCE_INIT {0, 0}
216
+
217
+ /* Test and set. Alas, not atomic, but tries to minimize the period of
218
+ vulnerability. */
219
+ local int test_and_set OF((int volatile *));
220
+ local int test_and_set(flag)
221
+ int volatile *flag;
222
+ {
223
+ int was;
224
+
225
+ was = *flag;
226
+ *flag = 1;
227
+ return was;
228
+ }
229
+
230
+ /* Run the provided init() function once. This is not thread-safe. */
231
+ local void once(state, init)
232
+ once_t *state;
233
+ void (*init)(void);
234
+ {
235
+ if (!state->done) {
236
+ if (test_and_set(&state->begun))
237
+ while (!state->done)
238
+ ;
239
+ else {
240
+ init();
241
+ state->done = 1;
242
+ }
243
+ }
244
+ }
245
+
246
+ #endif
247
+
248
+ /* State for once(). */
249
+ local once_t made = ONCE_INIT;
250
+
62
251
  /*
63
252
  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
64
253
  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
65
254
 
66
255
  Polynomials over GF(2) are represented in binary, one bit per coefficient,
67
- with the lowest powers in the most significant bit. Then adding polynomials
256
+ with the lowest powers in the most significant bit. Then adding polynomials
68
257
  is just exclusive-or, and multiplying a polynomial by x is a right shift by
69
- one. If we call the above polynomial p, and represent a byte as the
258
+ one. If we call the above polynomial p, and represent a byte as the
70
259
  polynomial q, also with the lowest power in the most significant bit (so the
71
- byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
260
+ byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
72
261
  where a mod b means the remainder after dividing a by b.
73
262
 
74
263
  This calculation is done using the shift-register method of multiplying and
75
- taking the remainder. The register is initialized to zero, and for each
264
+ taking the remainder. The register is initialized to zero, and for each
76
265
  incoming bit, x^32 is added mod p to the register if the bit is a one (where
77
- x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
78
- x (which is shifting right by one and adding x^32 mod p if the bit shifted
79
- out is a one). We start with the highest power (least significant bit) of
80
- q and repeat for all eight bits of q.
81
-
82
- The first table is simply the CRC of all possible eight bit values. This is
83
- all the information needed to generate CRCs on data a byte at a time for all
84
- combinations of CRC register values and incoming bytes. The remaining tables
85
- allow for word-at-a-time CRC calculation for both big-endian and little-
86
- endian machines, where a word is four bytes.
87
- */
266
+ x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
267
+ (which is shifting right by one and adding x^32 mod p if the bit shifted out
268
+ is a one). We start with the highest power (least significant bit) of q and
269
+ repeat for all eight bits of q.
270
+
271
+ The table is simply the CRC of all possible eight bit values. This is all the
272
+ information needed to generate CRCs on data a byte at a time for all
273
+ combinations of CRC register values and incoming bytes.
274
+ */
275
+
88
276
  local void make_crc_table()
89
277
  {
90
- z_crc_t c;
91
- int n, k;
92
- z_crc_t poly; /* polynomial exclusive-or pattern */
93
- /* terms of polynomial defining this crc (except x^32): */
94
- static volatile int first = 1; /* flag to limit concurrent making */
95
- static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
96
-
97
- /* See if another task is already doing this (not thread-safe, but better
98
- than nothing -- significantly reduces duration of vulnerability in
99
- case the advice about DYNAMIC_CRC_TABLE is ignored) */
100
- if (first) {
101
- first = 0;
102
-
103
- /* make exclusive-or pattern from polynomial (0xedb88320UL) */
104
- poly = 0;
105
- for (n = 0; n < (int)(sizeof(p)/sizeof(unsigned char)); n++)
106
- poly |= (z_crc_t)1 << (31 - p[n]);
107
-
108
- /* generate a crc for every 8-bit value */
109
- for (n = 0; n < 256; n++) {
110
- c = (z_crc_t)n;
111
- for (k = 0; k < 8; k++)
112
- c = c & 1 ? poly ^ (c >> 1) : c >> 1;
113
- crc_table[0][n] = c;
114
- }
278
+ unsigned i, j, n;
279
+ z_crc_t p;
115
280
 
116
- #ifdef BYFOUR
117
- /* generate crc for each value followed by one, two, and three zeros,
118
- and then the byte reversal of those as well as the first table */
119
- for (n = 0; n < 256; n++) {
120
- c = crc_table[0][n];
121
- crc_table[4][n] = ZSWAP32(c);
122
- for (k = 1; k < 4; k++) {
123
- c = crc_table[0][c & 0xff] ^ (c >> 8);
124
- crc_table[k][n] = c;
125
- crc_table[k + 4][n] = ZSWAP32(c);
126
- }
127
- }
128
- #endif /* BYFOUR */
129
-
130
- crc_table_empty = 0;
131
- }
132
- else { /* not first */
133
- /* wait for the other guy to finish (not efficient, but rare) */
134
- while (crc_table_empty)
135
- ;
281
+ /* initialize the CRC of bytes tables */
282
+ for (i = 0; i < 256; i++) {
283
+ p = i;
284
+ for (j = 0; j < 8; j++)
285
+ p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
286
+ crc_table[i] = p;
287
+ #ifdef W
288
+ crc_big_table[i] = byte_swap(p);
289
+ #endif
136
290
  }
137
291
 
292
+ /* initialize the x^2^n mod p(x) table */
293
+ p = (z_crc_t)1 << 30; /* x^1 */
294
+ x2n_table[0] = p;
295
+ for (n = 1; n < 32; n++)
296
+ x2n_table[n] = p = multmodp(p, p);
297
+
298
+ #ifdef W
299
+ /* initialize the braiding tables -- needs x2n_table[] */
300
+ braid(crc_braid_table, crc_braid_big_table, N, W);
301
+ #endif
302
+
138
303
  #ifdef MAKECRCH
139
- /* write out CRC tables to crc32.h */
140
304
  {
305
+ /*
306
+ The crc32.h header file contains tables for both 32-bit and 64-bit
307
+ z_word_t's, and so requires a 64-bit type be available. In that case,
308
+ z_word_t must be defined to be 64-bits. This code then also generates
309
+ and writes out the tables for the case that z_word_t is 32 bits.
310
+ */
311
+ #if !defined(W) || W != 8
312
+ # error Need a 64-bit integer type in order to generate crc32.h.
313
+ #endif
141
314
  FILE *out;
315
+ int k, n;
316
+ z_crc_t ltl[8][256];
317
+ z_word_t big[8][256];
142
318
 
143
319
  out = fopen("crc32.h", "w");
144
320
  if (out == NULL) return;
145
- fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
146
- fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
147
- fprintf(out, "local const z_crc_t FAR ");
148
- fprintf(out, "crc_table[TBLS][256] =\n{\n {\n");
149
- write_table(out, crc_table[0]);
150
- # ifdef BYFOUR
151
- fprintf(out, "#ifdef BYFOUR\n");
152
- for (k = 1; k < 8; k++) {
153
- fprintf(out, " },\n {\n");
154
- write_table(out, crc_table[k]);
321
+
322
+ /* write out little-endian CRC table to crc32.h */
323
+ fprintf(out,
324
+ "/* crc32.h -- tables for rapid CRC calculation\n"
325
+ " * Generated automatically by crc32.c\n */\n"
326
+ "\n"
327
+ "local const z_crc_t FAR crc_table[] = {\n"
328
+ " ");
329
+ write_table(out, crc_table, 256);
330
+ fprintf(out,
331
+ "};\n");
332
+
333
+ /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
334
+ fprintf(out,
335
+ "\n"
336
+ "#ifdef W\n"
337
+ "\n"
338
+ "#if W == 8\n"
339
+ "\n"
340
+ "local const z_word_t FAR crc_big_table[] = {\n"
341
+ " ");
342
+ write_table64(out, crc_big_table, 256);
343
+ fprintf(out,
344
+ "};\n");
345
+
346
+ /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
347
+ fprintf(out,
348
+ "\n"
349
+ "#else /* W == 4 */\n"
350
+ "\n"
351
+ "local const z_word_t FAR crc_big_table[] = {\n"
352
+ " ");
353
+ write_table32hi(out, crc_big_table, 256);
354
+ fprintf(out,
355
+ "};\n"
356
+ "\n"
357
+ "#endif\n");
358
+
359
+ /* write out braid tables for each value of N */
360
+ for (n = 1; n <= 6; n++) {
361
+ fprintf(out,
362
+ "\n"
363
+ "#if N == %d\n", n);
364
+
365
+ /* compute braid tables for this N and 64-bit word_t */
366
+ braid(ltl, big, n, 8);
367
+
368
+ /* write out braid tables for 64-bit z_word_t to crc32.h */
369
+ fprintf(out,
370
+ "\n"
371
+ "#if W == 8\n"
372
+ "\n"
373
+ "local const z_crc_t FAR crc_braid_table[][256] = {\n");
374
+ for (k = 0; k < 8; k++) {
375
+ fprintf(out, " {");
376
+ write_table(out, ltl[k], 256);
377
+ fprintf(out, "}%s", k < 7 ? ",\n" : "");
378
+ }
379
+ fprintf(out,
380
+ "};\n"
381
+ "\n"
382
+ "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
383
+ for (k = 0; k < 8; k++) {
384
+ fprintf(out, " {");
385
+ write_table64(out, big[k], 256);
386
+ fprintf(out, "}%s", k < 7 ? ",\n" : "");
387
+ }
388
+ fprintf(out,
389
+ "};\n");
390
+
391
+ /* compute braid tables for this N and 32-bit word_t */
392
+ braid(ltl, big, n, 4);
393
+
394
+ /* write out braid tables for 32-bit z_word_t to crc32.h */
395
+ fprintf(out,
396
+ "\n"
397
+ "#else /* W == 4 */\n"
398
+ "\n"
399
+ "local const z_crc_t FAR crc_braid_table[][256] = {\n");
400
+ for (k = 0; k < 4; k++) {
401
+ fprintf(out, " {");
402
+ write_table(out, ltl[k], 256);
403
+ fprintf(out, "}%s", k < 3 ? ",\n" : "");
404
+ }
405
+ fprintf(out,
406
+ "};\n"
407
+ "\n"
408
+ "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
409
+ for (k = 0; k < 4; k++) {
410
+ fprintf(out, " {");
411
+ write_table32hi(out, big[k], 256);
412
+ fprintf(out, "}%s", k < 3 ? ",\n" : "");
413
+ }
414
+ fprintf(out,
415
+ "};\n"
416
+ "\n"
417
+ "#endif\n"
418
+ "\n"
419
+ "#endif\n");
155
420
  }
156
- fprintf(out, "#endif\n");
157
- # endif /* BYFOUR */
158
- fprintf(out, " }\n};\n");
421
+ fprintf(out,
422
+ "\n"
423
+ "#endif\n");
424
+
425
+ /* write out zeros operator table to crc32.h */
426
+ fprintf(out,
427
+ "\n"
428
+ "local const z_crc_t FAR x2n_table[] = {\n"
429
+ " ");
430
+ write_table(out, x2n_table, 32);
431
+ fprintf(out,
432
+ "};\n");
159
433
  fclose(out);
160
434
  }
161
435
  #endif /* MAKECRCH */
162
436
  }
163
437
 
164
438
  #ifdef MAKECRCH
165
- local void write_table(out, table)
439
+
440
+ /*
441
+ Write the 32-bit values in table[0..k-1] to out, five per line in
442
+ hexadecimal separated by commas.
443
+ */
444
+ local void write_table(out, table, k)
166
445
  FILE *out;
167
446
  const z_crc_t FAR *table;
447
+ int k;
168
448
  {
169
449
  int n;
170
450
 
171
- for (n = 0; n < 256; n++)
172
- fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ",
451
+ for (n = 0; n < k; n++)
452
+ fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ",
173
453
  (unsigned long)(table[n]),
174
- n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
454
+ n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
175
455
  }
456
+
457
+ /*
458
+ Write the high 32-bits of each value in table[0..k-1] to out, five per line
459
+ in hexadecimal separated by commas.
460
+ */
461
+ local void write_table32hi(out, table, k)
462
+ FILE *out;
463
+ const z_word_t FAR *table;
464
+ int k;
465
+ {
466
+ int n;
467
+
468
+ for (n = 0; n < k; n++)
469
+ fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ",
470
+ (unsigned long)(table[n] >> 32),
471
+ n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
472
+ }
473
+
474
+ /*
475
+ Write the 64-bit values in table[0..k-1] to out, three per line in
476
+ hexadecimal separated by commas. This assumes that if there is a 64-bit
477
+ type, then there is also a long long integer type, and it is at least 64
478
+ bits. If not, then the type cast and format string can be adjusted
479
+ accordingly.
480
+ */
481
+ local void write_table64(out, table, k)
482
+ FILE *out;
483
+ const z_word_t FAR *table;
484
+ int k;
485
+ {
486
+ int n;
487
+
488
+ for (n = 0; n < k; n++)
489
+ fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : " ",
490
+ (unsigned long long)(table[n]),
491
+ n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
492
+ }
493
+
494
+ /* Actually do the deed. */
495
+ int main()
496
+ {
497
+ make_crc_table();
498
+ return 0;
499
+ }
500
+
176
501
  #endif /* MAKECRCH */
177
502
 
503
+ #ifdef W
504
+ /*
505
+ Generate the little and big-endian braid tables for the given n and z_word_t
506
+ size w. Each array must have room for w blocks of 256 elements.
507
+ */
508
+ local void braid(ltl, big, n, w)
509
+ z_crc_t ltl[][256];
510
+ z_word_t big[][256];
511
+ int n;
512
+ int w;
513
+ {
514
+ int k;
515
+ z_crc_t i, p, q;
516
+ for (k = 0; k < w; k++) {
517
+ p = x2nmodp((n * w + 3 - k) << 3, 0);
518
+ ltl[k][0] = 0;
519
+ big[w - 1 - k][0] = 0;
520
+ for (i = 1; i < 256; i++) {
521
+ ltl[k][i] = q = multmodp(i << 24, p);
522
+ big[w - 1 - k][i] = byte_swap(q);
523
+ }
524
+ }
525
+ }
526
+ #endif
527
+
178
528
  #else /* !DYNAMIC_CRC_TABLE */
179
529
  /* ========================================================================
180
- * Tables of CRC-32s of all single-byte values, made by make_crc_table().
530
+ * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
531
+ * of x for combining CRC-32s, all made by make_crc_table().
181
532
  */
182
533
  #include "crc32.h"
183
534
  #endif /* DYNAMIC_CRC_TABLE */
184
535
 
536
+ /* ========================================================================
537
+ * Routines used for CRC calculation. Some are also required for the table
538
+ * generation above.
539
+ */
540
+
541
+ /*
542
+ Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
543
+ reflected. For speed, this requires that a not be zero.
544
+ */
545
+ local z_crc_t multmodp(a, b)
546
+ z_crc_t a;
547
+ z_crc_t b;
548
+ {
549
+ z_crc_t m, p;
550
+
551
+ m = (z_crc_t)1 << 31;
552
+ p = 0;
553
+ for (;;) {
554
+ if (a & m) {
555
+ p ^= b;
556
+ if ((a & (m - 1)) == 0)
557
+ break;
558
+ }
559
+ m >>= 1;
560
+ b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
561
+ }
562
+ return p;
563
+ }
564
+
565
+ /*
566
+ Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
567
+ initialized.
568
+ */
569
+ local z_crc_t x2nmodp(n, k)
570
+ z_off64_t n;
571
+ unsigned k;
572
+ {
573
+ z_crc_t p;
574
+
575
+ p = (z_crc_t)1 << 31; /* x^0 == 1 */
576
+ while (n) {
577
+ if (n & 1)
578
+ p = multmodp(x2n_table[k & 31], p);
579
+ n >>= 1;
580
+ k++;
581
+ }
582
+ return p;
583
+ }
584
+
185
585
  /* =========================================================================
186
- * This function can be used by asm versions of crc32()
586
+ * This function can be used by asm versions of crc32(), and to force the
587
+ * generation of the CRC tables in a threaded application.
187
588
  */
188
589
  const z_crc_t FAR * ZEXPORT get_crc_table()
189
590
  {
190
591
  #ifdef DYNAMIC_CRC_TABLE
191
- if (crc_table_empty)
192
- make_crc_table();
592
+ once(&made, make_crc_table);
193
593
  #endif /* DYNAMIC_CRC_TABLE */
194
594
  return (const z_crc_t FAR *)crc_table;
195
595
  }
196
596
 
197
- /* ========================================================================= */
198
- #define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
199
- #define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
597
+ /* =========================================================================
598
+ * Use ARM machine instructions if available. This will compute the CRC about
599
+ * ten times faster than the braided calculation. This code does not check for
600
+ * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
601
+ * only be defined if the compilation specifies an ARM processor architecture
602
+ * that has the instructions. For example, compiling with -march=armv8.1-a or
603
+ * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
604
+ * instructions.
605
+ */
606
+ #ifdef ARMCRC32
607
+
608
+ /*
609
+ Constants empirically determined to maximize speed. These values are from
610
+ measurements on a Cortex-A57. Your mileage may vary.
611
+ */
612
+ #define Z_BATCH 3990 /* number of words in a batch */
613
+ #define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */
614
+ #define Z_BATCH_MIN 800 /* fewest words in a final batch */
200
615
 
201
- /* ========================================================================= */
202
616
  unsigned long ZEXPORT crc32_z(crc, buf, len)
203
617
  unsigned long crc;
204
618
  const unsigned char FAR *buf;
205
619
  z_size_t len;
206
620
  {
207
- if (buf == Z_NULL) return 0UL;
621
+ z_crc_t val;
622
+ z_word_t crc1, crc2;
623
+ const z_word_t *word;
624
+ z_word_t val0, val1, val2;
625
+ z_size_t last, last2, i;
626
+ z_size_t num;
627
+
628
+ /* Return initial CRC, if requested. */
629
+ if (buf == Z_NULL) return 0;
208
630
 
209
631
  #ifdef DYNAMIC_CRC_TABLE
210
- if (crc_table_empty)
211
- make_crc_table();
632
+ once(&made, make_crc_table);
212
633
  #endif /* DYNAMIC_CRC_TABLE */
213
634
 
214
- #ifdef BYFOUR
215
- if (sizeof(void *) == sizeof(ptrdiff_t)) {
216
- z_crc_t endian;
635
+ /* Pre-condition the CRC */
636
+ crc ^= 0xffffffff;
217
637
 
218
- endian = 1;
219
- if (*((unsigned char *)(&endian)))
220
- return crc32_little(crc, buf, len);
221
- else
222
- return crc32_big(crc, buf, len);
638
+ /* Compute the CRC up to a word boundary. */
639
+ while (len && ((z_size_t)buf & 7) != 0) {
640
+ len--;
641
+ val = *buf++;
642
+ __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
223
643
  }
224
- #endif /* BYFOUR */
225
- crc = crc ^ 0xffffffffUL;
226
- while (len >= 8) {
227
- DO8;
228
- len -= 8;
644
+
645
+ /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
646
+ word = (z_word_t const *)buf;
647
+ num = len >> 3;
648
+ len &= 7;
649
+
650
+ /* Do three interleaved CRCs to realize the throughput of one crc32x
651
+ instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three
652
+ CRCs are combined into a single CRC after each set of batches. */
653
+ while (num >= 3 * Z_BATCH) {
654
+ crc1 = 0;
655
+ crc2 = 0;
656
+ for (i = 0; i < Z_BATCH; i++) {
657
+ val0 = word[i];
658
+ val1 = word[i + Z_BATCH];
659
+ val2 = word[i + 2 * Z_BATCH];
660
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
661
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
662
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
663
+ }
664
+ word += 3 * Z_BATCH;
665
+ num -= 3 * Z_BATCH;
666
+ crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
667
+ crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
229
668
  }
230
- if (len) do {
231
- DO1;
232
- } while (--len);
233
- return crc ^ 0xffffffffUL;
234
- }
235
669
 
236
- /* ========================================================================= */
237
- unsigned long ZEXPORT crc32(crc, buf, len)
238
- unsigned long crc;
239
- const unsigned char FAR *buf;
240
- uInt len;
241
- {
242
- return crc32_z(crc, buf, len);
670
+ /* Do one last smaller batch with the remaining words, if there are enough
671
+ to pay for the combination of CRCs. */
672
+ last = num / 3;
673
+ if (last >= Z_BATCH_MIN) {
674
+ last2 = last << 1;
675
+ crc1 = 0;
676
+ crc2 = 0;
677
+ for (i = 0; i < last; i++) {
678
+ val0 = word[i];
679
+ val1 = word[i + last];
680
+ val2 = word[i + last2];
681
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
682
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
683
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
684
+ }
685
+ word += 3 * last;
686
+ num -= 3 * last;
687
+ val = x2nmodp(last, 6);
688
+ crc = multmodp(val, crc) ^ crc1;
689
+ crc = multmodp(val, crc) ^ crc2;
690
+ }
691
+
692
+ /* Compute the CRC on any remaining words. */
693
+ for (i = 0; i < num; i++) {
694
+ val0 = word[i];
695
+ __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
696
+ }
697
+ word += num;
698
+
699
+ /* Complete the CRC on any remaining bytes. */
700
+ buf = (const unsigned char FAR *)word;
701
+ while (len) {
702
+ len--;
703
+ val = *buf++;
704
+ __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
705
+ }
706
+
707
+ /* Return the CRC, post-conditioned. */
708
+ return crc ^ 0xffffffff;
243
709
  }
244
710
 
245
- #ifdef BYFOUR
711
+ #else
712
+
713
+ #ifdef W
714
+
715
+ local z_crc_t crc_word(z_word_t data);
716
+ local z_word_t crc_word_big(z_word_t data);
246
717
 
247
718
  /*
248
- This BYFOUR code accesses the passed unsigned char * buffer with a 32-bit
249
- integer pointer type. This violates the strict aliasing rule, where a
250
- compiler can assume, for optimization purposes, that two pointers to
251
- fundamentally different types won't ever point to the same memory. This can
252
- manifest as a problem only if one of the pointers is written to. This code
253
- only reads from those pointers. So long as this code remains isolated in
254
- this compilation unit, there won't be a problem. For this reason, this code
255
- should not be copied and pasted into a compilation unit in which other code
256
- writes to the buffer that is passed to these routines.
719
+ Return the CRC of the W bytes in the word_t data, taking the
720
+ least-significant byte of the word as the first byte of data, without any pre
721
+ or post conditioning. This is used to combine the CRCs of each braid.
257
722
  */
723
+ local z_crc_t crc_word(data)
724
+ z_word_t data;
725
+ {
726
+ int k;
727
+ for (k = 0; k < W; k++)
728
+ data = (data >> 8) ^ crc_table[data & 0xff];
729
+ return (z_crc_t)data;
730
+ }
258
731
 
259
- /* ========================================================================= */
260
- #define DOLIT4 c ^= *buf4++; \
261
- c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
262
- crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
263
- #define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
732
+ local z_word_t crc_word_big(data)
733
+ z_word_t data;
734
+ {
735
+ int k;
736
+ for (k = 0; k < W; k++)
737
+ data = (data << 8) ^
738
+ crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
739
+ return data;
740
+ }
741
+
742
+ #endif
264
743
 
265
744
  /* ========================================================================= */
266
- local unsigned long crc32_little(crc, buf, len)
745
+ unsigned long ZEXPORT crc32_z(crc, buf, len)
267
746
  unsigned long crc;
268
747
  const unsigned char FAR *buf;
269
748
  z_size_t len;
270
749
  {
271
- register z_crc_t c;
272
- register const z_crc_t FAR *buf4;
750
+ /* Return initial CRC, if requested. */
751
+ if (buf == Z_NULL) return 0;
273
752
 
274
- c = (z_crc_t)crc;
275
- c = ~c;
276
- while (len && ((ptrdiff_t)buf & 3)) {
277
- c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
278
- len--;
279
- }
753
+ #ifdef DYNAMIC_CRC_TABLE
754
+ once(&made, make_crc_table);
755
+ #endif /* DYNAMIC_CRC_TABLE */
280
756
 
281
- buf4 = (const z_crc_t FAR *)(const void FAR *)buf;
282
- while (len >= 32) {
283
- DOLIT32;
284
- len -= 32;
285
- }
286
- while (len >= 4) {
287
- DOLIT4;
288
- len -= 4;
289
- }
290
- buf = (const unsigned char FAR *)buf4;
757
+ /* Pre-condition the CRC */
758
+ crc ^= 0xffffffff;
291
759
 
292
- if (len) do {
293
- c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
294
- } while (--len);
295
- c = ~c;
296
- return (unsigned long)c;
297
- }
760
+ #ifdef W
298
761
 
299
- /* ========================================================================= */
300
- #define DOBIG4 c ^= *buf4++; \
301
- c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
302
- crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
303
- #define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
762
+ /* If provided enough bytes, do a braided CRC calculation. */
763
+ if (len >= N * W + W - 1) {
764
+ z_size_t blks;
765
+ z_word_t const *words;
766
+ unsigned endian;
767
+ int k;
304
768
 
305
- /* ========================================================================= */
306
- local unsigned long crc32_big(crc, buf, len)
307
- unsigned long crc;
308
- const unsigned char FAR *buf;
309
- z_size_t len;
310
- {
311
- register z_crc_t c;
312
- register const z_crc_t FAR *buf4;
769
+ /* Compute the CRC up to a z_word_t boundary. */
770
+ while (len && ((z_size_t)buf & (W - 1)) != 0) {
771
+ len--;
772
+ crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
773
+ }
313
774
 
314
- c = ZSWAP32((z_crc_t)crc);
315
- c = ~c;
316
- while (len && ((ptrdiff_t)buf & 3)) {
317
- c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
318
- len--;
775
+ /* Compute the CRC on as many N z_word_t blocks as are available. */
776
+ blks = len / (N * W);
777
+ len -= blks * N * W;
778
+ words = (z_word_t const *)buf;
779
+
780
+ /* Do endian check at execution time instead of compile time, since ARM
781
+ processors can change the endianess at execution time. If the
782
+ compiler knows what the endianess will be, it can optimize out the
783
+ check and the unused branch. */
784
+ endian = 1;
785
+ if (*(unsigned char *)&endian) {
786
+ /* Little endian. */
787
+
788
+ z_crc_t crc0;
789
+ z_word_t word0;
790
+ #if N > 1
791
+ z_crc_t crc1;
792
+ z_word_t word1;
793
+ #if N > 2
794
+ z_crc_t crc2;
795
+ z_word_t word2;
796
+ #if N > 3
797
+ z_crc_t crc3;
798
+ z_word_t word3;
799
+ #if N > 4
800
+ z_crc_t crc4;
801
+ z_word_t word4;
802
+ #if N > 5
803
+ z_crc_t crc5;
804
+ z_word_t word5;
805
+ #endif
806
+ #endif
807
+ #endif
808
+ #endif
809
+ #endif
810
+
811
+ /* Initialize the CRC for each braid. */
812
+ crc0 = crc;
813
+ #if N > 1
814
+ crc1 = 0;
815
+ #if N > 2
816
+ crc2 = 0;
817
+ #if N > 3
818
+ crc3 = 0;
819
+ #if N > 4
820
+ crc4 = 0;
821
+ #if N > 5
822
+ crc5 = 0;
823
+ #endif
824
+ #endif
825
+ #endif
826
+ #endif
827
+ #endif
828
+
829
+ /*
830
+ Process the first blks-1 blocks, computing the CRCs on each braid
831
+ independently.
832
+ */
833
+ while (--blks) {
834
+ /* Load the word for each braid into registers. */
835
+ word0 = crc0 ^ words[0];
836
+ #if N > 1
837
+ word1 = crc1 ^ words[1];
838
+ #if N > 2
839
+ word2 = crc2 ^ words[2];
840
+ #if N > 3
841
+ word3 = crc3 ^ words[3];
842
+ #if N > 4
843
+ word4 = crc4 ^ words[4];
844
+ #if N > 5
845
+ word5 = crc5 ^ words[5];
846
+ #endif
847
+ #endif
848
+ #endif
849
+ #endif
850
+ #endif
851
+ words += N;
852
+
853
+ /* Compute and update the CRC for each word. The loop should
854
+ get unrolled. */
855
+ crc0 = crc_braid_table[0][word0 & 0xff];
856
+ #if N > 1
857
+ crc1 = crc_braid_table[0][word1 & 0xff];
858
+ #if N > 2
859
+ crc2 = crc_braid_table[0][word2 & 0xff];
860
+ #if N > 3
861
+ crc3 = crc_braid_table[0][word3 & 0xff];
862
+ #if N > 4
863
+ crc4 = crc_braid_table[0][word4 & 0xff];
864
+ #if N > 5
865
+ crc5 = crc_braid_table[0][word5 & 0xff];
866
+ #endif
867
+ #endif
868
+ #endif
869
+ #endif
870
+ #endif
871
+ for (k = 1; k < W; k++) {
872
+ crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
873
+ #if N > 1
874
+ crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
875
+ #if N > 2
876
+ crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
877
+ #if N > 3
878
+ crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
879
+ #if N > 4
880
+ crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
881
+ #if N > 5
882
+ crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
883
+ #endif
884
+ #endif
885
+ #endif
886
+ #endif
887
+ #endif
888
+ }
889
+ }
890
+
891
+ /*
892
+ Process the last block, combining the CRCs of the N braids at the
893
+ same time.
894
+ */
895
+ crc = crc_word(crc0 ^ words[0]);
896
+ #if N > 1
897
+ crc = crc_word(crc1 ^ words[1] ^ crc);
898
+ #if N > 2
899
+ crc = crc_word(crc2 ^ words[2] ^ crc);
900
+ #if N > 3
901
+ crc = crc_word(crc3 ^ words[3] ^ crc);
902
+ #if N > 4
903
+ crc = crc_word(crc4 ^ words[4] ^ crc);
904
+ #if N > 5
905
+ crc = crc_word(crc5 ^ words[5] ^ crc);
906
+ #endif
907
+ #endif
908
+ #endif
909
+ #endif
910
+ #endif
911
+ words += N;
912
+ }
913
+ else {
914
+ /* Big endian. */
915
+
916
+ z_word_t crc0, word0, comb;
917
+ #if N > 1
918
+ z_word_t crc1, word1;
919
+ #if N > 2
920
+ z_word_t crc2, word2;
921
+ #if N > 3
922
+ z_word_t crc3, word3;
923
+ #if N > 4
924
+ z_word_t crc4, word4;
925
+ #if N > 5
926
+ z_word_t crc5, word5;
927
+ #endif
928
+ #endif
929
+ #endif
930
+ #endif
931
+ #endif
932
+
933
+ /* Initialize the CRC for each braid. */
934
+ crc0 = byte_swap(crc);
935
+ #if N > 1
936
+ crc1 = 0;
937
+ #if N > 2
938
+ crc2 = 0;
939
+ #if N > 3
940
+ crc3 = 0;
941
+ #if N > 4
942
+ crc4 = 0;
943
+ #if N > 5
944
+ crc5 = 0;
945
+ #endif
946
+ #endif
947
+ #endif
948
+ #endif
949
+ #endif
950
+
951
+ /*
952
+ Process the first blks-1 blocks, computing the CRCs on each braid
953
+ independently.
954
+ */
955
+ while (--blks) {
956
+ /* Load the word for each braid into registers. */
957
+ word0 = crc0 ^ words[0];
958
+ #if N > 1
959
+ word1 = crc1 ^ words[1];
960
+ #if N > 2
961
+ word2 = crc2 ^ words[2];
962
+ #if N > 3
963
+ word3 = crc3 ^ words[3];
964
+ #if N > 4
965
+ word4 = crc4 ^ words[4];
966
+ #if N > 5
967
+ word5 = crc5 ^ words[5];
968
+ #endif
969
+ #endif
970
+ #endif
971
+ #endif
972
+ #endif
973
+ words += N;
974
+
975
+ /* Compute and update the CRC for each word. The loop should
976
+ get unrolled. */
977
+ crc0 = crc_braid_big_table[0][word0 & 0xff];
978
+ #if N > 1
979
+ crc1 = crc_braid_big_table[0][word1 & 0xff];
980
+ #if N > 2
981
+ crc2 = crc_braid_big_table[0][word2 & 0xff];
982
+ #if N > 3
983
+ crc3 = crc_braid_big_table[0][word3 & 0xff];
984
+ #if N > 4
985
+ crc4 = crc_braid_big_table[0][word4 & 0xff];
986
+ #if N > 5
987
+ crc5 = crc_braid_big_table[0][word5 & 0xff];
988
+ #endif
989
+ #endif
990
+ #endif
991
+ #endif
992
+ #endif
993
+ for (k = 1; k < W; k++) {
994
+ crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
995
+ #if N > 1
996
+ crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
997
+ #if N > 2
998
+ crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
999
+ #if N > 3
1000
+ crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
1001
+ #if N > 4
1002
+ crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
1003
+ #if N > 5
1004
+ crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
1005
+ #endif
1006
+ #endif
1007
+ #endif
1008
+ #endif
1009
+ #endif
1010
+ }
1011
+ }
1012
+
1013
+ /*
1014
+ Process the last block, combining the CRCs of the N braids at the
1015
+ same time.
1016
+ */
1017
+ comb = crc_word_big(crc0 ^ words[0]);
1018
+ #if N > 1
1019
+ comb = crc_word_big(crc1 ^ words[1] ^ comb);
1020
+ #if N > 2
1021
+ comb = crc_word_big(crc2 ^ words[2] ^ comb);
1022
+ #if N > 3
1023
+ comb = crc_word_big(crc3 ^ words[3] ^ comb);
1024
+ #if N > 4
1025
+ comb = crc_word_big(crc4 ^ words[4] ^ comb);
1026
+ #if N > 5
1027
+ comb = crc_word_big(crc5 ^ words[5] ^ comb);
1028
+ #endif
1029
+ #endif
1030
+ #endif
1031
+ #endif
1032
+ #endif
1033
+ words += N;
1034
+ crc = byte_swap(comb);
1035
+ }
1036
+
1037
+ /*
1038
+ Update the pointer to the remaining bytes to process.
1039
+ */
1040
+ buf = (unsigned char const *)words;
319
1041
  }
320
1042
 
321
- buf4 = (const z_crc_t FAR *)(const void FAR *)buf;
322
- while (len >= 32) {
323
- DOBIG32;
324
- len -= 32;
1043
+ #endif /* W */
1044
+
1045
+ /* Complete the computation of the CRC on any remaining bytes. */
1046
+ while (len >= 8) {
1047
+ len -= 8;
1048
+ crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1049
+ crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1050
+ crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1051
+ crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1052
+ crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1053
+ crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1054
+ crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1055
+ crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
325
1056
  }
326
- while (len >= 4) {
327
- DOBIG4;
328
- len -= 4;
1057
+ while (len) {
1058
+ len--;
1059
+ crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
329
1060
  }
330
- buf = (const unsigned char FAR *)buf4;
331
1061
 
332
- if (len) do {
333
- c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
334
- } while (--len);
335
- c = ~c;
336
- return (unsigned long)(ZSWAP32(c));
1062
+ /* Return the CRC, post-conditioned. */
1063
+ return crc ^ 0xffffffff;
337
1064
  }
338
1065
 
339
- #endif /* BYFOUR */
340
-
341
- #define GF2_DIM 32 /* dimension of GF(2) vectors (length of CRC) */
1066
+ #endif
342
1067
 
343
1068
  /* ========================================================================= */
344
- local unsigned long gf2_matrix_times(mat, vec)
345
- unsigned long *mat;
346
- unsigned long vec;
1069
+ unsigned long ZEXPORT crc32(crc, buf, len)
1070
+ unsigned long crc;
1071
+ const unsigned char FAR *buf;
1072
+ uInt len;
347
1073
  {
348
- unsigned long sum;
349
-
350
- sum = 0;
351
- while (vec) {
352
- if (vec & 1)
353
- sum ^= *mat;
354
- vec >>= 1;
355
- mat++;
356
- }
357
- return sum;
1074
+ return crc32_z(crc, buf, len);
358
1075
  }
359
1076
 
360
1077
  /* ========================================================================= */
361
- local void gf2_matrix_square(square, mat)
362
- unsigned long *square;
363
- unsigned long *mat;
1078
+ uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
1079
+ uLong crc1;
1080
+ uLong crc2;
1081
+ z_off64_t len2;
364
1082
  {
365
- int n;
366
-
367
- for (n = 0; n < GF2_DIM; n++)
368
- square[n] = gf2_matrix_times(mat, mat[n]);
1083
+ #ifdef DYNAMIC_CRC_TABLE
1084
+ once(&made, make_crc_table);
1085
+ #endif /* DYNAMIC_CRC_TABLE */
1086
+ return multmodp(x2nmodp(len2, 3), crc1) ^ crc2;
369
1087
  }
370
1088
 
371
1089
  /* ========================================================================= */
372
- local uLong crc32_combine_(crc1, crc2, len2)
1090
+ uLong ZEXPORT crc32_combine(crc1, crc2, len2)
373
1091
  uLong crc1;
374
1092
  uLong crc2;
1093
+ z_off_t len2;
1094
+ {
1095
+ return crc32_combine64(crc1, crc2, len2);
1096
+ }
1097
+
1098
+ /* ========================================================================= */
1099
+ uLong ZEXPORT crc32_combine_gen64(len2)
375
1100
  z_off64_t len2;
376
1101
  {
377
- int n;
378
- unsigned long row;
379
- unsigned long even[GF2_DIM]; /* even-power-of-two zeros operator */
380
- unsigned long odd[GF2_DIM]; /* odd-power-of-two zeros operator */
381
-
382
- /* degenerate case (also disallow negative lengths) */
383
- if (len2 <= 0)
384
- return crc1;
385
-
386
- /* put operator for one zero bit in odd */
387
- odd[0] = 0xedb88320UL; /* CRC-32 polynomial */
388
- row = 1;
389
- for (n = 1; n < GF2_DIM; n++) {
390
- odd[n] = row;
391
- row <<= 1;
392
- }
1102
+ #ifdef DYNAMIC_CRC_TABLE
1103
+ once(&made, make_crc_table);
1104
+ #endif /* DYNAMIC_CRC_TABLE */
1105
+ return x2nmodp(len2, 3);
1106
+ }
393
1107
 
394
- /* put operator for two zero bits in even */
395
- gf2_matrix_square(even, odd);
396
-
397
- /* put operator for four zero bits in odd */
398
- gf2_matrix_square(odd, even);
399
-
400
- /* apply len2 zeros to crc1 (first square will put the operator for one
401
- zero byte, eight zero bits, in even) */
402
- do {
403
- /* apply zeros operator for this bit of len2 */
404
- gf2_matrix_square(even, odd);
405
- if (len2 & 1)
406
- crc1 = gf2_matrix_times(even, crc1);
407
- len2 >>= 1;
408
-
409
- /* if no more bits set, then done */
410
- if (len2 == 0)
411
- break;
412
-
413
- /* another iteration of the loop with odd and even swapped */
414
- gf2_matrix_square(odd, even);
415
- if (len2 & 1)
416
- crc1 = gf2_matrix_times(odd, crc1);
417
- len2 >>= 1;
418
-
419
- /* if no more bits set, then done */
420
- } while (len2 != 0);
421
-
422
- /* return combined crc */
423
- crc1 ^= crc2;
424
- return crc1;
1108
+ /* ========================================================================= */
1109
+ uLong ZEXPORT crc32_combine_gen(len2)
1110
+ z_off_t len2;
1111
+ {
1112
+ return crc32_combine_gen64(len2);
425
1113
  }
426
1114
 
427
1115
  /* ========================================================================= */
428
- uLong ZEXPORT crc32_combine(crc1, crc2, len2)
1116
+ uLong ZEXPORT crc32_combine_op(crc1, crc2, op)
429
1117
  uLong crc1;
430
1118
  uLong crc2;
431
- z_off_t len2;
1119
+ uLong op;
432
1120
  {
433
- return crc32_combine_(crc1, crc2, len2);
1121
+ return multmodp(op, crc1) ^ crc2;
434
1122
  }
435
-