numo-random 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,750 @@
1
+ /*
2
+ * PCG Random Number Generation for C++
3
+ *
4
+ * Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
5
+ *
6
+ * Licensed under the Apache License, Version 2.0 (the "License");
7
+ * you may not use this file except in compliance with the License.
8
+ * You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing, software
13
+ * distributed under the License is distributed on an "AS IS" BASIS,
14
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ * See the License for the specific language governing permissions and
16
+ * limitations under the License.
17
+ *
18
+ * For additional information about the PCG random number generation scheme,
19
+ * including its license and other licensing options, visit
20
+ *
21
+ * http://www.pcg-random.org
22
+ */
23
+
24
+ /*
25
+ * This code provides a a C++ class that can provide 128-bit (or higher)
26
+ * integers. To produce 2K-bit integers, it uses two K-bit integers,
27
+ * placed in a union that allowes the code to also see them as four K/2 bit
28
+ * integers (and access them either directly name, or by index).
29
+ *
30
+ * It may seem like we're reinventing the wheel here, because several
31
+ * libraries already exist that support large integers, but most existing
32
+ * libraries provide a very generic multiprecision code, but here we're
33
+ * operating at a fixed size. Also, most other libraries are fairly
34
+ * heavyweight. So we use a direct implementation. Sadly, it's much slower
35
+ * than hand-coded assembly or direct CPU support.
36
+ */
37
+
38
+ #ifndef PCG_UINT128_HPP_INCLUDED
39
+ #define PCG_UINT128_HPP_INCLUDED 1
40
+
41
+ #include <cstdint>
42
+ #include <cstdio>
43
+ #include <cassert>
44
+ #include <climits>
45
+ #include <utility>
46
+ #include <initializer_list>
47
+ #include <type_traits>
48
+
49
+ /*
50
+ * We want to lay the type out the same way that a native type would be laid
51
+ * out, which means we must know the machine's endian, at compile time.
52
+ * This ugliness attempts to do so.
53
+ */
54
+
55
+ #ifndef PCG_LITTLE_ENDIAN
56
+ #if defined(__BYTE_ORDER__)
57
+ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
58
+ #define PCG_LITTLE_ENDIAN 1
59
+ #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
60
+ #define PCG_LITTLE_ENDIAN 0
61
+ #else
62
+ #error __BYTE_ORDER__ does not match a standard endian, pick a side
63
+ #endif
64
+ #elif __LITTLE_ENDIAN__ || _LITTLE_ENDIAN
65
+ #define PCG_LITTLE_ENDIAN 1
66
+ #elif __BIG_ENDIAN__ || _BIG_ENDIAN
67
+ #define PCG_LITTLE_ENDIAN 0
68
+ #elif __x86_64 || __x86_64__ || __i386 || __i386__
69
+ #define PCG_LITTLE_ENDIAN 1
70
+ #elif __powerpc__ || __POWERPC__ || __ppc__ || __PPC__ \
71
+ || __m68k__ || __mc68000__
72
+ #define PCG_LITTLE_ENDIAN 0
73
+ #else
74
+ #error Unable to determine target endianness
75
+ #endif
76
+ #endif
77
+
78
+ namespace pcg_extras {
79
+
80
+ // Recent versions of GCC have intrinsics we can use to quickly calculate
81
+ // the number of leading and trailing zeros in a number. If possible, we
82
+ // use them, otherwise we fall back to old-fashioned bit twiddling to figure
83
+ // them out.
84
+
85
+ #ifndef PCG_BITCOUNT_T
86
+ typedef uint8_t bitcount_t;
87
+ #else
88
+ typedef PCG_BITCOUNT_T bitcount_t;
89
+ #endif
90
+
91
+ /*
92
+ * Provide some useful helper functions
93
+ * * flog2 floor(log2(x))
94
+ * * trailingzeros number of trailing zero bits
95
+ */
96
+
97
+ #ifdef __GNUC__ // Any GNU-compatible compiler supporting C++11 has
98
+ // some useful intrinsics we can use.
99
+
100
+ inline bitcount_t flog2(uint32_t v)
101
+ {
102
+ return 31 - __builtin_clz(v);
103
+ }
104
+
105
+ inline bitcount_t trailingzeros(uint32_t v)
106
+ {
107
+ return __builtin_ctz(v);
108
+ }
109
+
110
+ inline bitcount_t flog2(uint64_t v)
111
+ {
112
+ #if UINT64_MAX == ULONG_MAX
113
+ return 63 - __builtin_clzl(v);
114
+ #elif UINT64_MAX == ULLONG_MAX
115
+ return 63 - __builtin_clzll(v);
116
+ #else
117
+ #error Cannot find a function for uint64_t
118
+ #endif
119
+ }
120
+
121
+ inline bitcount_t trailingzeros(uint64_t v)
122
+ {
123
+ #if UINT64_MAX == ULONG_MAX
124
+ return __builtin_ctzl(v);
125
+ #elif UINT64_MAX == ULLONG_MAX
126
+ return __builtin_ctzll(v);
127
+ #else
128
+ #error Cannot find a function for uint64_t
129
+ #endif
130
+ }
131
+
132
+ #else // Otherwise, we fall back to bit twiddling
133
+ // implementations
134
+
135
+ inline bitcount_t flog2(uint32_t v)
136
+ {
137
+ // Based on code by Eric Cole and Mark Dickinson, which appears at
138
+ // https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
139
+
140
+ static const uint8_t multiplyDeBruijnBitPos[32] = {
141
+ 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
142
+ 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
143
+ };
144
+
145
+ v |= v >> 1; // first round down to one less than a power of 2
146
+ v |= v >> 2;
147
+ v |= v >> 4;
148
+ v |= v >> 8;
149
+ v |= v >> 16;
150
+
151
+ return multiplyDeBruijnBitPos[(uint32_t)(v * 0x07C4ACDDU) >> 27];
152
+ }
153
+
154
+ inline bitcount_t trailingzeros(uint32_t v)
155
+ {
156
+ static const uint8_t multiplyDeBruijnBitPos[32] = {
157
+ 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
158
+ 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
159
+ };
160
+
161
+ return multiplyDeBruijnBitPos[((uint32_t)((v & -v) * 0x077CB531U)) >> 27];
162
+ }
163
+
164
+ inline bitcount_t flog2(uint64_t v)
165
+ {
166
+ uint32_t high = v >> 32;
167
+ uint32_t low = uint32_t(v);
168
+
169
+ return high ? 32+flog2(high) : flog2(low);
170
+ }
171
+
172
+ inline bitcount_t trailingzeros(uint64_t v)
173
+ {
174
+ uint32_t high = v >> 32;
175
+ uint32_t low = uint32_t(v);
176
+
177
+ return low ? trailingzeros(low) : trailingzeros(high)+32;
178
+ }
179
+
180
+ #endif
181
+
182
+ template <typename UInt>
183
+ inline bitcount_t clog2(UInt v)
184
+ {
185
+ return flog2(v) + ((v & (-v)) != v);
186
+ }
187
+
188
+ template <typename UInt>
189
+ inline UInt addwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
190
+ {
191
+ UInt half_result = y + carryin;
192
+ UInt result = x + half_result;
193
+ *carryout = (half_result < y) || (result < x);
194
+ return result;
195
+ }
196
+
197
+ template <typename UInt>
198
+ inline UInt subwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
199
+ {
200
+ UInt half_result = y + carryin;
201
+ UInt result = x - half_result;
202
+ *carryout = (half_result < y) || (result > x);
203
+ return result;
204
+ }
205
+
206
+
207
+ template <typename UInt, typename UIntX2>
208
+ class uint_x4 {
209
+ // private:
210
+ public:
211
+ union {
212
+ #if PCG_LITTLE_ENDIAN
213
+ struct {
214
+ UInt v0, v1, v2, v3;
215
+ } w;
216
+ struct {
217
+ UIntX2 v01, v23;
218
+ } d;
219
+ #else
220
+ struct {
221
+ UInt v3, v2, v1, v0;
222
+ } w;
223
+ struct {
224
+ UIntX2 v23, v01;
225
+ } d;
226
+ #endif
227
+ // For the array access versions, the code that uses the array
228
+ // must handle endian itself. Yuck.
229
+ UInt wa[4];
230
+ UIntX2 da[2];
231
+ };
232
+
233
+ public:
234
+ uint_x4() = default;
235
+
236
+ constexpr uint_x4(UInt v3, UInt v2, UInt v1, UInt v0)
237
+ #if PCG_LITTLE_ENDIAN
238
+ : w{v0, v1, v2, v3}
239
+ #else
240
+ : w{v3, v2, v1, v0}
241
+ #endif
242
+ {
243
+ // Nothing (else) to do
244
+ }
245
+
246
+ constexpr uint_x4(UIntX2 v23, UIntX2 v01)
247
+ #if PCG_LITTLE_ENDIAN
248
+ : d{v01,v23}
249
+ #else
250
+ : d{v23,v01}
251
+ #endif
252
+ {
253
+ // Nothing (else) to do
254
+ }
255
+
256
+ template<class Integral,
257
+ typename std::enable_if<(std::is_integral<Integral>::value
258
+ && sizeof(Integral) <= sizeof(UIntX2))
259
+ >::type* = nullptr>
260
+ constexpr uint_x4(Integral v01)
261
+ #if PCG_LITTLE_ENDIAN
262
+ : d{UIntX2(v01),0UL}
263
+ #else
264
+ : d{0UL,UIntX2(v01)}
265
+ #endif
266
+ {
267
+ // Nothing (else) to do
268
+ }
269
+
270
+ explicit constexpr operator uint64_t() const
271
+ {
272
+ return d.v01;
273
+ }
274
+
275
+ explicit constexpr operator uint32_t() const
276
+ {
277
+ return w.v0;
278
+ }
279
+
280
+ explicit constexpr operator int() const
281
+ {
282
+ return w.v0;
283
+ }
284
+
285
+ explicit constexpr operator uint16_t() const
286
+ {
287
+ return w.v0;
288
+ }
289
+
290
+ explicit constexpr operator uint8_t() const
291
+ {
292
+ return w.v0;
293
+ }
294
+
295
+ typedef typename std::conditional<std::is_same<uint64_t,
296
+ unsigned long>::value,
297
+ unsigned long long,
298
+ unsigned long>::type
299
+ uint_missing_t;
300
+
301
+ explicit constexpr operator uint_missing_t() const
302
+ {
303
+ return d.v01;
304
+ }
305
+
306
+ explicit constexpr operator bool() const
307
+ {
308
+ return d.v01 || d.v23;
309
+ }
310
+
311
+ template<typename U, typename V>
312
+ friend uint_x4<U,V> operator*(const uint_x4<U,V>&, const uint_x4<U,V>&);
313
+
314
+ template<typename U, typename V>
315
+ friend std::pair< uint_x4<U,V>,uint_x4<U,V> >
316
+ divmod(const uint_x4<U,V>&, const uint_x4<U,V>&);
317
+
318
+ template<typename U, typename V>
319
+ friend uint_x4<U,V> operator+(const uint_x4<U,V>&, const uint_x4<U,V>&);
320
+
321
+ template<typename U, typename V>
322
+ friend uint_x4<U,V> operator-(const uint_x4<U,V>&, const uint_x4<U,V>&);
323
+
324
+ template<typename U, typename V>
325
+ friend uint_x4<U,V> operator<<(const uint_x4<U,V>&, const uint_x4<U,V>&);
326
+
327
+ template<typename U, typename V>
328
+ friend uint_x4<U,V> operator>>(const uint_x4<U,V>&, const uint_x4<U,V>&);
329
+
330
+ template<typename U, typename V>
331
+ friend uint_x4<U,V> operator&(const uint_x4<U,V>&, const uint_x4<U,V>&);
332
+
333
+ template<typename U, typename V>
334
+ friend uint_x4<U,V> operator|(const uint_x4<U,V>&, const uint_x4<U,V>&);
335
+
336
+ template<typename U, typename V>
337
+ friend uint_x4<U,V> operator^(const uint_x4<U,V>&, const uint_x4<U,V>&);
338
+
339
+ template<typename U, typename V>
340
+ friend bool operator==(const uint_x4<U,V>&, const uint_x4<U,V>&);
341
+
342
+ template<typename U, typename V>
343
+ friend bool operator!=(const uint_x4<U,V>&, const uint_x4<U,V>&);
344
+
345
+ template<typename U, typename V>
346
+ friend bool operator<(const uint_x4<U,V>&, const uint_x4<U,V>&);
347
+
348
+ template<typename U, typename V>
349
+ friend bool operator<=(const uint_x4<U,V>&, const uint_x4<U,V>&);
350
+
351
+ template<typename U, typename V>
352
+ friend bool operator>(const uint_x4<U,V>&, const uint_x4<U,V>&);
353
+
354
+ template<typename U, typename V>
355
+ friend bool operator>=(const uint_x4<U,V>&, const uint_x4<U,V>&);
356
+
357
+ template<typename U, typename V>
358
+ friend uint_x4<U,V> operator~(const uint_x4<U,V>&);
359
+
360
+ template<typename U, typename V>
361
+ friend uint_x4<U,V> operator-(const uint_x4<U,V>&);
362
+
363
+ template<typename U, typename V>
364
+ friend bitcount_t flog2(const uint_x4<U,V>&);
365
+
366
+ template<typename U, typename V>
367
+ friend bitcount_t trailingzeros(const uint_x4<U,V>&);
368
+
369
+ uint_x4& operator*=(const uint_x4& rhs)
370
+ {
371
+ uint_x4 result = *this * rhs;
372
+ return *this = result;
373
+ }
374
+
375
+ uint_x4& operator/=(const uint_x4& rhs)
376
+ {
377
+ uint_x4 result = *this / rhs;
378
+ return *this = result;
379
+ }
380
+
381
+ uint_x4& operator%=(const uint_x4& rhs)
382
+ {
383
+ uint_x4 result = *this % rhs;
384
+ return *this = result;
385
+ }
386
+
387
+ uint_x4& operator+=(const uint_x4& rhs)
388
+ {
389
+ uint_x4 result = *this + rhs;
390
+ return *this = result;
391
+ }
392
+
393
+ uint_x4& operator-=(const uint_x4& rhs)
394
+ {
395
+ uint_x4 result = *this - rhs;
396
+ return *this = result;
397
+ }
398
+
399
+ uint_x4& operator&=(const uint_x4& rhs)
400
+ {
401
+ uint_x4 result = *this & rhs;
402
+ return *this = result;
403
+ }
404
+
405
+ uint_x4& operator|=(const uint_x4& rhs)
406
+ {
407
+ uint_x4 result = *this | rhs;
408
+ return *this = result;
409
+ }
410
+
411
+ uint_x4& operator^=(const uint_x4& rhs)
412
+ {
413
+ uint_x4 result = *this ^ rhs;
414
+ return *this = result;
415
+ }
416
+
417
+ uint_x4& operator>>=(bitcount_t shift)
418
+ {
419
+ uint_x4 result = *this >> shift;
420
+ return *this = result;
421
+ }
422
+
423
+ uint_x4& operator<<=(bitcount_t shift)
424
+ {
425
+ uint_x4 result = *this << shift;
426
+ return *this = result;
427
+ }
428
+
429
+ };
430
+
431
+ template<typename U, typename V>
432
+ bitcount_t flog2(const uint_x4<U,V>& v)
433
+ {
434
+ #if PCG_LITTLE_ENDIAN
435
+ for (uint8_t i = 4; i !=0; /* dec in loop */) {
436
+ --i;
437
+ #else
438
+ for (uint8_t i = 0; i < 4; ++i) {
439
+ #endif
440
+ if (v.wa[i] == 0)
441
+ continue;
442
+ return flog2(v.wa[i]) + (sizeof(U)*CHAR_BIT)*i;
443
+ }
444
+ abort();
445
+ }
446
+
447
+ template<typename U, typename V>
448
+ bitcount_t trailingzeros(const uint_x4<U,V>& v)
449
+ {
450
+ #if PCG_LITTLE_ENDIAN
451
+ for (uint8_t i = 0; i < 4; ++i) {
452
+ #else
453
+ for (uint8_t i = 4; i !=0; /* dec in loop */) {
454
+ --i;
455
+ #endif
456
+ if (v.wa[i] != 0)
457
+ return trailingzeros(v.wa[i]) + (sizeof(U)*CHAR_BIT)*i;
458
+ }
459
+ return (sizeof(U)*CHAR_BIT)*4;
460
+ }
461
+
462
+ template <typename UInt, typename UIntX2>
463
+ std::pair< uint_x4<UInt,UIntX2>, uint_x4<UInt,UIntX2> >
464
+ divmod(const uint_x4<UInt,UIntX2>& orig_dividend,
465
+ const uint_x4<UInt,UIntX2>& divisor)
466
+ {
467
+ // If the dividend is less than the divisor, the answer is always zero.
468
+ // This takes care of boundary cases like 0/x (which would otherwise be
469
+ // problematic because we can't take the log of zero. (The boundary case
470
+ // of division by zero is undefined.)
471
+ if (orig_dividend < divisor)
472
+ return { uint_x4<UInt,UIntX2>(0UL), orig_dividend };
473
+
474
+ auto dividend = orig_dividend;
475
+
476
+ auto log2_divisor = flog2(divisor);
477
+ auto log2_dividend = flog2(dividend);
478
+ // assert(log2_dividend >= log2_divisor);
479
+ bitcount_t logdiff = log2_dividend - log2_divisor;
480
+
481
+ constexpr uint_x4<UInt,UIntX2> ONE(1UL);
482
+ if (logdiff == 0)
483
+ return { ONE, dividend - divisor };
484
+
485
+ // Now we change the log difference to
486
+ // floor(log2(divisor)) - ceil(log2(dividend))
487
+ // to ensure that we *underestimate* the result.
488
+ logdiff -= 1;
489
+
490
+ uint_x4<UInt,UIntX2> quotient(0UL);
491
+
492
+ auto qfactor = ONE << logdiff;
493
+ auto factor = divisor << logdiff;
494
+
495
+ do {
496
+ dividend -= factor;
497
+ quotient += qfactor;
498
+ while (dividend < factor) {
499
+ factor >>= 1;
500
+ qfactor >>= 1;
501
+ }
502
+ } while (dividend >= divisor);
503
+
504
+ return { quotient, dividend };
505
+ }
506
+
507
+ template <typename UInt, typename UIntX2>
508
+ uint_x4<UInt,UIntX2> operator/(const uint_x4<UInt,UIntX2>& dividend,
509
+ const uint_x4<UInt,UIntX2>& divisor)
510
+ {
511
+ return divmod(dividend, divisor).first;
512
+ }
513
+
514
+ template <typename UInt, typename UIntX2>
515
+ uint_x4<UInt,UIntX2> operator%(const uint_x4<UInt,UIntX2>& dividend,
516
+ const uint_x4<UInt,UIntX2>& divisor)
517
+ {
518
+ return divmod(dividend, divisor).second;
519
+ }
520
+
521
+
522
+ template <typename UInt, typename UIntX2>
523
+ uint_x4<UInt,UIntX2> operator*(const uint_x4<UInt,UIntX2>& a,
524
+ const uint_x4<UInt,UIntX2>& b)
525
+ {
526
+ uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
527
+ bool carryin = false;
528
+ bool carryout;
529
+ UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(b.w.v0);
530
+ r.w.v0 = UInt(a0b0);
531
+ r.w.v1 = UInt(a0b0 >> 32);
532
+
533
+ UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(b.w.v0);
534
+ r.w.v2 = UInt(a1b0 >> 32);
535
+ r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout);
536
+ carryin = carryout;
537
+ r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
538
+ carryin = carryout;
539
+ r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
540
+
541
+ UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b.w.v1);
542
+ carryin = false;
543
+ r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> 32), carryin, &carryout);
544
+ carryin = carryout;
545
+ r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
546
+
547
+ carryin = false;
548
+ r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout);
549
+ carryin = carryout;
550
+ r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
551
+ carryin = carryout;
552
+ r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
553
+
554
+ UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b.w.v1);
555
+ carryin = false;
556
+ r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout);
557
+ carryin = carryout;
558
+ r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> 32), carryin, &carryout);
559
+
560
+ r.d.v23 += a.d.v01 * b.d.v23 + a.d.v23 * b.d.v01;
561
+
562
+ return r;
563
+ }
564
+
565
+
566
+ template <typename UInt, typename UIntX2>
567
+ uint_x4<UInt,UIntX2> operator+(const uint_x4<UInt,UIntX2>& a,
568
+ const uint_x4<UInt,UIntX2>& b)
569
+ {
570
+ uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
571
+
572
+ bool carryin = false;
573
+ bool carryout;
574
+ r.w.v0 = addwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
575
+ carryin = carryout;
576
+ r.w.v1 = addwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
577
+ carryin = carryout;
578
+ r.w.v2 = addwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
579
+ carryin = carryout;
580
+ r.w.v3 = addwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
581
+
582
+ return r;
583
+ }
584
+
585
+ template <typename UInt, typename UIntX2>
586
+ uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& a,
587
+ const uint_x4<UInt,UIntX2>& b)
588
+ {
589
+ uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
590
+
591
+ bool carryin = false;
592
+ bool carryout;
593
+ r.w.v0 = subwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
594
+ carryin = carryout;
595
+ r.w.v1 = subwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
596
+ carryin = carryout;
597
+ r.w.v2 = subwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
598
+ carryin = carryout;
599
+ r.w.v3 = subwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
600
+
601
+ return r;
602
+ }
603
+
604
+
605
+ template <typename UInt, typename UIntX2>
606
+ uint_x4<UInt,UIntX2> operator&(const uint_x4<UInt,UIntX2>& a,
607
+ const uint_x4<UInt,UIntX2>& b)
608
+ {
609
+ return uint_x4<UInt,UIntX2>(a.d.v23 & b.d.v23, a.d.v01 & b.d.v01);
610
+ }
611
+
612
+ template <typename UInt, typename UIntX2>
613
+ uint_x4<UInt,UIntX2> operator|(const uint_x4<UInt,UIntX2>& a,
614
+ const uint_x4<UInt,UIntX2>& b)
615
+ {
616
+ return uint_x4<UInt,UIntX2>(a.d.v23 | b.d.v23, a.d.v01 | b.d.v01);
617
+ }
618
+
619
+ template <typename UInt, typename UIntX2>
620
+ uint_x4<UInt,UIntX2> operator^(const uint_x4<UInt,UIntX2>& a,
621
+ const uint_x4<UInt,UIntX2>& b)
622
+ {
623
+ return uint_x4<UInt,UIntX2>(a.d.v23 ^ b.d.v23, a.d.v01 ^ b.d.v01);
624
+ }
625
+
626
+ template <typename UInt, typename UIntX2>
627
+ uint_x4<UInt,UIntX2> operator~(const uint_x4<UInt,UIntX2>& v)
628
+ {
629
+ return uint_x4<UInt,UIntX2>(~v.d.v23, ~v.d.v01);
630
+ }
631
+
632
+ template <typename UInt, typename UIntX2>
633
+ uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& v)
634
+ {
635
+ return uint_x4<UInt,UIntX2>(0UL,0UL) - v;
636
+ }
637
+
638
+ template <typename UInt, typename UIntX2>
639
+ bool operator==(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
640
+ {
641
+ return (a.d.v01 == b.d.v01) && (a.d.v23 == b.d.v23);
642
+ }
643
+
644
+ template <typename UInt, typename UIntX2>
645
+ bool operator!=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
646
+ {
647
+ return !operator==(a,b);
648
+ }
649
+
650
+
651
+ template <typename UInt, typename UIntX2>
652
+ bool operator<(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
653
+ {
654
+ return (a.d.v23 < b.d.v23)
655
+ || ((a.d.v23 == b.d.v23) && (a.d.v01 < b.d.v01));
656
+ }
657
+
658
+ template <typename UInt, typename UIntX2>
659
+ bool operator>(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
660
+ {
661
+ return operator<(b,a);
662
+ }
663
+
664
+ template <typename UInt, typename UIntX2>
665
+ bool operator<=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
666
+ {
667
+ return !(operator<(b,a));
668
+ }
669
+
670
+ template <typename UInt, typename UIntX2>
671
+ bool operator>=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
672
+ {
673
+ return !(operator<(a,b));
674
+ }
675
+
676
+
677
+
678
+ template <typename UInt, typename UIntX2>
679
+ uint_x4<UInt,UIntX2> operator<<(const uint_x4<UInt,UIntX2>& v,
680
+ const bitcount_t shift)
681
+ {
682
+ uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
683
+ const bitcount_t bits = sizeof(UInt) * CHAR_BIT;
684
+ const bitcount_t bitmask = bits - 1;
685
+ const bitcount_t shiftdiv = shift / bits;
686
+ const bitcount_t shiftmod = shift & bitmask;
687
+
688
+ if (shiftmod) {
689
+ UInt carryover = 0;
690
+ #if PCG_LITTLE_ENDIAN
691
+ for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
692
+ #else
693
+ for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
694
+ --out, --in;
695
+ #endif
696
+ r.wa[out] = (v.wa[in] << shiftmod) | carryover;
697
+ carryover = (v.wa[in] >> (bits - shiftmod));
698
+ }
699
+ } else {
700
+ #if PCG_LITTLE_ENDIAN
701
+ for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
702
+ #else
703
+ for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
704
+ --out, --in;
705
+ #endif
706
+ r.wa[out] = v.wa[in];
707
+ }
708
+ }
709
+
710
+ return r;
711
+ }
712
+
713
+ template <typename UInt, typename UIntX2>
714
+ uint_x4<UInt,UIntX2> operator>>(const uint_x4<UInt,UIntX2>& v,
715
+ const bitcount_t shift)
716
+ {
717
+ uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
718
+ const bitcount_t bits = sizeof(UInt) * CHAR_BIT;
719
+ const bitcount_t bitmask = bits - 1;
720
+ const bitcount_t shiftdiv = shift / bits;
721
+ const bitcount_t shiftmod = shift & bitmask;
722
+
723
+ if (shiftmod) {
724
+ UInt carryover = 0;
725
+ #if PCG_LITTLE_ENDIAN
726
+ for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
727
+ --out, --in;
728
+ #else
729
+ for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
730
+ #endif
731
+ r.wa[out] = (v.wa[in] >> shiftmod) | carryover;
732
+ carryover = (v.wa[in] << (bits - shiftmod));
733
+ }
734
+ } else {
735
+ #if PCG_LITTLE_ENDIAN
736
+ for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
737
+ --out, --in;
738
+ #else
739
+ for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
740
+ #endif
741
+ r.wa[out] = v.wa[in];
742
+ }
743
+ }
744
+
745
+ return r;
746
+ }
747
+
748
+ } // namespace pcg_extras
749
+
750
+ #endif // PCG_UINT128_HPP_INCLUDED