numo-random 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +7 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/LICENSE.txt +176 -0
- data/README.md +64 -0
- data/ext/numo/random/extconf.rb +29 -0
- data/ext/numo/random/randomext.cpp +26 -0
- data/ext/numo/random/randomext.hpp +410 -0
- data/ext/numo/random/src/LICENSE.txt +201 -0
- data/ext/numo/random/src/pcg_extras.hpp +637 -0
- data/ext/numo/random/src/pcg_random.hpp +1751 -0
- data/ext/numo/random/src/pcg_uint128.hpp +750 -0
- data/lib/numo/random/version.rb +8 -0
- data/lib/numo/random.rb +184 -0
- metadata +78 -0
@@ -0,0 +1,750 @@
|
|
1
|
+
/*
|
2
|
+
* PCG Random Number Generation for C++
|
3
|
+
*
|
4
|
+
* Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
|
5
|
+
*
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
7
|
+
* you may not use this file except in compliance with the License.
|
8
|
+
* You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
* See the License for the specific language governing permissions and
|
16
|
+
* limitations under the License.
|
17
|
+
*
|
18
|
+
* For additional information about the PCG random number generation scheme,
|
19
|
+
* including its license and other licensing options, visit
|
20
|
+
*
|
21
|
+
* http://www.pcg-random.org
|
22
|
+
*/
|
23
|
+
|
24
|
+
/*
|
25
|
+
* This code provides a a C++ class that can provide 128-bit (or higher)
|
26
|
+
* integers. To produce 2K-bit integers, it uses two K-bit integers,
|
27
|
+
* placed in a union that allowes the code to also see them as four K/2 bit
|
28
|
+
* integers (and access them either directly name, or by index).
|
29
|
+
*
|
30
|
+
* It may seem like we're reinventing the wheel here, because several
|
31
|
+
* libraries already exist that support large integers, but most existing
|
32
|
+
* libraries provide a very generic multiprecision code, but here we're
|
33
|
+
* operating at a fixed size. Also, most other libraries are fairly
|
34
|
+
* heavyweight. So we use a direct implementation. Sadly, it's much slower
|
35
|
+
* than hand-coded assembly or direct CPU support.
|
36
|
+
*/
|
37
|
+
|
38
|
+
#ifndef PCG_UINT128_HPP_INCLUDED
|
39
|
+
#define PCG_UINT128_HPP_INCLUDED 1
|
40
|
+
|
41
|
+
#include <cstdint>
|
42
|
+
#include <cstdio>
|
43
|
+
#include <cassert>
|
44
|
+
#include <climits>
|
45
|
+
#include <utility>
|
46
|
+
#include <initializer_list>
|
47
|
+
#include <type_traits>
|
48
|
+
|
49
|
+
/*
|
50
|
+
* We want to lay the type out the same way that a native type would be laid
|
51
|
+
* out, which means we must know the machine's endian, at compile time.
|
52
|
+
* This ugliness attempts to do so.
|
53
|
+
*/
|
54
|
+
|
55
|
+
#ifndef PCG_LITTLE_ENDIAN
|
56
|
+
#if defined(__BYTE_ORDER__)
|
57
|
+
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
58
|
+
#define PCG_LITTLE_ENDIAN 1
|
59
|
+
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
60
|
+
#define PCG_LITTLE_ENDIAN 0
|
61
|
+
#else
|
62
|
+
#error __BYTE_ORDER__ does not match a standard endian, pick a side
|
63
|
+
#endif
|
64
|
+
#elif __LITTLE_ENDIAN__ || _LITTLE_ENDIAN
|
65
|
+
#define PCG_LITTLE_ENDIAN 1
|
66
|
+
#elif __BIG_ENDIAN__ || _BIG_ENDIAN
|
67
|
+
#define PCG_LITTLE_ENDIAN 0
|
68
|
+
#elif __x86_64 || __x86_64__ || __i386 || __i386__
|
69
|
+
#define PCG_LITTLE_ENDIAN 1
|
70
|
+
#elif __powerpc__ || __POWERPC__ || __ppc__ || __PPC__ \
|
71
|
+
|| __m68k__ || __mc68000__
|
72
|
+
#define PCG_LITTLE_ENDIAN 0
|
73
|
+
#else
|
74
|
+
#error Unable to determine target endianness
|
75
|
+
#endif
|
76
|
+
#endif
|
77
|
+
|
78
|
+
namespace pcg_extras {
|
79
|
+
|
80
|
+
// Recent versions of GCC have intrinsics we can use to quickly calculate
|
81
|
+
// the number of leading and trailing zeros in a number. If possible, we
|
82
|
+
// use them, otherwise we fall back to old-fashioned bit twiddling to figure
|
83
|
+
// them out.
|
84
|
+
|
85
|
+
#ifndef PCG_BITCOUNT_T
|
86
|
+
typedef uint8_t bitcount_t;
|
87
|
+
#else
|
88
|
+
typedef PCG_BITCOUNT_T bitcount_t;
|
89
|
+
#endif
|
90
|
+
|
91
|
+
/*
|
92
|
+
* Provide some useful helper functions
|
93
|
+
* * flog2 floor(log2(x))
|
94
|
+
* * trailingzeros number of trailing zero bits
|
95
|
+
*/
|
96
|
+
|
97
|
+
#ifdef __GNUC__ // Any GNU-compatible compiler supporting C++11 has
|
98
|
+
// some useful intrinsics we can use.
|
99
|
+
|
100
|
+
inline bitcount_t flog2(uint32_t v)
|
101
|
+
{
|
102
|
+
return 31 - __builtin_clz(v);
|
103
|
+
}
|
104
|
+
|
105
|
+
inline bitcount_t trailingzeros(uint32_t v)
|
106
|
+
{
|
107
|
+
return __builtin_ctz(v);
|
108
|
+
}
|
109
|
+
|
110
|
+
inline bitcount_t flog2(uint64_t v)
|
111
|
+
{
|
112
|
+
#if UINT64_MAX == ULONG_MAX
|
113
|
+
return 63 - __builtin_clzl(v);
|
114
|
+
#elif UINT64_MAX == ULLONG_MAX
|
115
|
+
return 63 - __builtin_clzll(v);
|
116
|
+
#else
|
117
|
+
#error Cannot find a function for uint64_t
|
118
|
+
#endif
|
119
|
+
}
|
120
|
+
|
121
|
+
inline bitcount_t trailingzeros(uint64_t v)
|
122
|
+
{
|
123
|
+
#if UINT64_MAX == ULONG_MAX
|
124
|
+
return __builtin_ctzl(v);
|
125
|
+
#elif UINT64_MAX == ULLONG_MAX
|
126
|
+
return __builtin_ctzll(v);
|
127
|
+
#else
|
128
|
+
#error Cannot find a function for uint64_t
|
129
|
+
#endif
|
130
|
+
}
|
131
|
+
|
132
|
+
#else // Otherwise, we fall back to bit twiddling
|
133
|
+
// implementations
|
134
|
+
|
135
|
+
inline bitcount_t flog2(uint32_t v)
|
136
|
+
{
|
137
|
+
// Based on code by Eric Cole and Mark Dickinson, which appears at
|
138
|
+
// https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
|
139
|
+
|
140
|
+
static const uint8_t multiplyDeBruijnBitPos[32] = {
|
141
|
+
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
|
142
|
+
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
|
143
|
+
};
|
144
|
+
|
145
|
+
v |= v >> 1; // first round down to one less than a power of 2
|
146
|
+
v |= v >> 2;
|
147
|
+
v |= v >> 4;
|
148
|
+
v |= v >> 8;
|
149
|
+
v |= v >> 16;
|
150
|
+
|
151
|
+
return multiplyDeBruijnBitPos[(uint32_t)(v * 0x07C4ACDDU) >> 27];
|
152
|
+
}
|
153
|
+
|
154
|
+
inline bitcount_t trailingzeros(uint32_t v)
|
155
|
+
{
|
156
|
+
static const uint8_t multiplyDeBruijnBitPos[32] = {
|
157
|
+
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
|
158
|
+
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
|
159
|
+
};
|
160
|
+
|
161
|
+
return multiplyDeBruijnBitPos[((uint32_t)((v & -v) * 0x077CB531U)) >> 27];
|
162
|
+
}
|
163
|
+
|
164
|
+
inline bitcount_t flog2(uint64_t v)
|
165
|
+
{
|
166
|
+
uint32_t high = v >> 32;
|
167
|
+
uint32_t low = uint32_t(v);
|
168
|
+
|
169
|
+
return high ? 32+flog2(high) : flog2(low);
|
170
|
+
}
|
171
|
+
|
172
|
+
inline bitcount_t trailingzeros(uint64_t v)
|
173
|
+
{
|
174
|
+
uint32_t high = v >> 32;
|
175
|
+
uint32_t low = uint32_t(v);
|
176
|
+
|
177
|
+
return low ? trailingzeros(low) : trailingzeros(high)+32;
|
178
|
+
}
|
179
|
+
|
180
|
+
#endif
|
181
|
+
|
182
|
+
template <typename UInt>
|
183
|
+
inline bitcount_t clog2(UInt v)
|
184
|
+
{
|
185
|
+
return flog2(v) + ((v & (-v)) != v);
|
186
|
+
}
|
187
|
+
|
188
|
+
template <typename UInt>
|
189
|
+
inline UInt addwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
|
190
|
+
{
|
191
|
+
UInt half_result = y + carryin;
|
192
|
+
UInt result = x + half_result;
|
193
|
+
*carryout = (half_result < y) || (result < x);
|
194
|
+
return result;
|
195
|
+
}
|
196
|
+
|
197
|
+
template <typename UInt>
|
198
|
+
inline UInt subwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
|
199
|
+
{
|
200
|
+
UInt half_result = y + carryin;
|
201
|
+
UInt result = x - half_result;
|
202
|
+
*carryout = (half_result < y) || (result > x);
|
203
|
+
return result;
|
204
|
+
}
|
205
|
+
|
206
|
+
|
207
|
+
template <typename UInt, typename UIntX2>
|
208
|
+
class uint_x4 {
|
209
|
+
// private:
|
210
|
+
public:
|
211
|
+
union {
|
212
|
+
#if PCG_LITTLE_ENDIAN
|
213
|
+
struct {
|
214
|
+
UInt v0, v1, v2, v3;
|
215
|
+
} w;
|
216
|
+
struct {
|
217
|
+
UIntX2 v01, v23;
|
218
|
+
} d;
|
219
|
+
#else
|
220
|
+
struct {
|
221
|
+
UInt v3, v2, v1, v0;
|
222
|
+
} w;
|
223
|
+
struct {
|
224
|
+
UIntX2 v23, v01;
|
225
|
+
} d;
|
226
|
+
#endif
|
227
|
+
// For the array access versions, the code that uses the array
|
228
|
+
// must handle endian itself. Yuck.
|
229
|
+
UInt wa[4];
|
230
|
+
UIntX2 da[2];
|
231
|
+
};
|
232
|
+
|
233
|
+
public:
|
234
|
+
uint_x4() = default;
|
235
|
+
|
236
|
+
constexpr uint_x4(UInt v3, UInt v2, UInt v1, UInt v0)
|
237
|
+
#if PCG_LITTLE_ENDIAN
|
238
|
+
: w{v0, v1, v2, v3}
|
239
|
+
#else
|
240
|
+
: w{v3, v2, v1, v0}
|
241
|
+
#endif
|
242
|
+
{
|
243
|
+
// Nothing (else) to do
|
244
|
+
}
|
245
|
+
|
246
|
+
constexpr uint_x4(UIntX2 v23, UIntX2 v01)
|
247
|
+
#if PCG_LITTLE_ENDIAN
|
248
|
+
: d{v01,v23}
|
249
|
+
#else
|
250
|
+
: d{v23,v01}
|
251
|
+
#endif
|
252
|
+
{
|
253
|
+
// Nothing (else) to do
|
254
|
+
}
|
255
|
+
|
256
|
+
template<class Integral,
|
257
|
+
typename std::enable_if<(std::is_integral<Integral>::value
|
258
|
+
&& sizeof(Integral) <= sizeof(UIntX2))
|
259
|
+
>::type* = nullptr>
|
260
|
+
constexpr uint_x4(Integral v01)
|
261
|
+
#if PCG_LITTLE_ENDIAN
|
262
|
+
: d{UIntX2(v01),0UL}
|
263
|
+
#else
|
264
|
+
: d{0UL,UIntX2(v01)}
|
265
|
+
#endif
|
266
|
+
{
|
267
|
+
// Nothing (else) to do
|
268
|
+
}
|
269
|
+
|
270
|
+
explicit constexpr operator uint64_t() const
|
271
|
+
{
|
272
|
+
return d.v01;
|
273
|
+
}
|
274
|
+
|
275
|
+
explicit constexpr operator uint32_t() const
|
276
|
+
{
|
277
|
+
return w.v0;
|
278
|
+
}
|
279
|
+
|
280
|
+
explicit constexpr operator int() const
|
281
|
+
{
|
282
|
+
return w.v0;
|
283
|
+
}
|
284
|
+
|
285
|
+
explicit constexpr operator uint16_t() const
|
286
|
+
{
|
287
|
+
return w.v0;
|
288
|
+
}
|
289
|
+
|
290
|
+
explicit constexpr operator uint8_t() const
|
291
|
+
{
|
292
|
+
return w.v0;
|
293
|
+
}
|
294
|
+
|
295
|
+
typedef typename std::conditional<std::is_same<uint64_t,
|
296
|
+
unsigned long>::value,
|
297
|
+
unsigned long long,
|
298
|
+
unsigned long>::type
|
299
|
+
uint_missing_t;
|
300
|
+
|
301
|
+
explicit constexpr operator uint_missing_t() const
|
302
|
+
{
|
303
|
+
return d.v01;
|
304
|
+
}
|
305
|
+
|
306
|
+
explicit constexpr operator bool() const
|
307
|
+
{
|
308
|
+
return d.v01 || d.v23;
|
309
|
+
}
|
310
|
+
|
311
|
+
template<typename U, typename V>
|
312
|
+
friend uint_x4<U,V> operator*(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
313
|
+
|
314
|
+
template<typename U, typename V>
|
315
|
+
friend std::pair< uint_x4<U,V>,uint_x4<U,V> >
|
316
|
+
divmod(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
317
|
+
|
318
|
+
template<typename U, typename V>
|
319
|
+
friend uint_x4<U,V> operator+(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
320
|
+
|
321
|
+
template<typename U, typename V>
|
322
|
+
friend uint_x4<U,V> operator-(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
323
|
+
|
324
|
+
template<typename U, typename V>
|
325
|
+
friend uint_x4<U,V> operator<<(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
326
|
+
|
327
|
+
template<typename U, typename V>
|
328
|
+
friend uint_x4<U,V> operator>>(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
329
|
+
|
330
|
+
template<typename U, typename V>
|
331
|
+
friend uint_x4<U,V> operator&(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
332
|
+
|
333
|
+
template<typename U, typename V>
|
334
|
+
friend uint_x4<U,V> operator|(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
335
|
+
|
336
|
+
template<typename U, typename V>
|
337
|
+
friend uint_x4<U,V> operator^(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
338
|
+
|
339
|
+
template<typename U, typename V>
|
340
|
+
friend bool operator==(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
341
|
+
|
342
|
+
template<typename U, typename V>
|
343
|
+
friend bool operator!=(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
344
|
+
|
345
|
+
template<typename U, typename V>
|
346
|
+
friend bool operator<(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
347
|
+
|
348
|
+
template<typename U, typename V>
|
349
|
+
friend bool operator<=(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
350
|
+
|
351
|
+
template<typename U, typename V>
|
352
|
+
friend bool operator>(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
353
|
+
|
354
|
+
template<typename U, typename V>
|
355
|
+
friend bool operator>=(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
356
|
+
|
357
|
+
template<typename U, typename V>
|
358
|
+
friend uint_x4<U,V> operator~(const uint_x4<U,V>&);
|
359
|
+
|
360
|
+
template<typename U, typename V>
|
361
|
+
friend uint_x4<U,V> operator-(const uint_x4<U,V>&);
|
362
|
+
|
363
|
+
template<typename U, typename V>
|
364
|
+
friend bitcount_t flog2(const uint_x4<U,V>&);
|
365
|
+
|
366
|
+
template<typename U, typename V>
|
367
|
+
friend bitcount_t trailingzeros(const uint_x4<U,V>&);
|
368
|
+
|
369
|
+
uint_x4& operator*=(const uint_x4& rhs)
|
370
|
+
{
|
371
|
+
uint_x4 result = *this * rhs;
|
372
|
+
return *this = result;
|
373
|
+
}
|
374
|
+
|
375
|
+
uint_x4& operator/=(const uint_x4& rhs)
|
376
|
+
{
|
377
|
+
uint_x4 result = *this / rhs;
|
378
|
+
return *this = result;
|
379
|
+
}
|
380
|
+
|
381
|
+
uint_x4& operator%=(const uint_x4& rhs)
|
382
|
+
{
|
383
|
+
uint_x4 result = *this % rhs;
|
384
|
+
return *this = result;
|
385
|
+
}
|
386
|
+
|
387
|
+
uint_x4& operator+=(const uint_x4& rhs)
|
388
|
+
{
|
389
|
+
uint_x4 result = *this + rhs;
|
390
|
+
return *this = result;
|
391
|
+
}
|
392
|
+
|
393
|
+
uint_x4& operator-=(const uint_x4& rhs)
|
394
|
+
{
|
395
|
+
uint_x4 result = *this - rhs;
|
396
|
+
return *this = result;
|
397
|
+
}
|
398
|
+
|
399
|
+
uint_x4& operator&=(const uint_x4& rhs)
|
400
|
+
{
|
401
|
+
uint_x4 result = *this & rhs;
|
402
|
+
return *this = result;
|
403
|
+
}
|
404
|
+
|
405
|
+
uint_x4& operator|=(const uint_x4& rhs)
|
406
|
+
{
|
407
|
+
uint_x4 result = *this | rhs;
|
408
|
+
return *this = result;
|
409
|
+
}
|
410
|
+
|
411
|
+
uint_x4& operator^=(const uint_x4& rhs)
|
412
|
+
{
|
413
|
+
uint_x4 result = *this ^ rhs;
|
414
|
+
return *this = result;
|
415
|
+
}
|
416
|
+
|
417
|
+
uint_x4& operator>>=(bitcount_t shift)
|
418
|
+
{
|
419
|
+
uint_x4 result = *this >> shift;
|
420
|
+
return *this = result;
|
421
|
+
}
|
422
|
+
|
423
|
+
uint_x4& operator<<=(bitcount_t shift)
|
424
|
+
{
|
425
|
+
uint_x4 result = *this << shift;
|
426
|
+
return *this = result;
|
427
|
+
}
|
428
|
+
|
429
|
+
};
|
430
|
+
|
431
|
+
template<typename U, typename V>
|
432
|
+
bitcount_t flog2(const uint_x4<U,V>& v)
|
433
|
+
{
|
434
|
+
#if PCG_LITTLE_ENDIAN
|
435
|
+
for (uint8_t i = 4; i !=0; /* dec in loop */) {
|
436
|
+
--i;
|
437
|
+
#else
|
438
|
+
for (uint8_t i = 0; i < 4; ++i) {
|
439
|
+
#endif
|
440
|
+
if (v.wa[i] == 0)
|
441
|
+
continue;
|
442
|
+
return flog2(v.wa[i]) + (sizeof(U)*CHAR_BIT)*i;
|
443
|
+
}
|
444
|
+
abort();
|
445
|
+
}
|
446
|
+
|
447
|
+
template<typename U, typename V>
|
448
|
+
bitcount_t trailingzeros(const uint_x4<U,V>& v)
|
449
|
+
{
|
450
|
+
#if PCG_LITTLE_ENDIAN
|
451
|
+
for (uint8_t i = 0; i < 4; ++i) {
|
452
|
+
#else
|
453
|
+
for (uint8_t i = 4; i !=0; /* dec in loop */) {
|
454
|
+
--i;
|
455
|
+
#endif
|
456
|
+
if (v.wa[i] != 0)
|
457
|
+
return trailingzeros(v.wa[i]) + (sizeof(U)*CHAR_BIT)*i;
|
458
|
+
}
|
459
|
+
return (sizeof(U)*CHAR_BIT)*4;
|
460
|
+
}
|
461
|
+
|
462
|
+
template <typename UInt, typename UIntX2>
|
463
|
+
std::pair< uint_x4<UInt,UIntX2>, uint_x4<UInt,UIntX2> >
|
464
|
+
divmod(const uint_x4<UInt,UIntX2>& orig_dividend,
|
465
|
+
const uint_x4<UInt,UIntX2>& divisor)
|
466
|
+
{
|
467
|
+
// If the dividend is less than the divisor, the answer is always zero.
|
468
|
+
// This takes care of boundary cases like 0/x (which would otherwise be
|
469
|
+
// problematic because we can't take the log of zero. (The boundary case
|
470
|
+
// of division by zero is undefined.)
|
471
|
+
if (orig_dividend < divisor)
|
472
|
+
return { uint_x4<UInt,UIntX2>(0UL), orig_dividend };
|
473
|
+
|
474
|
+
auto dividend = orig_dividend;
|
475
|
+
|
476
|
+
auto log2_divisor = flog2(divisor);
|
477
|
+
auto log2_dividend = flog2(dividend);
|
478
|
+
// assert(log2_dividend >= log2_divisor);
|
479
|
+
bitcount_t logdiff = log2_dividend - log2_divisor;
|
480
|
+
|
481
|
+
constexpr uint_x4<UInt,UIntX2> ONE(1UL);
|
482
|
+
if (logdiff == 0)
|
483
|
+
return { ONE, dividend - divisor };
|
484
|
+
|
485
|
+
// Now we change the log difference to
|
486
|
+
// floor(log2(divisor)) - ceil(log2(dividend))
|
487
|
+
// to ensure that we *underestimate* the result.
|
488
|
+
logdiff -= 1;
|
489
|
+
|
490
|
+
uint_x4<UInt,UIntX2> quotient(0UL);
|
491
|
+
|
492
|
+
auto qfactor = ONE << logdiff;
|
493
|
+
auto factor = divisor << logdiff;
|
494
|
+
|
495
|
+
do {
|
496
|
+
dividend -= factor;
|
497
|
+
quotient += qfactor;
|
498
|
+
while (dividend < factor) {
|
499
|
+
factor >>= 1;
|
500
|
+
qfactor >>= 1;
|
501
|
+
}
|
502
|
+
} while (dividend >= divisor);
|
503
|
+
|
504
|
+
return { quotient, dividend };
|
505
|
+
}
|
506
|
+
|
507
|
+
template <typename UInt, typename UIntX2>
|
508
|
+
uint_x4<UInt,UIntX2> operator/(const uint_x4<UInt,UIntX2>& dividend,
|
509
|
+
const uint_x4<UInt,UIntX2>& divisor)
|
510
|
+
{
|
511
|
+
return divmod(dividend, divisor).first;
|
512
|
+
}
|
513
|
+
|
514
|
+
template <typename UInt, typename UIntX2>
|
515
|
+
uint_x4<UInt,UIntX2> operator%(const uint_x4<UInt,UIntX2>& dividend,
|
516
|
+
const uint_x4<UInt,UIntX2>& divisor)
|
517
|
+
{
|
518
|
+
return divmod(dividend, divisor).second;
|
519
|
+
}
|
520
|
+
|
521
|
+
|
522
|
+
template <typename UInt, typename UIntX2>
|
523
|
+
uint_x4<UInt,UIntX2> operator*(const uint_x4<UInt,UIntX2>& a,
|
524
|
+
const uint_x4<UInt,UIntX2>& b)
|
525
|
+
{
|
526
|
+
uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
|
527
|
+
bool carryin = false;
|
528
|
+
bool carryout;
|
529
|
+
UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(b.w.v0);
|
530
|
+
r.w.v0 = UInt(a0b0);
|
531
|
+
r.w.v1 = UInt(a0b0 >> 32);
|
532
|
+
|
533
|
+
UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(b.w.v0);
|
534
|
+
r.w.v2 = UInt(a1b0 >> 32);
|
535
|
+
r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout);
|
536
|
+
carryin = carryout;
|
537
|
+
r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
|
538
|
+
carryin = carryout;
|
539
|
+
r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
|
540
|
+
|
541
|
+
UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b.w.v1);
|
542
|
+
carryin = false;
|
543
|
+
r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> 32), carryin, &carryout);
|
544
|
+
carryin = carryout;
|
545
|
+
r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
|
546
|
+
|
547
|
+
carryin = false;
|
548
|
+
r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout);
|
549
|
+
carryin = carryout;
|
550
|
+
r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
|
551
|
+
carryin = carryout;
|
552
|
+
r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
|
553
|
+
|
554
|
+
UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b.w.v1);
|
555
|
+
carryin = false;
|
556
|
+
r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout);
|
557
|
+
carryin = carryout;
|
558
|
+
r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> 32), carryin, &carryout);
|
559
|
+
|
560
|
+
r.d.v23 += a.d.v01 * b.d.v23 + a.d.v23 * b.d.v01;
|
561
|
+
|
562
|
+
return r;
|
563
|
+
}
|
564
|
+
|
565
|
+
|
566
|
+
template <typename UInt, typename UIntX2>
|
567
|
+
uint_x4<UInt,UIntX2> operator+(const uint_x4<UInt,UIntX2>& a,
|
568
|
+
const uint_x4<UInt,UIntX2>& b)
|
569
|
+
{
|
570
|
+
uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
|
571
|
+
|
572
|
+
bool carryin = false;
|
573
|
+
bool carryout;
|
574
|
+
r.w.v0 = addwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
|
575
|
+
carryin = carryout;
|
576
|
+
r.w.v1 = addwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
|
577
|
+
carryin = carryout;
|
578
|
+
r.w.v2 = addwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
|
579
|
+
carryin = carryout;
|
580
|
+
r.w.v3 = addwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
|
581
|
+
|
582
|
+
return r;
|
583
|
+
}
|
584
|
+
|
585
|
+
template <typename UInt, typename UIntX2>
|
586
|
+
uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& a,
|
587
|
+
const uint_x4<UInt,UIntX2>& b)
|
588
|
+
{
|
589
|
+
uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
|
590
|
+
|
591
|
+
bool carryin = false;
|
592
|
+
bool carryout;
|
593
|
+
r.w.v0 = subwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
|
594
|
+
carryin = carryout;
|
595
|
+
r.w.v1 = subwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
|
596
|
+
carryin = carryout;
|
597
|
+
r.w.v2 = subwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
|
598
|
+
carryin = carryout;
|
599
|
+
r.w.v3 = subwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
|
600
|
+
|
601
|
+
return r;
|
602
|
+
}
|
603
|
+
|
604
|
+
|
605
|
+
template <typename UInt, typename UIntX2>
|
606
|
+
uint_x4<UInt,UIntX2> operator&(const uint_x4<UInt,UIntX2>& a,
|
607
|
+
const uint_x4<UInt,UIntX2>& b)
|
608
|
+
{
|
609
|
+
return uint_x4<UInt,UIntX2>(a.d.v23 & b.d.v23, a.d.v01 & b.d.v01);
|
610
|
+
}
|
611
|
+
|
612
|
+
template <typename UInt, typename UIntX2>
|
613
|
+
uint_x4<UInt,UIntX2> operator|(const uint_x4<UInt,UIntX2>& a,
|
614
|
+
const uint_x4<UInt,UIntX2>& b)
|
615
|
+
{
|
616
|
+
return uint_x4<UInt,UIntX2>(a.d.v23 | b.d.v23, a.d.v01 | b.d.v01);
|
617
|
+
}
|
618
|
+
|
619
|
+
template <typename UInt, typename UIntX2>
|
620
|
+
uint_x4<UInt,UIntX2> operator^(const uint_x4<UInt,UIntX2>& a,
|
621
|
+
const uint_x4<UInt,UIntX2>& b)
|
622
|
+
{
|
623
|
+
return uint_x4<UInt,UIntX2>(a.d.v23 ^ b.d.v23, a.d.v01 ^ b.d.v01);
|
624
|
+
}
|
625
|
+
|
626
|
+
template <typename UInt, typename UIntX2>
|
627
|
+
uint_x4<UInt,UIntX2> operator~(const uint_x4<UInt,UIntX2>& v)
|
628
|
+
{
|
629
|
+
return uint_x4<UInt,UIntX2>(~v.d.v23, ~v.d.v01);
|
630
|
+
}
|
631
|
+
|
632
|
+
template <typename UInt, typename UIntX2>
|
633
|
+
uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& v)
|
634
|
+
{
|
635
|
+
return uint_x4<UInt,UIntX2>(0UL,0UL) - v;
|
636
|
+
}
|
637
|
+
|
638
|
+
template <typename UInt, typename UIntX2>
|
639
|
+
bool operator==(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
|
640
|
+
{
|
641
|
+
return (a.d.v01 == b.d.v01) && (a.d.v23 == b.d.v23);
|
642
|
+
}
|
643
|
+
|
644
|
+
template <typename UInt, typename UIntX2>
|
645
|
+
bool operator!=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
|
646
|
+
{
|
647
|
+
return !operator==(a,b);
|
648
|
+
}
|
649
|
+
|
650
|
+
|
651
|
+
template <typename UInt, typename UIntX2>
|
652
|
+
bool operator<(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
|
653
|
+
{
|
654
|
+
return (a.d.v23 < b.d.v23)
|
655
|
+
|| ((a.d.v23 == b.d.v23) && (a.d.v01 < b.d.v01));
|
656
|
+
}
|
657
|
+
|
658
|
+
template <typename UInt, typename UIntX2>
|
659
|
+
bool operator>(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
|
660
|
+
{
|
661
|
+
return operator<(b,a);
|
662
|
+
}
|
663
|
+
|
664
|
+
template <typename UInt, typename UIntX2>
|
665
|
+
bool operator<=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
|
666
|
+
{
|
667
|
+
return !(operator<(b,a));
|
668
|
+
}
|
669
|
+
|
670
|
+
template <typename UInt, typename UIntX2>
|
671
|
+
bool operator>=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
|
672
|
+
{
|
673
|
+
return !(operator<(a,b));
|
674
|
+
}
|
675
|
+
|
676
|
+
|
677
|
+
|
678
|
+
template <typename UInt, typename UIntX2>
|
679
|
+
uint_x4<UInt,UIntX2> operator<<(const uint_x4<UInt,UIntX2>& v,
|
680
|
+
const bitcount_t shift)
|
681
|
+
{
|
682
|
+
uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
|
683
|
+
const bitcount_t bits = sizeof(UInt) * CHAR_BIT;
|
684
|
+
const bitcount_t bitmask = bits - 1;
|
685
|
+
const bitcount_t shiftdiv = shift / bits;
|
686
|
+
const bitcount_t shiftmod = shift & bitmask;
|
687
|
+
|
688
|
+
if (shiftmod) {
|
689
|
+
UInt carryover = 0;
|
690
|
+
#if PCG_LITTLE_ENDIAN
|
691
|
+
for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
|
692
|
+
#else
|
693
|
+
for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
|
694
|
+
--out, --in;
|
695
|
+
#endif
|
696
|
+
r.wa[out] = (v.wa[in] << shiftmod) | carryover;
|
697
|
+
carryover = (v.wa[in] >> (bits - shiftmod));
|
698
|
+
}
|
699
|
+
} else {
|
700
|
+
#if PCG_LITTLE_ENDIAN
|
701
|
+
for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
|
702
|
+
#else
|
703
|
+
for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
|
704
|
+
--out, --in;
|
705
|
+
#endif
|
706
|
+
r.wa[out] = v.wa[in];
|
707
|
+
}
|
708
|
+
}
|
709
|
+
|
710
|
+
return r;
|
711
|
+
}
|
712
|
+
|
713
|
+
template <typename UInt, typename UIntX2>
|
714
|
+
uint_x4<UInt,UIntX2> operator>>(const uint_x4<UInt,UIntX2>& v,
|
715
|
+
const bitcount_t shift)
|
716
|
+
{
|
717
|
+
uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
|
718
|
+
const bitcount_t bits = sizeof(UInt) * CHAR_BIT;
|
719
|
+
const bitcount_t bitmask = bits - 1;
|
720
|
+
const bitcount_t shiftdiv = shift / bits;
|
721
|
+
const bitcount_t shiftmod = shift & bitmask;
|
722
|
+
|
723
|
+
if (shiftmod) {
|
724
|
+
UInt carryover = 0;
|
725
|
+
#if PCG_LITTLE_ENDIAN
|
726
|
+
for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
|
727
|
+
--out, --in;
|
728
|
+
#else
|
729
|
+
for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
|
730
|
+
#endif
|
731
|
+
r.wa[out] = (v.wa[in] >> shiftmod) | carryover;
|
732
|
+
carryover = (v.wa[in] << (bits - shiftmod));
|
733
|
+
}
|
734
|
+
} else {
|
735
|
+
#if PCG_LITTLE_ENDIAN
|
736
|
+
for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
|
737
|
+
--out, --in;
|
738
|
+
#else
|
739
|
+
for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
|
740
|
+
#endif
|
741
|
+
r.wa[out] = v.wa[in];
|
742
|
+
}
|
743
|
+
}
|
744
|
+
|
745
|
+
return r;
|
746
|
+
}
|
747
|
+
|
748
|
+
} // namespace pcg_extras
|
749
|
+
|
750
|
+
#endif // PCG_UINT128_HPP_INCLUDED
|