numo-random 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +7 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/LICENSE.txt +176 -0
- data/README.md +64 -0
- data/ext/numo/random/extconf.rb +29 -0
- data/ext/numo/random/randomext.cpp +26 -0
- data/ext/numo/random/randomext.hpp +410 -0
- data/ext/numo/random/src/LICENSE.txt +201 -0
- data/ext/numo/random/src/pcg_extras.hpp +637 -0
- data/ext/numo/random/src/pcg_random.hpp +1751 -0
- data/ext/numo/random/src/pcg_uint128.hpp +750 -0
- data/lib/numo/random/version.rb +8 -0
- data/lib/numo/random.rb +184 -0
- metadata +78 -0
@@ -0,0 +1,750 @@
|
|
1
|
+
/*
|
2
|
+
* PCG Random Number Generation for C++
|
3
|
+
*
|
4
|
+
* Copyright 2014 Melissa O'Neill <oneill@pcg-random.org>
|
5
|
+
*
|
6
|
+
* Licensed under the Apache License, Version 2.0 (the "License");
|
7
|
+
* you may not use this file except in compliance with the License.
|
8
|
+
* You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing, software
|
13
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
* See the License for the specific language governing permissions and
|
16
|
+
* limitations under the License.
|
17
|
+
*
|
18
|
+
* For additional information about the PCG random number generation scheme,
|
19
|
+
* including its license and other licensing options, visit
|
20
|
+
*
|
21
|
+
* http://www.pcg-random.org
|
22
|
+
*/
|
23
|
+
|
24
|
+
/*
|
25
|
+
* This code provides a a C++ class that can provide 128-bit (or higher)
|
26
|
+
* integers. To produce 2K-bit integers, it uses two K-bit integers,
|
27
|
+
* placed in a union that allowes the code to also see them as four K/2 bit
|
28
|
+
* integers (and access them either directly name, or by index).
|
29
|
+
*
|
30
|
+
* It may seem like we're reinventing the wheel here, because several
|
31
|
+
* libraries already exist that support large integers, but most existing
|
32
|
+
* libraries provide a very generic multiprecision code, but here we're
|
33
|
+
* operating at a fixed size. Also, most other libraries are fairly
|
34
|
+
* heavyweight. So we use a direct implementation. Sadly, it's much slower
|
35
|
+
* than hand-coded assembly or direct CPU support.
|
36
|
+
*/
|
37
|
+
|
38
|
+
#ifndef PCG_UINT128_HPP_INCLUDED
|
39
|
+
#define PCG_UINT128_HPP_INCLUDED 1
|
40
|
+
|
41
|
+
#include <cstdint>
|
42
|
+
#include <cstdio>
|
43
|
+
#include <cassert>
|
44
|
+
#include <climits>
|
45
|
+
#include <utility>
|
46
|
+
#include <initializer_list>
|
47
|
+
#include <type_traits>
|
48
|
+
|
49
|
+
/*
|
50
|
+
* We want to lay the type out the same way that a native type would be laid
|
51
|
+
* out, which means we must know the machine's endian, at compile time.
|
52
|
+
* This ugliness attempts to do so.
|
53
|
+
*/
|
54
|
+
|
55
|
+
#ifndef PCG_LITTLE_ENDIAN
|
56
|
+
#if defined(__BYTE_ORDER__)
|
57
|
+
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
58
|
+
#define PCG_LITTLE_ENDIAN 1
|
59
|
+
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
60
|
+
#define PCG_LITTLE_ENDIAN 0
|
61
|
+
#else
|
62
|
+
#error __BYTE_ORDER__ does not match a standard endian, pick a side
|
63
|
+
#endif
|
64
|
+
#elif __LITTLE_ENDIAN__ || _LITTLE_ENDIAN
|
65
|
+
#define PCG_LITTLE_ENDIAN 1
|
66
|
+
#elif __BIG_ENDIAN__ || _BIG_ENDIAN
|
67
|
+
#define PCG_LITTLE_ENDIAN 0
|
68
|
+
#elif __x86_64 || __x86_64__ || __i386 || __i386__
|
69
|
+
#define PCG_LITTLE_ENDIAN 1
|
70
|
+
#elif __powerpc__ || __POWERPC__ || __ppc__ || __PPC__ \
|
71
|
+
|| __m68k__ || __mc68000__
|
72
|
+
#define PCG_LITTLE_ENDIAN 0
|
73
|
+
#else
|
74
|
+
#error Unable to determine target endianness
|
75
|
+
#endif
|
76
|
+
#endif
|
77
|
+
|
78
|
+
namespace pcg_extras {
|
79
|
+
|
80
|
+
// Recent versions of GCC have intrinsics we can use to quickly calculate
|
81
|
+
// the number of leading and trailing zeros in a number. If possible, we
|
82
|
+
// use them, otherwise we fall back to old-fashioned bit twiddling to figure
|
83
|
+
// them out.
|
84
|
+
|
85
|
+
#ifndef PCG_BITCOUNT_T
|
86
|
+
typedef uint8_t bitcount_t;
|
87
|
+
#else
|
88
|
+
typedef PCG_BITCOUNT_T bitcount_t;
|
89
|
+
#endif
|
90
|
+
|
91
|
+
/*
|
92
|
+
* Provide some useful helper functions
|
93
|
+
* * flog2 floor(log2(x))
|
94
|
+
* * trailingzeros number of trailing zero bits
|
95
|
+
*/
|
96
|
+
|
97
|
+
#ifdef __GNUC__ // Any GNU-compatible compiler supporting C++11 has
|
98
|
+
// some useful intrinsics we can use.
|
99
|
+
|
100
|
+
inline bitcount_t flog2(uint32_t v)
|
101
|
+
{
|
102
|
+
return 31 - __builtin_clz(v);
|
103
|
+
}
|
104
|
+
|
105
|
+
inline bitcount_t trailingzeros(uint32_t v)
|
106
|
+
{
|
107
|
+
return __builtin_ctz(v);
|
108
|
+
}
|
109
|
+
|
110
|
+
inline bitcount_t flog2(uint64_t v)
|
111
|
+
{
|
112
|
+
#if UINT64_MAX == ULONG_MAX
|
113
|
+
return 63 - __builtin_clzl(v);
|
114
|
+
#elif UINT64_MAX == ULLONG_MAX
|
115
|
+
return 63 - __builtin_clzll(v);
|
116
|
+
#else
|
117
|
+
#error Cannot find a function for uint64_t
|
118
|
+
#endif
|
119
|
+
}
|
120
|
+
|
121
|
+
inline bitcount_t trailingzeros(uint64_t v)
|
122
|
+
{
|
123
|
+
#if UINT64_MAX == ULONG_MAX
|
124
|
+
return __builtin_ctzl(v);
|
125
|
+
#elif UINT64_MAX == ULLONG_MAX
|
126
|
+
return __builtin_ctzll(v);
|
127
|
+
#else
|
128
|
+
#error Cannot find a function for uint64_t
|
129
|
+
#endif
|
130
|
+
}
|
131
|
+
|
132
|
+
#else // Otherwise, we fall back to bit twiddling
|
133
|
+
// implementations
|
134
|
+
|
135
|
+
inline bitcount_t flog2(uint32_t v)
|
136
|
+
{
|
137
|
+
// Based on code by Eric Cole and Mark Dickinson, which appears at
|
138
|
+
// https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
|
139
|
+
|
140
|
+
static const uint8_t multiplyDeBruijnBitPos[32] = {
|
141
|
+
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
|
142
|
+
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
|
143
|
+
};
|
144
|
+
|
145
|
+
v |= v >> 1; // first round down to one less than a power of 2
|
146
|
+
v |= v >> 2;
|
147
|
+
v |= v >> 4;
|
148
|
+
v |= v >> 8;
|
149
|
+
v |= v >> 16;
|
150
|
+
|
151
|
+
return multiplyDeBruijnBitPos[(uint32_t)(v * 0x07C4ACDDU) >> 27];
|
152
|
+
}
|
153
|
+
|
154
|
+
inline bitcount_t trailingzeros(uint32_t v)
|
155
|
+
{
|
156
|
+
static const uint8_t multiplyDeBruijnBitPos[32] = {
|
157
|
+
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
|
158
|
+
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
|
159
|
+
};
|
160
|
+
|
161
|
+
return multiplyDeBruijnBitPos[((uint32_t)((v & -v) * 0x077CB531U)) >> 27];
|
162
|
+
}
|
163
|
+
|
164
|
+
inline bitcount_t flog2(uint64_t v)
|
165
|
+
{
|
166
|
+
uint32_t high = v >> 32;
|
167
|
+
uint32_t low = uint32_t(v);
|
168
|
+
|
169
|
+
return high ? 32+flog2(high) : flog2(low);
|
170
|
+
}
|
171
|
+
|
172
|
+
inline bitcount_t trailingzeros(uint64_t v)
|
173
|
+
{
|
174
|
+
uint32_t high = v >> 32;
|
175
|
+
uint32_t low = uint32_t(v);
|
176
|
+
|
177
|
+
return low ? trailingzeros(low) : trailingzeros(high)+32;
|
178
|
+
}
|
179
|
+
|
180
|
+
#endif
|
181
|
+
|
182
|
+
template <typename UInt>
|
183
|
+
inline bitcount_t clog2(UInt v)
|
184
|
+
{
|
185
|
+
return flog2(v) + ((v & (-v)) != v);
|
186
|
+
}
|
187
|
+
|
188
|
+
template <typename UInt>
|
189
|
+
inline UInt addwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
|
190
|
+
{
|
191
|
+
UInt half_result = y + carryin;
|
192
|
+
UInt result = x + half_result;
|
193
|
+
*carryout = (half_result < y) || (result < x);
|
194
|
+
return result;
|
195
|
+
}
|
196
|
+
|
197
|
+
template <typename UInt>
|
198
|
+
inline UInt subwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
|
199
|
+
{
|
200
|
+
UInt half_result = y + carryin;
|
201
|
+
UInt result = x - half_result;
|
202
|
+
*carryout = (half_result < y) || (result > x);
|
203
|
+
return result;
|
204
|
+
}
|
205
|
+
|
206
|
+
|
207
|
+
template <typename UInt, typename UIntX2>
|
208
|
+
class uint_x4 {
|
209
|
+
// private:
|
210
|
+
public:
|
211
|
+
union {
|
212
|
+
#if PCG_LITTLE_ENDIAN
|
213
|
+
struct {
|
214
|
+
UInt v0, v1, v2, v3;
|
215
|
+
} w;
|
216
|
+
struct {
|
217
|
+
UIntX2 v01, v23;
|
218
|
+
} d;
|
219
|
+
#else
|
220
|
+
struct {
|
221
|
+
UInt v3, v2, v1, v0;
|
222
|
+
} w;
|
223
|
+
struct {
|
224
|
+
UIntX2 v23, v01;
|
225
|
+
} d;
|
226
|
+
#endif
|
227
|
+
// For the array access versions, the code that uses the array
|
228
|
+
// must handle endian itself. Yuck.
|
229
|
+
UInt wa[4];
|
230
|
+
UIntX2 da[2];
|
231
|
+
};
|
232
|
+
|
233
|
+
public:
|
234
|
+
uint_x4() = default;
|
235
|
+
|
236
|
+
constexpr uint_x4(UInt v3, UInt v2, UInt v1, UInt v0)
|
237
|
+
#if PCG_LITTLE_ENDIAN
|
238
|
+
: w{v0, v1, v2, v3}
|
239
|
+
#else
|
240
|
+
: w{v3, v2, v1, v0}
|
241
|
+
#endif
|
242
|
+
{
|
243
|
+
// Nothing (else) to do
|
244
|
+
}
|
245
|
+
|
246
|
+
constexpr uint_x4(UIntX2 v23, UIntX2 v01)
|
247
|
+
#if PCG_LITTLE_ENDIAN
|
248
|
+
: d{v01,v23}
|
249
|
+
#else
|
250
|
+
: d{v23,v01}
|
251
|
+
#endif
|
252
|
+
{
|
253
|
+
// Nothing (else) to do
|
254
|
+
}
|
255
|
+
|
256
|
+
template<class Integral,
|
257
|
+
typename std::enable_if<(std::is_integral<Integral>::value
|
258
|
+
&& sizeof(Integral) <= sizeof(UIntX2))
|
259
|
+
>::type* = nullptr>
|
260
|
+
constexpr uint_x4(Integral v01)
|
261
|
+
#if PCG_LITTLE_ENDIAN
|
262
|
+
: d{UIntX2(v01),0UL}
|
263
|
+
#else
|
264
|
+
: d{0UL,UIntX2(v01)}
|
265
|
+
#endif
|
266
|
+
{
|
267
|
+
// Nothing (else) to do
|
268
|
+
}
|
269
|
+
|
270
|
+
explicit constexpr operator uint64_t() const
|
271
|
+
{
|
272
|
+
return d.v01;
|
273
|
+
}
|
274
|
+
|
275
|
+
explicit constexpr operator uint32_t() const
|
276
|
+
{
|
277
|
+
return w.v0;
|
278
|
+
}
|
279
|
+
|
280
|
+
explicit constexpr operator int() const
|
281
|
+
{
|
282
|
+
return w.v0;
|
283
|
+
}
|
284
|
+
|
285
|
+
explicit constexpr operator uint16_t() const
|
286
|
+
{
|
287
|
+
return w.v0;
|
288
|
+
}
|
289
|
+
|
290
|
+
explicit constexpr operator uint8_t() const
|
291
|
+
{
|
292
|
+
return w.v0;
|
293
|
+
}
|
294
|
+
|
295
|
+
typedef typename std::conditional<std::is_same<uint64_t,
|
296
|
+
unsigned long>::value,
|
297
|
+
unsigned long long,
|
298
|
+
unsigned long>::type
|
299
|
+
uint_missing_t;
|
300
|
+
|
301
|
+
explicit constexpr operator uint_missing_t() const
|
302
|
+
{
|
303
|
+
return d.v01;
|
304
|
+
}
|
305
|
+
|
306
|
+
explicit constexpr operator bool() const
|
307
|
+
{
|
308
|
+
return d.v01 || d.v23;
|
309
|
+
}
|
310
|
+
|
311
|
+
template<typename U, typename V>
|
312
|
+
friend uint_x4<U,V> operator*(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
313
|
+
|
314
|
+
template<typename U, typename V>
|
315
|
+
friend std::pair< uint_x4<U,V>,uint_x4<U,V> >
|
316
|
+
divmod(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
317
|
+
|
318
|
+
template<typename U, typename V>
|
319
|
+
friend uint_x4<U,V> operator+(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
320
|
+
|
321
|
+
template<typename U, typename V>
|
322
|
+
friend uint_x4<U,V> operator-(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
323
|
+
|
324
|
+
template<typename U, typename V>
|
325
|
+
friend uint_x4<U,V> operator<<(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
326
|
+
|
327
|
+
template<typename U, typename V>
|
328
|
+
friend uint_x4<U,V> operator>>(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
329
|
+
|
330
|
+
template<typename U, typename V>
|
331
|
+
friend uint_x4<U,V> operator&(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
332
|
+
|
333
|
+
template<typename U, typename V>
|
334
|
+
friend uint_x4<U,V> operator|(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
335
|
+
|
336
|
+
template<typename U, typename V>
|
337
|
+
friend uint_x4<U,V> operator^(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
338
|
+
|
339
|
+
template<typename U, typename V>
|
340
|
+
friend bool operator==(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
341
|
+
|
342
|
+
template<typename U, typename V>
|
343
|
+
friend bool operator!=(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
344
|
+
|
345
|
+
template<typename U, typename V>
|
346
|
+
friend bool operator<(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
347
|
+
|
348
|
+
template<typename U, typename V>
|
349
|
+
friend bool operator<=(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
350
|
+
|
351
|
+
template<typename U, typename V>
|
352
|
+
friend bool operator>(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
353
|
+
|
354
|
+
template<typename U, typename V>
|
355
|
+
friend bool operator>=(const uint_x4<U,V>&, const uint_x4<U,V>&);
|
356
|
+
|
357
|
+
template<typename U, typename V>
|
358
|
+
friend uint_x4<U,V> operator~(const uint_x4<U,V>&);
|
359
|
+
|
360
|
+
template<typename U, typename V>
|
361
|
+
friend uint_x4<U,V> operator-(const uint_x4<U,V>&);
|
362
|
+
|
363
|
+
template<typename U, typename V>
|
364
|
+
friend bitcount_t flog2(const uint_x4<U,V>&);
|
365
|
+
|
366
|
+
template<typename U, typename V>
|
367
|
+
friend bitcount_t trailingzeros(const uint_x4<U,V>&);
|
368
|
+
|
369
|
+
uint_x4& operator*=(const uint_x4& rhs)
|
370
|
+
{
|
371
|
+
uint_x4 result = *this * rhs;
|
372
|
+
return *this = result;
|
373
|
+
}
|
374
|
+
|
375
|
+
uint_x4& operator/=(const uint_x4& rhs)
|
376
|
+
{
|
377
|
+
uint_x4 result = *this / rhs;
|
378
|
+
return *this = result;
|
379
|
+
}
|
380
|
+
|
381
|
+
uint_x4& operator%=(const uint_x4& rhs)
|
382
|
+
{
|
383
|
+
uint_x4 result = *this % rhs;
|
384
|
+
return *this = result;
|
385
|
+
}
|
386
|
+
|
387
|
+
uint_x4& operator+=(const uint_x4& rhs)
|
388
|
+
{
|
389
|
+
uint_x4 result = *this + rhs;
|
390
|
+
return *this = result;
|
391
|
+
}
|
392
|
+
|
393
|
+
uint_x4& operator-=(const uint_x4& rhs)
|
394
|
+
{
|
395
|
+
uint_x4 result = *this - rhs;
|
396
|
+
return *this = result;
|
397
|
+
}
|
398
|
+
|
399
|
+
uint_x4& operator&=(const uint_x4& rhs)
|
400
|
+
{
|
401
|
+
uint_x4 result = *this & rhs;
|
402
|
+
return *this = result;
|
403
|
+
}
|
404
|
+
|
405
|
+
uint_x4& operator|=(const uint_x4& rhs)
|
406
|
+
{
|
407
|
+
uint_x4 result = *this | rhs;
|
408
|
+
return *this = result;
|
409
|
+
}
|
410
|
+
|
411
|
+
uint_x4& operator^=(const uint_x4& rhs)
|
412
|
+
{
|
413
|
+
uint_x4 result = *this ^ rhs;
|
414
|
+
return *this = result;
|
415
|
+
}
|
416
|
+
|
417
|
+
uint_x4& operator>>=(bitcount_t shift)
|
418
|
+
{
|
419
|
+
uint_x4 result = *this >> shift;
|
420
|
+
return *this = result;
|
421
|
+
}
|
422
|
+
|
423
|
+
uint_x4& operator<<=(bitcount_t shift)
|
424
|
+
{
|
425
|
+
uint_x4 result = *this << shift;
|
426
|
+
return *this = result;
|
427
|
+
}
|
428
|
+
|
429
|
+
};
|
430
|
+
|
431
|
+
template<typename U, typename V>
|
432
|
+
bitcount_t flog2(const uint_x4<U,V>& v)
|
433
|
+
{
|
434
|
+
#if PCG_LITTLE_ENDIAN
|
435
|
+
for (uint8_t i = 4; i !=0; /* dec in loop */) {
|
436
|
+
--i;
|
437
|
+
#else
|
438
|
+
for (uint8_t i = 0; i < 4; ++i) {
|
439
|
+
#endif
|
440
|
+
if (v.wa[i] == 0)
|
441
|
+
continue;
|
442
|
+
return flog2(v.wa[i]) + (sizeof(U)*CHAR_BIT)*i;
|
443
|
+
}
|
444
|
+
abort();
|
445
|
+
}
|
446
|
+
|
447
|
+
template<typename U, typename V>
|
448
|
+
bitcount_t trailingzeros(const uint_x4<U,V>& v)
|
449
|
+
{
|
450
|
+
#if PCG_LITTLE_ENDIAN
|
451
|
+
for (uint8_t i = 0; i < 4; ++i) {
|
452
|
+
#else
|
453
|
+
for (uint8_t i = 4; i !=0; /* dec in loop */) {
|
454
|
+
--i;
|
455
|
+
#endif
|
456
|
+
if (v.wa[i] != 0)
|
457
|
+
return trailingzeros(v.wa[i]) + (sizeof(U)*CHAR_BIT)*i;
|
458
|
+
}
|
459
|
+
return (sizeof(U)*CHAR_BIT)*4;
|
460
|
+
}
|
461
|
+
|
462
|
+
template <typename UInt, typename UIntX2>
|
463
|
+
std::pair< uint_x4<UInt,UIntX2>, uint_x4<UInt,UIntX2> >
|
464
|
+
divmod(const uint_x4<UInt,UIntX2>& orig_dividend,
|
465
|
+
const uint_x4<UInt,UIntX2>& divisor)
|
466
|
+
{
|
467
|
+
// If the dividend is less than the divisor, the answer is always zero.
|
468
|
+
// This takes care of boundary cases like 0/x (which would otherwise be
|
469
|
+
// problematic because we can't take the log of zero. (The boundary case
|
470
|
+
// of division by zero is undefined.)
|
471
|
+
if (orig_dividend < divisor)
|
472
|
+
return { uint_x4<UInt,UIntX2>(0UL), orig_dividend };
|
473
|
+
|
474
|
+
auto dividend = orig_dividend;
|
475
|
+
|
476
|
+
auto log2_divisor = flog2(divisor);
|
477
|
+
auto log2_dividend = flog2(dividend);
|
478
|
+
// assert(log2_dividend >= log2_divisor);
|
479
|
+
bitcount_t logdiff = log2_dividend - log2_divisor;
|
480
|
+
|
481
|
+
constexpr uint_x4<UInt,UIntX2> ONE(1UL);
|
482
|
+
if (logdiff == 0)
|
483
|
+
return { ONE, dividend - divisor };
|
484
|
+
|
485
|
+
// Now we change the log difference to
|
486
|
+
// floor(log2(divisor)) - ceil(log2(dividend))
|
487
|
+
// to ensure that we *underestimate* the result.
|
488
|
+
logdiff -= 1;
|
489
|
+
|
490
|
+
uint_x4<UInt,UIntX2> quotient(0UL);
|
491
|
+
|
492
|
+
auto qfactor = ONE << logdiff;
|
493
|
+
auto factor = divisor << logdiff;
|
494
|
+
|
495
|
+
do {
|
496
|
+
dividend -= factor;
|
497
|
+
quotient += qfactor;
|
498
|
+
while (dividend < factor) {
|
499
|
+
factor >>= 1;
|
500
|
+
qfactor >>= 1;
|
501
|
+
}
|
502
|
+
} while (dividend >= divisor);
|
503
|
+
|
504
|
+
return { quotient, dividend };
|
505
|
+
}
|
506
|
+
|
507
|
+
template <typename UInt, typename UIntX2>
|
508
|
+
uint_x4<UInt,UIntX2> operator/(const uint_x4<UInt,UIntX2>& dividend,
|
509
|
+
const uint_x4<UInt,UIntX2>& divisor)
|
510
|
+
{
|
511
|
+
return divmod(dividend, divisor).first;
|
512
|
+
}
|
513
|
+
|
514
|
+
template <typename UInt, typename UIntX2>
|
515
|
+
uint_x4<UInt,UIntX2> operator%(const uint_x4<UInt,UIntX2>& dividend,
|
516
|
+
const uint_x4<UInt,UIntX2>& divisor)
|
517
|
+
{
|
518
|
+
return divmod(dividend, divisor).second;
|
519
|
+
}
|
520
|
+
|
521
|
+
|
522
|
+
template <typename UInt, typename UIntX2>
|
523
|
+
uint_x4<UInt,UIntX2> operator*(const uint_x4<UInt,UIntX2>& a,
|
524
|
+
const uint_x4<UInt,UIntX2>& b)
|
525
|
+
{
|
526
|
+
uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
|
527
|
+
bool carryin = false;
|
528
|
+
bool carryout;
|
529
|
+
UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(b.w.v0);
|
530
|
+
r.w.v0 = UInt(a0b0);
|
531
|
+
r.w.v1 = UInt(a0b0 >> 32);
|
532
|
+
|
533
|
+
UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(b.w.v0);
|
534
|
+
r.w.v2 = UInt(a1b0 >> 32);
|
535
|
+
r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout);
|
536
|
+
carryin = carryout;
|
537
|
+
r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
|
538
|
+
carryin = carryout;
|
539
|
+
r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
|
540
|
+
|
541
|
+
UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b.w.v1);
|
542
|
+
carryin = false;
|
543
|
+
r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> 32), carryin, &carryout);
|
544
|
+
carryin = carryout;
|
545
|
+
r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
|
546
|
+
|
547
|
+
carryin = false;
|
548
|
+
r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout);
|
549
|
+
carryin = carryout;
|
550
|
+
r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
|
551
|
+
carryin = carryout;
|
552
|
+
r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
|
553
|
+
|
554
|
+
UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b.w.v1);
|
555
|
+
carryin = false;
|
556
|
+
r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout);
|
557
|
+
carryin = carryout;
|
558
|
+
r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> 32), carryin, &carryout);
|
559
|
+
|
560
|
+
r.d.v23 += a.d.v01 * b.d.v23 + a.d.v23 * b.d.v01;
|
561
|
+
|
562
|
+
return r;
|
563
|
+
}
|
564
|
+
|
565
|
+
|
566
|
+
template <typename UInt, typename UIntX2>
|
567
|
+
uint_x4<UInt,UIntX2> operator+(const uint_x4<UInt,UIntX2>& a,
|
568
|
+
const uint_x4<UInt,UIntX2>& b)
|
569
|
+
{
|
570
|
+
uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
|
571
|
+
|
572
|
+
bool carryin = false;
|
573
|
+
bool carryout;
|
574
|
+
r.w.v0 = addwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
|
575
|
+
carryin = carryout;
|
576
|
+
r.w.v1 = addwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
|
577
|
+
carryin = carryout;
|
578
|
+
r.w.v2 = addwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
|
579
|
+
carryin = carryout;
|
580
|
+
r.w.v3 = addwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
|
581
|
+
|
582
|
+
return r;
|
583
|
+
}
|
584
|
+
|
585
|
+
template <typename UInt, typename UIntX2>
|
586
|
+
uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& a,
|
587
|
+
const uint_x4<UInt,UIntX2>& b)
|
588
|
+
{
|
589
|
+
uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
|
590
|
+
|
591
|
+
bool carryin = false;
|
592
|
+
bool carryout;
|
593
|
+
r.w.v0 = subwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
|
594
|
+
carryin = carryout;
|
595
|
+
r.w.v1 = subwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
|
596
|
+
carryin = carryout;
|
597
|
+
r.w.v2 = subwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
|
598
|
+
carryin = carryout;
|
599
|
+
r.w.v3 = subwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
|
600
|
+
|
601
|
+
return r;
|
602
|
+
}
|
603
|
+
|
604
|
+
|
605
|
+
template <typename UInt, typename UIntX2>
|
606
|
+
uint_x4<UInt,UIntX2> operator&(const uint_x4<UInt,UIntX2>& a,
|
607
|
+
const uint_x4<UInt,UIntX2>& b)
|
608
|
+
{
|
609
|
+
return uint_x4<UInt,UIntX2>(a.d.v23 & b.d.v23, a.d.v01 & b.d.v01);
|
610
|
+
}
|
611
|
+
|
612
|
+
template <typename UInt, typename UIntX2>
|
613
|
+
uint_x4<UInt,UIntX2> operator|(const uint_x4<UInt,UIntX2>& a,
|
614
|
+
const uint_x4<UInt,UIntX2>& b)
|
615
|
+
{
|
616
|
+
return uint_x4<UInt,UIntX2>(a.d.v23 | b.d.v23, a.d.v01 | b.d.v01);
|
617
|
+
}
|
618
|
+
|
619
|
+
template <typename UInt, typename UIntX2>
|
620
|
+
uint_x4<UInt,UIntX2> operator^(const uint_x4<UInt,UIntX2>& a,
|
621
|
+
const uint_x4<UInt,UIntX2>& b)
|
622
|
+
{
|
623
|
+
return uint_x4<UInt,UIntX2>(a.d.v23 ^ b.d.v23, a.d.v01 ^ b.d.v01);
|
624
|
+
}
|
625
|
+
|
626
|
+
template <typename UInt, typename UIntX2>
|
627
|
+
uint_x4<UInt,UIntX2> operator~(const uint_x4<UInt,UIntX2>& v)
|
628
|
+
{
|
629
|
+
return uint_x4<UInt,UIntX2>(~v.d.v23, ~v.d.v01);
|
630
|
+
}
|
631
|
+
|
632
|
+
template <typename UInt, typename UIntX2>
|
633
|
+
uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& v)
|
634
|
+
{
|
635
|
+
return uint_x4<UInt,UIntX2>(0UL,0UL) - v;
|
636
|
+
}
|
637
|
+
|
638
|
+
template <typename UInt, typename UIntX2>
|
639
|
+
bool operator==(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
|
640
|
+
{
|
641
|
+
return (a.d.v01 == b.d.v01) && (a.d.v23 == b.d.v23);
|
642
|
+
}
|
643
|
+
|
644
|
+
template <typename UInt, typename UIntX2>
|
645
|
+
bool operator!=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
|
646
|
+
{
|
647
|
+
return !operator==(a,b);
|
648
|
+
}
|
649
|
+
|
650
|
+
|
651
|
+
template <typename UInt, typename UIntX2>
|
652
|
+
bool operator<(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
|
653
|
+
{
|
654
|
+
return (a.d.v23 < b.d.v23)
|
655
|
+
|| ((a.d.v23 == b.d.v23) && (a.d.v01 < b.d.v01));
|
656
|
+
}
|
657
|
+
|
658
|
+
template <typename UInt, typename UIntX2>
|
659
|
+
bool operator>(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
|
660
|
+
{
|
661
|
+
return operator<(b,a);
|
662
|
+
}
|
663
|
+
|
664
|
+
template <typename UInt, typename UIntX2>
|
665
|
+
bool operator<=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
|
666
|
+
{
|
667
|
+
return !(operator<(b,a));
|
668
|
+
}
|
669
|
+
|
670
|
+
template <typename UInt, typename UIntX2>
|
671
|
+
bool operator>=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
|
672
|
+
{
|
673
|
+
return !(operator<(a,b));
|
674
|
+
}
|
675
|
+
|
676
|
+
|
677
|
+
|
678
|
+
template <typename UInt, typename UIntX2>
|
679
|
+
uint_x4<UInt,UIntX2> operator<<(const uint_x4<UInt,UIntX2>& v,
|
680
|
+
const bitcount_t shift)
|
681
|
+
{
|
682
|
+
uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
|
683
|
+
const bitcount_t bits = sizeof(UInt) * CHAR_BIT;
|
684
|
+
const bitcount_t bitmask = bits - 1;
|
685
|
+
const bitcount_t shiftdiv = shift / bits;
|
686
|
+
const bitcount_t shiftmod = shift & bitmask;
|
687
|
+
|
688
|
+
if (shiftmod) {
|
689
|
+
UInt carryover = 0;
|
690
|
+
#if PCG_LITTLE_ENDIAN
|
691
|
+
for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
|
692
|
+
#else
|
693
|
+
for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
|
694
|
+
--out, --in;
|
695
|
+
#endif
|
696
|
+
r.wa[out] = (v.wa[in] << shiftmod) | carryover;
|
697
|
+
carryover = (v.wa[in] >> (bits - shiftmod));
|
698
|
+
}
|
699
|
+
} else {
|
700
|
+
#if PCG_LITTLE_ENDIAN
|
701
|
+
for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
|
702
|
+
#else
|
703
|
+
for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
|
704
|
+
--out, --in;
|
705
|
+
#endif
|
706
|
+
r.wa[out] = v.wa[in];
|
707
|
+
}
|
708
|
+
}
|
709
|
+
|
710
|
+
return r;
|
711
|
+
}
|
712
|
+
|
713
|
+
template <typename UInt, typename UIntX2>
|
714
|
+
uint_x4<UInt,UIntX2> operator>>(const uint_x4<UInt,UIntX2>& v,
|
715
|
+
const bitcount_t shift)
|
716
|
+
{
|
717
|
+
uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
|
718
|
+
const bitcount_t bits = sizeof(UInt) * CHAR_BIT;
|
719
|
+
const bitcount_t bitmask = bits - 1;
|
720
|
+
const bitcount_t shiftdiv = shift / bits;
|
721
|
+
const bitcount_t shiftmod = shift & bitmask;
|
722
|
+
|
723
|
+
if (shiftmod) {
|
724
|
+
UInt carryover = 0;
|
725
|
+
#if PCG_LITTLE_ENDIAN
|
726
|
+
for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
|
727
|
+
--out, --in;
|
728
|
+
#else
|
729
|
+
for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
|
730
|
+
#endif
|
731
|
+
r.wa[out] = (v.wa[in] >> shiftmod) | carryover;
|
732
|
+
carryover = (v.wa[in] << (bits - shiftmod));
|
733
|
+
}
|
734
|
+
} else {
|
735
|
+
#if PCG_LITTLE_ENDIAN
|
736
|
+
for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
|
737
|
+
--out, --in;
|
738
|
+
#else
|
739
|
+
for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
|
740
|
+
#endif
|
741
|
+
r.wa[out] = v.wa[in];
|
742
|
+
}
|
743
|
+
}
|
744
|
+
|
745
|
+
return r;
|
746
|
+
}
|
747
|
+
|
748
|
+
} // namespace pcg_extras
|
749
|
+
|
750
|
+
#endif // PCG_UINT128_HPP_INCLUDED
|