fastremap 1.16.0__cp39-cp39-win32.whl → 1.17.0__cp39-cp39-win32.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fastremap/ipt.hpp ADDED
@@ -0,0 +1,354 @@
1
+ /* ipt.hpp - In-Place Transposition
2
+ *
3
+ * When transitioning between different media,
4
+ * e.g. CPU to GPU, CPU to Network, CPU to disk,
5
+ * it's often necessary to physically transpose
6
+ * multi-dimensional arrays to reformat as C or
7
+ * Fortran order. Tranposing matrices is also
8
+ * a common action in linear algebra, but often
9
+ * you can get away with just changing the strides.
10
+ *
11
+ * An out-of-place transposition is easy to write,
12
+ * often faster, but will spike peak memory consumption.
13
+ *
14
+ * This library grants the user the option of performing
15
+ * an in-place transposition which trades CPU time for
16
+ * peak memory usage.
17
+ *
18
+ * Author: William Silversmith
19
+ * Date: Feb. 2019
20
+ */
21
+
22
+ #include <algorithm>
23
+ #include <cstdint>
24
+ #include <vector>
25
+
26
+ #ifndef IN_PLACE_TRANSPOSE_H
27
+ #define IN_PLACE_TRANSPOSE_H
28
+
29
+ // ipt = in-place transpose
30
+ // call as:
31
+ // 2d: ipt::ipt<T>(arr, sx, sy);
32
+ // 3d: ipt::ipt<T>(arr, sx, sy, sz);
33
+ // 4d: ipt::ipt<T>(arr, sx, sy, sz, sw);
34
+
35
+ namespace ipt {
36
+
37
+ template <typename T>
38
+ void square_ipt(T* arr, const size_t sx, const size_t sy) {
39
+ T tmp = 0;
40
+
41
+ size_t k = 0;
42
+ size_t next_k = 0;
43
+
44
+ size_t base_k = 0; // just for going faster
45
+
46
+ for (size_t y = 0; y < sy; y++) {
47
+ base_k = sx * y;
48
+ for (size_t x = y; x < sx; x++) {
49
+ k = x + base_k;
50
+ next_k = y + sy * x;
51
+
52
+ tmp = arr[next_k];
53
+ arr[next_k] = arr[k];
54
+ arr[k] = tmp;
55
+ }
56
+ }
57
+ }
58
+
59
+ /* A permutation, P(k), is a mapping of
60
+ * one arrangement of numbers to another.
61
+ * For an m x n array, the permuatation
62
+ * mapping from C to Fortran order is:
63
+ *
64
+ * P(k) := mk mod mn - 1
65
+ * iP(k) := nk mod mn - 1 (the inverse)
66
+ *
67
+ * Where does this come from? Assume we are
68
+ * going from C to Fortran order (it doesn't
69
+ * matter either way). The indicies are defined
70
+ * as:
71
+ *
72
+ * k = C(x,y) = x + sx * y
73
+ * F(x,y) = y + sy * x
74
+ *
75
+ * The permutation P(k) is the transformation:
76
+ *
77
+ * P(C(x,y)) = F(x,y)
78
+ *
79
+ * 1. P(x + sx * y) = y + sx * x
80
+ * 2. sy (x + sx y) = sy x + sx sy y
81
+ * 3. Let q = (sx sy - 1)
82
+ * 4. sy x + sx sy y % q
83
+ * 5. ((sy x % q) + (sx sy y % q)) % q by distributive identity
84
+ * 6. sy x is identical b/c q is always bigger
85
+ * 7. sx sy y reduces to y
86
+ * 8 q is always bigger than sy x + y so it disappears
87
+ *
88
+ * ==> P(k) = y + sy * x = F(x,y)
89
+ * ==> P(k) = sy * k % (sx sy - 1)
90
+ *
91
+ * Note that P(0) and P(q) are always 0 and q respectively.
92
+ *
93
+ * Now we need a way to implement this insight.
94
+ * How can we move the data around without using too
95
+ * much extra space? A simple algorithm is
96
+ * "follow-the-cycles". Each time you try moving a
97
+ * k to P(k), it displaces the resident tile. Eventually,
98
+ * this forms a cycle. When you reach the end of a cycle,
99
+ * you can stop processing and move to unvisited parts of
100
+ * the array. This requires storing a packed bit representation
101
+ * of where we've visited to make sure we get everything.
102
+ * This means we need to store between 2.0x and 1.016x
103
+ * memory in the size of the original array depending on its
104
+ * data type (2.0x would be a transpose of another bit packed
105
+ * array and 1.016x would be 64-bit data types).
106
+ *
107
+ * There are fancier algorithms that use divide-and-conquer,
108
+ * and SIMD tricks, and near zero extra memory, but
109
+ * this is a good place to start. Fwiw, the bit vector
110
+ * has an O(nm) time complexity (really 2nm) while the
111
+ * sans-bit vector algorithms are O(nm log nm).
112
+ */
113
+ template <typename T>
114
+ void rect_ipt(T* arr, const size_t sx, const size_t sy) {
115
+ const size_t sxy = sx * sy;
116
+
117
+ std::vector<bool> visited;
118
+ visited.resize(sxy);
119
+
120
+ visited[0] = true;
121
+ visited[sxy - 1] = true;
122
+
123
+ const size_t q = sxy - 1;
124
+
125
+ size_t k, next_k;
126
+ T tmp1, tmp2;
127
+
128
+ for (size_t i = 1; i < q; i++) {
129
+ if (visited[i]) {
130
+ continue;
131
+ }
132
+
133
+ k = i;
134
+ tmp1 = arr[k];
135
+ next_k = sy * k - q * (k / sx); // P(k)
136
+
137
+ while (!visited[next_k]) {
138
+ tmp2 = arr[next_k];
139
+ arr[next_k] = tmp1;
140
+ tmp1 = tmp2;
141
+ visited[next_k] = true;
142
+ k = next_k;
143
+ next_k = sy * k - q * (k / sx); // P(k)
144
+ }
145
+ }
146
+ }
147
+
148
+ // note: sx == sy == sz... find better convention?
149
+ // still good for mutliple-dispatch.
150
+ template <typename T>
151
+ void square_ipt(
152
+ T* arr,
153
+ const size_t sx, const size_t sy, const size_t sz
154
+ ) {
155
+
156
+ T tmp = 0;
157
+
158
+ const size_t sxy = sx * sy;
159
+ const size_t syz = sy * sz;
160
+
161
+ size_t k = 0;
162
+ size_t next_k = 0;
163
+ size_t base_k = 0;
164
+ for (size_t z = 0; z < sz; z++) {
165
+ for (size_t y = 0; y < sy; y++) {
166
+ base_k = sx * y + sxy * z;
167
+ for (size_t x = z; x < sx; x++) {
168
+ k = x + base_k;
169
+ next_k = z + sz * y + syz * x;
170
+
171
+ tmp = arr[next_k];
172
+ arr[next_k] = arr[k];
173
+ arr[k] = tmp;
174
+ }
175
+ }
176
+ }
177
+ }
178
+
179
+ inline size_t P_3d(
180
+ const size_t k,
181
+ const size_t sx, const size_t sy, const size_t sz
182
+ ) {
183
+ const size_t sxy = sx * sy;
184
+
185
+ // k = x + sx y + sx sy z
186
+
187
+ size_t z = k / sxy;
188
+ size_t y = (k - (z * sxy)) / sx;
189
+ size_t x = k - sx * (y + z * sy);
190
+ return z + sz * (y + sy * x);
191
+ }
192
+
193
+ template <typename T>
194
+ void rect_ipt(
195
+ T* arr,
196
+ const size_t sx, const size_t sy, const size_t sz
197
+ ) {
198
+ const size_t sxy = sx * sy;
199
+ const size_t N = sxy * sz;
200
+
201
+ std::vector<bool> visited;
202
+ visited.resize(N);
203
+
204
+ visited[0] = true;
205
+ visited[N - 1] = true;
206
+
207
+ size_t k, next_k;
208
+ T tmp1 = 0, tmp2 = 0;
209
+
210
+ for (size_t i = 1; i < (N - 1); i++) {
211
+ if (visited[i]) {
212
+ continue;
213
+ }
214
+
215
+ k = i;
216
+ tmp1 = arr[k];
217
+ next_k = P_3d(k, sx, sy, sz);
218
+ while (!visited[next_k]) {
219
+ tmp2 = arr[next_k];
220
+ arr[next_k] = tmp1;
221
+ tmp1 = tmp2;
222
+ visited[next_k] = true;
223
+ k = next_k;
224
+ next_k = P_3d(k, sx, sy, sz);
225
+ }
226
+ }
227
+ }
228
+
229
+ inline size_t P_4d(
230
+ const size_t k,
231
+ const size_t sx, const size_t sy, const size_t sz, const size_t sw
232
+ ) {
233
+ const size_t sxy = sx * sy;
234
+ const size_t sxyz = sxy * sz;
235
+
236
+ // k = x + sx y + sx sy z + sx sy sz w
237
+
238
+ size_t w = k / sxyz;
239
+ size_t z = (k - w * sxyz) / sxy;
240
+ size_t y = (k - (w * sxyz) - (z * sxy)) / sx;
241
+ size_t x = k - (w * sxyz) - (z * sxy) - y * sx;
242
+
243
+ return w + sw * (z + sz * (y + sy * x));
244
+ }
245
+
246
+ template <typename T>
247
+ void rect_ipt(
248
+ T* arr,
249
+ const size_t sx, const size_t sy, const size_t sz, const size_t sw
250
+ ) {
251
+
252
+ const size_t N = sx * sy * sz * sw;
253
+
254
+ std::vector<bool> visited;
255
+ visited.resize(N);
256
+
257
+ visited[0] = true;
258
+ visited[N - 1] = true;
259
+
260
+ size_t k, next_k;
261
+ T tmp1 = 0, tmp2 = 0;
262
+
263
+ for (size_t i = 1; i < (N - 1); i++) {
264
+ if (visited[i]) {
265
+ continue;
266
+ }
267
+
268
+ k = i;
269
+ tmp1 = arr[k];
270
+ next_k = P_4d(k, sx, sy, sz, sw);
271
+ while (!visited[next_k]) {
272
+ tmp2 = arr[next_k];
273
+ arr[next_k] = tmp1;
274
+ tmp1 = tmp2;
275
+ visited[next_k] = true;
276
+ k = next_k;
277
+ next_k = P_4d(k, sx, sy, sz, sw);
278
+ }
279
+ }
280
+ }
281
+
282
+ template <typename T>
283
+ void ipt(T* arr, const size_t sx) {
284
+ return;
285
+ }
286
+
287
+ template <typename T>
288
+ void ipt(T* arr, const size_t sx, const size_t sy) {
289
+ if (sx * sy <= 1) {
290
+ return;
291
+ }
292
+
293
+ if (sx == sy) {
294
+ square_ipt(arr, sx, sy);
295
+ }
296
+ else {
297
+ rect_ipt(arr, sx, sy);
298
+ }
299
+ }
300
+
301
+ template <typename T>
302
+ void ipt(T* arr, const size_t sx, const size_t sy, const size_t sz) {
303
+ if (sx * sy * sz <= 1) {
304
+ return;
305
+ }
306
+
307
+ if (sx == sy && sy == sz) {
308
+ square_ipt(arr, sx, sy, sz);
309
+ }
310
+ else {
311
+ rect_ipt(arr, sx, sy, sz);
312
+ }
313
+ }
314
+
315
+ template <typename T>
316
+ void ipt(
317
+ T* arr,
318
+ const size_t sx, const size_t sy,
319
+ const size_t sz, const size_t sw
320
+ ) {
321
+ if (sx * sy * sz * sw <= 1) {
322
+ return;
323
+ }
324
+
325
+ rect_ipt(arr, sx, sy, sz, sw);
326
+ }
327
+
328
+ };
329
+
330
+ namespace pyipt {
331
+
332
+ template <typename T>
333
+ void _ipt2d(T* arr, const size_t sx, const size_t sy) {
334
+ ipt::ipt(arr, sx, sy);
335
+ }
336
+
337
+ template <typename T>
338
+ void _ipt3d(T* arr, const size_t sx, const size_t sy, const size_t sz) {
339
+ ipt::ipt(arr, sx, sy, sz);
340
+ }
341
+
342
+ template <typename T>
343
+ void _ipt4d(
344
+ T* arr,
345
+ const size_t sx, const size_t sy,
346
+ const size_t sz, const size_t sw
347
+ ) {
348
+
349
+ ipt::ipt(arr, sx, sy, sz, sw);
350
+ }
351
+
352
+ };
353
+
354
+ #endif
fastremap/py.typed ADDED
File without changes