halide 19.0.0__cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. halide/__init__.py +39 -0
  2. halide/_generator_helpers.py +835 -0
  3. halide/bin/adams2019_retrain_cost_model +0 -0
  4. halide/bin/adams2019_weightsdir_to_weightsfile +0 -0
  5. halide/bin/anderson2021_retrain_cost_model +0 -0
  6. halide/bin/anderson2021_weightsdir_to_weightsfile +0 -0
  7. halide/bin/featurization_to_sample +0 -0
  8. halide/bin/gengen +0 -0
  9. halide/bin/get_host_target +0 -0
  10. halide/halide_.cpython-39-x86_64-linux-gnu.so +0 -0
  11. halide/imageio.py +60 -0
  12. halide/include/Halide.h +35293 -0
  13. halide/include/HalideBuffer.h +2618 -0
  14. halide/include/HalidePyTorchCudaHelpers.h +64 -0
  15. halide/include/HalidePyTorchHelpers.h +120 -0
  16. halide/include/HalideRuntime.h +2221 -0
  17. halide/include/HalideRuntimeCuda.h +89 -0
  18. halide/include/HalideRuntimeD3D12Compute.h +91 -0
  19. halide/include/HalideRuntimeHexagonDma.h +104 -0
  20. halide/include/HalideRuntimeHexagonHost.h +157 -0
  21. halide/include/HalideRuntimeMetal.h +112 -0
  22. halide/include/HalideRuntimeOpenCL.h +119 -0
  23. halide/include/HalideRuntimeQurt.h +32 -0
  24. halide/include/HalideRuntimeVulkan.h +137 -0
  25. halide/include/HalideRuntimeWebGPU.h +44 -0
  26. halide/lib64/cmake/Halide/FindHalide_LLVM.cmake +152 -0
  27. halide/lib64/cmake/Halide/FindV8.cmake +33 -0
  28. halide/lib64/cmake/Halide/Halide-shared-deps.cmake +0 -0
  29. halide/lib64/cmake/Halide/Halide-shared-targets-release.cmake +29 -0
  30. halide/lib64/cmake/Halide/Halide-shared-targets.cmake +154 -0
  31. halide/lib64/cmake/Halide/HalideConfig.cmake +162 -0
  32. halide/lib64/cmake/Halide/HalideConfigVersion.cmake +65 -0
  33. halide/lib64/cmake/HalideHelpers/FindHalide_WebGPU.cmake +27 -0
  34. halide/lib64/cmake/HalideHelpers/Halide-Interfaces-release.cmake +116 -0
  35. halide/lib64/cmake/HalideHelpers/Halide-Interfaces.cmake +236 -0
  36. halide/lib64/cmake/HalideHelpers/HalideGeneratorHelpers.cmake +1056 -0
  37. halide/lib64/cmake/HalideHelpers/HalideHelpersConfig.cmake +28 -0
  38. halide/lib64/cmake/HalideHelpers/HalideHelpersConfigVersion.cmake +54 -0
  39. halide/lib64/cmake/HalideHelpers/HalideTargetHelpers.cmake +99 -0
  40. halide/lib64/cmake/HalideHelpers/MutexCopy.ps1 +31 -0
  41. halide/lib64/cmake/HalideHelpers/TargetExportScript.cmake +55 -0
  42. halide/lib64/cmake/Halide_Python/Halide_Python-targets-release.cmake +30 -0
  43. halide/lib64/cmake/Halide_Python/Halide_Python-targets.cmake +125 -0
  44. halide/lib64/cmake/Halide_Python/Halide_PythonConfig.cmake +26 -0
  45. halide/lib64/cmake/Halide_Python/Halide_PythonConfigVersion.cmake +65 -0
  46. halide/lib64/libHalide.so +0 -0
  47. halide/lib64/libHalidePyStubs.a +0 -0
  48. halide/lib64/libHalide_GenGen.a +0 -0
  49. halide/lib64/libautoschedule_adams2019.so +0 -0
  50. halide/lib64/libautoschedule_anderson2021.so +0 -0
  51. halide/lib64/libautoschedule_li2018.so +0 -0
  52. halide/lib64/libautoschedule_mullapudi2016.so +0 -0
  53. halide/share/doc/Halide/LICENSE.txt +233 -0
  54. halide/share/doc/Halide/README.md +439 -0
  55. halide/share/doc/Halide/doc/BuildingHalideWithCMake.md +626 -0
  56. halide/share/doc/Halide/doc/CodeStyleCMake.md +393 -0
  57. halide/share/doc/Halide/doc/FuzzTesting.md +104 -0
  58. halide/share/doc/Halide/doc/HalideCMakePackage.md +812 -0
  59. halide/share/doc/Halide/doc/Hexagon.md +73 -0
  60. halide/share/doc/Halide/doc/Python.md +844 -0
  61. halide/share/doc/Halide/doc/RunGen.md +283 -0
  62. halide/share/doc/Halide/doc/Testing.md +125 -0
  63. halide/share/doc/Halide/doc/Vulkan.md +287 -0
  64. halide/share/doc/Halide/doc/WebAssembly.md +228 -0
  65. halide/share/doc/Halide/doc/WebGPU.md +128 -0
  66. halide/share/tools/RunGen.h +1470 -0
  67. halide/share/tools/RunGenMain.cpp +642 -0
  68. halide/share/tools/adams2019_autotune_loop.sh +227 -0
  69. halide/share/tools/anderson2021_autotune_loop.sh +591 -0
  70. halide/share/tools/halide_benchmark.h +240 -0
  71. halide/share/tools/halide_image.h +31 -0
  72. halide/share/tools/halide_image_info.h +318 -0
  73. halide/share/tools/halide_image_io.h +2794 -0
  74. halide/share/tools/halide_malloc_trace.h +102 -0
  75. halide/share/tools/halide_thread_pool.h +161 -0
  76. halide/share/tools/halide_trace_config.h +559 -0
  77. halide-19.0.0.data/data/share/cmake/Halide/HalideConfig.cmake +6 -0
  78. halide-19.0.0.data/data/share/cmake/Halide/HalideConfigVersion.cmake +65 -0
  79. halide-19.0.0.data/data/share/cmake/HalideHelpers/HalideHelpersConfig.cmake +6 -0
  80. halide-19.0.0.data/data/share/cmake/HalideHelpers/HalideHelpersConfigVersion.cmake +54 -0
  81. halide-19.0.0.dist-info/METADATA +301 -0
  82. halide-19.0.0.dist-info/RECORD +84 -0
  83. halide-19.0.0.dist-info/WHEEL +6 -0
  84. halide-19.0.0.dist-info/licenses/LICENSE.txt +233 -0
@@ -0,0 +1,2618 @@
1
+ /** \file
2
+ * Defines a Buffer type that wraps from halide_buffer_t and adds
3
+ * functionality, and methods for more conveniently iterating over the
4
+ * samples in a halide_buffer_t outside of Halide code. */
5
+
6
+ #ifndef HALIDE_RUNTIME_BUFFER_H
7
+ #define HALIDE_RUNTIME_BUFFER_H
8
+
9
+ #include <algorithm>
10
+ #include <atomic>
11
+ #include <cassert>
12
+ #include <cstdint>
13
+ #include <cstdlib>
14
+ #include <cstring>
15
+ #include <limits>
16
+ #include <memory>
17
+ #include <type_traits>
18
+ #include <vector>
19
+
20
+ #ifdef __APPLE__
21
+ #include <AvailabilityVersions.h>
22
+ #include <TargetConditionals.h>
23
+ #endif
24
+
25
+ #if defined(__has_feature)
26
+ #if __has_feature(memory_sanitizer)
27
+ #include <sanitizer/msan_interface.h>
28
+ #endif
29
+ #endif
30
+
31
+ #include "HalideRuntime.h"
32
+
33
+ #ifdef _MSC_VER
34
+ #include <malloc.h>
35
+ #define HALIDE_ALLOCA _alloca
36
+ #else
37
+ #define HALIDE_ALLOCA __builtin_alloca
38
+ #endif
39
+
40
+ // gcc 5.1 has a false positive warning on this code
41
+ #if __GNUC__ == 5 && __GNUC_MINOR__ == 1
42
+ #pragma GCC diagnostic ignored "-Warray-bounds"
43
+ #endif
44
+
45
+ #ifndef HALIDE_RUNTIME_BUFFER_CHECK_INDICES
46
+ #define HALIDE_RUNTIME_BUFFER_CHECK_INDICES 0
47
+ #endif
48
+
49
+ #ifndef HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT
50
+ // Conservatively align buffer allocations to 128 bytes by default.
51
+ // This is enough alignment for all the platforms currently in use.
52
+ // Redefine this in your compiler settings if you desire more/less alignment.
53
+ #define HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT 128
54
+ #endif
55
+
56
+ static_assert(((HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT & (HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT - 1)) == 0),
57
+ "HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT must be a power of 2.");
58
+
59
+ // Unfortunately, not all C++17 runtimes support aligned_alloc
60
+ // (it may depends on OS/SDK version); this is provided as an opt-out
61
+ // if you are compiling on a platform that doesn't provide a (good)
62
+ // implementation. (Note that we actually use the C11 `::aligned_alloc()`
63
+ // rather than the C++17 `std::aligned_alloc()` because at least one platform
64
+ // we found supports the former but not the latter.)
65
+ #ifndef HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
66
+
67
+ // clang-format off
68
+ #ifdef _MSC_VER
69
+
70
+ // MSVC doesn't implement aligned_alloc(), even in C++17 mode, and
71
+ // has stated they probably never will, so, always default it off here.
72
+ #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
73
+
74
+ #elif defined(__ANDROID_API__) && __ANDROID_API__ < 28
75
+
76
+ // Android doesn't provide aligned_alloc until API 28
77
+ #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
78
+
79
+ #elif defined(__APPLE__)
80
+
81
+ #if TARGET_OS_OSX && (__MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_15)
82
+
83
+ // macOS doesn't provide aligned_alloc until 10.15
84
+ #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
85
+
86
+ #elif TARGET_OS_IPHONE && (__IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_14_0)
87
+
88
+ // iOS doesn't provide aligned_alloc until 14.0
89
+ #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
90
+
91
+ #else
92
+
93
+ // Assume it's ok on all other Apple targets
94
+ #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1
95
+
96
+ #endif
97
+
98
+ #else
99
+
100
+ #if defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC)
101
+
102
+ // ARM GNU-A baremetal compiler doesn't provide aligned_alloc as of 12.2
103
+ #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
104
+
105
+ #else
106
+
107
+ // Not Windows, Android, or Apple: just assume it's ok
108
+ #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1
109
+
110
+ #endif
111
+
112
+ #endif
113
+ // clang-format on
114
+
115
+ #endif // HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
116
+
117
+ namespace Halide {
118
+ namespace Runtime {
119
+
120
+ // Forward-declare our Buffer class
121
+ template<typename T, int Dims, int InClassDimStorage>
122
+ class Buffer;
123
+
124
+ // A helper to check if a parameter pack is entirely implicitly
125
+ // int-convertible to use with std::enable_if
126
+ template<typename... Args>
127
+ struct AllInts : std::false_type {};
128
+
129
+ template<>
130
+ struct AllInts<> : std::true_type {};
131
+
132
+ template<typename T, typename... Args>
133
+ struct AllInts<T, Args...> {
134
+ static const bool value = std::is_convertible<T, int>::value && AllInts<Args...>::value;
135
+ };
136
+
137
+ // Floats and doubles are technically implicitly int-convertible, but
138
+ // doing so produces a warning we treat as an error, so just disallow
139
+ // it here.
140
+ template<typename... Args>
141
+ struct AllInts<float, Args...> : std::false_type {};
142
+
143
+ template<typename... Args>
144
+ struct AllInts<double, Args...> : std::false_type {};
145
+
146
+ namespace Internal {
147
+ // A helper to detect if there are any zeros in a container
148
+ template<typename Container>
149
+ bool any_zero(const Container &c) {
150
+ for (int i : c) {
151
+ if (i == 0) {
152
+ return true;
153
+ }
154
+ }
155
+ return false;
156
+ }
157
+
158
+ struct DefaultAllocatorFns {
159
+ static inline void *(*default_allocate_fn)(size_t) = nullptr;
160
+ static inline void (*default_deallocate_fn)(void *) = nullptr;
161
+ };
162
+ } // namespace Internal
163
+
164
+ /** A struct acting as a header for allocations owned by the Buffer
165
+ * class itself. */
166
+ struct AllocationHeader {
167
+ void (*deallocate_fn)(void *);
168
+ std::atomic<int> ref_count;
169
+
170
+ // Note that ref_count always starts at 1
171
+ explicit AllocationHeader(void (*deallocate_fn)(void *))
172
+ : deallocate_fn(deallocate_fn), ref_count(1) {
173
+ }
174
+ };
175
+
176
+ /** This indicates how to deallocate the device for a Halide::Runtime::Buffer. */
177
+ enum struct BufferDeviceOwnership : int {
178
+ Allocated, ///> halide_device_free will be called when device ref count goes to zero
179
+ WrappedNative, ///> halide_device_detach_native will be called when device ref count goes to zero
180
+ Unmanaged, ///> No free routine will be called when device ref count goes to zero
181
+ AllocatedDeviceAndHost, ///> Call device_and_host_free when DevRefCount goes to zero.
182
+ Cropped, ///> Call halide_device_release_crop when DevRefCount goes to zero.
183
+ };
184
+
185
+ /** A similar struct for managing device allocations. */
186
+ struct DeviceRefCount {
187
+ // This is only ever constructed when there's something to manage,
188
+ // so start at one.
189
+ std::atomic<int> count{1};
190
+ BufferDeviceOwnership ownership{BufferDeviceOwnership::Allocated};
191
+ };
192
+
193
+ constexpr int AnyDims = -1;
194
+
195
+ /** A templated Buffer class that wraps halide_buffer_t and adds
196
+ * functionality. When using Halide from C++, this is the preferred
197
+ * way to create input and output buffers. The overhead of using this
198
+ * class relative to a naked halide_buffer_t is minimal - it uses another
199
+ * ~16 bytes on the stack, and does no dynamic allocations when using
200
+ * it to represent existing memory of a known maximum dimensionality.
201
+ *
202
+ * The template parameter T is the element type. For buffers where the
203
+ * element type is unknown, or may vary, use void or const void.
204
+ *
205
+ * The template parameter Dims is the number of dimensions. For buffers where
206
+ * the dimensionality type is unknown at, or may vary, use AnyDims.
207
+ *
208
+ * InClassDimStorage is the maximum number of dimensions that can be represented
209
+ * using space inside the class itself. Set it to the maximum dimensionality
210
+ * you expect this buffer to be. If the actual dimensionality exceeds
211
+ * this, heap storage is allocated to track the shape of the buffer.
212
+ * InClassDimStorage defaults to 4, which should cover nearly all usage.
213
+ *
214
+ * The class optionally allocates and owns memory for the image using
215
+ * a shared pointer allocated with the provided allocator. If they are
216
+ * null, malloc and free are used. Any device-side allocation is
217
+ * considered as owned if and only if the host-side allocation is
218
+ * owned. */
219
+ template<typename T = void,
220
+ int Dims = AnyDims,
221
+ int InClassDimStorage = (Dims == AnyDims ? 4 : std::max(Dims, 1))>
222
+ class Buffer {
223
+ /** The underlying halide_buffer_t */
224
+ halide_buffer_t buf = {};
225
+
226
+ /** Some in-class storage for shape of the dimensions. */
227
+ halide_dimension_t shape[InClassDimStorage];
228
+
229
+ /** The allocation owned by this Buffer. NULL if the Buffer does not
230
+ * own the memory. */
231
+ AllocationHeader *alloc = nullptr;
232
+
233
+ /** A reference count for the device allocation owned by this
234
+ * buffer. */
235
+ mutable DeviceRefCount *dev_ref_count = nullptr;
236
+
237
+ /** True if T is of type void or const void */
238
+ static const bool T_is_void = std::is_same<typename std::remove_const<T>::type, void>::value;
239
+
240
+ /** A type function that adds a const qualifier if T is a const type. */
241
+ template<typename T2>
242
+ using add_const_if_T_is_const = typename std::conditional<std::is_const<T>::value, const T2, T2>::type;
243
+
244
+ /** T unless T is (const) void, in which case (const)
245
+ * uint8_t. Useful for providing return types for operator() */
246
+ using not_void_T = typename std::conditional<T_is_void,
247
+ add_const_if_T_is_const<uint8_t>,
248
+ T>::type;
249
+
250
+ /** T with constness removed. Useful for return type of copy(). */
251
+ using not_const_T = typename std::remove_const<T>::type;
252
+
253
+ /** The type the elements are stored as. Equal to not_void_T
254
+ * unless T is a pointer, in which case uint64_t. Halide stores
255
+ * all pointer types as uint64s internally, even on 32-bit
256
+ * systems. */
257
+ using storage_T = typename std::conditional<std::is_pointer<T>::value, uint64_t, not_void_T>::type;
258
+
259
+ public:
260
+ /** True if the Halide type is not void (or const void). */
261
+ static constexpr bool has_static_halide_type = !T_is_void;
262
+
263
+ /** Get the Halide type of T. Callers should not use the result if
264
+ * has_static_halide_type is false. */
265
+ static constexpr halide_type_t static_halide_type() {
266
+ return halide_type_of<typename std::remove_cv<not_void_T>::type>();
267
+ }
268
+
269
+ /** Does this Buffer own the host memory it refers to? */
270
+ bool owns_host_memory() const {
271
+ return alloc != nullptr;
272
+ }
273
+
274
+ static constexpr bool has_static_dimensions = (Dims != AnyDims);
275
+
276
+ /** Callers should not use the result if
277
+ * has_static_dimensions is false. */
278
+ static constexpr int static_dimensions() {
279
+ return Dims;
280
+ }
281
+
282
+ static_assert(!has_static_dimensions || static_dimensions() >= 0);
283
+
284
+ private:
285
+ /** Increment the reference count of any owned allocation */
286
+ void incref() const {
287
+ if (owns_host_memory()) {
288
+ alloc->ref_count++;
289
+ }
290
+ if (buf.device) {
291
+ if (!dev_ref_count) {
292
+ // I seem to have a non-zero dev field but no
293
+ // reference count for it. I must have been given a
294
+ // device allocation by a Halide pipeline, and have
295
+ // never been copied from since. Take sole ownership
296
+ // of it.
297
+ dev_ref_count = new DeviceRefCount;
298
+ }
299
+ dev_ref_count->count++;
300
+ }
301
+ }
302
+
303
+ // Note that this is called "cropped" but can also encompass a slice/embed
304
+ // operation as well.
305
+ struct DevRefCountCropped : DeviceRefCount {
306
+ // We will only store Buffers that have a dynamic number of dimensions.
307
+ // Buffers that cropped or sliced from need to be first converted to
308
+ // one with variable size. This is required because we cannot possibly
309
+ // know what the actual dimensionality is of the buffer this is a
310
+ // crop or slice from. Since cropping a sliced buffer is also possible,
311
+ // no optimizations can be made for cropped buffers either.
312
+ Buffer<T, AnyDims> cropped_from;
313
+ explicit DevRefCountCropped(const Buffer<T, AnyDims> &cropped_from)
314
+ : cropped_from(cropped_from) {
315
+ ownership = BufferDeviceOwnership::Cropped;
316
+ }
317
+ };
318
+
319
+ /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */
320
+ void crop_from(const Buffer<T, AnyDims> &cropped_from) {
321
+ assert(dev_ref_count == nullptr);
322
+ dev_ref_count = new DevRefCountCropped(cropped_from);
323
+ }
324
+
325
+ /** Decrement the reference count of any owned allocation and free host
326
+ * and device memory if it hits zero. Sets alloc to nullptr. */
327
+ void decref(bool device_only = false) {
328
+ if (owns_host_memory() && !device_only) {
329
+ int new_count = --(alloc->ref_count);
330
+ if (new_count == 0) {
331
+ void (*fn)(void *) = alloc->deallocate_fn;
332
+ alloc->~AllocationHeader();
333
+ fn(alloc);
334
+ }
335
+ buf.host = nullptr;
336
+ alloc = nullptr;
337
+ set_host_dirty(false);
338
+ }
339
+ int new_count = 0;
340
+ if (dev_ref_count) {
341
+ new_count = --(dev_ref_count->count);
342
+ }
343
+ if (new_count == 0) {
344
+ if (buf.device) {
345
+ assert(!(alloc && device_dirty()) &&
346
+ "Implicitly freeing a dirty device allocation while a host allocation still lives. "
347
+ "Call device_free explicitly if you want to drop dirty device-side data. "
348
+ "Call copy_to_host explicitly if you want the data copied to the host allocation "
349
+ "before the device allocation is freed.");
350
+ int result = halide_error_code_success;
351
+ if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative) {
352
+ result = buf.device_interface->detach_native(nullptr, &buf);
353
+ } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost) {
354
+ result = buf.device_interface->device_and_host_free(nullptr, &buf);
355
+ } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
356
+ result = buf.device_interface->device_release_crop(nullptr, &buf);
357
+ } else if (dev_ref_count == nullptr || dev_ref_count->ownership == BufferDeviceOwnership::Allocated) {
358
+ result = buf.device_interface->device_free(nullptr, &buf);
359
+ }
360
+ // No reasonable way to return the error, but we can at least assert-fail in debug builds.
361
+ assert((result == halide_error_code_success) && "device_interface call returned a nonzero result in Buffer::decref()");
362
+ (void)result;
363
+ }
364
+ if (dev_ref_count) {
365
+ if (dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
366
+ delete (DevRefCountCropped *)dev_ref_count;
367
+ } else {
368
+ delete dev_ref_count;
369
+ }
370
+ }
371
+ }
372
+ dev_ref_count = nullptr;
373
+ buf.device = 0;
374
+ buf.device_interface = nullptr;
375
+ }
376
+
377
+ void free_shape_storage() {
378
+ if (buf.dim != shape) {
379
+ delete[] buf.dim;
380
+ buf.dim = nullptr;
381
+ }
382
+ }
383
+
384
+ template<int DimsSpecified>
385
+ void make_static_shape_storage() {
386
+ static_assert(Dims == AnyDims || Dims == DimsSpecified,
387
+ "Number of arguments to Buffer() does not match static dimensionality");
388
+ buf.dimensions = DimsSpecified;
389
+ if constexpr (Dims == AnyDims) {
390
+ if constexpr (DimsSpecified <= InClassDimStorage) {
391
+ buf.dim = shape;
392
+ } else {
393
+ static_assert(DimsSpecified >= 1);
394
+ buf.dim = new halide_dimension_t[DimsSpecified];
395
+ }
396
+ } else {
397
+ static_assert(InClassDimStorage >= Dims);
398
+ buf.dim = shape;
399
+ }
400
+ }
401
+
402
+ void make_shape_storage(const int dimensions) {
403
+ if (Dims != AnyDims && Dims != dimensions) {
404
+ assert(false && "Number of arguments to Buffer() does not match static dimensionality");
405
+ }
406
+ // This should usually be inlined, so if dimensions is statically known,
407
+ // we can skip the call to new
408
+ buf.dimensions = dimensions;
409
+ buf.dim = (dimensions <= InClassDimStorage) ? shape : new halide_dimension_t[dimensions];
410
+ }
411
+
412
+ void copy_shape_from(const halide_buffer_t &other) {
413
+ // All callers of this ensure that buf.dimensions == other.dimensions.
414
+ make_shape_storage(other.dimensions);
415
+ std::copy(other.dim, other.dim + other.dimensions, buf.dim);
416
+ }
417
+
418
+ template<typename T2, int D2, int S2>
419
+ void move_shape_from(Buffer<T2, D2, S2> &&other) {
420
+ if (other.shape == other.buf.dim) {
421
+ copy_shape_from(other.buf);
422
+ } else {
423
+ buf.dim = other.buf.dim;
424
+ other.buf.dim = nullptr;
425
+ }
426
+ other.buf = halide_buffer_t();
427
+ }
428
+
429
+ /** Initialize the shape from a halide_buffer_t. */
430
+ void initialize_from_buffer(const halide_buffer_t &b,
431
+ BufferDeviceOwnership ownership) {
432
+ memcpy(&buf, &b, sizeof(halide_buffer_t));
433
+ copy_shape_from(b);
434
+ if (b.device) {
435
+ dev_ref_count = new DeviceRefCount;
436
+ dev_ref_count->ownership = ownership;
437
+ }
438
+ }
439
+
440
+ /** Initialize the shape from an array of ints */
441
+ void initialize_shape(const int *sizes) {
442
+ for (int i = 0; i < buf.dimensions; i++) {
443
+ buf.dim[i].min = 0;
444
+ buf.dim[i].extent = sizes[i];
445
+ if (i == 0) {
446
+ buf.dim[i].stride = 1;
447
+ } else {
448
+ buf.dim[i].stride = buf.dim[i - 1].stride * buf.dim[i - 1].extent;
449
+ }
450
+ }
451
+ }
452
+
453
+ /** Initialize the shape from a vector of extents */
454
+ void initialize_shape(const std::vector<int> &sizes) {
455
+ assert(buf.dimensions == (int)sizes.size());
456
+ initialize_shape(sizes.data());
457
+ }
458
+
459
+ /** Initialize the shape from the static shape of an array */
460
+ template<typename Array, size_t N>
461
+ void initialize_shape_from_array_shape(int next, Array (&vals)[N]) {
462
+ buf.dim[next].min = 0;
463
+ buf.dim[next].extent = (int)N;
464
+ if (next == 0) {
465
+ buf.dim[next].stride = 1;
466
+ } else {
467
+ initialize_shape_from_array_shape(next - 1, vals[0]);
468
+ buf.dim[next].stride = buf.dim[next - 1].stride * buf.dim[next - 1].extent;
469
+ }
470
+ }
471
+
472
+ /** Base case for the template recursion above. */
473
+ template<typename T2>
474
+ void initialize_shape_from_array_shape(int, const T2 &) {
475
+ }
476
+
477
+ /** Get the dimensionality of a multi-dimensional C array */
478
+ template<typename Array, size_t N>
479
+ static int dimensionality_of_array(Array (&vals)[N]) {
480
+ return dimensionality_of_array(vals[0]) + 1;
481
+ }
482
+
483
+ template<typename T2>
484
+ static int dimensionality_of_array(const T2 &) {
485
+ return 0;
486
+ }
487
+
488
+ /** Get the underlying halide_type_t of an array's element type. */
489
+ template<typename Array, size_t N>
490
+ static halide_type_t scalar_type_of_array(Array (&vals)[N]) {
491
+ return scalar_type_of_array(vals[0]);
492
+ }
493
+
494
+ template<typename T2>
495
+ static halide_type_t scalar_type_of_array(const T2 &) {
496
+ return halide_type_of<typename std::remove_cv<T2>::type>();
497
+ }
498
+
499
+ /** Crop a single dimension without handling device allocation. */
500
+ void crop_host(int d, int min, int extent) {
501
+ assert(dim(d).min() <= min);
502
+ assert(dim(d).max() >= min + extent - 1);
503
+ ptrdiff_t shift = min - dim(d).min();
504
+ if (buf.host != nullptr) {
505
+ buf.host += (shift * dim(d).stride()) * type().bytes();
506
+ }
507
+ buf.dim[d].min = min;
508
+ buf.dim[d].extent = extent;
509
+ }
510
+
511
+ /** Crop as many dimensions as are in rect, without handling device allocation. */
512
+ void crop_host(const std::vector<std::pair<int, int>> &rect) {
513
+ assert(rect.size() <= static_cast<decltype(rect.size())>(std::numeric_limits<int>::max()));
514
+ int limit = (int)rect.size();
515
+ assert(limit <= dimensions());
516
+ for (int i = 0; i < limit; i++) {
517
+ crop_host(i, rect[i].first, rect[i].second);
518
+ }
519
+ }
520
+
521
+ void complete_device_crop(Buffer<T, Dims, InClassDimStorage> &result_host_cropped) const {
522
+ assert(buf.device_interface != nullptr);
523
+ if (buf.device_interface->device_crop(nullptr, &this->buf, &result_host_cropped.buf) == halide_error_code_success) {
524
+ // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
525
+ // is it possible to get to this point without incref having run at least once since
526
+ // the device field was set? (I.e. in the internal logic of crop. incref might have been
527
+ // called.)
528
+ if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
529
+ result_host_cropped.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
530
+ } else {
531
+ result_host_cropped.crop_from(*this);
532
+ }
533
+ }
534
+ }
535
+
536
+ /** slice a single dimension without handling device allocation. */
537
+ void slice_host(int d, int pos) {
538
+ static_assert(Dims == AnyDims);
539
+ assert(dimensions() > 0);
540
+ assert(d >= 0 && d < dimensions());
541
+ assert(pos >= dim(d).min() && pos <= dim(d).max());
542
+ buf.dimensions--;
543
+ ptrdiff_t shift = pos - buf.dim[d].min;
544
+ if (buf.host != nullptr) {
545
+ buf.host += (shift * buf.dim[d].stride) * type().bytes();
546
+ }
547
+ for (int i = d; i < buf.dimensions; i++) {
548
+ buf.dim[i] = buf.dim[i + 1];
549
+ }
550
+ buf.dim[buf.dimensions] = {0, 0, 0};
551
+ }
552
+
553
+ void complete_device_slice(Buffer<T, AnyDims, InClassDimStorage> &result_host_sliced, int d, int pos) const {
554
+ assert(buf.device_interface != nullptr);
555
+ if (buf.device_interface->device_slice(nullptr, &this->buf, d, pos, &result_host_sliced.buf) == halide_error_code_success) {
556
+ // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
557
+ // is it possible to get to this point without incref having run at least once since
558
+ // the device field was set? (I.e. in the internal logic of slice. incref might have been
559
+ // called.)
560
+ if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
561
+ // crop_from() is correct here, despite the fact that we are slicing.
562
+ result_host_sliced.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
563
+ } else {
564
+ // crop_from() is correct here, despite the fact that we are slicing.
565
+ result_host_sliced.crop_from(*this);
566
+ }
567
+ }
568
+ }
569
+
570
+ public:
571
+ typedef T ElemType;
572
+
573
+ /** Read-only access to the shape */
574
+ class Dimension {
575
+ const halide_dimension_t &d;
576
+
577
+ public:
578
+ /** The lowest coordinate in this dimension */
579
+ HALIDE_ALWAYS_INLINE int min() const {
580
+ return d.min;
581
+ }
582
+
583
+ /** The number of elements in memory you have to step over to
584
+ * increment this coordinate by one. */
585
+ HALIDE_ALWAYS_INLINE int stride() const {
586
+ return d.stride;
587
+ }
588
+
589
+ /** The extent of the image along this dimension */
590
+ HALIDE_ALWAYS_INLINE int extent() const {
591
+ return d.extent;
592
+ }
593
+
594
+ /** The highest coordinate in this dimension */
595
+ HALIDE_ALWAYS_INLINE int max() const {
596
+ return min() + extent() - 1;
597
+ }
598
+
599
+ /** An iterator class, so that you can iterate over
600
+ * coordinates in a dimensions using a range-based for loop. */
601
+ struct iterator {
602
+ int val;
603
+ int operator*() const {
604
+ return val;
605
+ }
606
+ bool operator!=(const iterator &other) const {
607
+ return val != other.val;
608
+ }
609
+ iterator &operator++() {
610
+ val++;
611
+ return *this;
612
+ }
613
+ };
614
+
615
+ /** An iterator that points to the min coordinate */
616
+ HALIDE_ALWAYS_INLINE iterator begin() const {
617
+ return {min()};
618
+ }
619
+
620
+ /** An iterator that points to one past the max coordinate */
621
+ HALIDE_ALWAYS_INLINE iterator end() const {
622
+ return {min() + extent()};
623
+ }
624
+
625
+ explicit Dimension(const halide_dimension_t &dim)
626
+ : d(dim) {
627
+ }
628
+ };
629
+
630
+ /** Access the shape of the buffer */
631
+ HALIDE_ALWAYS_INLINE Dimension dim(int i) const {
632
+ assert(i >= 0 && i < this->dimensions());
633
+ return Dimension(buf.dim[i]);
634
+ }
635
+
636
+ /** Access to the mins, strides, extents. Will be deprecated. Do not use. */
637
+ // @{
638
+ int min(int i) const {
639
+ return dim(i).min();
640
+ }
641
+ int extent(int i) const {
642
+ return dim(i).extent();
643
+ }
644
+ int stride(int i) const {
645
+ return dim(i).stride();
646
+ }
647
+ // @}
648
+
649
+ /** The total number of elements this buffer represents. Equal to
650
+ * the product of the extents */
651
+ size_t number_of_elements() const {
652
+ return buf.number_of_elements();
653
+ }
654
+
655
+ /** Get the dimensionality of the buffer. */
656
+ int dimensions() const {
657
+ if constexpr (has_static_dimensions) {
658
+ return Dims;
659
+ } else {
660
+ return buf.dimensions;
661
+ }
662
+ }
663
+
664
+ /** Get the type of the elements. */
665
+ halide_type_t type() const {
666
+ return buf.type;
667
+ }
668
+
669
+ /** A pointer to the element with the lowest address. If all
670
+ * strides are positive, equal to the host pointer. */
671
+ T *begin() const {
672
+ assert(buf.host != nullptr); // Cannot call begin() on an unallocated Buffer.
673
+ return (T *)buf.begin();
674
+ }
675
+
676
+ /** A pointer to one beyond the element with the highest address. */
677
+ T *end() const {
678
+ assert(buf.host != nullptr); // Cannot call end() on an unallocated Buffer.
679
+ return (T *)buf.end();
680
+ }
681
+
682
+ /** The total number of bytes spanned by the data in memory. */
683
+ size_t size_in_bytes() const {
684
+ return buf.size_in_bytes();
685
+ }
686
+
687
+ /** Reset the Buffer to be equivalent to a default-constructed Buffer
688
+ * of the same static type (if any); Buffer<void> will have its runtime
689
+ * type reset to uint8. */
690
+ void reset() {
691
+ *this = Buffer();
692
+ }
693
+
694
+ Buffer()
695
+ : shape() {
696
+ buf.type = static_halide_type();
697
+ // If Dims are statically known, must create storage that many.
698
+ // otherwise, make a zero-dimensional buffer.
699
+ constexpr int buf_dimensions = (Dims == AnyDims) ? 0 : Dims;
700
+ make_static_shape_storage<buf_dimensions>();
701
+ }
702
+
703
+ /** Make a Buffer from a halide_buffer_t */
704
+ explicit Buffer(const halide_buffer_t &buf,
705
+ BufferDeviceOwnership ownership = BufferDeviceOwnership::Unmanaged) {
706
+ assert(T_is_void || buf.type == static_halide_type());
707
+ initialize_from_buffer(buf, ownership);
708
+ }
709
+
710
+ /** Give Buffers access to the members of Buffers of different dimensionalities and types. */
711
+ template<typename T2, int D2, int S2>
712
+ friend class Buffer;
713
+
714
+ private:
715
+ template<typename T2, int D2, int S2>
716
+ static void static_assert_can_convert_from() {
717
+ static_assert((!std::is_const<T2>::value || std::is_const<T>::value),
718
+ "Can't convert from a Buffer<const T> to a Buffer<T>");
719
+ static_assert(std::is_same<typename std::remove_const<T>::type,
720
+ typename std::remove_const<T2>::type>::value ||
721
+ T_is_void || Buffer<T2, D2, S2>::T_is_void,
722
+ "type mismatch constructing Buffer");
723
+ static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2,
724
+ "Can't convert from a Buffer with static dimensionality to a Buffer with different static dimensionality");
725
+ }
726
+
727
+ public:
728
+ static void set_default_allocate_fn(void *(*allocate_fn)(size_t)) {
729
+ Internal::DefaultAllocatorFns::default_allocate_fn = allocate_fn;
730
+ }
731
+ static void set_default_deallocate_fn(void (*deallocate_fn)(void *)) {
732
+ Internal::DefaultAllocatorFns::default_deallocate_fn = deallocate_fn;
733
+ }
734
+
735
+ /** Determine if a Buffer<T, Dims, InClassDimStorage> can be constructed from some other Buffer type.
736
+ * If this can be determined at compile time, fail with a static assert; otherwise
737
+ * return a boolean based on runtime typing. */
738
+ template<typename T2, int D2, int S2>
739
+ static bool can_convert_from(const Buffer<T2, D2, S2> &other) {
740
+ static_assert_can_convert_from<T2, D2, S2>();
741
+ if (Buffer<T2, D2, S2>::T_is_void && !T_is_void) {
742
+ if (other.type() != static_halide_type()) {
743
+ return false;
744
+ }
745
+ }
746
+ if (Dims != AnyDims) {
747
+ if (other.dimensions() != Dims) {
748
+ return false;
749
+ }
750
+ }
751
+ return true;
752
+ }
753
+
754
+ /** Fail an assertion at runtime or compile-time if an Buffer<T, Dims, InClassDimStorage>
755
+ * cannot be constructed from some other Buffer type. */
756
+ template<typename T2, int D2, int S2>
757
+ static void assert_can_convert_from(const Buffer<T2, D2, S2> &other) {
758
+ // Explicitly call static_assert_can_convert_from() here so
759
+ // that we always get compile-time checking, even if compiling with
760
+ // assertions disabled.
761
+ static_assert_can_convert_from<T2, D2, S2>();
762
+ assert(can_convert_from(other));
763
+ }
764
+
765
+ /** Copy constructor. Does not copy underlying data. */
766
+ Buffer(const Buffer<T, Dims, InClassDimStorage> &other)
767
+ : buf(other.buf),
768
+ alloc(other.alloc) {
769
+ other.incref();
770
+ dev_ref_count = other.dev_ref_count;
771
+ copy_shape_from(other.buf);
772
+ }
773
+
774
+ /** Construct a Buffer from a Buffer of different dimensionality
775
+ * and type. Asserts that the type and dimensionality matches (at runtime,
776
+ * if one of the types is void). Note that this constructor is
777
+ * implicit. This, for example, lets you pass things like
778
+ * Buffer<T> or Buffer<const void> to functions expected
779
+ * Buffer<const T>. */
780
+ template<typename T2, int D2, int S2>
781
+ Buffer(const Buffer<T2, D2, S2> &other)
782
+ : buf(other.buf),
783
+ alloc(other.alloc) {
784
+ assert_can_convert_from(other);
785
+ other.incref();
786
+ dev_ref_count = other.dev_ref_count;
787
+ copy_shape_from(other.buf);
788
+ }
789
+
790
+ /** Move constructor */
791
+ Buffer(Buffer<T, Dims, InClassDimStorage> &&other) noexcept
792
+ : buf(other.buf),
793
+ alloc(other.alloc),
794
+ dev_ref_count(other.dev_ref_count) {
795
+ other.dev_ref_count = nullptr;
796
+ other.alloc = nullptr;
797
+ move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
798
+ }
799
+
800
+ /** Move-construct a Buffer from a Buffer of different
801
+ * dimensionality and type. Asserts that the types match (at
802
+ * runtime if one of the types is void). */
803
+ template<typename T2, int D2, int S2>
804
+ Buffer(Buffer<T2, D2, S2> &&other)
805
+ : buf(other.buf),
806
+ alloc(other.alloc),
807
+ dev_ref_count(other.dev_ref_count) {
808
+ assert_can_convert_from(other);
809
+ other.dev_ref_count = nullptr;
810
+ other.alloc = nullptr;
811
+ move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
812
+ }
813
+
814
+ /** Assign from another Buffer of possibly-different
815
+ * dimensionality and type. Asserts that the types match (at
816
+ * runtime if one of the types is void). */
817
+ template<typename T2, int D2, int S2>
818
+ Buffer<T, Dims, InClassDimStorage> &operator=(const Buffer<T2, D2, S2> &other) {
819
+ if ((const void *)this == (const void *)&other) {
820
+ return *this;
821
+ }
822
+ assert_can_convert_from(other);
823
+ other.incref();
824
+ decref();
825
+ dev_ref_count = other.dev_ref_count;
826
+ alloc = other.alloc;
827
+ free_shape_storage();
828
+ buf = other.buf;
829
+ copy_shape_from(other.buf);
830
+ return *this;
831
+ }
832
+
833
+ /** Standard assignment operator */
834
+ Buffer<T, Dims, InClassDimStorage> &operator=(const Buffer<T, Dims, InClassDimStorage> &other) {
835
+ // The cast to void* here is just to satisfy clang-tidy
836
+ if ((const void *)this == (const void *)&other) {
837
+ return *this;
838
+ }
839
+ other.incref();
840
+ decref();
841
+ dev_ref_count = other.dev_ref_count;
842
+ alloc = other.alloc;
843
+ free_shape_storage();
844
+ buf = other.buf;
845
+ copy_shape_from(other.buf);
846
+ return *this;
847
+ }
848
+
849
+ /** Move from another Buffer of possibly-different
850
+ * dimensionality and type. Asserts that the types match (at
851
+ * runtime if one of the types is void). */
852
+ template<typename T2, int D2, int S2>
853
+ Buffer<T, Dims, InClassDimStorage> &operator=(Buffer<T2, D2, S2> &&other) {
854
+ assert_can_convert_from(other);
855
+ decref();
856
+ alloc = other.alloc;
857
+ other.alloc = nullptr;
858
+ dev_ref_count = other.dev_ref_count;
859
+ other.dev_ref_count = nullptr;
860
+ free_shape_storage();
861
+ buf = other.buf;
862
+ move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
863
+ return *this;
864
+ }
865
+
866
+ /** Standard move-assignment operator */
867
+ Buffer<T, Dims, InClassDimStorage> &operator=(Buffer<T, Dims, InClassDimStorage> &&other) noexcept {
868
+ decref();
869
+ alloc = other.alloc;
870
+ other.alloc = nullptr;
871
+ dev_ref_count = other.dev_ref_count;
872
+ other.dev_ref_count = nullptr;
873
+ free_shape_storage();
874
+ buf = other.buf;
875
+ move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
876
+ return *this;
877
+ }
878
+
879
+ /** Check the product of the extents fits in memory. */
880
+ void check_overflow() {
881
+ size_t size = type().bytes();
882
+ for (int i = 0; i < dimensions(); i++) {
883
+ size *= dim(i).extent();
884
+ }
885
+ // We allow 2^31 or 2^63 bytes, so drop the top bit.
886
+ size = (size << 1) >> 1;
887
+ for (int i = 0; i < dimensions(); i++) {
888
+ size /= dim(i).extent();
889
+ }
890
+ assert(size == (size_t)type().bytes() && "Error: Overflow computing total size of buffer.");
891
+ }
892
+
893
+ /** Allocate memory for this Buffer. Drops the reference to any
894
+ * owned memory. */
895
+ void allocate(void *(*allocate_fn)(size_t) = nullptr,
896
+ void (*deallocate_fn)(void *) = nullptr) {
897
+ // Drop any existing allocation
898
+ deallocate();
899
+
900
+ // Conservatively align images to (usually) 128 bytes. This is enough
901
+ // alignment for all the platforms we might use. Also ensure that the allocation
902
+ // is such that the logical size is an integral multiple of 128 bytes (or a bit more).
903
+ constexpr size_t alignment = HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT;
904
+
905
+ const auto align_up = [=](size_t value) -> size_t {
906
+ return (value + alignment - 1) & ~(alignment - 1);
907
+ };
908
+
909
+ size_t size = size_in_bytes();
910
+
911
+ #if HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
912
+ // Only use aligned_alloc() if no custom allocators are specified.
913
+ if (!allocate_fn && !deallocate_fn && !Internal::DefaultAllocatorFns::default_allocate_fn && !Internal::DefaultAllocatorFns::default_deallocate_fn) {
914
+ // As a practical matter, sizeof(AllocationHeader) is going to be no more than 16 bytes
915
+ // on any supported platform, so we will just overallocate by 'alignment'
916
+ // so that the user storage also starts at an aligned point. This is a bit
917
+ // wasteful, but probably not a big deal.
918
+ static_assert(sizeof(AllocationHeader) <= alignment);
919
+ void *alloc_storage = ::aligned_alloc(alignment, align_up(size) + alignment);
920
+ assert((uintptr_t)alloc_storage == align_up((uintptr_t)alloc_storage));
921
+ alloc = new (alloc_storage) AllocationHeader(free);
922
+ buf.host = (uint8_t *)((uintptr_t)alloc_storage + alignment);
923
+ return;
924
+ }
925
+ // else fall thru
926
+ #endif
927
+ if (!allocate_fn) {
928
+ allocate_fn = Internal::DefaultAllocatorFns::default_allocate_fn;
929
+ if (!allocate_fn) {
930
+ allocate_fn = malloc;
931
+ }
932
+ }
933
+ if (!deallocate_fn) {
934
+ deallocate_fn = Internal::DefaultAllocatorFns::default_deallocate_fn;
935
+ if (!deallocate_fn) {
936
+ deallocate_fn = free;
937
+ }
938
+ }
939
+
940
+ static_assert(sizeof(AllocationHeader) <= alignment);
941
+
942
+ // malloc() and friends must return a pointer aligned to at least alignof(std::max_align_t);
943
+ // make sure this is OK for AllocationHeader, since it always goes at the start
944
+ static_assert(alignof(AllocationHeader) <= alignof(std::max_align_t));
945
+
946
+ const size_t requested_size = align_up(size + alignment +
947
+ std::max(0, (int)sizeof(AllocationHeader) -
948
+ (int)sizeof(std::max_align_t)));
949
+ void *alloc_storage = allocate_fn(requested_size);
950
+ alloc = new (alloc_storage) AllocationHeader(deallocate_fn);
951
+ uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader);
952
+ buf.host = (uint8_t *)align_up((uintptr_t)unaligned_ptr);
953
+ }
954
+
955
+ /** Drop reference to any owned host or device memory, possibly
956
+ * freeing it, if this buffer held the last reference to
957
+ * it. Retains the shape of the buffer. Does nothing if this
958
+ * buffer did not allocate its own memory. */
959
+ void deallocate() {
960
+ decref();
961
+ }
962
+
963
+ /** Drop reference to any owned device memory, possibly freeing it
964
+ * if this buffer held the last reference to it. Asserts that
965
+ * device_dirty is false. */
966
+ void device_deallocate() {
967
+ decref(true);
968
+ }
969
+
970
+ /** Allocate a new image of the given size with a runtime
971
+ * type. Only used when you do know what size you want but you
972
+ * don't know statically what type the elements are. Pass zeroes
973
+ * to make a buffer suitable for bounds query calls. */
974
+ template<typename... Args,
975
+ typename = typename std::enable_if<AllInts<Args...>::value>::type>
976
+ Buffer(halide_type_t t, int first, Args... rest) {
977
+ if (!T_is_void) {
978
+ assert(static_halide_type() == t);
979
+ }
980
+ int extents[] = {first, (int)rest...};
981
+ buf.type = t;
982
+ constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
983
+ make_static_shape_storage<buf_dimensions>();
984
+ initialize_shape(extents);
985
+ if (!Internal::any_zero(extents)) {
986
+ check_overflow();
987
+ allocate();
988
+ }
989
+ }
990
+
991
+ /** Allocate a new image of the given size. Pass zeroes to make a
992
+ * buffer suitable for bounds query calls. */
993
+ // @{
994
+
995
+ // The overload with one argument is 'explicit', so that
996
+ // (say) int is not implicitly convertible to Buffer<int>
997
+ explicit Buffer(int first) {
998
+ static_assert(!T_is_void,
999
+ "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
1000
+ int extents[] = {first};
1001
+ buf.type = static_halide_type();
1002
+ constexpr int buf_dimensions = 1;
1003
+ make_static_shape_storage<buf_dimensions>();
1004
+ initialize_shape(extents);
1005
+ if (first != 0) {
1006
+ check_overflow();
1007
+ allocate();
1008
+ }
1009
+ }
1010
+
1011
+ template<typename... Args,
1012
+ typename = typename std::enable_if<AllInts<Args...>::value>::type>
1013
+ Buffer(int first, int second, Args... rest) {
1014
+ static_assert(!T_is_void,
1015
+ "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
1016
+ int extents[] = {first, second, (int)rest...};
1017
+ buf.type = static_halide_type();
1018
+ constexpr int buf_dimensions = 2 + (int)(sizeof...(rest));
1019
+ make_static_shape_storage<buf_dimensions>();
1020
+ initialize_shape(extents);
1021
+ if (!Internal::any_zero(extents)) {
1022
+ check_overflow();
1023
+ allocate();
1024
+ }
1025
+ }
1026
+ // @}
1027
+
1028
+ /** Allocate a new image of unknown type using a vector of ints as the size. */
1029
+ Buffer(halide_type_t t, const std::vector<int> &sizes) {
1030
+ if (!T_is_void) {
1031
+ assert(static_halide_type() == t);
1032
+ }
1033
+ buf.type = t;
1034
+ // make_shape_storage() will do a runtime check that dimensionality matches.
1035
+ make_shape_storage((int)sizes.size());
1036
+ initialize_shape(sizes);
1037
+ if (!Internal::any_zero(sizes)) {
1038
+ check_overflow();
1039
+ allocate();
1040
+ }
1041
+ }
1042
+
1043
+ /** Allocate a new image of known type using a vector of ints as the size. */
1044
+ explicit Buffer(const std::vector<int> &sizes)
1045
+ : Buffer(static_halide_type(), sizes) {
1046
+ }
1047
+
1048
+ private:
1049
+ // Create a copy of the sizes vector, ordered as specified by order.
1050
+ static std::vector<int> make_ordered_sizes(const std::vector<int> &sizes, const std::vector<int> &order) {
1051
+ assert(order.size() == sizes.size());
1052
+ std::vector<int> ordered_sizes(sizes.size());
1053
+ for (size_t i = 0; i < sizes.size(); ++i) {
1054
+ ordered_sizes[i] = sizes.at(order[i]);
1055
+ }
1056
+ return ordered_sizes;
1057
+ }
1058
+
1059
+ public:
1060
+ /** Allocate a new image of unknown type using a vector of ints as the size and
1061
+ * a vector of indices indicating the storage order for each dimension. The
1062
+ * length of the sizes vector and the storage-order vector must match. For instance,
1063
+ * to allocate an interleaved RGB buffer, you would pass {2, 0, 1} for storage_order. */
1064
+ Buffer(halide_type_t t, const std::vector<int> &sizes, const std::vector<int> &storage_order)
1065
+ : Buffer(t, make_ordered_sizes(sizes, storage_order)) {
1066
+ transpose(storage_order);
1067
+ }
1068
+
1069
+ Buffer(const std::vector<int> &sizes, const std::vector<int> &storage_order)
1070
+ : Buffer(static_halide_type(), sizes, storage_order) {
1071
+ }
1072
+
1073
+ /** Make an Buffer that refers to a statically sized array. Does not
1074
+ * take ownership of the data, and does not set the host_dirty flag. */
1075
+ template<typename Array, size_t N>
1076
+ explicit Buffer(Array (&vals)[N]) {
1077
+ const int buf_dimensions = dimensionality_of_array(vals);
1078
+ buf.type = scalar_type_of_array(vals);
1079
+ buf.host = (uint8_t *)vals;
1080
+ make_shape_storage(buf_dimensions);
1081
+ initialize_shape_from_array_shape(buf.dimensions - 1, vals);
1082
+ }
1083
+
1084
+ /** Initialize an Buffer of runtime type from a pointer and some
1085
+ * sizes. Assumes dense row-major packing and a min coordinate of
1086
+ * zero. Does not take ownership of the data and does not set the
1087
+ * host_dirty flag. */
1088
+ template<typename... Args,
1089
+ typename = typename std::enable_if<AllInts<Args...>::value>::type>
1090
+ explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&...rest) {
1091
+ if (!T_is_void) {
1092
+ assert(static_halide_type() == t);
1093
+ }
1094
+ int extents[] = {first, (int)rest...};
1095
+ buf.type = t;
1096
+ buf.host = (uint8_t *)const_cast<void *>(data);
1097
+ constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
1098
+ make_static_shape_storage<buf_dimensions>();
1099
+ initialize_shape(extents);
1100
+ }
1101
+
1102
+ /** Initialize an Buffer from a pointer and some sizes. Assumes
1103
+ * dense row-major packing and a min coordinate of zero. Does not
1104
+ * take ownership of the data and does not set the host_dirty flag. */
1105
+ template<typename... Args,
1106
+ typename = typename std::enable_if<AllInts<Args...>::value>::type>
1107
+ explicit Buffer(T *data, int first, Args &&...rest) {
1108
+ int extents[] = {first, (int)rest...};
1109
+ buf.type = static_halide_type();
1110
+ buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1111
+ constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
1112
+ make_static_shape_storage<buf_dimensions>();
1113
+ initialize_shape(extents);
1114
+ }
1115
+
1116
+ /** Initialize an Buffer from a pointer and a vector of
1117
+ * sizes. Assumes dense row-major packing and a min coordinate of
1118
+ * zero. Does not take ownership of the data and does not set the
1119
+ * host_dirty flag. */
1120
+ explicit Buffer(T *data, const std::vector<int> &sizes) {
1121
+ buf.type = static_halide_type();
1122
+ buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1123
+ make_shape_storage((int)sizes.size());
1124
+ initialize_shape(sizes);
1125
+ }
1126
+
1127
+ /** Initialize an Buffer of runtime type from a pointer and a
1128
+ * vector of sizes. Assumes dense row-major packing and a min
1129
+ * coordinate of zero. Does not take ownership of the data and
1130
+ * does not set the host_dirty flag. */
1131
+ explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, const std::vector<int> &sizes) {
1132
+ if (!T_is_void) {
1133
+ assert(static_halide_type() == t);
1134
+ }
1135
+ buf.type = t;
1136
+ buf.host = (uint8_t *)const_cast<void *>(data);
1137
+ make_shape_storage((int)sizes.size());
1138
+ initialize_shape(sizes);
1139
+ }
1140
+
1141
+ /** Initialize an Buffer from a pointer to the min coordinate and
1142
+ * an array describing the shape. Does not take ownership of the
1143
+ * data, and does not set the host_dirty flag. */
1144
+ explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int d, const halide_dimension_t *shape) {
1145
+ if (!T_is_void) {
1146
+ assert(static_halide_type() == t);
1147
+ }
1148
+ buf.type = t;
1149
+ buf.host = (uint8_t *)const_cast<void *>(data);
1150
+ make_shape_storage(d);
1151
+ for (int i = 0; i < d; i++) {
1152
+ buf.dim[i] = shape[i];
1153
+ }
1154
+ }
1155
+
1156
+ /** Initialize a Buffer from a pointer to the min coordinate and
1157
+ * a vector describing the shape. Does not take ownership of the
1158
+ * data, and does not set the host_dirty flag. */
1159
+ explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data,
1160
+ const std::vector<halide_dimension_t> &shape)
1161
+ : Buffer(t, data, (int)shape.size(), shape.data()) {
1162
+ }
1163
+
1164
+ /** Initialize an Buffer from a pointer to the min coordinate and
1165
+ * an array describing the shape. Does not take ownership of the
1166
+ * data and does not set the host_dirty flag. */
1167
+ explicit Buffer(T *data, int d, const halide_dimension_t *shape) {
1168
+ buf.type = static_halide_type();
1169
+ buf.host = (uint8_t *)const_cast<typename std::remove_const<T>::type *>(data);
1170
+ make_shape_storage(d);
1171
+ for (int i = 0; i < d; i++) {
1172
+ buf.dim[i] = shape[i];
1173
+ }
1174
+ }
1175
+
1176
+ /** Initialize a Buffer from a pointer to the min coordinate and
1177
+ * a vector describing the shape. Does not take ownership of the
1178
+ * data, and does not set the host_dirty flag. */
1179
+ explicit Buffer(T *data, const std::vector<halide_dimension_t> &shape)
1180
+ : Buffer(data, (int)shape.size(), shape.data()) {
1181
+ }
1182
+
1183
+ /** Destructor. Will release any underlying owned allocation if
1184
+ * this is the last reference to it. Will assert fail if there are
1185
+ * weak references to this Buffer outstanding. */
1186
+ ~Buffer() {
1187
+ decref();
1188
+ free_shape_storage();
1189
+ }
1190
+
1191
+ /** Get a pointer to the raw halide_buffer_t this wraps. */
1192
+ // @{
1193
+ halide_buffer_t *raw_buffer() {
1194
+ return &buf;
1195
+ }
1196
+
1197
+ const halide_buffer_t *raw_buffer() const {
1198
+ return &buf;
1199
+ }
1200
+ // @}
1201
+
1202
+ /** Provide a cast operator to halide_buffer_t *, so that
1203
+ * instances can be passed directly to Halide filters. */
1204
+ operator halide_buffer_t *() {
1205
+ return &buf;
1206
+ }
1207
+
1208
+ /** Return a typed reference to this Buffer. Useful for converting
1209
+ * a reference to a Buffer<void> to a reference to, for example, a
1210
+ * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1211
+ * You can also optionally sspecify a new value for Dims; this is useful
1212
+ * mainly for removing the dimensionality constraint on a Buffer with
1213
+ * explicit dimensionality. Does a runtime assert if the source buffer type
1214
+ * is void or the new dimensionality is incompatible. */
1215
+ template<typename T2, int D2 = Dims>
1216
+ HALIDE_ALWAYS_INLINE Buffer<T2, D2, InClassDimStorage> &as() & {
1217
+ Buffer<T2, D2, InClassDimStorage>::assert_can_convert_from(*this);
1218
+ return *((Buffer<T2, D2, InClassDimStorage> *)this);
1219
+ }
1220
+
1221
+ /** Return a const typed reference to this Buffer. Useful for converting
1222
+ * a reference to a Buffer<void> to a reference to, for example, a
1223
+ * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1224
+ * You can also optionally sspecify a new value for Dims; this is useful
1225
+ * mainly for removing the dimensionality constraint on a Buffer with
1226
+ * explicit dimensionality. Does a runtime assert if the source buffer type
1227
+ * is void or the new dimensionality is incompatible. */
1228
+ template<typename T2, int D2 = Dims>
1229
+ HALIDE_ALWAYS_INLINE const Buffer<T2, D2, InClassDimStorage> &as() const & {
1230
+ Buffer<T2, D2, InClassDimStorage>::assert_can_convert_from(*this);
1231
+ return *((const Buffer<T2, D2, InClassDimStorage> *)this);
1232
+ }
1233
+
1234
+ /** Return an rval reference to this Buffer. Useful for converting
1235
+ * a reference to a Buffer<void> to a reference to, for example, a
1236
+ * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
1237
+ * You can also optionally sspecify a new value for Dims; this is useful
1238
+ * mainly for removing the dimensionality constraint on a Buffer with
1239
+ * explicit dimensionality. Does a runtime assert if the source buffer type
1240
+ * is void or the new dimensionality is incompatible. */
1241
+ template<typename T2, int D2 = Dims>
1242
+ HALIDE_ALWAYS_INLINE Buffer<T2, D2, InClassDimStorage> as() && {
1243
+ Buffer<T2, D2, InClassDimStorage>::assert_can_convert_from(*this);
1244
+ return *((Buffer<T2, D2, InClassDimStorage> *)this);
1245
+ }
1246
+
1247
+ /** as_const() is syntactic sugar for .as<const T>(), to avoid the need
1248
+ * to recapitulate the type argument. */
1249
+ // @{
1250
+ HALIDE_ALWAYS_INLINE
1251
+ Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> &as_const() & {
1252
+ // Note that we can skip the assert_can_convert_from(), since T -> const T
1253
+ // conversion is always legal.
1254
+ return *((Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> *)this);
1255
+ }
1256
+
1257
+ HALIDE_ALWAYS_INLINE
1258
+ const Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> &as_const() const & {
1259
+ return *((const Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> *)this);
1260
+ }
1261
+
1262
+ HALIDE_ALWAYS_INLINE
1263
+ Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> as_const() && {
1264
+ return *((Buffer<typename std::add_const<T>::type, Dims, InClassDimStorage> *)this);
1265
+ }
1266
+ // @}
1267
+
1268
+ /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<const T>& when
1269
+ * passing arguments */
1270
+ template<typename T2 = T, typename = typename std::enable_if<!std::is_const<T2>::value>::type>
1271
+ operator Buffer<typename std::add_const<T2>::type, Dims, InClassDimStorage> &() & {
1272
+ return as_const();
1273
+ }
1274
+
1275
+ /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<void>& when
1276
+ * passing arguments */
1277
+ template<typename TVoid,
1278
+ typename T2 = T,
1279
+ typename = typename std::enable_if<std::is_same<TVoid, void>::value &&
1280
+ !std::is_void<T2>::value &&
1281
+ !std::is_const<T2>::value>::type>
1282
+ operator Buffer<TVoid, Dims, InClassDimStorage> &() & {
1283
+ return as<TVoid, Dims>();
1284
+ }
1285
+
1286
+ /** Add some syntactic sugar to allow autoconversion from Buffer<const T> to Buffer<const void>& when
1287
+ * passing arguments */
1288
+ template<typename TVoid,
1289
+ typename T2 = T,
1290
+ typename = typename std::enable_if<std::is_same<TVoid, void>::value &&
1291
+ !std::is_void<T2>::value &&
1292
+ std::is_const<T2>::value>::type>
1293
+ operator Buffer<const TVoid, Dims, InClassDimStorage> &() & {
1294
+ return as<const TVoid, Dims>();
1295
+ }
1296
+
1297
+ /** Conventional names for the first three dimensions. */
1298
+ // @{
1299
+ int width() const {
1300
+ return (dimensions() > 0) ? dim(0).extent() : 1;
1301
+ }
1302
+ int height() const {
1303
+ return (dimensions() > 1) ? dim(1).extent() : 1;
1304
+ }
1305
+ int channels() const {
1306
+ return (dimensions() > 2) ? dim(2).extent() : 1;
1307
+ }
1308
+ // @}
1309
+
1310
+ /** Conventional names for the min and max value of each dimension */
1311
+ // @{
1312
+ int left() const {
1313
+ return dim(0).min();
1314
+ }
1315
+
1316
+ int right() const {
1317
+ return dim(0).max();
1318
+ }
1319
+
1320
+ int top() const {
1321
+ return dim(1).min();
1322
+ }
1323
+
1324
+ int bottom() const {
1325
+ return dim(1).max();
1326
+ }
1327
+ // @}
1328
+
1329
+ /** Make a new image which is a deep copy of this image. Use crop
1330
+ * or slice followed by copy to make a copy of only a portion of
1331
+ * the image. The new image has the same nesting order of dimensions
1332
+ * (e.g. channels innermost), but resets the strides to the default
1333
+ * (each stride is the product of the extents of the inner dimensions).
1334
+ * Note that this means any strides of zero get broadcast into a non-zero stride.
1335
+ *
1336
+ * Note that the returned Buffer is always of a non-const type T (ie:
1337
+ *
1338
+ * Buffer<const T>.copy() -> Buffer<T> rather than Buffer<const T>
1339
+ *
1340
+ * which is always safe, since we are making a deep copy. (The caller
1341
+ * can easily cast it back to Buffer<const T> if desired, which is
1342
+ * always safe and free.)
1343
+ */
1344
+ Buffer<not_const_T, Dims, InClassDimStorage> copy(void *(*allocate_fn)(size_t) = nullptr,
1345
+ void (*deallocate_fn)(void *) = nullptr) const {
1346
+ Buffer<not_const_T, Dims, InClassDimStorage> dst = Buffer<not_const_T, Dims, InClassDimStorage>::make_with_shape_of(*this, allocate_fn, deallocate_fn);
1347
+ dst.copy_from(*this);
1348
+ return dst;
1349
+ }
1350
+
1351
+ /** Like copy(), but the copy is created in interleaved memory layout
1352
+ * (vs. keeping the same memory layout as the original). Requires that 'this'
1353
+ * has exactly 3 dimensions.
1354
+ */
1355
+ Buffer<not_const_T, Dims, InClassDimStorage> copy_to_interleaved(void *(*allocate_fn)(size_t) = nullptr,
1356
+ void (*deallocate_fn)(void *) = nullptr) const {
1357
+ static_assert(Dims == AnyDims || Dims == 3);
1358
+ assert(dimensions() == 3);
1359
+ Buffer<not_const_T, Dims, InClassDimStorage> dst = Buffer<not_const_T, Dims, InClassDimStorage>::make_interleaved(nullptr, width(), height(), channels());
1360
+ dst.set_min(min(0), min(1), min(2));
1361
+ dst.allocate(allocate_fn, deallocate_fn);
1362
+ dst.copy_from(*this);
1363
+ return dst;
1364
+ }
1365
+
1366
+ /** Like copy(), but the copy is created in planar memory layout
1367
+ * (vs. keeping the same memory layout as the original).
1368
+ */
1369
+ Buffer<not_const_T, Dims, InClassDimStorage> copy_to_planar(void *(*allocate_fn)(size_t) = nullptr,
1370
+ void (*deallocate_fn)(void *) = nullptr) const {
1371
+ std::vector<int> mins, extents;
1372
+ const int dims = dimensions();
1373
+ mins.reserve(dims);
1374
+ extents.reserve(dims);
1375
+ for (int d = 0; d < dims; ++d) {
1376
+ mins.push_back(dim(d).min());
1377
+ extents.push_back(dim(d).extent());
1378
+ }
1379
+ Buffer<not_const_T, Dims, InClassDimStorage> dst = Buffer<not_const_T, Dims, InClassDimStorage>(nullptr, extents);
1380
+ dst.set_min(mins);
1381
+ dst.allocate(allocate_fn, deallocate_fn);
1382
+ dst.copy_from(*this);
1383
+ return dst;
1384
+ }
1385
+
1386
+ /** Make a copy of the Buffer which shares the underlying host and/or device
1387
+ * allocations as the existing Buffer. This is purely syntactic sugar for
1388
+ * cases where you have a const reference to a Buffer but need a temporary
1389
+ * non-const copy (e.g. to make a call into AOT-generated Halide code), and want a terse
1390
+ * inline way to create a temporary. \code
1391
+ * void call_my_func(const Buffer<const uint8_t>& input) {
1392
+ * my_func(input.alias(), output);
1393
+ * }\endcode
1394
+ */
1395
+ Buffer<T, Dims, InClassDimStorage> alias() const {
1396
+ return *this;
1397
+ }
1398
+
1399
+ /** Fill a Buffer with the values at the same coordinates in
1400
+ * another Buffer. Restricts itself to coordinates contained
1401
+ * within the intersection of the two buffers. If the two Buffers
1402
+ * are not in the same coordinate system, you will need to
1403
+ * translate the argument Buffer first. E.g. if you're blitting a
1404
+ * sprite onto a framebuffer, you'll want to translate the sprite
1405
+ * to the correct location first like so: \code
1406
+ * framebuffer.copy_from(sprite.translated({x, y})); \endcode
1407
+ */
1408
+ template<typename T2, int D2, int S2>
1409
+ void copy_from(Buffer<T2, D2, S2> src) {
1410
+ static_assert(!std::is_const<T>::value, "Cannot call copy_from() on a Buffer<const T>");
1411
+ assert(!device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty destination.");
1412
+ assert(!src.device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty source.");
1413
+
1414
+ Buffer<T, Dims, InClassDimStorage> dst(*this);
1415
+
1416
+ static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2);
1417
+ assert(src.dimensions() == dst.dimensions());
1418
+
1419
+ // Trim the copy to the region in common
1420
+ const int d = dimensions();
1421
+ for (int i = 0; i < d; i++) {
1422
+ int min_coord = std::max(dst.dim(i).min(), src.dim(i).min());
1423
+ int max_coord = std::min(dst.dim(i).max(), src.dim(i).max());
1424
+ if (max_coord < min_coord) {
1425
+ // The buffers do not overlap.
1426
+ return;
1427
+ }
1428
+ dst.crop(i, min_coord, max_coord - min_coord + 1);
1429
+ src.crop(i, min_coord, max_coord - min_coord + 1);
1430
+ }
1431
+
1432
+ // If T is void, we need to do runtime dispatch to an
1433
+ // appropriately-typed lambda. We're copying, so we only care
1434
+ // about the element size. (If not, this should optimize away
1435
+ // into a static dispatch to the right-sized copy.)
1436
+ if (T_is_void ? (type().bytes() == 1) : (sizeof(not_void_T) == 1)) {
1437
+ using MemType = uint8_t;
1438
+ auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1439
+ auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1440
+ typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1441
+ } else if (T_is_void ? (type().bytes() == 2) : (sizeof(not_void_T) == 2)) {
1442
+ using MemType = uint16_t;
1443
+ auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1444
+ auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1445
+ typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1446
+ } else if (T_is_void ? (type().bytes() == 4) : (sizeof(not_void_T) == 4)) {
1447
+ using MemType = uint32_t;
1448
+ auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1449
+ auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1450
+ typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1451
+ } else if (T_is_void ? (type().bytes() == 8) : (sizeof(not_void_T) == 8)) {
1452
+ using MemType = uint64_t;
1453
+ auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
1454
+ auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
1455
+ typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
1456
+ } else {
1457
+ assert(false && "type().bytes() must be 1, 2, 4, or 8");
1458
+ }
1459
+ set_host_dirty();
1460
+ }
1461
+
1462
+ /** Make an image that refers to a sub-range of this image along
1463
+ * the given dimension. Asserts that the crop region is within
1464
+ * the existing bounds: you cannot "crop outwards", even if you know there
1465
+ * is valid Buffer storage (e.g. because you already cropped inwards). */
1466
+ Buffer<T, Dims, InClassDimStorage> cropped(int d, int min, int extent) const {
1467
+ // Make a fresh copy of the underlying buffer (but not a fresh
1468
+ // copy of the allocation, if there is one).
1469
+ Buffer<T, Dims, InClassDimStorage> im = *this;
1470
+
1471
+ // This guarantees the prexisting device ref is dropped if the
1472
+ // device_crop call fails and maintains the buffer in a consistent
1473
+ // state.
1474
+ im.device_deallocate();
1475
+
1476
+ im.crop_host(d, min, extent);
1477
+ if (buf.device_interface != nullptr) {
1478
+ complete_device_crop(im);
1479
+ }
1480
+ return im;
1481
+ }
1482
+
1483
+ /** Crop an image in-place along the given dimension. This does
1484
+ * not move any data around in memory - it just changes the min
1485
+ * and extent of the given dimension. */
1486
+ void crop(int d, int min, int extent) {
1487
+ // An optimization for non-device buffers. For the device case,
1488
+ // a temp buffer is required, so reuse the not-in-place version.
1489
+ // TODO(zalman|abadams): Are nop crops common enough to special
1490
+ // case the device part of the if to do nothing?
1491
+ if (buf.device_interface != nullptr) {
1492
+ *this = cropped(d, min, extent);
1493
+ } else {
1494
+ crop_host(d, min, extent);
1495
+ }
1496
+ }
1497
+
1498
+ /** Make an image that refers to a sub-rectangle of this image along
1499
+ * the first N dimensions. Asserts that the crop region is within
1500
+ * the existing bounds. The cropped image may drop any device handle
1501
+ * if the device_interface cannot accomplish the crop in-place. */
1502
+ Buffer<T, Dims, InClassDimStorage> cropped(const std::vector<std::pair<int, int>> &rect) const {
1503
+ // Make a fresh copy of the underlying buffer (but not a fresh
1504
+ // copy of the allocation, if there is one).
1505
+ Buffer<T, Dims, InClassDimStorage> im = *this;
1506
+
1507
+ // This guarantees the prexisting device ref is dropped if the
1508
+ // device_crop call fails and maintains the buffer in a consistent
1509
+ // state.
1510
+ im.device_deallocate();
1511
+
1512
+ im.crop_host(rect);
1513
+ if (buf.device_interface != nullptr) {
1514
+ complete_device_crop(im);
1515
+ }
1516
+ return im;
1517
+ }
1518
+
1519
+ /** Crop an image in-place along the first N dimensions. This does
1520
+ * not move any data around in memory, nor does it free memory. It
1521
+ * just rewrites the min/extent of each dimension to refer to a
1522
+ * subregion of the same allocation. */
1523
+ void crop(const std::vector<std::pair<int, int>> &rect) {
1524
+ // An optimization for non-device buffers. For the device case,
1525
+ // a temp buffer is required, so reuse the not-in-place version.
1526
+ // TODO(zalman|abadams): Are nop crops common enough to special
1527
+ // case the device part of the if to do nothing?
1528
+ if (buf.device_interface != nullptr) {
1529
+ *this = cropped(rect);
1530
+ } else {
1531
+ crop_host(rect);
1532
+ }
1533
+ }
1534
+
1535
+ /** Make an image which refers to the same data with using
1536
+ * translated coordinates in the given dimension. Positive values
1537
+ * move the image data to the right or down relative to the
1538
+ * coordinate system. Drops any device handle. */
1539
+ Buffer<T, Dims, InClassDimStorage> translated(int d, int dx) const {
1540
+ Buffer<T, Dims, InClassDimStorage> im = *this;
1541
+ im.translate(d, dx);
1542
+ return im;
1543
+ }
1544
+
1545
+ /** Translate an image in-place along one dimension by changing
1546
+ * how it is indexed. Does not move any data around in memory. */
1547
+ void translate(int d, int delta) {
1548
+ assert(d >= 0 && d < this->dimensions());
1549
+ device_deallocate();
1550
+ buf.dim[d].min += delta;
1551
+ }
1552
+
1553
+ /** Make an image which refers to the same data translated along
1554
+ * the first N dimensions. */
1555
+ Buffer<T, Dims, InClassDimStorage> translated(const std::vector<int> &delta) const {
1556
+ Buffer<T, Dims, InClassDimStorage> im = *this;
1557
+ im.translate(delta);
1558
+ return im;
1559
+ }
1560
+
1561
+ /** Translate an image along the first N dimensions by changing
1562
+ * how it is indexed. Does not move any data around in memory. */
1563
+ void translate(const std::vector<int> &delta) {
1564
+ device_deallocate();
1565
+ assert(delta.size() <= static_cast<decltype(delta.size())>(std::numeric_limits<int>::max()));
1566
+ int limit = (int)delta.size();
1567
+ assert(limit <= dimensions());
1568
+ for (int i = 0; i < limit; i++) {
1569
+ translate(i, delta[i]);
1570
+ }
1571
+ }
1572
+
1573
+ /** Set the min coordinate of an image in the first N dimensions. */
1574
+ // @{
1575
+ void set_min(const std::vector<int> &mins) {
1576
+ assert(mins.size() <= static_cast<decltype(mins.size())>(dimensions()));
1577
+ device_deallocate();
1578
+ for (size_t i = 0; i < mins.size(); i++) {
1579
+ buf.dim[i].min = mins[i];
1580
+ }
1581
+ }
1582
+
1583
+ template<typename... Args>
1584
+ void set_min(Args... args) {
1585
+ set_min(std::vector<int>{args...});
1586
+ }
1587
+ // @}
1588
+
1589
+ /** Test if a given coordinate is within the bounds of an image. */
1590
+ // @{
1591
+ bool contains(const std::vector<int> &coords) const {
1592
+ assert(coords.size() <= static_cast<decltype(coords.size())>(dimensions()));
1593
+ for (size_t i = 0; i < coords.size(); i++) {
1594
+ if (coords[i] < dim((int)i).min() || coords[i] > dim((int)i).max()) {
1595
+ return false;
1596
+ }
1597
+ }
1598
+ return true;
1599
+ }
1600
+
1601
+ template<typename... Args>
1602
+ bool contains(Args... args) const {
1603
+ return contains(std::vector<int>{args...});
1604
+ }
1605
+ // @}
1606
+
1607
+ /** Make a buffer which refers to the same data in the same layout
1608
+ * using a swapped indexing order for the dimensions given. So
1609
+ * A = B.transposed(0, 1) means that A(i, j) == B(j, i), and more
1610
+ * strongly that A.address_of(i, j) == B.address_of(j, i). */
1611
+ Buffer<T, Dims, InClassDimStorage> transposed(int d1, int d2) const {
1612
+ Buffer<T, Dims, InClassDimStorage> im = *this;
1613
+ im.transpose(d1, d2);
1614
+ return im;
1615
+ }
1616
+
1617
+ /** Transpose a buffer in-place by changing how it is indexed. For
1618
+ * example, transpose(0, 1) on a two-dimensional buffer means that
1619
+ * the value referred to by coordinates (i, j) is now reached at
1620
+ * the coordinates (j, i), and vice versa. This is done by
1621
+ * reordering the per-dimension metadata rather than by moving
1622
+ * data around in memory, so other views of the same memory will
1623
+ * not see the data as having been transposed. */
1624
+ void transpose(int d1, int d2) {
1625
+ assert(d1 >= 0 && d1 < this->dimensions());
1626
+ assert(d2 >= 0 && d2 < this->dimensions());
1627
+ std::swap(buf.dim[d1], buf.dim[d2]);
1628
+ }
1629
+
1630
+ /** A generalized transpose: instead of swapping two dimensions,
1631
+ * pass a vector that lists each dimension index exactly once, in
1632
+ * the desired order. This does not move any data around in memory
1633
+ * - it just permutes how it is indexed. */
1634
+ void transpose(const std::vector<int> &order) {
1635
+ assert((int)order.size() == dimensions());
1636
+ if (dimensions() < 2) {
1637
+ // My, that was easy
1638
+ return;
1639
+ }
1640
+
1641
+ std::vector<int> order_sorted = order;
1642
+ for (size_t i = 1; i < order_sorted.size(); i++) {
1643
+ for (size_t j = i; j > 0 && order_sorted[j - 1] > order_sorted[j]; j--) {
1644
+ std::swap(order_sorted[j], order_sorted[j - 1]);
1645
+ transpose(j, j - 1);
1646
+ }
1647
+ }
1648
+ }
1649
+
1650
+ /** Make a buffer which refers to the same data in the same
1651
+ * layout using a different ordering of the dimensions. */
1652
+ Buffer<T, Dims, InClassDimStorage> transposed(const std::vector<int> &order) const {
1653
+ Buffer<T, Dims, InClassDimStorage> im = *this;
1654
+ im.transpose(order);
1655
+ return im;
1656
+ }
1657
+
1658
+ /** Make a lower-dimensional buffer that refers to one slice of
1659
+ * this buffer. */
1660
+ Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
1661
+ sliced(int d, int pos) const {
1662
+ static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
1663
+ assert(dimensions() > 0);
1664
+
1665
+ Buffer<T, AnyDims, InClassDimStorage> im = *this;
1666
+
1667
+ // This guarantees the prexisting device ref is dropped if the
1668
+ // device_slice call fails and maintains the buffer in a consistent
1669
+ // state.
1670
+ im.device_deallocate();
1671
+
1672
+ im.slice_host(d, pos);
1673
+ if (buf.device_interface != nullptr) {
1674
+ complete_device_slice(im, d, pos);
1675
+ }
1676
+ return im;
1677
+ }
1678
+
1679
+ /** Make a lower-dimensional buffer that refers to one slice of this
1680
+ * buffer at the dimension's minimum. */
1681
+ Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
1682
+ sliced(int d) const {
1683
+ static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
1684
+ assert(dimensions() > 0);
1685
+
1686
+ return sliced(d, dim(d).min());
1687
+ }
1688
+
1689
+ /** Rewrite the buffer to refer to a single lower-dimensional
1690
+ * slice of itself along the given dimension at the given
1691
+ * coordinate. Does not move any data around or free the original
1692
+ * memory, so other views of the same data are unaffected. Can
1693
+ * only be called on a Buffer with dynamic dimensionality. */
1694
+ void slice(int d, int pos) {
1695
+ static_assert(Dims == AnyDims, "Cannot call slice() on a Buffer with static dimensionality.");
1696
+ assert(dimensions() > 0);
1697
+
1698
+ // An optimization for non-device buffers. For the device case,
1699
+ // a temp buffer is required, so reuse the not-in-place version.
1700
+ // TODO(zalman|abadams): Are nop slices common enough to special
1701
+ // case the device part of the if to do nothing?
1702
+ if (buf.device_interface != nullptr) {
1703
+ *this = sliced(d, pos);
1704
+ } else {
1705
+ slice_host(d, pos);
1706
+ }
1707
+ }
1708
+
1709
+ /** Slice a buffer in-place at the dimension's minimum. */
1710
+ void slice(int d) {
1711
+ slice(d, dim(d).min());
1712
+ }
1713
+
1714
+ /** Make a new buffer that views this buffer as a single slice in a
1715
+ * higher-dimensional space. The new dimension has extent one and
1716
+ * the given min. This operation is the opposite of slice. As an
1717
+ * example, the following condition is true:
1718
+ *
1719
+ \code
1720
+ im2 = im.embedded(1, 17);
1721
+ &im(x, y, c) == &im2(x, 17, y, c);
1722
+ \endcode
1723
+ */
1724
+ Buffer<T, (Dims == AnyDims ? AnyDims : Dims + 1)>
1725
+ embedded(int d, int pos = 0) const {
1726
+ Buffer<T, AnyDims, InClassDimStorage> im(*this);
1727
+ im.embed(d, pos);
1728
+ return im;
1729
+ }
1730
+
1731
+ /** Embed a buffer in-place, increasing the
1732
+ * dimensionality. */
1733
+ void embed(int d, int pos = 0) {
1734
+ static_assert(Dims == AnyDims, "Cannot call embed() on a Buffer with static dimensionality.");
1735
+ assert(d >= 0 && d <= dimensions());
1736
+ add_dimension();
1737
+ translate(dimensions() - 1, pos);
1738
+ for (int i = dimensions() - 1; i > d; i--) {
1739
+ transpose(i, i - 1);
1740
+ }
1741
+ }
1742
+
1743
+ /** Add a new dimension with a min of zero and an extent of
1744
+ * one. The stride is the extent of the outermost dimension times
1745
+ * its stride. The new dimension is the last dimension. This is a
1746
+ * special case of embed. */
1747
+ void add_dimension() {
1748
+ static_assert(Dims == AnyDims, "Cannot call add_dimension() on a Buffer with static dimensionality.");
1749
+ const int dims = buf.dimensions;
1750
+ buf.dimensions++;
1751
+ if (buf.dim != shape) {
1752
+ // We're already on the heap. Reallocate.
1753
+ halide_dimension_t *new_shape = new halide_dimension_t[buf.dimensions];
1754
+ for (int i = 0; i < dims; i++) {
1755
+ new_shape[i] = buf.dim[i];
1756
+ }
1757
+ delete[] buf.dim;
1758
+ buf.dim = new_shape;
1759
+ } else if (dims == InClassDimStorage) {
1760
+ // Transition from the in-class storage to the heap
1761
+ make_shape_storage(buf.dimensions);
1762
+ for (int i = 0; i < dims; i++) {
1763
+ buf.dim[i] = shape[i];
1764
+ }
1765
+ } else {
1766
+ // We still fit in the class
1767
+ }
1768
+ buf.dim[dims] = {0, 1, 0};
1769
+ if (dims == 0) {
1770
+ buf.dim[dims].stride = 1;
1771
+ } else {
1772
+ buf.dim[dims].stride = buf.dim[dims - 1].extent * buf.dim[dims - 1].stride;
1773
+ }
1774
+ }
1775
+
1776
+ /** Add a new dimension with a min of zero, an extent of one, and
1777
+ * the specified stride. The new dimension is the last
1778
+ * dimension. This is a special case of embed. */
1779
+ void add_dimension_with_stride(int s) {
1780
+ add_dimension();
1781
+ buf.dim[buf.dimensions - 1].stride = s;
1782
+ }
1783
+
1784
+ /** Methods for managing any GPU allocation. */
1785
+ // @{
1786
+ // Set the host dirty flag. Called by every operator()
1787
+ // access. Must be inlined so it can be hoisted out of loops.
1788
+ HALIDE_ALWAYS_INLINE
1789
+ void set_host_dirty(bool v = true) {
1790
+ assert((!v || !device_dirty()) && "Cannot set host dirty when device is already dirty. Call copy_to_host() before accessing the buffer from host.");
1791
+ buf.set_host_dirty(v);
1792
+ }
1793
+
1794
+ // Check if the device allocation is dirty. Called by
1795
+ // set_host_dirty, which is called by every accessor. Must be
1796
+ // inlined so it can be hoisted out of loops.
1797
+ HALIDE_ALWAYS_INLINE
1798
+ bool device_dirty() const {
1799
+ return buf.device_dirty();
1800
+ }
1801
+
1802
+ bool host_dirty() const {
1803
+ return buf.host_dirty();
1804
+ }
1805
+
1806
+ void set_device_dirty(bool v = true) {
1807
+ assert((!v || !host_dirty()) && "Cannot set device dirty when host is already dirty.");
1808
+ buf.set_device_dirty(v);
1809
+ }
1810
+
1811
+ int copy_to_host(void *ctx = nullptr) {
1812
+ if (device_dirty()) {
1813
+ return buf.device_interface->copy_to_host(ctx, &buf);
1814
+ }
1815
+ return halide_error_code_success;
1816
+ }
1817
+
1818
+ int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1819
+ if (host_dirty()) {
1820
+ return device_interface->copy_to_device(ctx, &buf, device_interface);
1821
+ }
1822
+ return halide_error_code_success;
1823
+ }
1824
+
1825
+ int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1826
+ return device_interface->device_malloc(ctx, &buf, device_interface);
1827
+ }
1828
+
1829
+ int device_free(void *ctx = nullptr) {
1830
+ if (dev_ref_count) {
1831
+ assert(dev_ref_count->ownership == BufferDeviceOwnership::Allocated &&
1832
+ "Can't call device_free on an unmanaged or wrapped native device handle. "
1833
+ "Free the source allocation or call device_detach_native instead.");
1834
+ // Multiple people may be holding onto this dev field
1835
+ assert(dev_ref_count->count == 1 &&
1836
+ "Multiple Halide::Runtime::Buffer objects share this device "
1837
+ "allocation. Freeing it would create dangling references. "
1838
+ "Don't call device_free on Halide buffers that you have copied or "
1839
+ "passed by value.");
1840
+ }
1841
+ int ret = halide_error_code_success;
1842
+ if (buf.device_interface) {
1843
+ ret = buf.device_interface->device_free(ctx, &buf);
1844
+ }
1845
+ if (dev_ref_count) {
1846
+ delete dev_ref_count;
1847
+ dev_ref_count = nullptr;
1848
+ }
1849
+ return ret;
1850
+ }
1851
+
1852
+ int device_wrap_native(const struct halide_device_interface_t *device_interface,
1853
+ uint64_t handle, void *ctx = nullptr) {
1854
+ assert(device_interface);
1855
+ dev_ref_count = new DeviceRefCount;
1856
+ dev_ref_count->ownership = BufferDeviceOwnership::WrappedNative;
1857
+ return device_interface->wrap_native(ctx, &buf, handle, device_interface);
1858
+ }
1859
+
1860
+ int device_detach_native(void *ctx = nullptr) {
1861
+ assert(dev_ref_count &&
1862
+ dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative &&
1863
+ "Only call device_detach_native on buffers wrapping a native "
1864
+ "device handle via device_wrap_native. This buffer was allocated "
1865
+ "using device_malloc, or is unmanaged. "
1866
+ "Call device_free or free the original allocation instead.");
1867
+ // Multiple people may be holding onto this dev field
1868
+ assert(dev_ref_count->count == 1 &&
1869
+ "Multiple Halide::Runtime::Buffer objects share this device "
1870
+ "allocation. Freeing it could create dangling references. "
1871
+ "Don't call device_detach_native on Halide buffers that you "
1872
+ "have copied or passed by value.");
1873
+ int ret = halide_error_code_success;
1874
+ if (buf.device_interface) {
1875
+ ret = buf.device_interface->detach_native(ctx, &buf);
1876
+ }
1877
+ delete dev_ref_count;
1878
+ dev_ref_count = nullptr;
1879
+ return ret;
1880
+ }
1881
+
1882
+ int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1883
+ return device_interface->device_and_host_malloc(ctx, &buf, device_interface);
1884
+ }
1885
+
1886
+ int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
1887
+ if (dev_ref_count) {
1888
+ assert(dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost &&
1889
+ "Can't call device_and_host_free on a device handle not allocated with device_and_host_malloc. "
1890
+ "Free the source allocation or call device_detach_native instead.");
1891
+ // Multiple people may be holding onto this dev field
1892
+ assert(dev_ref_count->count == 1 &&
1893
+ "Multiple Halide::Runtime::Buffer objects share this device "
1894
+ "allocation. Freeing it would create dangling references. "
1895
+ "Don't call device_and_host_free on Halide buffers that you have copied or "
1896
+ "passed by value.");
1897
+ }
1898
+ int ret = halide_error_code_success;
1899
+ if (buf.device_interface) {
1900
+ ret = buf.device_interface->device_and_host_free(ctx, &buf);
1901
+ }
1902
+ if (dev_ref_count) {
1903
+ delete dev_ref_count;
1904
+ dev_ref_count = nullptr;
1905
+ }
1906
+ return ret;
1907
+ }
1908
+
1909
+ int device_sync(void *ctx = nullptr) {
1910
+ return buf.device_sync(ctx);
1911
+ }
1912
+
1913
+ bool has_device_allocation() const {
1914
+ return buf.device != 0;
1915
+ }
1916
+
1917
+ /** Return the method by which the device field is managed. */
1918
+ BufferDeviceOwnership device_ownership() const {
1919
+ if (dev_ref_count == nullptr) {
1920
+ return BufferDeviceOwnership::Allocated;
1921
+ }
1922
+ return dev_ref_count->ownership;
1923
+ }
1924
+ // @}
1925
+
1926
+ /** If you use the (x, y, c) indexing convention, then Halide
1927
+ * Buffers are stored planar by default. This function constructs
1928
+ * an interleaved RGB or RGBA image that can still be indexed
1929
+ * using (x, y, c). Passing it to a generator requires that the
1930
+ * generator has been compiled with support for interleaved (also
1931
+ * known as packed or chunky) memory layouts. */
1932
+ static Buffer<void, Dims, InClassDimStorage> make_interleaved(halide_type_t t, int width, int height, int channels) {
1933
+ static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
1934
+ Buffer<void, Dims, InClassDimStorage> im(t, channels, width, height);
1935
+ // Note that this is equivalent to calling transpose({2, 0, 1}),
1936
+ // but slightly more efficient.
1937
+ im.transpose(0, 1);
1938
+ im.transpose(1, 2);
1939
+ return im;
1940
+ }
1941
+
1942
+ /** If you use the (x, y, c) indexing convention, then Halide
1943
+ * Buffers are stored planar by default. This function constructs
1944
+ * an interleaved RGB or RGBA image that can still be indexed
1945
+ * using (x, y, c). Passing it to a generator requires that the
1946
+ * generator has been compiled with support for interleaved (also
1947
+ * known as packed or chunky) memory layouts. */
1948
+ static Buffer<T, Dims, InClassDimStorage> make_interleaved(int width, int height, int channels) {
1949
+ return make_interleaved(static_halide_type(), width, height, channels);
1950
+ }
1951
+
1952
+ /** Wrap an existing interleaved image. */
1953
+ static Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage>
1954
+ make_interleaved(halide_type_t t, T *data, int width, int height, int channels) {
1955
+ static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
1956
+ Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage> im(t, data, channels, width, height);
1957
+ im.transpose(0, 1);
1958
+ im.transpose(1, 2);
1959
+ return im;
1960
+ }
1961
+
1962
+ /** Wrap an existing interleaved image. */
1963
+ static Buffer<T, Dims, InClassDimStorage> make_interleaved(T *data, int width, int height, int channels) {
1964
+ return make_interleaved(static_halide_type(), data, width, height, channels);
1965
+ }
1966
+
1967
+ /** Make a zero-dimensional Buffer */
1968
+ static Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage> make_scalar(halide_type_t t) {
1969
+ static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1970
+ Buffer<add_const_if_T_is_const<void>, AnyDims, InClassDimStorage> buf(t, 1);
1971
+ buf.slice(0, 0);
1972
+ return buf;
1973
+ }
1974
+
1975
+ /** Make a zero-dimensional Buffer */
1976
+ static Buffer<T, Dims, InClassDimStorage> make_scalar() {
1977
+ static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1978
+ Buffer<T, AnyDims, InClassDimStorage> buf(1);
1979
+ buf.slice(0, 0);
1980
+ return buf;
1981
+ }
1982
+
1983
+ /** Make a zero-dimensional Buffer that points to non-owned, existing data */
1984
+ static Buffer<T, Dims, InClassDimStorage> make_scalar(T *data) {
1985
+ static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
1986
+ Buffer<T, AnyDims, InClassDimStorage> buf(data, 1);
1987
+ buf.slice(0, 0);
1988
+ return buf;
1989
+ }
1990
+
1991
+ /** Make a buffer with the same shape and memory nesting order as
1992
+ * another buffer. It may have a different type. */
1993
+ template<typename T2, int D2, int S2>
1994
+ static Buffer<T, Dims, InClassDimStorage> make_with_shape_of(Buffer<T2, D2, S2> src,
1995
+ void *(*allocate_fn)(size_t) = nullptr,
1996
+ void (*deallocate_fn)(void *) = nullptr) {
1997
+ static_assert(Dims == D2 || Dims == AnyDims);
1998
+ const halide_type_t dst_type = T_is_void ? src.type() : halide_type_of<typename std::remove_cv<not_void_T>::type>();
1999
+ return Buffer<>::make_with_shape_of_helper(dst_type, src.dimensions(), src.buf.dim,
2000
+ allocate_fn, deallocate_fn);
2001
+ }
2002
+
2003
+ private:
2004
+ static Buffer<> make_with_shape_of_helper(halide_type_t dst_type,
2005
+ int dimensions,
2006
+ halide_dimension_t *shape,
2007
+ void *(*allocate_fn)(size_t),
2008
+ void (*deallocate_fn)(void *)) {
2009
+ // Reorder the dimensions of src to have strides in increasing order
2010
+ std::vector<int> swaps;
2011
+ for (int i = dimensions - 1; i > 0; i--) {
2012
+ for (int j = i; j > 0; j--) {
2013
+ if (shape[j - 1].stride > shape[j].stride) {
2014
+ std::swap(shape[j - 1], shape[j]);
2015
+ swaps.push_back(j);
2016
+ }
2017
+ }
2018
+ }
2019
+
2020
+ // Rewrite the strides to be dense (this messes up src, which
2021
+ // is why we took it by value).
2022
+ for (int i = 0; i < dimensions; i++) {
2023
+ if (i == 0) {
2024
+ shape[i].stride = 1;
2025
+ } else {
2026
+ shape[i].stride = shape[i - 1].extent * shape[i - 1].stride;
2027
+ }
2028
+ }
2029
+
2030
+ // Undo the dimension reordering
2031
+ while (!swaps.empty()) {
2032
+ int j = swaps.back();
2033
+ std::swap(shape[j - 1], shape[j]);
2034
+ swaps.pop_back();
2035
+ }
2036
+
2037
+ // Use an explicit runtime type, and make dst a Buffer<void>, to allow
2038
+ // using this method with Buffer<void> for either src or dst.
2039
+ Buffer<> dst(dst_type, nullptr, dimensions, shape);
2040
+ dst.allocate(allocate_fn, deallocate_fn);
2041
+
2042
+ return dst;
2043
+ }
2044
+
2045
+ template<typename... Args>
2046
+ HALIDE_ALWAYS_INLINE
2047
+ ptrdiff_t
2048
+ offset_of(int d, int first, Args... rest) const {
2049
+ #if HALIDE_RUNTIME_BUFFER_CHECK_INDICES
2050
+ assert(first >= this->buf.dim[d].min);
2051
+ assert(first < this->buf.dim[d].min + this->buf.dim[d].extent);
2052
+ #endif
2053
+ return offset_of(d + 1, rest...) + (ptrdiff_t)this->buf.dim[d].stride * (first - this->buf.dim[d].min);
2054
+ }
2055
+
2056
+ HALIDE_ALWAYS_INLINE
2057
+ ptrdiff_t offset_of(int d) const {
2058
+ return 0;
2059
+ }
2060
+
2061
+ template<typename... Args>
2062
+ HALIDE_ALWAYS_INLINE
2063
+ storage_T *
2064
+ address_of(Args... args) const {
2065
+ if (T_is_void) {
2066
+ return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
2067
+ } else {
2068
+ return (storage_T *)(this->buf.host) + offset_of(0, args...);
2069
+ }
2070
+ }
2071
+
2072
+ HALIDE_ALWAYS_INLINE
2073
+ ptrdiff_t offset_of(const int *pos) const {
2074
+ ptrdiff_t offset = 0;
2075
+ for (int i = this->dimensions() - 1; i >= 0; i--) {
2076
+ #if HALIDE_RUNTIME_BUFFER_CHECK_INDICES
2077
+ assert(pos[i] >= this->buf.dim[i].min);
2078
+ assert(pos[i] < this->buf.dim[i].min + this->buf.dim[i].extent);
2079
+ #endif
2080
+ offset += (ptrdiff_t)this->buf.dim[i].stride * (pos[i] - this->buf.dim[i].min);
2081
+ }
2082
+ return offset;
2083
+ }
2084
+
2085
+ HALIDE_ALWAYS_INLINE
2086
+ storage_T *address_of(const int *pos) const {
2087
+ if (T_is_void) {
2088
+ return (storage_T *)this->buf.host + offset_of(pos) * type().bytes();
2089
+ } else {
2090
+ return (storage_T *)this->buf.host + offset_of(pos);
2091
+ }
2092
+ }
2093
+
2094
+ public:
2095
+ /** Get a pointer to the address of the min coordinate. */
2096
+ T *data() const {
2097
+ return (T *)(this->buf.host);
2098
+ }
2099
+
2100
+ /** Access elements. Use im(...) to get a reference to an element,
2101
+ * and use &im(...) to get the address of an element. If you pass
2102
+ * fewer arguments than the buffer has dimensions, the rest are
2103
+ * treated as their min coordinate. The non-const versions set the
2104
+ * host_dirty flag to true.
2105
+ */
2106
+ //@{
2107
+ template<typename... Args,
2108
+ typename = typename std::enable_if<AllInts<Args...>::value>::type>
2109
+ HALIDE_ALWAYS_INLINE const not_void_T &operator()(int first, Args... rest) const {
2110
+ static_assert(!T_is_void,
2111
+ "Cannot use operator() on Buffer<void> types");
2112
+ constexpr int expected_dims = 1 + (int)(sizeof...(rest));
2113
+ static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2114
+ assert(!device_dirty());
2115
+ return *((const not_void_T *)(address_of(first, rest...)));
2116
+ }
2117
+
2118
+ HALIDE_ALWAYS_INLINE
2119
+ const not_void_T &
2120
+ operator()() const {
2121
+ static_assert(!T_is_void,
2122
+ "Cannot use operator() on Buffer<void> types");
2123
+ constexpr int expected_dims = 0;
2124
+ static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2125
+ assert(!device_dirty());
2126
+ return *((const not_void_T *)(data()));
2127
+ }
2128
+
2129
+ HALIDE_ALWAYS_INLINE
2130
+ const not_void_T &
2131
+ operator()(const int *pos) const {
2132
+ static_assert(!T_is_void,
2133
+ "Cannot use operator() on Buffer<void> types");
2134
+ assert(!device_dirty());
2135
+ return *((const not_void_T *)(address_of(pos)));
2136
+ }
2137
+
2138
+ template<typename... Args,
2139
+ typename = typename std::enable_if<AllInts<Args...>::value>::type>
2140
+ HALIDE_ALWAYS_INLINE
2141
+ not_void_T &
2142
+ operator()(int first, Args... rest) {
2143
+ static_assert(!T_is_void,
2144
+ "Cannot use operator() on Buffer<void> types");
2145
+ constexpr int expected_dims = 1 + (int)(sizeof...(rest));
2146
+ static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2147
+ set_host_dirty();
2148
+ return *((not_void_T *)(address_of(first, rest...)));
2149
+ }
2150
+
2151
+ HALIDE_ALWAYS_INLINE
2152
+ not_void_T &
2153
+ operator()() {
2154
+ static_assert(!T_is_void,
2155
+ "Cannot use operator() on Buffer<void> types");
2156
+ constexpr int expected_dims = 0;
2157
+ static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
2158
+ set_host_dirty();
2159
+ return *((not_void_T *)(data()));
2160
+ }
2161
+
2162
+ HALIDE_ALWAYS_INLINE
2163
+ not_void_T &
2164
+ operator()(const int *pos) {
2165
+ static_assert(!T_is_void,
2166
+ "Cannot use operator() on Buffer<void> types");
2167
+ set_host_dirty();
2168
+ return *((not_void_T *)(address_of(pos)));
2169
+ }
2170
+ // @}
2171
+
2172
+ /** Tests that all values in this buffer are equal to val. */
2173
+ bool all_equal(not_void_T val) const {
2174
+ bool all_equal = true;
2175
+ for_each_element([&](const int *pos) { all_equal &= (*this)(pos) == val; });
2176
+ return all_equal;
2177
+ }
2178
+
2179
+ Buffer<T, Dims, InClassDimStorage> &fill(not_void_T val) {
2180
+ set_host_dirty();
2181
+ for_each_value([=](T &v) { v = val; });
2182
+ return *this;
2183
+ }
2184
+
2185
+ private:
2186
+ /** Helper functions for for_each_value. */
2187
+ // @{
2188
+ template<int N>
2189
+ struct for_each_value_task_dim {
2190
+ std::ptrdiff_t extent;
2191
+ std::ptrdiff_t stride[N];
2192
+ };
2193
+
2194
+ // Given an array of strides, and a bunch of pointers to pointers
2195
+ // (all of different types), advance the pointers using the
2196
+ // strides.
2197
+ template<typename Ptr, typename... Ptrs>
2198
+ HALIDE_ALWAYS_INLINE static void advance_ptrs(const std::ptrdiff_t *stride, Ptr &ptr, Ptrs &...ptrs) {
2199
+ ptr += *stride;
2200
+ advance_ptrs(stride + 1, ptrs...);
2201
+ }
2202
+
2203
+ HALIDE_ALWAYS_INLINE
2204
+ static void advance_ptrs(const std::ptrdiff_t *) {
2205
+ }
2206
+
2207
+ template<typename Fn, typename Ptr, typename... Ptrs>
2208
+ HALIDE_NEVER_INLINE static void for_each_value_helper(Fn &&f, int d, bool innermost_strides_are_one,
2209
+ const for_each_value_task_dim<sizeof...(Ptrs) + 1> *t, Ptr ptr, Ptrs... ptrs) {
2210
+ if (d == 0) {
2211
+ if (innermost_strides_are_one) {
2212
+ Ptr end = ptr + t[0].extent;
2213
+ while (ptr != end) {
2214
+ f(*ptr++, (*ptrs++)...);
2215
+ }
2216
+ } else {
2217
+ for (std::ptrdiff_t i = t[0].extent; i != 0; i--) {
2218
+ f(*ptr, (*ptrs)...);
2219
+ advance_ptrs(t[0].stride, ptr, ptrs...);
2220
+ }
2221
+ }
2222
+ } else {
2223
+ for (std::ptrdiff_t i = t[d].extent; i != 0; i--) {
2224
+ for_each_value_helper(f, d - 1, innermost_strides_are_one, t, ptr, ptrs...);
2225
+ advance_ptrs(t[d].stride, ptr, ptrs...);
2226
+ }
2227
+ }
2228
+ }
2229
+
2230
+ // Return pair is <new_dimensions, innermost_strides_are_one>
2231
+ template<int N>
2232
+ HALIDE_NEVER_INLINE static std::pair<int, bool> for_each_value_prep(for_each_value_task_dim<N> *t,
2233
+ const halide_buffer_t **buffers) {
2234
+ const int dimensions = buffers[0]->dimensions;
2235
+ assert(dimensions > 0);
2236
+
2237
+ // Check the buffers all have clean host allocations
2238
+ for (int i = 0; i < N; i++) {
2239
+ if (buffers[i]->device) {
2240
+ assert(buffers[i]->host &&
2241
+ "Buffer passed to for_each_value has device allocation but no host allocation. Call allocate() and copy_to_host() first");
2242
+ assert(!buffers[i]->device_dirty() &&
2243
+ "Buffer passed to for_each_value is dirty on device. Call copy_to_host() first");
2244
+ } else {
2245
+ assert(buffers[i]->host &&
2246
+ "Buffer passed to for_each_value has no host or device allocation");
2247
+ }
2248
+ }
2249
+
2250
+ // Extract the strides in all the dimensions
2251
+ for (int i = 0; i < dimensions; i++) {
2252
+ for (int j = 0; j < N; j++) {
2253
+ assert(buffers[j]->dimensions == dimensions);
2254
+ assert(buffers[j]->dim[i].extent == buffers[0]->dim[i].extent &&
2255
+ buffers[j]->dim[i].min == buffers[0]->dim[i].min);
2256
+ const int s = buffers[j]->dim[i].stride;
2257
+ t[i].stride[j] = s;
2258
+ }
2259
+ t[i].extent = buffers[0]->dim[i].extent;
2260
+
2261
+ // Order the dimensions by stride, so that the traversal is cache-coherent.
2262
+ // Use the last dimension for this, because this is the source in copies.
2263
+ // It appears to be better to optimize read order than write order.
2264
+ for (int j = i; j > 0 && t[j].stride[N - 1] < t[j - 1].stride[N - 1]; j--) {
2265
+ std::swap(t[j], t[j - 1]);
2266
+ }
2267
+ }
2268
+
2269
+ // flatten dimensions where possible to make a larger inner
2270
+ // loop for autovectorization.
2271
+ int d = dimensions;
2272
+ for (int i = 1; i < d; i++) {
2273
+ bool flat = true;
2274
+ for (int j = 0; j < N; j++) {
2275
+ flat = flat && t[i - 1].stride[j] * t[i - 1].extent == t[i].stride[j];
2276
+ }
2277
+ if (flat) {
2278
+ t[i - 1].extent *= t[i].extent;
2279
+ for (int j = i; j < d - 1; j++) {
2280
+ t[j] = t[j + 1];
2281
+ }
2282
+ i--;
2283
+ d--;
2284
+ }
2285
+ }
2286
+
2287
+ // Note that we assert() that dimensions > 0 above
2288
+ // (our one-and-only caller will only call us that way)
2289
+ // so the unchecked access to t[0] should be safe.
2290
+ bool innermost_strides_are_one = true;
2291
+ for (int i = 0; i < N; i++) {
2292
+ innermost_strides_are_one &= (t[0].stride[i] == 1);
2293
+ }
2294
+
2295
+ return {d, innermost_strides_are_one};
2296
+ }
2297
+
2298
+ template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2299
+ void for_each_value_impl(Fn &&f, Args &&...other_buffers) const {
2300
+ if (dimensions() > 0) {
2301
+ const size_t alloc_size = dimensions() * sizeof(for_each_value_task_dim<N>);
2302
+ Buffer<>::for_each_value_task_dim<N> *t =
2303
+ (Buffer<>::for_each_value_task_dim<N> *)HALIDE_ALLOCA(alloc_size);
2304
+ // Move the preparatory code into a non-templated helper to
2305
+ // save code size.
2306
+ const halide_buffer_t *buffers[] = {&buf, (&other_buffers.buf)...};
2307
+ auto [new_dims, innermost_strides_are_one] = Buffer<>::for_each_value_prep(t, buffers);
2308
+ if (new_dims > 0) {
2309
+ Buffer<>::for_each_value_helper(f, new_dims - 1,
2310
+ innermost_strides_are_one,
2311
+ t,
2312
+ data(), (other_buffers.data())...);
2313
+ return;
2314
+ }
2315
+ // else fall thru
2316
+ }
2317
+
2318
+ // zero-dimensional case
2319
+ f(*data(), (*other_buffers.data())...);
2320
+ }
2321
+ // @}
2322
+
2323
+ public:
2324
+ /** Call a function on every value in the buffer, and the
2325
+ * corresponding values in some number of other buffers of the
2326
+ * same size. The function should take a reference, const
2327
+ * reference, or value of the correct type for each buffer. This
2328
+ * effectively lifts a function of scalars to an element-wise
2329
+ * function of buffers. This produces code that the compiler can
2330
+ * autovectorize. This is slightly cheaper than for_each_element,
2331
+ * because it does not need to track the coordinates.
2332
+ *
2333
+ * Note that constness of Buffers is preserved: a const Buffer<T> (for either
2334
+ * 'this' or the other-buffers arguments) will allow mutation of the
2335
+ * buffer contents, while a Buffer<const T> will not. Attempting to specify
2336
+ * a mutable reference for the lambda argument of a Buffer<const T>
2337
+ * will result in a compilation error. */
2338
+ // @{
2339
+ template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2340
+ HALIDE_ALWAYS_INLINE const Buffer<T, Dims, InClassDimStorage> &for_each_value(Fn &&f, Args &&...other_buffers) const {
2341
+ for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2342
+ return *this;
2343
+ }
2344
+
2345
+ template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
2346
+ HALIDE_ALWAYS_INLINE
2347
+ Buffer<T, Dims, InClassDimStorage> &
2348
+ for_each_value(Fn &&f, Args &&...other_buffers) {
2349
+ for_each_value_impl(f, std::forward<Args>(other_buffers)...);
2350
+ return *this;
2351
+ }
2352
+ // @}
2353
+
2354
+ private:
2355
+ // Helper functions for for_each_element
2356
+ struct for_each_element_task_dim {
2357
+ int min, max;
2358
+ };
2359
+
2360
+ /** If f is callable with this many args, call it. The first
2361
+ * argument is just to make the overloads distinct. Actual
2362
+ * overload selection is done using the enable_if. */
2363
+ template<typename Fn,
2364
+ typename... Args,
2365
+ typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2366
+ HALIDE_ALWAYS_INLINE static void for_each_element_variadic(int, int, const for_each_element_task_dim *, Fn &&f, Args... args) {
2367
+ f(args...);
2368
+ }
2369
+
2370
+ /** If the above overload is impossible, we add an outer loop over
2371
+ * an additional argument and try again. */
2372
+ template<typename Fn,
2373
+ typename... Args>
2374
+ HALIDE_ALWAYS_INLINE static void for_each_element_variadic(double, int d, const for_each_element_task_dim *t, Fn &&f, Args... args) {
2375
+ for (int i = t[d].min; i <= t[d].max; i++) {
2376
+ for_each_element_variadic(0, d - 1, t, std::forward<Fn>(f), i, args...);
2377
+ }
2378
+ }
2379
+
2380
+ /** Determine the minimum number of arguments a callable can take
2381
+ * using the same trick. */
2382
+ template<typename Fn,
2383
+ typename... Args,
2384
+ typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2385
+ HALIDE_ALWAYS_INLINE static int num_args(int, Fn &&, Args...) {
2386
+ return (int)(sizeof...(Args));
2387
+ }
2388
+
2389
+ /** The recursive version is only enabled up to a recursion limit
2390
+ * of 256. This catches callables that aren't callable with any
2391
+ * number of ints. */
2392
+ template<typename Fn,
2393
+ typename... Args>
2394
+ HALIDE_ALWAYS_INLINE static int num_args(double, Fn &&f, Args... args) {
2395
+ static_assert(sizeof...(args) <= 256,
2396
+ "Callable passed to for_each_element must accept either a const int *,"
2397
+ " or up to 256 ints. No such operator found. Expect infinite template recursion.");
2398
+ return num_args(0, std::forward<Fn>(f), 0, args...);
2399
+ }
2400
+
2401
+ /** A version where the callable takes a position array instead,
2402
+ * with compile-time recursion on the dimensionality. This
2403
+ * overload is preferred to the one below using the same int vs
2404
+ * double trick as above, but is impossible once d hits -1 using
2405
+ * std::enable_if. */
2406
+ template<int d,
2407
+ typename Fn,
2408
+ typename = typename std::enable_if<(d >= 0)>::type>
2409
+ HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(int, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2410
+ for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2411
+ for_each_element_array_helper<d - 1>(0, t, std::forward<Fn>(f), pos);
2412
+ }
2413
+ }
2414
+
2415
+ /** Base case for recursion above. */
2416
+ template<int d,
2417
+ typename Fn,
2418
+ typename = typename std::enable_if<(d < 0)>::type>
2419
+ HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(double, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2420
+ f(pos);
2421
+ }
2422
+
2423
+ /** A run-time-recursive version (instead of
2424
+ * compile-time-recursive) that requires the callable to take a
2425
+ * pointer to a position array instead. Dispatches to the
2426
+ * compile-time-recursive version once the dimensionality gets
2427
+ * small. */
2428
+ template<typename Fn>
2429
+ static void for_each_element_array(int d, const for_each_element_task_dim *t, Fn &&f, int *pos) {
2430
+ if (d == -1) {
2431
+ f(pos);
2432
+ } else if (d == 0) {
2433
+ // Once the dimensionality gets small enough, dispatch to
2434
+ // a compile-time-recursive version for better codegen of
2435
+ // the inner loops.
2436
+ for_each_element_array_helper<0, Fn>(0, t, std::forward<Fn>(f), pos);
2437
+ } else if (d == 1) {
2438
+ for_each_element_array_helper<1, Fn>(0, t, std::forward<Fn>(f), pos);
2439
+ } else if (d == 2) {
2440
+ for_each_element_array_helper<2, Fn>(0, t, std::forward<Fn>(f), pos);
2441
+ } else if (d == 3) {
2442
+ for_each_element_array_helper<3, Fn>(0, t, std::forward<Fn>(f), pos);
2443
+ } else {
2444
+ for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
2445
+ for_each_element_array(d - 1, t, std::forward<Fn>(f), pos);
2446
+ }
2447
+ }
2448
+ }
2449
+
2450
+ /** We now have two overloads for for_each_element. This one
2451
+ * triggers if the callable takes a const int *.
2452
+ */
2453
+ template<typename Fn,
2454
+ typename = decltype(std::declval<Fn>()((const int *)nullptr))>
2455
+ static void for_each_element(int, int dims, const for_each_element_task_dim *t, Fn &&f, int check = 0) {
2456
+ const int size = dims * sizeof(int);
2457
+ int *pos = (int *)HALIDE_ALLOCA(size);
2458
+ // At least one version of GCC will (incorrectly) report that pos "may be used uninitialized".
2459
+ // Add this memset to silence it.
2460
+ memset(pos, 0, size);
2461
+ for_each_element_array(dims - 1, t, std::forward<Fn>(f), pos);
2462
+ }
2463
+
2464
+ /** This one triggers otherwise. It treats the callable as
2465
+ * something that takes some number of ints. */
2466
+ template<typename Fn>
2467
+ HALIDE_ALWAYS_INLINE static void for_each_element(double, int dims, const for_each_element_task_dim *t, Fn &&f) {
2468
+ int args = num_args(0, std::forward<Fn>(f));
2469
+ assert(dims >= args);
2470
+ for_each_element_variadic(0, args - 1, t, std::forward<Fn>(f));
2471
+ }
2472
+
2473
+ template<typename Fn>
2474
+ void for_each_element_impl(Fn &&f) const {
2475
+ for_each_element_task_dim *t =
2476
+ (for_each_element_task_dim *)HALIDE_ALLOCA(dimensions() * sizeof(for_each_element_task_dim));
2477
+ for (int i = 0; i < dimensions(); i++) {
2478
+ t[i].min = dim(i).min();
2479
+ t[i].max = dim(i).max();
2480
+ }
2481
+ for_each_element(0, dimensions(), t, std::forward<Fn>(f));
2482
+ }
2483
+
2484
+ public:
2485
+ /** Call a function at each site in a buffer. This is likely to be
2486
+ * much slower than using Halide code to populate a buffer, but is
2487
+ * convenient for tests. If the function has more arguments than the
2488
+ * buffer has dimensions, the remaining arguments will be zero. If it
2489
+ * has fewer arguments than the buffer has dimensions then the last
2490
+ * few dimensions of the buffer are not iterated over. For example,
2491
+ * the following code exploits this to set a floating point RGB image
2492
+ * to red:
2493
+
2494
+ \code
2495
+ Buffer<float, 3> im(100, 100, 3);
2496
+ im.for_each_element([&](int x, int y) {
2497
+ im(x, y, 0) = 1.0f;
2498
+ im(x, y, 1) = 0.0f;
2499
+ im(x, y, 2) = 0.0f:
2500
+ });
2501
+ \endcode
2502
+
2503
+ * The compiled code is equivalent to writing the a nested for loop,
2504
+ * and compilers are capable of optimizing it in the same way.
2505
+ *
2506
+ * If the callable can be called with an int * as the sole argument,
2507
+ * that version is called instead. Each location in the buffer is
2508
+ * passed to it in a coordinate array. This version is higher-overhead
2509
+ * than the variadic version, but is useful for writing generic code
2510
+ * that accepts buffers of arbitrary dimensionality. For example, the
2511
+ * following sets the value at all sites in an arbitrary-dimensional
2512
+ * buffer to their first coordinate:
2513
+
2514
+ \code
2515
+ im.for_each_element([&](const int *pos) {im(pos) = pos[0];});
2516
+ \endcode
2517
+
2518
+ * It is also possible to use for_each_element to iterate over entire
2519
+ * rows or columns by cropping the buffer to a single column or row
2520
+ * respectively and iterating over elements of the result. For example,
2521
+ * to set the diagonal of the image to 1 by iterating over the columns:
2522
+
2523
+ \code
2524
+ Buffer<float, 3> im(100, 100, 3);
2525
+ im.sliced(1, 0).for_each_element([&](int x, int c) {
2526
+ im(x, x, c) = 1.0f;
2527
+ });
2528
+ \endcode
2529
+
2530
+ * Or, assuming the memory layout is known to be dense per row, one can
2531
+ * memset each row of an image like so:
2532
+
2533
+ \code
2534
+ Buffer<float, 3> im(100, 100, 3);
2535
+ im.sliced(0, 0).for_each_element([&](int y, int c) {
2536
+ memset(&im(0, y, c), 0, sizeof(float) * im.width());
2537
+ });
2538
+ \endcode
2539
+
2540
+ */
2541
+ // @{
2542
+ template<typename Fn>
2543
+ HALIDE_ALWAYS_INLINE const Buffer<T, Dims, InClassDimStorage> &for_each_element(Fn &&f) const {
2544
+ for_each_element_impl(f);
2545
+ return *this;
2546
+ }
2547
+
2548
+ template<typename Fn>
2549
+ HALIDE_ALWAYS_INLINE
2550
+ Buffer<T, Dims, InClassDimStorage> &
2551
+ for_each_element(Fn &&f) {
2552
+ for_each_element_impl(f);
2553
+ return *this;
2554
+ }
2555
+ // @}
2556
+
2557
+ private:
2558
+ template<typename Fn>
2559
+ struct FillHelper {
2560
+ Fn f;
2561
+ Buffer<T, Dims, InClassDimStorage> *buf;
2562
+
2563
+ template<typename... Args,
2564
+ typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
2565
+ void operator()(Args... args) {
2566
+ (*buf)(args...) = f(args...);
2567
+ }
2568
+
2569
+ FillHelper(Fn &&f, Buffer<T, Dims, InClassDimStorage> *buf)
2570
+ : f(std::forward<Fn>(f)), buf(buf) {
2571
+ }
2572
+ };
2573
+
2574
+ public:
2575
+ /** Fill a buffer by evaluating a callable at every site. The
2576
+ * callable should look much like a callable passed to
2577
+ * for_each_element, but it should return the value that should be
2578
+ * stored to the coordinate corresponding to the arguments. */
2579
+ template<typename Fn,
2580
+ typename = typename std::enable_if<!std::is_arithmetic<typename std::decay<Fn>::type>::value>::type>
2581
+ Buffer<T, Dims, InClassDimStorage> &fill(Fn &&f) {
2582
+ // We'll go via for_each_element. We need a variadic wrapper lambda.
2583
+ FillHelper<Fn> wrapper(std::forward<Fn>(f), this);
2584
+ return for_each_element(wrapper);
2585
+ }
2586
+
2587
+ /** Check if an input buffer passed extern stage is a querying
2588
+ * bounds. Compared to doing the host pointer check directly,
2589
+ * this both adds clarity to code and will facilitate moving to
2590
+ * another representation for bounds query arguments. */
2591
+ bool is_bounds_query() const {
2592
+ return buf.is_bounds_query();
2593
+ }
2594
+
2595
+ /** Convenient check to verify that all of the interesting bytes in the Buffer
2596
+ * are initialized under MSAN. Note that by default, we use for_each_value() here so that
2597
+ * we skip any unused padding that isn't part of the Buffer; this isn't efficient,
2598
+ * but in MSAN mode, it doesn't matter. (Pass true for the flag to force check
2599
+ * the entire Buffer storage.) */
2600
+ void msan_check_mem_is_initialized(bool entire = false) const {
2601
+ #if defined(__has_feature)
2602
+ #if __has_feature(memory_sanitizer)
2603
+ if (entire) {
2604
+ __msan_check_mem_is_initialized(data(), size_in_bytes());
2605
+ } else {
2606
+ for_each_value([](T &v) { __msan_check_mem_is_initialized(&v, sizeof(T)); ; });
2607
+ }
2608
+ #endif
2609
+ #endif
2610
+ }
2611
+ };
2612
+
2613
+ } // namespace Runtime
2614
+ } // namespace Halide
2615
+
2616
+ #undef HALIDE_ALLOCA
2617
+
2618
+ #endif // HALIDE_RUNTIME_IMAGE_H