PyPI - halide - Versions diffs - 19.0.0__cp38-cp38-macosx_11_0_x86_64.whl - Mend

halide 19.0.0__cp38-cp38-macosx_11_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

halide/__init__.py +39 -0
halide/_generator_helpers.py +835 -0
halide/bin/adams2019_retrain_cost_model +0 -0
halide/bin/adams2019_weightsdir_to_weightsfile +0 -0
halide/bin/anderson2021_retrain_cost_model +0 -0
halide/bin/anderson2021_weightsdir_to_weightsfile +0 -0
halide/bin/featurization_to_sample +0 -0
halide/bin/gengen +0 -0
halide/bin/get_host_target +0 -0
halide/halide_.cpython-38-darwin.so +0 -0
halide/imageio.py +60 -0
halide/include/Halide.h +35293 -0
halide/include/HalideBuffer.h +2618 -0
halide/include/HalidePyTorchCudaHelpers.h +64 -0
halide/include/HalidePyTorchHelpers.h +120 -0
halide/include/HalideRuntime.h +2221 -0
halide/include/HalideRuntimeCuda.h +89 -0
halide/include/HalideRuntimeD3D12Compute.h +91 -0
halide/include/HalideRuntimeHexagonDma.h +104 -0
halide/include/HalideRuntimeHexagonHost.h +157 -0
halide/include/HalideRuntimeMetal.h +112 -0
halide/include/HalideRuntimeOpenCL.h +119 -0
halide/include/HalideRuntimeQurt.h +32 -0
halide/include/HalideRuntimeVulkan.h +137 -0
halide/include/HalideRuntimeWebGPU.h +44 -0
halide/lib/cmake/Halide/FindHalide_LLVM.cmake +152 -0
halide/lib/cmake/Halide/FindV8.cmake +33 -0
halide/lib/cmake/Halide/Halide-shared-deps.cmake +0 -0
halide/lib/cmake/Halide/Halide-shared-targets-release.cmake +29 -0
halide/lib/cmake/Halide/Halide-shared-targets.cmake +154 -0
halide/lib/cmake/Halide/HalideConfig.cmake +162 -0
halide/lib/cmake/Halide/HalideConfigVersion.cmake +65 -0
halide/lib/cmake/HalideHelpers/FindHalide_WebGPU.cmake +27 -0
halide/lib/cmake/HalideHelpers/Halide-Interfaces-release.cmake +116 -0
halide/lib/cmake/HalideHelpers/Halide-Interfaces.cmake +236 -0
halide/lib/cmake/HalideHelpers/HalideGeneratorHelpers.cmake +1056 -0
halide/lib/cmake/HalideHelpers/HalideHelpersConfig.cmake +28 -0
halide/lib/cmake/HalideHelpers/HalideHelpersConfigVersion.cmake +54 -0
halide/lib/cmake/HalideHelpers/HalideTargetHelpers.cmake +99 -0
halide/lib/cmake/HalideHelpers/MutexCopy.ps1 +31 -0
halide/lib/cmake/HalideHelpers/TargetExportScript.cmake +55 -0
halide/lib/cmake/Halide_Python/Halide_Python-targets-release.cmake +30 -0
halide/lib/cmake/Halide_Python/Halide_Python-targets.cmake +125 -0
halide/lib/cmake/Halide_Python/Halide_PythonConfig.cmake +26 -0
halide/lib/cmake/Halide_Python/Halide_PythonConfigVersion.cmake +65 -0
halide/lib/libHalide.dylib +0 -0
halide/lib/libHalidePyStubs.a +0 -0
halide/lib/libHalide_GenGen.a +0 -0
halide/lib/libautoschedule_adams2019.so +0 -0
halide/lib/libautoschedule_anderson2021.so +0 -0
halide/lib/libautoschedule_li2018.so +0 -0
halide/lib/libautoschedule_mullapudi2016.so +0 -0
halide/share/doc/Halide/LICENSE.txt +233 -0
halide/share/doc/Halide/README.md +439 -0
halide/share/doc/Halide/doc/BuildingHalideWithCMake.md +626 -0
halide/share/doc/Halide/doc/CodeStyleCMake.md +393 -0
halide/share/doc/Halide/doc/FuzzTesting.md +104 -0
halide/share/doc/Halide/doc/HalideCMakePackage.md +812 -0
halide/share/doc/Halide/doc/Hexagon.md +73 -0
halide/share/doc/Halide/doc/Python.md +844 -0
halide/share/doc/Halide/doc/RunGen.md +283 -0
halide/share/doc/Halide/doc/Testing.md +125 -0
halide/share/doc/Halide/doc/Vulkan.md +287 -0
halide/share/doc/Halide/doc/WebAssembly.md +228 -0
halide/share/doc/Halide/doc/WebGPU.md +128 -0
halide/share/tools/RunGen.h +1470 -0
halide/share/tools/RunGenMain.cpp +642 -0
halide/share/tools/adams2019_autotune_loop.sh +227 -0
halide/share/tools/anderson2021_autotune_loop.sh +591 -0
halide/share/tools/halide_benchmark.h +240 -0
halide/share/tools/halide_image.h +31 -0
halide/share/tools/halide_image_info.h +318 -0
halide/share/tools/halide_image_io.h +2794 -0
halide/share/tools/halide_malloc_trace.h +102 -0
halide/share/tools/halide_thread_pool.h +161 -0
halide/share/tools/halide_trace_config.h +559 -0
halide-19.0.0.data/data/share/cmake/Halide/HalideConfig.cmake +6 -0
halide-19.0.0.data/data/share/cmake/Halide/HalideConfigVersion.cmake +65 -0
halide-19.0.0.data/data/share/cmake/HalideHelpers/HalideHelpersConfig.cmake +6 -0
halide-19.0.0.data/data/share/cmake/HalideHelpers/HalideHelpersConfigVersion.cmake +54 -0
halide-19.0.0.dist-info/METADATA +301 -0
halide-19.0.0.dist-info/RECORD +84 -0
halide-19.0.0.dist-info/WHEEL +5 -0
halide-19.0.0.dist-info/licenses/LICENSE.txt +233 -0

halide/share/tools/halide_image_io.h ADDED Viewed

@@ -0,0 +1,2794 @@
+// This simple IO library works the Halide::Buffer<T> type or any
+// other image type with the same API.
+#ifndef HALIDE_IMAGE_IO_H
+#define HALIDE_IMAGE_IO_H
+#include <algorithm>
+#include <cctype>
+#include <cmath>
+#include <cstdarg>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <functional>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+#ifndef HALIDE_NO_PNG
+#include "png.h"
+#endif
+#ifndef HALIDE_NO_JPEG
+#ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+#include "jpeglib.h"
+#endif
+#include "HalideRuntime.h"  // for halide_type_t
+namespace Halide {
+namespace Tools {
+struct FormatInfo {
+    halide_type_t type;
+    int dimensions;
+    bool operator<(const FormatInfo &other) const {
+        if (type.code < other.type.code) {
+            return true;
+        } else if (type.code > other.type.code) {
+            return false;
+        }
+        if (type.bits < other.type.bits) {
+            return true;
+        } else if (type.bits > other.type.bits) {
+            return false;
+        }
+        if (type.lanes < other.type.lanes) {
+            return true;
+        } else if (type.lanes > other.type.lanes) {
+            return false;
+        }
+        return (dimensions < other.dimensions);
+    }
+};
+namespace Internal {
+typedef bool (*CheckFunc)(bool condition, const char *msg);
+inline bool CheckFail(bool condition, const char *msg) {
+    if (!condition) {
+        fprintf(stderr, "%s\n", msg);
+        abort();
+    }
+    return condition;
+}
+inline bool CheckReturn(bool condition, const char *msg) {
+    return condition;
+}
+template<typename To, typename From>
+To convert(const From &from);
+// Convert to bool
+template<>
+inline bool convert(const bool &in) {
+    return in;
+}
+template<>
+inline bool convert(const uint8_t &in) {
+    return in != 0;
+}
+template<>
+inline bool convert(const uint16_t &in) {
+    return in != 0;
+}
+template<>
+inline bool convert(const uint32_t &in) {
+    return in != 0;
+}
+template<>
+inline bool convert(const uint64_t &in) {
+    return in != 0;
+}
+template<>
+inline bool convert(const int8_t &in) {
+    return in != 0;
+}
+template<>
+inline bool convert(const int16_t &in) {
+    return in != 0;
+}
+template<>
+inline bool convert(const int32_t &in) {
+    return in != 0;
+}
+template<>
+inline bool convert(const int64_t &in) {
+    return in != 0;
+}
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline bool convert(const _Float16 &in) {
+    return (float)in != 0;
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline bool convert(const float &in) {
+    return in != 0;
+}
+template<>
+inline bool convert(const double &in) {
+    return in != 0;
+}
+// Convert to u8
+template<>
+inline uint8_t convert(const bool &in) {
+    return in;
+}
+template<>
+inline uint8_t convert(const uint8_t &in) {
+    return in;
+}
+template<>
+inline uint8_t convert(const uint16_t &in) {
+    uint32_t tmp = (uint32_t)(in) + 0x80;
+    // Fast approximation of div-by-257: see http://research.swtch.com/divmult
+    return ((tmp * 255 + 255) >> 16);
+}
+template<>
+inline uint8_t convert(const uint32_t &in) {
+    return (uint8_t)((((uint64_t)in) + 0x00808080) / 0x01010101);
+}
+// uint64 -> 8 just discards the lower 32 bits: if you were expecting more precision, well, sorry
+template<>
+inline uint8_t convert(const uint64_t &in) {
+    return convert<uint8_t, uint32_t>(uint32_t(in >> 32));
+}
+template<>
+inline uint8_t convert(const int8_t &in) {
+    return convert<uint8_t, uint8_t>(in);
+}
+template<>
+inline uint8_t convert(const int16_t &in) {
+    return convert<uint8_t, uint16_t>(in);
+}
+template<>
+inline uint8_t convert(const int32_t &in) {
+    return convert<uint8_t, uint32_t>(in);
+}
+template<>
+inline uint8_t convert(const int64_t &in) {
+    return convert<uint8_t, uint64_t>(in);
+}
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline uint8_t convert(const _Float16 &in) {
+    return (uint8_t)std::lround((float)in * 255.0f);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline uint8_t convert(const float &in) {
+    return (uint8_t)std::lround(in * 255.0f);
+}
+template<>
+inline uint8_t convert(const double &in) {
+    return (uint8_t)std::lround(in * 255.0);
+}
+// Convert to u16
+template<>
+inline uint16_t convert(const bool &in) {
+    return in;
+}
+template<>
+inline uint16_t convert(const uint8_t &in) {
+    return uint16_t(in) * 0x0101;
+}
+template<>
+inline uint16_t convert(const uint16_t &in) {
+    return in;
+}
+template<>
+inline uint16_t convert(const uint32_t &in) {
+    return in >> 16;
+}
+template<>
+inline uint16_t convert(const uint64_t &in) {
+    return in >> 48;
+}
+template<>
+inline uint16_t convert(const int8_t &in) {
+    return convert<uint16_t, uint8_t>(in);
+}
+template<>
+inline uint16_t convert(const int16_t &in) {
+    return convert<uint16_t, uint16_t>(in);
+}
+template<>
+inline uint16_t convert(const int32_t &in) {
+    return convert<uint16_t, uint32_t>(in);
+}
+template<>
+inline uint16_t convert(const int64_t &in) {
+    return convert<uint16_t, uint64_t>(in);
+}
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline uint16_t convert(const _Float16 &in) {
+    return (uint16_t)std::lround((float)in * 65535.0f);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline uint16_t convert(const float &in) {
+    return (uint16_t)std::lround(in * 65535.0f);
+}
+template<>
+inline uint16_t convert(const double &in) {
+    return (uint16_t)std::lround(in * 65535.0);
+}
+// Convert to u32
+template<>
+inline uint32_t convert(const bool &in) {
+    return in;
+}
+template<>
+inline uint32_t convert(const uint8_t &in) {
+    return uint32_t(in) * 0x01010101;
+}
+template<>
+inline uint32_t convert(const uint16_t &in) {
+    return uint32_t(in) * 0x00010001;
+}
+template<>
+inline uint32_t convert(const uint32_t &in) {
+    return in;
+}
+template<>
+inline uint32_t convert(const uint64_t &in) {
+    return (uint32_t)(in >> 32);
+}
+template<>
+inline uint32_t convert(const int8_t &in) {
+    return convert<uint32_t, uint8_t>(in);
+}
+template<>
+inline uint32_t convert(const int16_t &in) {
+    return convert<uint32_t, uint16_t>(in);
+}
+template<>
+inline uint32_t convert(const int32_t &in) {
+    return convert<uint32_t, uint32_t>(in);
+}
+template<>
+inline uint32_t convert(const int64_t &in) {
+    return convert<uint32_t, uint64_t>(in);
+}
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline uint32_t convert(const _Float16 &in) {
+    return (uint32_t)std::llround((float)in * 4294967295.0);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline uint32_t convert(const float &in) {
+    return (uint32_t)std::llround(in * 4294967295.0);
+}
+template<>
+inline uint32_t convert(const double &in) {
+    return (uint32_t)std::llround(in * 4294967295.0);
+}
+// Convert to u64
+template<>
+inline uint64_t convert(const bool &in) {
+    return in;
+}
+template<>
+inline uint64_t convert(const uint8_t &in) {
+    return uint64_t(in) * 0x0101010101010101LL;
+}
+template<>
+inline uint64_t convert(const uint16_t &in) {
+    return uint64_t(in) * 0x0001000100010001LL;
+}
+template<>
+inline uint64_t convert(const uint32_t &in) {
+    return uint64_t(in) * 0x0000000100000001LL;
+}
+template<>
+inline uint64_t convert(const uint64_t &in) {
+    return in;
+}
+template<>
+inline uint64_t convert(const int8_t &in) {
+    return convert<uint64_t, uint8_t>(in);
+}
+template<>
+inline uint64_t convert(const int16_t &in) {
+    return convert<uint64_t, uint16_t>(in);
+}
+template<>
+inline uint64_t convert(const int32_t &in) {
+    return convert<uint64_t, uint64_t>(in);
+}
+template<>
+inline uint64_t convert(const int64_t &in) {
+    return convert<uint64_t, uint64_t>(in);
+}
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline uint64_t convert(const _Float16 &in) {
+    return convert<uint64_t, uint32_t>((uint32_t)std::llround((float)in * 4294967295.0));
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline uint64_t convert(const float &in) {
+    return convert<uint64_t, uint32_t>((uint32_t)std::llround(in * 4294967295.0));
+}
+template<>
+inline uint64_t convert(const double &in) {
+    return convert<uint64_t, uint32_t>((uint32_t)std::llround(in * 4294967295.0));
+}
+// Convert to i8
+template<>
+inline int8_t convert(const bool &in) {
+    return in;
+}
+template<>
+inline int8_t convert(const uint8_t &in) {
+    return convert<uint8_t, uint8_t>(in);
+}
+template<>
+inline int8_t convert(const uint16_t &in) {
+    return convert<uint8_t, uint16_t>(in);
+}
+template<>
+inline int8_t convert(const uint32_t &in) {
+    return convert<uint8_t, uint32_t>(in);
+}
+template<>
+inline int8_t convert(const uint64_t &in) {
+    return convert<uint8_t, uint64_t>(in);
+}
+template<>
+inline int8_t convert(const int8_t &in) {
+    return convert<uint8_t, int8_t>(in);
+}
+template<>
+inline int8_t convert(const int16_t &in) {
+    return convert<uint8_t, int16_t>(in);
+}
+template<>
+inline int8_t convert(const int32_t &in) {
+    return convert<uint8_t, int32_t>(in);
+}
+template<>
+inline int8_t convert(const int64_t &in) {
+    return convert<uint8_t, int64_t>(in);
+}
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline int8_t convert(const _Float16 &in) {
+    return convert<uint8_t, float>((float)in);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline int8_t convert(const float &in) {
+    return convert<uint8_t, float>(in);
+}
+template<>
+inline int8_t convert(const double &in) {
+    return convert<uint8_t, double>(in);
+}
+// Convert to i16
+template<>
+inline int16_t convert(const bool &in) {
+    return in;
+}
+template<>
+inline int16_t convert(const uint8_t &in) {
+    return convert<uint16_t, uint8_t>(in);
+}
+template<>
+inline int16_t convert(const uint16_t &in) {
+    return convert<uint16_t, uint16_t>(in);
+}
+template<>
+inline int16_t convert(const uint32_t &in) {
+    return convert<uint16_t, uint32_t>(in);
+}
+template<>
+inline int16_t convert(const uint64_t &in) {
+    return convert<uint16_t, uint64_t>(in);
+}
+template<>
+inline int16_t convert(const int8_t &in) {
+    return convert<uint16_t, int8_t>(in);
+}
+template<>
+inline int16_t convert(const int16_t &in) {
+    return convert<uint16_t, int16_t>(in);
+}
+template<>
+inline int16_t convert(const int32_t &in) {
+    return convert<uint16_t, int32_t>(in);
+}
+template<>
+inline int16_t convert(const int64_t &in) {
+    return convert<uint16_t, int64_t>(in);
+}
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline int16_t convert(const _Float16 &in) {
+    return convert<uint16_t, float>((float)in);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline int16_t convert(const float &in) {
+    return convert<uint16_t, float>(in);
+}
+template<>
+inline int16_t convert(const double &in) {
+    return convert<uint16_t, double>(in);
+}
+// Convert to i32
+template<>
+inline int32_t convert(const bool &in) {
+    return in;
+}
+template<>
+inline int32_t convert(const uint8_t &in) {
+    return convert<uint32_t, uint8_t>(in);
+}
+template<>
+inline int32_t convert(const uint16_t &in) {
+    return convert<uint32_t, uint16_t>(in);
+}
+template<>
+inline int32_t convert(const uint32_t &in) {
+    return convert<uint32_t, uint32_t>(in);
+}
+template<>
+inline int32_t convert(const uint64_t &in) {
+    return convert<uint32_t, uint64_t>(in);
+}
+template<>
+inline int32_t convert(const int8_t &in) {
+    return convert<uint32_t, int8_t>(in);
+}
+template<>
+inline int32_t convert(const int16_t &in) {
+    return convert<uint32_t, int16_t>(in);
+}
+template<>
+inline int32_t convert(const int32_t &in) {
+    return convert<uint32_t, int32_t>(in);
+}
+template<>
+inline int32_t convert(const int64_t &in) {
+    return convert<uint32_t, int64_t>(in);
+}
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline int32_t convert(const _Float16 &in) {
+    return convert<uint32_t, float>((float)in);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline int32_t convert(const float &in) {
+    return convert<uint32_t, float>(in);
+}
+template<>
+inline int32_t convert(const double &in) {
+    return convert<uint32_t, double>(in);
+}
+// Convert to i64
+template<>
+inline int64_t convert(const bool &in) {
+    return in;
+}
+template<>
+inline int64_t convert(const uint8_t &in) {
+    return convert<uint64_t, uint8_t>(in);
+}
+template<>
+inline int64_t convert(const uint16_t &in) {
+    return convert<uint64_t, uint16_t>(in);
+}
+template<>
+inline int64_t convert(const uint32_t &in) {
+    return convert<uint64_t, uint32_t>(in);
+}
+template<>
+inline int64_t convert(const uint64_t &in) {
+    return convert<uint64_t, uint64_t>(in);
+}
+template<>
+inline int64_t convert(const int8_t &in) {
+    return convert<uint64_t, int8_t>(in);
+}
+template<>
+inline int64_t convert(const int16_t &in) {
+    return convert<uint64_t, int16_t>(in);
+}
+template<>
+inline int64_t convert(const int32_t &in) {
+    return convert<uint64_t, int32_t>(in);
+}
+template<>
+inline int64_t convert(const int64_t &in) {
+    return convert<uint64_t, int64_t>(in);
+}
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline int64_t convert(const _Float16 &in) {
+    return convert<uint64_t, float>((float)in);
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline int64_t convert(const float &in) {
+    return convert<uint64_t, float>(in);
+}
+template<>
+inline int64_t convert(const double &in) {
+    return convert<uint64_t, double>(in);
+}
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+// Convert to f16
+template<>
+inline _Float16 convert(const bool &in) {
+    return in;
+}
+template<>
+inline _Float16 convert(const uint8_t &in) {
+    return (_Float16)(in / 255.0f);
+}
+template<>
+inline _Float16 convert(const uint16_t &in) {
+    return (_Float16)(in / 65535.0f);
+}
+template<>
+inline _Float16 convert(const uint32_t &in) {
+    return (_Float16)(in / 4294967295.0);
+}
+template<>
+inline _Float16 convert(const uint64_t &in) {
+    return convert<_Float16, uint32_t>(uint32_t(in >> 32));
+}
+template<>
+inline _Float16 convert(const int8_t &in) {
+    return convert<_Float16, uint8_t>(in);
+}
+template<>
+inline _Float16 convert(const int16_t &in) {
+    return convert<_Float16, uint16_t>(in);
+}
+template<>
+inline _Float16 convert(const int32_t &in) {
+    return convert<_Float16, uint64_t>(in);
+}
+template<>
+inline _Float16 convert(const int64_t &in) {
+    return convert<_Float16, uint64_t>(in);
+}
+template<>
+inline _Float16 convert(const _Float16 &in) {
+    return in;
+}
+template<>
+inline _Float16 convert(const float &in) {
+    return (_Float16)in;
+}
+template<>
+inline _Float16 convert(const double &in) {
+    return (_Float16)in;
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+// Convert to f32
+template<>
+inline float convert(const bool &in) {
+    return in;
+}
+template<>
+inline float convert(const uint8_t &in) {
+    return in / 255.0f;
+}
+template<>
+inline float convert(const uint16_t &in) {
+    return in / 65535.0f;
+}
+template<>
+inline float convert(const uint32_t &in) {
+    return (float)(in / 4294967295.0);
+}
+template<>
+inline float convert(const uint64_t &in) {
+    return convert<float, uint32_t>(uint32_t(in >> 32));
+}
+template<>
+inline float convert(const int8_t &in) {
+    return convert<float, uint8_t>(in);
+}
+template<>
+inline float convert(const int16_t &in) {
+    return convert<float, uint16_t>(in);
+}
+template<>
+inline float convert(const int32_t &in) {
+    return convert<float, uint64_t>(in);
+}
+template<>
+inline float convert(const int64_t &in) {
+    return convert<float, uint64_t>(in);
+}
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline float convert(const _Float16 &in) {
+    return (float)in;
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline float convert(const float &in) {
+    return in;
+}
+template<>
+inline float convert(const double &in) {
+    return (float)in;
+}
+// Convert to f64
+template<>
+inline double convert(const bool &in) {
+    return in;
+}
+template<>
+inline double convert(const uint8_t &in) {
+    return in / 255.0f;
+}
+template<>
+inline double convert(const uint16_t &in) {
+    return in / 65535.0f;
+}
+template<>
+inline double convert(const uint32_t &in) {
+    return (double)(in / 4294967295.0);
+}
+template<>
+inline double convert(const uint64_t &in) {
+    return convert<double, uint32_t>(uint32_t(in >> 32));
+}
+template<>
+inline double convert(const int8_t &in) {
+    return convert<double, uint8_t>(in);
+}
+template<>
+inline double convert(const int16_t &in) {
+    return convert<double, uint16_t>(in);
+}
+template<>
+inline double convert(const int32_t &in) {
+    return convert<double, uint64_t>(in);
+}
+template<>
+inline double convert(const int64_t &in) {
+    return convert<double, uint64_t>(in);
+}
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline double convert(const _Float16 &in) {
+    return (double)in;
+}
+#endif  // HALIDE_CPP_COMPILER_HAS_FLOAT16
+template<>
+inline double convert(const float &in) {
+    return (double)in;
+}
+template<>
+inline double convert(const double &in) {
+    return in;
+}
+inline std::string to_lowercase(const std::string &s) {
+    std::string r = s;
+    std::transform(r.begin(), r.end(), r.begin(), ::tolower);
+    return r;
+}
+inline std::string get_lowercase_extension(const std::string &path) {
+    size_t last_dot = path.rfind('.');
+    if (last_dot == std::string::npos) {
+        return "";
+    }
+    return to_lowercase(path.substr(last_dot + 1));
+}
+template<typename ElemType>
+ElemType read_big_endian(const uint8_t *src);
+template<>
+inline uint8_t read_big_endian(const uint8_t *src) {
+    return *src;
+}
+template<>
+inline uint16_t read_big_endian(const uint8_t *src) {
+    return (((uint16_t)src[0]) << 8) | ((uint16_t)src[1]);
+}
+template<typename ElemType>
+void write_big_endian(const ElemType &src, uint8_t *dst);
+template<>
+inline void write_big_endian(const uint8_t &src, uint8_t *dst) {
+    *dst = src;
+}
+template<>
+inline void write_big_endian(const uint16_t &src, uint8_t *dst) {
+    dst[0] = src >> 8;
+    dst[1] = src & 0xff;
+}
+struct FileOpener {
+    FileOpener(const std::string &filename, const char *mode)
+        : f(fopen(filename.c_str(), mode)) {
+        // nothing
+    }
+    ~FileOpener() {
+        if (f != nullptr) {
+            fclose(f);
+        }
+    }
+    // read a line of data, skipping lines that begin with '#"
+    char *read_line(char *buf, int maxlen) {
+        char *status;
+        do {
+            status = fgets(buf, maxlen, f);
+        } while (status && buf[0] == '#');
+        return (status);
+    }
+    // call read_line and to a sscanf() on it
+    int scan_line(const char *fmt, ...) {
+        char buf[1024];
+        if (!read_line(buf, 1024)) {
+            return 0;
+        }
+        va_list args;
+        va_start(args, fmt);
+        int result = vsscanf(buf, fmt, args);
+        va_end(args);
+        return result;
+    }
+    bool read_bytes(void *data, size_t count) {
+        return fread(data, 1, count, f) == count;
+    }
+    template<typename T, size_t N>
+    bool read_array(T (&data)[N]) {
+        return read_bytes(&data[0], sizeof(T) * N);
+    }
+    template<typename T>
+    bool read_vector(std::vector<T> *v) {
+        return read_bytes(v->data(), v->size() * sizeof(T));
+    }
+    bool write_bytes(const void *data, size_t count) {
+        return fwrite(data, 1, count, f) == count;
+    }
+    template<typename T>
+    bool write_vector(const std::vector<T> &v) {
+        return write_bytes(v.data(), v.size() * sizeof(T));
+    }
+    template<typename T, size_t N>
+    bool write_array(const T (&data)[N]) {
+        return write_bytes(&data[0], sizeof(T) * N);
+    }
+    FILE *const f;
+};
+constexpr int AnyDims = -1;
+// Read a row of ElemTypes from a byte buffer and copy them into a specific image row.
+// Multibyte elements are assumed to be big-endian.
+template<typename ElemType, typename ImageType>
+void read_big_endian_row(const uint8_t *src, int y, ImageType *im) {
+    auto im_typed = im->template as<ElemType, AnyDims>();
+    const int xmin = im_typed.dim(0).min();
+    const int xmax = im_typed.dim(0).max();
+    if (im_typed.dimensions() > 2) {
+        const int cmin = im_typed.dim(2).min();
+        const int cmax = im_typed.dim(2).max();
+        for (int x = xmin; x <= xmax; x++) {
+            for (int c = cmin; c <= cmax; c++) {
+                im_typed(x, y, c + cmin) = read_big_endian<ElemType>(src);
+                src += sizeof(ElemType);
+            }
+        }
+    } else {
+        for (int x = xmin; x <= xmax; x++) {
+            im_typed(x, y) = read_big_endian<ElemType>(src);
+            src += sizeof(ElemType);
+        }
+    }
+}
+// Copy a row from an image into a byte buffer.
+// Multibyte elements are written in big-endian layout.
+template<typename ElemType, typename ImageType>
+void write_big_endian_row(const ImageType &im, int y, uint8_t *dst) {
+    auto im_typed = im.template as<typename std::add_const<ElemType>::type, AnyDims>();
+    const int xmin = im_typed.dim(0).min();
+    const int xmax = im_typed.dim(0).max();
+    if (im_typed.dimensions() > 2) {
+        const int cmin = im_typed.dim(2).min();
+        const int cmax = im_typed.dim(2).max();
+        for (int x = xmin; x <= xmax; x++) {
+            for (int c = cmin; c <= cmax; c++) {
+                write_big_endian<ElemType>(im_typed(x, y, c), dst);
+                dst += sizeof(ElemType);
+            }
+        }
+    } else {
+        for (int x = xmin; x <= xmax; x++) {
+            write_big_endian<ElemType>(im_typed(x, y), dst);
+            dst += sizeof(ElemType);
+        }
+    }
+}
+#ifndef HALIDE_NO_PNG
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool load_png(const std::string &filename, ImageType *im) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    /* open file and test for it being a png */
+    Internal::FileOpener f(filename, "rb");
+    if (!check(f.f != nullptr, "File could not be opened for reading")) {
+        return false;
+    }
+    png_byte header[8];
+    if (!check(f.read_array(header), "File ended before end of header")) {
+        return false;
+    }
+    if (!check(!png_sig_cmp(header, 0, 8), "File is not recognized as a PNG file")) {
+        return false;
+    }
+    /* initialize stuff */
+    png_structp png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, nullptr);
+    if (!check(png_ptr != nullptr, "png_create_read_struct failed")) {
+        return false;
+    }
+    png_infop info_ptr = png_create_info_struct(png_ptr);
+    if (!check(info_ptr != nullptr, "png_create_info_struct failed")) {
+        return false;
+    }
+    if (!check(!setjmp(png_jmpbuf(png_ptr)), "Error loading PNG")) {
+        return false;
+    }
+    png_init_io(png_ptr, f.f);
+    png_set_sig_bytes(png_ptr, 8);
+    png_read_info(png_ptr, info_ptr);
+    const int width = png_get_image_width(png_ptr, info_ptr);
+    const int height = png_get_image_height(png_ptr, info_ptr);
+    const int channels = png_get_channels(png_ptr, info_ptr);
+    const int bit_depth = png_get_bit_depth(png_ptr, info_ptr);
+    const halide_type_t im_type(halide_type_uint, bit_depth);
+    std::vector<int> im_dimensions = {width, height};
+    if (channels != 1) {
+        im_dimensions.push_back(channels);
+    }
+    *im = ImageType(im_type, im_dimensions);
+    png_read_update_info(png_ptr, info_ptr);
+    auto copy_to_image = bit_depth == 8 ?
+                             Internal::read_big_endian_row<uint8_t, ImageType> :
+                             Internal::read_big_endian_row<uint16_t, ImageType>;
+    std::vector<uint8_t> row(png_get_rowbytes(png_ptr, info_ptr));
+    const int ymin = im->dim(1).min();
+    const int ymax = im->dim(1).max();
+    for (int y = ymin; y <= ymax; ++y) {
+        png_read_row(png_ptr, row.data(), nullptr);
+        copy_to_image(row.data(), y, im);
+    }
+    png_destroy_read_struct(&png_ptr, &info_ptr, nullptr);
+    return true;
+}
+inline const std::set<FormatInfo> &query_png() {
+    static std::set<FormatInfo> info = {
+        {halide_type_t(halide_type_uint, 8), 2},
+        {halide_type_t(halide_type_uint, 16), 2},
+        {halide_type_t(halide_type_uint, 8), 3},
+        {halide_type_t(halide_type_uint, 16), 3}};
+    return info;
+}
+// "im" is not const-ref because copy_to_host() is not const.
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool save_png(ImageType &im, const std::string &filename) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    if (!check(im.copy_to_host() == halide_error_code_success, "copy_to_host() failed.")) {
+        return false;
+    }
+    const int width = im.width();
+    const int height = im.height();
+    const int channels = im.channels();
+    if (!check(channels >= 1 && channels <= 4,
+               "Can't write PNG files that have other than 1, 2, 3, or 4 channels")) {
+        return false;
+    }
+    const png_byte color_types[4] = {
+        PNG_COLOR_TYPE_GRAY,
+        PNG_COLOR_TYPE_GRAY_ALPHA,
+        PNG_COLOR_TYPE_RGB,
+        PNG_COLOR_TYPE_RGB_ALPHA};
+    png_byte color_type = color_types[channels - 1];
+    // open file
+    Internal::FileOpener f(filename, "wb");
+    if (!check(f.f != nullptr, "[write_png_file] File could not be opened for writing")) {
+        return false;
+    }
+    // initialize stuff
+    png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, nullptr);
+    if (!check(png_ptr != nullptr, "[write_png_file] png_create_write_struct failed")) {
+        return false;
+    }
+    png_infop info_ptr = png_create_info_struct(png_ptr);
+    if (!check(info_ptr != nullptr, "[write_png_file] png_create_info_struct failed")) {
+        return false;
+    }
+    if (!check(!setjmp(png_jmpbuf(png_ptr)), "Error saving PNG")) {
+        return false;
+    }
+    png_init_io(png_ptr, f.f);
+    const halide_type_t im_type = im.type();
+    const int bit_depth = im_type.bits;
+    png_set_IHDR(png_ptr, info_ptr, width, height,
+                 bit_depth, color_type, PNG_INTERLACE_NONE,
+                 PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE);
+    png_write_info(png_ptr, info_ptr);
+    auto copy_from_image = bit_depth == 8 ?
+                               Internal::write_big_endian_row<uint8_t, ImageType> :
+                               Internal::write_big_endian_row<uint16_t, ImageType>;
+    std::vector<uint8_t> row(png_get_rowbytes(png_ptr, info_ptr));
+    const int ymin = im.dim(1).min();
+    const int ymax = im.dim(1).max();
+    for (int y = ymin; y <= ymax; ++y) {
+        copy_from_image(im, y, row.data());
+        png_write_row(png_ptr, row.data());
+    }
+    png_write_end(png_ptr, nullptr);
+    png_destroy_write_struct(&png_ptr, &info_ptr);
+    return true;
+}
+#endif  // not HALIDE_NO_PNG
+template<Internal::CheckFunc check>
+bool read_pnm_header(Internal::FileOpener &f, const std::string &hdr_fmt, int *width, int *height, int *bit_depth) {
+    if (!check(f.f != nullptr, "File could not be opened for reading")) {
+        return false;
+    }
+    char header[256];
+    if (!check(f.scan_line("%255s", header) == 1, "Could not read header")) {
+        return false;
+    }
+    if (!check(to_lowercase(hdr_fmt) == to_lowercase(header), "Unexpected file header")) {
+        return false;
+    }
+    if (!check(f.scan_line("%d %d\n", width, height) == 2, "Could not read width and height")) {
+        return false;
+    }
+    int maxval;
+    if (!check(f.scan_line("%d", &maxval) == 1, "Could not read max value")) {
+        return false;
+    }
+    if (maxval == 255) {
+        *bit_depth = 8;
+    } else if (maxval == 65535) {
+        *bit_depth = 16;
+    } else {
+        *bit_depth = 0;
+        return check(false, "Invalid bit depth");
+    }
+    return true;
+}
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool load_pnm(const std::string &filename, int channels, ImageType *im) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    const char *hdr_fmt = channels == 3 ? "P6" : "P5";
+    Internal::FileOpener f(filename, "rb");
+    int width, height, bit_depth;
+    if (!Internal::read_pnm_header<check>(f, hdr_fmt, &width, &height, &bit_depth)) {
+        return false;
+    }
+    const halide_type_t im_type(halide_type_uint, bit_depth);
+    std::vector<int> im_dimensions = {width, height};
+    if (channels > 1) {
+        im_dimensions.push_back(channels);
+    }
+    *im = ImageType(im_type, im_dimensions);
+    auto copy_to_image = bit_depth == 8 ?
+                             Internal::read_big_endian_row<uint8_t, ImageType> :
+                             Internal::read_big_endian_row<uint16_t, ImageType>;
+    std::vector<uint8_t> row(width * channels * (bit_depth / 8));
+    const int ymin = im->dim(1).min();
+    const int ymax = im->dim(1).max();
+    for (int y = ymin; y <= ymax; ++y) {
+        if (!check(f.read_vector(&row), "Could not read data")) {
+            return false;
+        }
+        copy_to_image(row.data(), y, im);
+    }
+    return true;
+}
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool save_pnm(ImageType &im, const int channels, const std::string &filename) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    if (!check(im.channels() == channels, "Wrong number of channels")) {
+        return false;
+    }
+    if (!check(im.copy_to_host() == halide_error_code_success, "copy_to_host() failed.")) {
+        return false;
+    }
+    const halide_type_t im_type = im.type();
+    const int width = im.width();
+    const int height = im.height();
+    const int bit_depth = im_type.bits;
+    Internal::FileOpener f(filename, "wb");
+    if (!check(f.f != nullptr, "File could not be opened for writing")) {
+        return false;
+    }
+    const char *hdr_fmt = channels == 3 ? "P6" : "P5";
+    fprintf(f.f, "%s\n%d %d\n%d\n", hdr_fmt, width, height, (1 << bit_depth) - 1);
+    auto copy_from_image = bit_depth == 8 ?
+                               Internal::write_big_endian_row<uint8_t, ImageType> :
+                               Internal::write_big_endian_row<uint16_t, ImageType>;
+    std::vector<uint8_t> row(width * channels * (bit_depth / 8));
+    const int ymin = im.dim(1).min();
+    const int ymax = im.dim(1).max();
+    for (int y = ymin; y <= ymax; ++y) {
+        copy_from_image(im, y, row.data());
+        if (!check(f.write_vector(row), "Could not write data")) {
+            return false;
+        }
+    }
+    return true;
+}
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool load_pgm(const std::string &filename, ImageType *im) {
+    return Internal::load_pnm<ImageType, check>(filename, 1, im);
+}
+inline const std::set<FormatInfo> &query_pgm() {
+    static std::set<FormatInfo> info = {
+        {halide_type_t(halide_type_uint, 8), 2},
+        {halide_type_t(halide_type_uint, 16), 2}};
+    return info;
+}
+// "im" is not const-ref because copy_to_host() is not const.
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool save_pgm(ImageType &im, const std::string &filename) {
+    return Internal::save_pnm<ImageType, check>(im, 1, filename);
+}
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool load_ppm(const std::string &filename, ImageType *im) {
+    return Internal::load_pnm<ImageType, check>(filename, 3, im);
+}
+inline const std::set<FormatInfo> &query_ppm() {
+    static std::set<FormatInfo> info = {
+        {halide_type_t(halide_type_uint, 8), 3},
+        {halide_type_t(halide_type_uint, 16), 3}};
+    return info;
+}
+// "im" is not const-ref because copy_to_host() is not const.
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool save_ppm(ImageType &im, const std::string &filename) {
+    return Internal::save_pnm<ImageType, check>(im, 3, filename);
+}
+// -------------- .npy file format
+// Based on documentation at https://numpy.org/devdocs/reference/generated/numpy.lib.format.html
+// and elsewhere
+#if (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN) || defined(HALIDE_FORCE_BIG_ENDIAN)
+constexpr bool host_is_big_endian = true;
+#else
+constexpr bool host_is_big_endian = false;
+#endif
+constexpr char little_endian_char = '<';
+constexpr char big_endian_char = '>';
+constexpr char no_endian_char = '|';
+constexpr char host_endian_char = (host_is_big_endian ? big_endian_char : little_endian_char);
+struct npy_dtype_info_t {
+    char byte_order;
+    char type_code;
+    char type_bytes;
+    std::string descr() const {
+        return std::string(1, byte_order) + std::string(1, type_code) + std::to_string((int)type_bytes);
+    }
+};
+inline static const std::array<std::pair<halide_type_t, npy_dtype_info_t>, 11> npy_dtypes = {{
+    {halide_type_t(halide_type_float, 16), {host_endian_char, 'f', 2}},
+    {halide_type_of<float>(), {host_endian_char, 'f', sizeof(float)}},
+    {halide_type_of<double>(), {host_endian_char, 'f', sizeof(double)}},
+    {halide_type_of<int8_t>(), {no_endian_char, 'i', sizeof(int8_t)}},
+    {halide_type_of<int16_t>(), {host_endian_char, 'i', sizeof(int16_t)}},
+    {halide_type_of<int32_t>(), {host_endian_char, 'i', sizeof(int32_t)}},
+    {halide_type_of<int64_t>(), {host_endian_char, 'i', sizeof(int64_t)}},
+    {halide_type_of<uint8_t>(), {no_endian_char, 'u', sizeof(uint8_t)}},
+    {halide_type_of<uint16_t>(), {host_endian_char, 'u', sizeof(uint16_t)}},
+    {halide_type_of<uint32_t>(), {host_endian_char, 'u', sizeof(uint32_t)}},
+    {halide_type_of<uint64_t>(), {host_endian_char, 'u', sizeof(uint64_t)}},
+}};
+inline static const std::array<char, 6> npy_magic_string = {'\x93', 'N', 'U', 'M', 'P', 'Y'};
+inline static const std::array<char, 2> npy_v1_bytes = {'\x01', '\x00'};
+inline std::string trim_whitespace(const std::string &s) {
+    const size_t first = s.find_first_not_of(" \t\n");
+    if (first == std::string::npos) {
+        return "";
+    }
+    const size_t last = s.find_last_not_of(" \t\n");
+    return s.substr(first, (last - first + 1));
+}
+struct NpyHeader {
+    char type_code;
+    int type_bytes;
+    std::vector<int> extents;
+    bool parse(const std::string &header) {
+        const char *ptr = &header[0];
+        if (*ptr++ != '{') {
+            return false;
+        }
+        while (true) {
+            char endian;
+            int consumed;
+            if (std::sscanf(ptr, "'descr': '%c%c%d'%n", &endian, &type_code, &type_bytes, &consumed) == 3) {
+                if (endian != '<' && endian != '|') {
+                    return false;
+                }
+                ptr += consumed;
+            } else if (std::strncmp(ptr, "'fortran_order': False", 22) == 0) {
+                ptr += 22;
+            } else if (std::strncmp(ptr, "'shape': (", 10) == 0) {
+                ptr += 10;
+                int n;
+                while (std::sscanf(ptr, "%d%n", &n, &consumed) == 1) {
+                    extents.push_back(n);
+                    ptr += consumed;
+                    if (*ptr == ',') {
+                        ptr++;
+                    }
+                    if (*ptr == ' ') {
+                        ptr++;
+                    }
+                }
+                if (*ptr++ != ')') {
+                    return false;
+                }
+            } else if (*ptr == '}') {
+                return true;
+            } else {
+                return false;
+            }
+            if (*ptr == ',') {
+                ptr++;
+            }
+            if (*ptr == ' ') {
+                ptr++;
+            }
+            assert(ptr <= &header.back());
+        }
+    }
+};
+// return true iff the buffer storage has no padding between
+// any elements, and is in strictly planar order.
+template<typename ImageType>
+bool buffer_is_compact_planar(ImageType &im) {
+    const halide_type_t im_type = im.type();
+    const size_t elem_size = (im_type.bits / 8);
+    if (((const uint8_t *)im.begin() + (im.number_of_elements() * elem_size)) != (const uint8_t *)im.end()) {
+        return false;
+    }
+    for (int d = 1; d < im.dimensions(); ++d) {
+        if (im.dim(d - 1).stride() > im.dim(d).stride()) {
+            return false;
+        }
+        // Strides can only match if the previous dimension has extent 1
+        // (this can happen when artificially adding dimension(s), e.g.
+        // to write a .tmp file)
+        if (im.dim(d - 1).stride() == im.dim(d).stride() && im.dim(d - 1).extent() != 1) {
+            return false;
+        }
+    }
+    return true;
+}
+template<typename ImageType, CheckFunc check = CheckReturn>
+bool load_npy(const std::string &filename, ImageType *im) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    FileOpener f(filename, "rb");
+    if (!check(f.f != nullptr, "File could not be opened for reading")) {
+        return false;
+    }
+    char magic_and_version[8];
+    if (!check(f.read_bytes(magic_and_version, 8), "Could not read .npy header")) {
+        return false;
+    }
+    if (memcmp(magic_and_version, npy_magic_string.data(), npy_magic_string.size()) != 0) {
+        return check(false, "Bad .npy magic string");
+    }
+    if ((magic_and_version[6] != 1 && magic_and_version[6] != 2 && magic_and_version[6] != 3) || magic_and_version[7] != 0) {
+        return check(false, "Bad .npy version");
+    }
+    size_t header_len;
+    uint8_t header_len_le[4];
+    if (magic_and_version[6] == 1) {
+        if (!check(f.read_bytes(header_len_le, 2), "Could not read .npy header")) {
+            return false;
+        }
+        header_len = (header_len_le[0] << 0) | (header_len_le[1] << 8);
+        if (!check((6 + 2 + 2 + header_len) % 64 == 0, ".npy header is not aligned properly")) {
+            return false;
+        }
+    } else {
+        if (!check(f.read_bytes(header_len_le, 4), "Could not read .npy header")) {
+            return false;
+        }
+        header_len = (header_len_le[0] << 0) | (header_len_le[1] << 8) | (header_len_le[2] << 16) | (header_len_le[3] << 24);
+        if (!check((6 + 2 + 4 + header_len) % 64 == 0, ".npy header is not aligned properly")) {
+            return false;
+        }
+    }
+    std::string header(header_len + 1, ' ');
+    if (!check(f.read_bytes(header.data(), header_len), "Could not read .npy header string")) {
+        return false;
+    }
+    NpyHeader h;
+    if (!check(h.parse(header), "Could not parse .npy header dict")) {
+        return false;
+    }
+    halide_type_t im_type((halide_type_code_t)0, 0, 0);
+    for (const auto &d : npy_dtypes) {
+        if (h.type_code == d.second.type_code && h.type_bytes == d.second.type_bytes) {
+            im_type = d.first;
+            break;
+        }
+    }
+    if (!check(im_type.bits != 0, "Unsupported type in load_npy")) {
+        return false;
+    }
+    *im = ImageType(im_type, h.extents);
+    // This should never fail unless the default Buffer<> constructor behavior changes.
+    if (!check(buffer_is_compact_planar(*im), "load_npy() requires compact planar images")) {
+        return false;
+    }
+    if (!check(f.read_bytes(im->begin(), im->size_in_bytes()), "Count not read .npy payload")) {
+        return false;
+    }
+    im->set_host_dirty();
+    return true;
+}
+template<typename ImageType, CheckFunc check = CheckReturn>
+bool write_planar_payload(ImageType &im, FileOpener &f) {
+    if (im.dimensions() == 0 || buffer_is_compact_planar(im)) {
+        // Contiguous buffer! Write it all in one swell foop.
+        if (!check(f.write_bytes(im.begin(), im.size_in_bytes()), "Count not write planar payload")) {
+            return false;
+        }
+    } else {
+        // We have to do this the hard way.
+        int d = im.dimensions() - 1;
+        for (int i = im.dim(d).min(); i <= im.dim(d).max(); i++) {
+            auto slice = im.sliced(d, i);
+            if (!write_planar_payload(slice, f)) {
+                return false;
+            }
+        }
+    }
+    return true;
+}
+template<typename ImageType, CheckFunc check = CheckReturn>
+bool save_npy(ImageType &im, const std::string &filename) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    if (!check(im.copy_to_host() == halide_error_code_success, "copy_to_host() failed.")) {
+        return false;
+    }
+    const halide_type_t im_type = im.type();
+    npy_dtype_info_t di = {0, 0, 0};
+    for (const auto &d : npy_dtypes) {
+        if (d.first == im_type) {
+            di = d.second;
+            break;
+        }
+    }
+    if (!check(di.byte_order != 0, "Unsupported type in save_npy")) {
+        return false;
+    }
+    std::string shape = "(";
+    for (int d = 0; d < im.dimensions(); ++d) {
+        if (d > 0) {
+            shape += ",";
+        }
+        shape += std::to_string(im.dim(d).extent());
+        if (im.dimensions() == 1) {
+            shape += ",";  // special-case for single-element tuples
+        }
+    }
+    shape += ")";
+    std::string header_dict_str = "{'descr': '" + di.descr() + "', 'fortran_order': False, 'shape': " + shape + "}\n";
+    const size_t unpadded_length = npy_magic_string.size() + npy_v1_bytes.size() + 2 + header_dict_str.size();
+    const size_t padded_length = (unpadded_length + 64 - 1) & ~(64 - 1);
+    const size_t padding = padded_length - unpadded_length;
+    header_dict_str += std::string(padding, ' ');
+    if (!check(header_dict_str.size() <= 65535, "Header is too large for v1 .npy file")) {
+        return false;
+    }
+    const uint16_t header_len = (uint16_t)(header_dict_str.size());
+    const uint8_t header_len_le[2] = {
+        (uint8_t)((header_len >> 0) & 0xff),
+        (uint8_t)((header_len >> 8) & 0xff)};
+    FileOpener f(filename, "wb");
+    if (!check(f.write_bytes(npy_magic_string.data(), npy_magic_string.size()), ".npy write failed")) {
+        return false;
+    }
+    if (!check(f.write_bytes(npy_v1_bytes.data(), npy_v1_bytes.size()), ".npy write failed")) {
+        return false;
+    }
+    if (!check(f.write_bytes(header_len_le, 2), ".npy write failed")) {
+        return false;
+    }
+    if (!check(f.write_bytes(header_dict_str.data(), header_dict_str.size()), ".npy write failed")) {
+        return false;
+    }
+    if (!write_planar_payload<ImageType, check>(im, f)) {
+        return false;
+    }
+    return true;
+}
+inline const std::set<FormatInfo> &query_npy() {
+    auto build_set = []() -> std::set<FormatInfo> {
+        // NumPy doesn't support bfloat16, not sure if they plan to,
+        // so we don't attempt to support it here
+        std::set<FormatInfo> s;
+        for (halide_type_code_t code : {halide_type_int, halide_type_uint, halide_type_float}) {
+            for (int bits : {8, 16, 32, 64}) {
+                if (code == halide_type_float && bits < 16) {
+                    continue;
+                }
+                for (int dims : {1, 2, 3, 4}) {
+                    s.insert({halide_type_t(code, bits), dims});
+                }
+            }
+        }
+        return s;
+    };
+    static std::set<FormatInfo> info = build_set();
+    return info;
+}
+#ifndef HALIDE_NO_JPEG
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool load_jpg(const std::string &filename, ImageType *im) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    Internal::FileOpener f(filename, "rb");
+    if (!check(f.f != nullptr, "File could not be opened for reading")) {
+        return false;
+    }
+    struct jpeg_decompress_struct cinfo;
+    struct jpeg_error_mgr jerr;
+    cinfo.err = jpeg_std_error(&jerr);
+    jpeg_create_decompress(&cinfo);
+    jpeg_stdio_src(&cinfo, f.f);
+    jpeg_read_header(&cinfo, TRUE);
+    jpeg_start_decompress(&cinfo);
+    const int width = cinfo.output_width;
+    const int height = cinfo.output_height;
+    const int channels = cinfo.output_components;
+    const halide_type_t im_type(halide_type_uint, 8);
+    std::vector<int> im_dimensions = {width, height};
+    if (channels > 1) {
+        im_dimensions.push_back(channels);
+    }
+    *im = ImageType(im_type, im_dimensions);
+    auto copy_to_image = Internal::read_big_endian_row<uint8_t, ImageType>;
+    std::vector<uint8_t> row(width * channels);
+    const int ymin = im->dim(1).min();
+    const int ymax = im->dim(1).max();
+    for (int y = ymin; y <= ymax; ++y) {
+        uint8_t *src = row.data();
+        jpeg_read_scanlines(&cinfo, &src, 1);
+        copy_to_image(row.data(), y, im);
+    }
+    jpeg_finish_decompress(&cinfo);
+    jpeg_destroy_decompress(&cinfo);
+    return true;
+}
+inline const std::set<FormatInfo> &query_jpg() {
+    static std::set<FormatInfo> info = {
+        {halide_type_t(halide_type_uint, 8), 2},
+        {halide_type_t(halide_type_uint, 8), 3},
+    };
+    return info;
+}
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool save_jpg(ImageType &im, const std::string &filename) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    if (!check(im.copy_to_host() == halide_error_code_success, "copy_to_host() failed.")) {
+        return false;
+    }
+    const int width = im.width();
+    const int height = im.height();
+    const int channels = im.channels();
+    if (!check(channels == 1 || channels == 3, "Wrong number of channels")) {
+        return false;
+    }
+    Internal::FileOpener f(filename, "wb");
+    if (!check(f.f != nullptr, "File could not be opened for writing")) {
+        return false;
+    }
+    // TODO: Make this an argument?
+    constexpr int quality = 99;
+    struct jpeg_compress_struct cinfo;
+    struct jpeg_error_mgr jerr;
+    cinfo.err = jpeg_std_error(&jerr);
+    jpeg_create_compress(&cinfo);
+    jpeg_stdio_dest(&cinfo, f.f);
+    cinfo.image_width = width;
+    cinfo.image_height = height;
+    cinfo.input_components = channels;
+    cinfo.in_color_space = (channels == 3) ? JCS_RGB : JCS_GRAYSCALE;
+    jpeg_set_defaults(&cinfo);
+    jpeg_set_quality(&cinfo, quality, TRUE);
+    jpeg_start_compress(&cinfo, TRUE);
+    auto copy_from_image = Internal::write_big_endian_row<uint8_t, ImageType>;
+    std::vector<uint8_t> row(width * channels);
+    const int ymin = im.dim(1).min();
+    const int ymax = im.dim(1).max();
+    for (int y = ymin; y <= ymax; ++y) {
+        uint8_t *dst = row.data();
+        copy_from_image(im, y, dst);
+        jpeg_write_scanlines(&cinfo, &dst, 1);
+    }
+    jpeg_finish_compress(&cinfo);
+    jpeg_destroy_compress(&cinfo);
+    return true;
+}
+#endif  // not HALIDE_NO_JPEG
+constexpr int kNumTmpCodes = 10;
+inline const halide_type_t *tmp_code_to_halide_type() {
+    static const halide_type_t tmp_code_to_halide_type_[kNumTmpCodes] = {
+        {halide_type_float, 32},
+        {halide_type_float, 64},
+        {halide_type_uint, 8},
+        {halide_type_int, 8},
+        {halide_type_uint, 16},
+        {halide_type_int, 16},
+        {halide_type_uint, 32},
+        {halide_type_int, 32},
+        {halide_type_uint, 64},
+        {halide_type_int, 64}};
+    return tmp_code_to_halide_type_;
+}
+// ".tmp" is a file format used by the ImageStack tool (see https://github.com/abadams/ImageStack)
+template<typename ImageType, CheckFunc check = CheckReturn>
+bool load_tmp(const std::string &filename, ImageType *im) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    FileOpener f(filename, "rb");
+    if (!check(f.f != nullptr, "File could not be opened for reading")) {
+        return false;
+    }
+    int32_t header[5];
+    if (!check(f.read_array(header), "Count not read .tmp header")) {
+        return false;
+    }
+    if (!check(header[0] > 0 && header[1] > 0 && header[2] > 0 && header[3] > 0 &&
+                   header[4] >= 0 && header[4] < kNumTmpCodes,
+               "Bad header on .tmp file")) {
+        return false;
+    }
+    const halide_type_t im_type = tmp_code_to_halide_type()[header[4]];
+    std::vector<int> im_dimensions = {header[0], header[1], header[2], header[3]};
+    *im = ImageType(im_type, im_dimensions);
+    // This should never fail unless the default Buffer<> constructor behavior changes.
+    if (!check(buffer_is_compact_planar(*im), "load_tmp() requires compact planar images")) {
+        return false;
+    }
+    if (!check(f.read_bytes(im->begin(), im->size_in_bytes()), "Count not read .tmp payload")) {
+        return false;
+    }
+    im->set_host_dirty();
+    return true;
+}
+inline const std::set<FormatInfo> &query_tmp() {
+    // TMP files require exactly 4 dimensions.
+    static std::set<FormatInfo> info = {
+        {halide_type_t(halide_type_float, 32), 4},
+        {halide_type_t(halide_type_float, 64), 4},
+        {halide_type_t(halide_type_uint, 8), 4},
+        {halide_type_t(halide_type_int, 8), 4},
+        {halide_type_t(halide_type_uint, 16), 4},
+        {halide_type_t(halide_type_int, 16), 4},
+        {halide_type_t(halide_type_uint, 32), 4},
+        {halide_type_t(halide_type_int, 32), 4},
+        {halide_type_t(halide_type_uint, 64), 4},
+        {halide_type_t(halide_type_int, 64), 4},
+    };
+    return info;
+}
+// ".tmp" is a file format used by the ImageStack tool (see https://github.com/abadams/ImageStack)
+template<typename ImageType, CheckFunc check = CheckReturn>
+bool save_tmp(ImageType &im, const std::string &filename) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    if (!check(im.copy_to_host() == halide_error_code_success, "copy_to_host() failed.")) {
+        return false;
+    }
+    int32_t header[5] = {1, 1, 1, 1, -1};
+    for (int i = 0; i < im.dimensions(); ++i) {
+        header[i] = im.dim(i).extent();
+    }
+    const auto *table = tmp_code_to_halide_type();
+    for (int i = 0; i < kNumTmpCodes; i++) {
+        if (im.type() == table[i]) {
+            header[4] = i;
+            break;
+        }
+    }
+    if (!check(header[4] >= 0, "Unsupported type for .tmp file")) {
+        return false;
+    }
+    FileOpener f(filename, "wb");
+    if (!check(f.f != nullptr, "File could not be opened for writing")) {
+        return false;
+    }
+    if (!check(f.write_array(header), "Could not write .tmp header")) {
+        return false;
+    }
+    if (!write_planar_payload<ImageType, check>(im, f)) {
+        return false;
+    }
+    return true;
+}
+// ".mat" is the matlab level 5 format documented here:
+// http://www.mathworks.com/help/pdf_doc/matlab/matfile_format.pdf
+enum MatlabTypeCode {
+    miINT8 = 1,
+    miUINT8 = 2,
+    miINT16 = 3,
+    miUINT16 = 4,
+    miINT32 = 5,
+    miUINT32 = 6,
+    miSINGLE = 7,
+    miDOUBLE = 9,
+    miINT64 = 12,
+    miUINT64 = 13,
+    miMATRIX = 14,
+    miCOMPRESSED = 15,
+    miUTF8 = 16,
+    miUTF16 = 17,
+    miUTF32 = 18
+};
+enum MatlabClassCode {
+    mxCHAR_CLASS = 3,
+    mxDOUBLE_CLASS = 6,
+    mxSINGLE_CLASS = 7,
+    mxINT8_CLASS = 8,
+    mxUINT8_CLASS = 9,
+    mxINT16_CLASS = 10,
+    mxUINT16_CLASS = 11,
+    mxINT32_CLASS = 12,
+    mxUINT32_CLASS = 13,
+    mxINT64_CLASS = 14,
+    mxUINT64_CLASS = 15
+};
+template<typename ImageType, CheckFunc check = CheckReturn>
+bool load_mat(const std::string &filename, ImageType *im) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    FileOpener f(filename, "rb");
+    if (!check(f.f != nullptr, "File could not be opened for reading")) {
+        return false;
+    }
+    uint8_t header[128];
+    if (!check(f.read_array(header), "Could not read .mat header\n")) {
+        return false;
+    }
+    // Matrix header
+    uint32_t matrix_header[2];
+    if (!check(f.read_array(matrix_header), "Could not read .mat header\n")) {
+        return false;
+    }
+    if (!check(matrix_header[0] == miMATRIX, "Could not parse this .mat file: bad matrix header\n")) {
+        return false;
+    }
+    // Array flags
+    uint32_t flags[4];
+    if (!check(f.read_array(flags), "Could not read .mat header\n")) {
+        return false;
+    }
+    if (!check(flags[0] == miUINT32 && flags[1] == 8, "Could not parse this .mat file: bad flags\n")) {
+        return false;
+    }
+    // Shape
+    uint32_t shape_header[2];
+    if (!check(f.read_array(shape_header), "Could not read .mat header\n")) {
+        return false;
+    }
+    if (!check(shape_header[0] == miINT32, "Could not parse this .mat file: bad shape header\n")) {
+        return false;
+    }
+    int dims = shape_header[1] / 4;
+    std::vector<int> extents(dims);
+    if (!check(f.read_vector(&extents), "Could not read .mat header\n")) {
+        return false;
+    }
+    if (dims & 1) {
+        uint32_t padding;
+        if (!check(f.read_bytes(&padding, 4), "Could not read .mat header\n")) {
+            return false;
+        }
+    }
+    // Skip over the name
+    uint32_t name_header[2];
+    if (!check(f.read_array(name_header), "Could not read .mat header\n")) {
+        return false;
+    }
+    if (name_header[0] >> 16) {
+        // Name must be fewer than 4 chars, and so the whole name
+        // field was stored packed into 8 bytes
+    } else {
+        if (!check(name_header[0] == miINT8, "Could not parse this .mat file: bad name header\n")) {
+            return false;
+        }
+        std::vector<uint64_t> scratch((name_header[1] + 7) / 8);
+        if (!check(f.read_vector(&scratch), "Could not read .mat header\n")) {
+            return false;
+        }
+    }
+    // Payload header
+    uint32_t payload_header[2];
+    if (!check(f.read_array(payload_header), "Could not read .mat header\n")) {
+        return false;
+    }
+    halide_type_t type;
+    switch (payload_header[0]) {
+    case miINT8:
+        type = halide_type_of<int8_t>();
+        break;
+    case miINT16:
+        type = halide_type_of<int16_t>();
+        break;
+    case miINT32:
+        type = halide_type_of<int32_t>();
+        break;
+    case miINT64:
+        type = halide_type_of<int64_t>();
+        break;
+    case miUINT8:
+        type = halide_type_of<uint8_t>();
+        break;
+    case miUINT16:
+        type = halide_type_of<uint16_t>();
+        break;
+    case miUINT32:
+        type = halide_type_of<uint32_t>();
+        break;
+    case miUINT64:
+        type = halide_type_of<uint64_t>();
+        break;
+    case miSINGLE:
+        type = halide_type_of<float>();
+        break;
+    case miDOUBLE:
+        type = halide_type_of<double>();
+        break;
+    default:
+        check(false, "Unknown header");
+        return false;
+    }
+    *im = ImageType(type, extents);
+    // This should never fail unless the default Buffer<> constructor behavior changes.
+    if (!check(buffer_is_compact_planar(*im), "load_mat() requires compact planar images")) {
+        return false;
+    }
+    if (!check(f.read_bytes(im->begin(), im->size_in_bytes()), "Could not read .tmp payload")) {
+        return false;
+    }
+    im->set_host_dirty();
+    return true;
+}
+inline const std::set<FormatInfo> &query_mat() {
+    // MAT files must have at least 2 dimensions, but there's no upper
+    // bound. Our support arbitrarily stops at 16 dimensions.
+    static std::set<FormatInfo> info = []() {
+        std::set<FormatInfo> s;
+        for (int i = 2; i < 16; i++) {
+            s.insert({halide_type_t(halide_type_float, 32), i});
+            s.insert({halide_type_t(halide_type_float, 64), i});
+            s.insert({halide_type_t(halide_type_uint, 8), i});
+            s.insert({halide_type_t(halide_type_int, 8), i});
+            s.insert({halide_type_t(halide_type_uint, 16), i});
+            s.insert({halide_type_t(halide_type_int, 16), i});
+            s.insert({halide_type_t(halide_type_uint, 32), i});
+            s.insert({halide_type_t(halide_type_int, 32), i});
+            s.insert({halide_type_t(halide_type_uint, 64), i});
+            s.insert({halide_type_t(halide_type_int, 64), i});
+        }
+        return s;
+    }();
+    return info;
+}
+template<typename ImageType, CheckFunc check = CheckReturn>
+bool save_mat(ImageType &im, const std::string &filename) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    if (!check(im.copy_to_host() == halide_error_code_success, "copy_to_host() failed.")) {
+        return false;
+    }
+    uint32_t class_code = 0, type_code = 0;
+    switch (im.raw_buffer()->type.code) {
+    case halide_type_int:
+        switch (im.raw_buffer()->type.bits) {
+        case 8:
+            class_code = mxINT8_CLASS;
+            type_code = miINT8;
+            break;
+        case 16:
+            class_code = mxINT16_CLASS;
+            type_code = miINT16;
+            break;
+        case 32:
+            class_code = mxINT32_CLASS;
+            type_code = miINT32;
+            break;
+        case 64:
+            class_code = mxINT64_CLASS;
+            type_code = miINT64;
+            break;
+        default:
+            check(false, "unreachable");
+        };
+        break;
+    case halide_type_uint:
+        switch (im.raw_buffer()->type.bits) {
+        case 8:
+            class_code = mxUINT8_CLASS;
+            type_code = miUINT8;
+            break;
+        case 16:
+            class_code = mxUINT16_CLASS;
+            type_code = miUINT16;
+            break;
+        case 32:
+            class_code = mxUINT32_CLASS;
+            type_code = miUINT32;
+            break;
+        case 64:
+            class_code = mxUINT64_CLASS;
+            type_code = miUINT64;
+            break;
+        default:
+            check(false, "unreachable");
+        };
+        break;
+    case halide_type_float:
+        switch (im.raw_buffer()->type.bits) {
+        case 16:
+            check(false, "float16 not supported by .mat");
+            break;
+        case 32:
+            class_code = mxSINGLE_CLASS;
+            type_code = miSINGLE;
+            break;
+        case 64:
+            class_code = mxDOUBLE_CLASS;
+            type_code = miDOUBLE;
+            break;
+        default:
+            check(false, "unreachable");
+        };
+        break;
+    case halide_type_bfloat:
+        check(false, "bfloat not supported by .mat");
+        break;
+    default:
+        check(false, "unreachable");
+    }
+    FileOpener f(filename, "wb");
+    if (!check(f.f != nullptr, "File could not be opened for writing")) {
+        return false;
+    }
+    // Pick a name for the array
+    size_t idx = filename.rfind('.');
+    std::string name = filename.substr(0, idx);
+    idx = filename.rfind('/');
+    if (idx != std::string::npos) {
+        name = name.substr(idx + 1);
+    }
+    // Matlab variable names conform to similar rules as C
+    if (name.empty() || !std::isalpha(name[0])) {
+        name = "v" + name;
+    }
+    for (char &c : name) {
+        if (!std::isalnum(c)) {
+            c = '_';
+        }
+    }
+    uint32_t name_size = (int)name.size();
+    while (name.size() & 0x7) {
+        name += '\0';
+    }
+    char header[128] = "MATLAB 5.0 MAT-file, produced by Halide";
+    int len = strlen(header);
+    memset(header + len, ' ', sizeof(header) - len);
+    // Version
+    *((uint16_t *)(header + 124)) = 0x0100;
+    // Endianness check
+    header[126] = 'I';
+    header[127] = 'M';
+    uint64_t payload_bytes = im.size_in_bytes();
+    if (!check((payload_bytes >> 32) == 0, "Buffer too large to save as .mat")) {
+        return false;
+    }
+    int dims = im.dimensions();
+    if (dims < 2) {
+        dims = 2;
+    }
+    int padded_dims = dims + (dims & 1);
+    uint32_t padding_bytes = 7 - ((payload_bytes - 1) & 7);
+    // Matrix header
+    uint32_t matrix_header[2] = {
+        miMATRIX, 40 + padded_dims * 4 + (uint32_t)name.size() + (uint32_t)payload_bytes + padding_bytes};
+    // Array flags
+    uint32_t flags[4] = {
+        miUINT32, 8, class_code, 1};
+    // Shape
+    int32_t shape[2] = {
+        miINT32,
+        im.dimensions() * 4,
+    };
+    std::vector<int> extents(im.dimensions());
+    for (int d = 0; d < im.dimensions(); d++) {
+        extents[d] = im.dim(d).extent();
+    }
+    while ((int)extents.size() < dims) {
+        extents.push_back(1);
+    }
+    while ((int)extents.size() < padded_dims) {
+        extents.push_back(0);
+    }
+    // Name
+    uint32_t name_header[2] = {
+        miINT8, name_size};
+    // Payload header
+    uint32_t payload_header[2] = {
+        type_code, (uint32_t)payload_bytes};
+    bool success =
+        f.write_array(header) &&
+        f.write_array(matrix_header) &&
+        f.write_array(flags) &&
+        f.write_array(shape) &&
+        f.write_vector(extents) &&
+        f.write_array(name_header) &&
+        f.write_bytes(&name[0], name.size()) &&
+        f.write_array(payload_header);
+    if (!check(success, "Could not write .mat header")) {
+        return false;
+    }
+    if (!write_planar_payload<ImageType, check>(im, f)) {
+        return false;
+    }
+    // Padding
+    if (!check(padding_bytes < 8, "Too much padding!\n")) {
+        return false;
+    }
+    uint64_t padding = 0;
+    if (!f.write_bytes(&padding, padding_bytes)) {
+        return false;
+    }
+    return true;
+}
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool load_tiff(const std::string &filename, ImageType *im) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    check(false, "Reading TIFF is not yet supported");
+    return false;
+}
+inline const std::set<FormatInfo> &query_tiff() {
+    auto build_set = []() -> std::set<FormatInfo> {
+        std::set<FormatInfo> s;
+        for (halide_type_code_t code : {halide_type_int, halide_type_uint, halide_type_float}) {
+            for (int bits : {8, 16, 32, 64}) {
+                for (int dims : {1, 2, 3, 4}) {
+                    if (code == halide_type_float && bits < 32) {
+                        continue;
+                    }
+                    s.insert({halide_type_t(code, bits), dims});
+                }
+            }
+        }
+        return s;
+    };
+    static std::set<FormatInfo> info = build_set();
+    return info;
+}
+#pragma pack(push)
+#pragma pack(2)
+struct halide_tiff_tag {
+    uint16_t tag_code;
+    int16_t type_code;
+    int32_t count;
+    union {
+        int8_t i8;
+        int16_t i16;
+        int32_t i32;
+    } value;
+    void assign16(uint16_t tag_code, int32_t count, int16_t value) {
+        this->tag_code = tag_code;
+        this->type_code = 3;  // SHORT
+        this->count = count;
+        this->value.i16 = value;
+    }
+    void assign32(uint16_t tag_code, int32_t count, int32_t value) {
+        this->tag_code = tag_code;
+        this->type_code = 4;  // LONG
+        this->count = count;
+        this->value.i32 = value;
+    }
+    void assign32(uint16_t tag_code, int16_t type_code, int32_t count, int32_t value) {
+        this->tag_code = tag_code;
+        this->type_code = type_code;
+        this->count = count;
+        this->value.i32 = value;
+    }
+};
+struct halide_tiff_header {
+    int16_t byte_order_marker;
+    int16_t version;
+    int32_t ifd0_offset;
+    int16_t entry_count;
+    halide_tiff_tag entries[15];
+    int32_t ifd0_end;
+    int32_t width_resolution[2];
+    int32_t height_resolution[2];
+};
+#pragma pack(pop)
+template<typename ElemType, int BUFFER_SIZE = 1024>
+struct ElemWriter {
+    ElemWriter(FileOpener *f)
+        : f(f), next(&buf[0]) {
+    }
+    ~ElemWriter() {
+        flush();
+    }
+    void operator()(const ElemType &elem) {
+        if (!ok) {
+            return;
+        }
+        *next++ = elem;
+        if (next == &buf[BUFFER_SIZE]) {
+            flush();
+        }
+    }
+    void flush() {
+        if (!ok) {
+            return;
+        }
+        if (next > buf) {
+            if (!f->write_bytes(buf, (next - buf) * sizeof(ElemType))) {
+                ok = false;
+            }
+            next = buf;
+        }
+    }
+    FileOpener *const f;
+    ElemType buf[BUFFER_SIZE];
+    ElemType *next;
+    bool ok = true;
+};
+// Note that this is a fairly simpleminded TIFF writer that doesn't
+// do any compression. It would be desirable to (optionally) support using libtiff
+// here instead, which would also allow us to provide a useful implementation
+// for TIFF reading.
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool save_tiff(ImageType &im, const std::string &filename) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    if (!check(im.copy_to_host() == halide_error_code_success, "copy_to_host() failed.")) {
+        return false;
+    }
+    if (!check(im.dimensions() <= 4, "Can only save TIFF files with <= 4 dimensions")) {
+        return false;
+    }
+    FileOpener f(filename, "wb");
+    if (!check(f.f != nullptr, "File could not be opened for writing")) {
+        return false;
+    }
+    const size_t elements = im.number_of_elements();
+    halide_dimension_t shape[4];
+    for (int i = 0; i < im.dimensions() && i < 4; i++) {
+        const auto &d = im.dim(i);
+        shape[i].min = d.min();
+        shape[i].extent = d.extent();
+        shape[i].stride = d.stride();
+    }
+    for (int i = im.dimensions(); i < 4; i++) {
+        shape[i].min = 0;
+        shape[i].extent = 1;
+        shape[i].stride = 0;
+    }
+    const halide_type_t im_type = im.type();
+    if (!check(im_type.code >= 0 && im_type.code < 3, "Unsupported image type")) {
+        return false;
+    }
+    const int32_t bytes_per_element = im_type.bytes();
+    const int32_t width = shape[0].extent;
+    const int32_t height = shape[1].extent;
+    int32_t depth = shape[2].extent;
+    int32_t channels = shape[3].extent;
+    if ((channels == 0 || channels == 1) && (depth < 5)) {
+        channels = depth;
+        depth = 1;
+    }
+    // TIFF sample type values are:
+    //     0 => Signed int
+    //     1 => Unsigned int
+    //     2 => Floating-point
+    static const int16_t type_code_to_tiff_sample_type[] = {
+        2, 1, 3};
+    struct halide_tiff_header header;
+    memset(&header, 0, sizeof(header));
+    const int32_t MMII = 0x4d4d4949;
+    // Select the appropriate two bytes signaling byte order automatically
+    const char *c = (const char *)&MMII;
+    header.byte_order_marker = (c[0] << 8) | c[1];
+    header.version = 42;
+    header.ifd0_offset = offsetof(halide_tiff_header, entry_count);
+    header.entry_count = sizeof(header.entries) / sizeof(header.entries[0]);
+    static_assert(sizeof(halide_tiff_tag) == 12, "Unexpected halide_tiff_tag packing");
+    halide_tiff_tag *tag = &header.entries[0];
+    tag++->assign32(256, 1, width);                           // ImageWidth
+    tag++->assign32(257, 1, height);                          // ImageLength
+    tag++->assign16(258, 1, int16_t(bytes_per_element * 8));  // BitsPerSample
+    tag++->assign16(259, 1, 1);                               // Compression -- none
+    tag++->assign16(262, 1, channels >= 3 ? 2 : 1);           // PhotometricInterpretation -- black is zero or RGB
+    tag++->assign32(273, channels, sizeof(header));           // StripOffsets
+    tag++->assign16(277, 1, int16_t(channels));               // SamplesPerPixel
+    tag++->assign32(278, 1, height);                          // RowsPerStrip
+    tag++->assign32(279, channels,                            // StripByteCounts
+                    (channels == 1) ?
+                        elements * bytes_per_element :
+                        sizeof(header) + channels * sizeof(int32_t));  // for channels > 1, this is an offset
+    tag++->assign32(282, 5, 1,
+                    offsetof(halide_tiff_header, width_resolution));  // XResolution
+    tag++->assign32(283, 5, 1,
+                    offsetof(halide_tiff_header, height_resolution));      // YResolution
+    tag++->assign16(284, 1, channels == 1 ? 1 : 2);                        // PlanarConfiguration -- contig or planar
+    tag++->assign16(296, 1, 1);                                            // ResolutionUnit -- none
+    tag++->assign16(339, 1, type_code_to_tiff_sample_type[im_type.code]);  // SampleFormat
+    tag++->assign32(32997, 1, depth);                                      // Image depth
+    // Verify we used exactly the number we declared
+    assert(tag == &header.entries[header.entry_count]);
+    header.ifd0_end = 0;
+    header.width_resolution[0] = 1;
+    header.width_resolution[1] = 1;
+    header.height_resolution[0] = 1;
+    header.height_resolution[1] = 1;
+    if (!check(f.write_bytes(&header, sizeof(header)), "TIFF write failed")) {
+        return false;
+    }
+    if (channels > 1) {
+        // Fill in the values for StripOffsets
+        int32_t offset = sizeof(header) + channels * sizeof(int32_t) * 2;
+        for (int32_t i = 0; i < channels; i++) {
+            if (!check(f.write_bytes(&offset, sizeof(offset)), "TIFF write failed")) {
+                return false;
+            }
+            offset += width * height * depth * bytes_per_element;
+        }
+        // Fill in the values for StripByteCounts
+        int32_t count = width * height * depth * bytes_per_element;
+        for (int32_t i = 0; i < channels; i++) {
+            if (!check(f.write_bytes(&count, sizeof(count)), "TIFF write failed")) {
+                return false;
+            }
+        }
+    }
+    // If image is dense, we can write it in one fell swoop
+    if (elements * bytes_per_element == im.size_in_bytes()) {
+        if (!check(f.write_bytes(im.data(), im.size_in_bytes()), "TIFF write failed")) {
+            return false;
+        }
+        return true;
+    }
+    // Otherwise, write it out via manual traversal.
+#define HANDLE_CASE(CODE, BITS, TYPE)                             \
+    case halide_type_t(CODE, BITS).as_u32(): {                    \
+        ElemWriter<TYPE> ew(&f);                                  \
+        im.template as<const TYPE, AnyDims>().for_each_value(ew); \
+        if (!check(ew.ok, "TIFF write failed")) {                 \
+            return false;                                         \
+        }                                                         \
+        break;                                                    \
+    }
+    switch (im_type.element_of().as_u32()) {
+        HANDLE_CASE(halide_type_float, 32, float)
+        HANDLE_CASE(halide_type_float, 64, double)
+        HANDLE_CASE(halide_type_int, 8, int8_t)
+        HANDLE_CASE(halide_type_int, 16, int16_t)
+        HANDLE_CASE(halide_type_int, 32, int32_t)
+        HANDLE_CASE(halide_type_int, 64, int64_t)
+        HANDLE_CASE(halide_type_uint, 1, bool)
+        HANDLE_CASE(halide_type_uint, 8, uint8_t)
+        HANDLE_CASE(halide_type_uint, 16, uint16_t)
+        HANDLE_CASE(halide_type_uint, 32, uint32_t)
+        HANDLE_CASE(halide_type_uint, 64, uint64_t)
+    // Note that we don't attempt to handle halide_type_handle here.
+    default:
+        assert(false && "Unsupported type");
+        return false;
+    }
+#undef HANDLE_CASE
+    return true;
+}
+// Given something like ImageType<Foo, 2>, produce typedef ImageType<Foo, AnyDims>
+template<typename ImageType>
+struct ImageTypeWithDynamicDims {
+    using type = decltype(std::declval<ImageType>().template as<typename ImageType::ElemType, AnyDims>());
+};
+// Given something like ImageType<Foo>, produce typedef ImageType<Bar, AnyDims>
+template<typename ImageType, typename ElemType>
+struct ImageTypeWithElemType {
+    using type = decltype(std::declval<ImageType>().template as<ElemType, AnyDims>());
+};
+// Given something like ImageType<Foo>, produce typedef ImageType<const Bar, AnyDims>
+template<typename ImageType, typename ElemType>
+struct ImageTypeWithConstElemType {
+    using type = decltype(std::declval<ImageType>().template as<typename std::add_const<ElemType>::type, AnyDims>());
+};
+template<typename ImageType, Internal::CheckFunc check>
+struct ImageIO {
+    using ConstImageType = typename ImageTypeWithConstElemType<ImageType, typename ImageType::ElemType>::type;
+    std::function<bool(const std::string &, ImageType *)> load;
+    std::function<bool(ConstImageType &im, const std::string &)> save;
+    std::function<const std::set<FormatInfo> &()> query;
+};
+template<typename ImageType, Internal::CheckFunc check>
+bool find_imageio(const std::string &filename, ImageIO<ImageType, check> *result) {
+    static_assert(!ImageType::has_static_halide_type, "");
+    using ConstImageType = typename ImageTypeWithConstElemType<ImageType, typename ImageType::ElemType>::type;
+    const std::map<std::string, ImageIO<ImageType, check>> m = {
+#ifndef HALIDE_NO_JPEG
+        {"jpeg", {load_jpg<ImageType, check>, save_jpg<ConstImageType, check>, query_jpg}},
+        {"jpg", {load_jpg<ImageType, check>, save_jpg<ConstImageType, check>, query_jpg}},
+#endif
+        {"npy", {load_npy<ImageType, check>, save_npy<ConstImageType, check>, query_npy}},
+        {"pgm", {load_pgm<ImageType, check>, save_pgm<ConstImageType, check>, query_pgm}},
+#ifndef HALIDE_NO_PNG
+        {"png", {load_png<ImageType, check>, save_png<ConstImageType, check>, query_png}},
+#endif
+        {"ppm", {load_ppm<ImageType, check>, save_ppm<ConstImageType, check>, query_ppm}},
+        {"tmp", {load_tmp<ImageType, check>, save_tmp<ConstImageType, check>, query_tmp}},
+        {"mat", {load_mat<ImageType, check>, save_mat<ConstImageType, check>, query_mat}},
+        {"tiff", {load_tiff<ImageType, check>, save_tiff<ConstImageType, check>, query_tiff}},
+    };
+    std::string ext = Internal::get_lowercase_extension(filename);
+    auto it = m.find(ext);
+    if (it != m.end()) {
+        *result = it->second;
+        return true;
+    }
+    std::string err = "unsupported file extension \"" + ext + "\", supported are:";
+    for (auto &it : m) {
+        err += " " + it.first;
+    }
+    err += "\n";
+    return check(false, err.c_str());
+}
+template<typename ImageType>
+FormatInfo best_save_format(const ImageType &im, const std::set<FormatInfo> &info) {
+    // A bit ad hoc, but will do for now:
+    // Perfect score is zero (exact match).
+    // The larger the score, the worse the match.
+    int best_score = 0x7fffffff;
+    FormatInfo best{};
+    const halide_type_t im_type = im.type();
+    const int im_dimensions = im.dimensions();
+    for (const auto &f : info) {
+        int score = 0;
+        // If format has too-few dimensions, that's very bad.
+        score += std::max(0, im_dimensions - f.dimensions) * 1024;
+        // If format has too-few bits, that's pretty bad.
+        score += std::max(0, im_type.bits - f.type.bits) * 8;
+        // If format has too-many bits, that's a little bad.
+        score += std::max(0, f.type.bits - im_type.bits);
+        // If format has different code, that's a little bad.
+        score += (f.type.code != im_type.code) ? 1 : 0;
+        if (score < best_score) {
+            best_score = score;
+            best = f;
+        }
+    }
+    return best;
+}
+}  // namespace Internal
+struct ImageTypeConversion {
+    // Convert an Image from one ElemType to another, where the src and
+    // dst types are statically known (e.g. Buffer<uint8_t> -> Buffer<float>).
+    // Note that this does conversion with scaling -- intepreting integers
+    // as fixed-point numbers between 0 and 1 -- not merely C-style casting.
+    //
+    // You'd normally call this with an explicit type for DstElemType and
+    // allow ImageType to be inferred, e.g.
+    //     Buffer<uint8_t> src = ...;
+    //     Buffer<float> dst = convert_image<float>(src);
+    template<typename DstElemType, typename ImageType,
+             typename std::enable_if<ImageType::has_static_halide_type && !std::is_void<DstElemType>::value>::type * = nullptr>
+    static auto convert_image(const ImageType &src) ->
+        typename Internal::ImageTypeWithElemType<ImageType, DstElemType>::type {
+        // The enable_if ensures this will never fire; this is here primarily
+        // as documentation and a backstop against breakage.
+        static_assert(ImageType::has_static_halide_type,
+                      "This variant of convert_image() requires a statically-typed image");
+        using SrcImageType = ImageType;
+        using SrcElemType = typename SrcImageType::ElemType;
+        using DstImageType = typename Internal::ImageTypeWithElemType<ImageType, DstElemType>::type;
+        DstImageType dst = DstImageType::make_with_shape_of(src);
+        const auto converter = [](DstElemType &dst_elem, SrcElemType src_elem) {
+            dst_elem = Internal::convert<DstElemType>(src_elem);
+        };
+        dst.for_each_value(converter, src);
+        dst.set_host_dirty();
+        return dst;
+    }
+    // Convert an Image from one ElemType to another, where the dst type is statically
+    // known but the src type is not (e.g. Buffer<> -> Buffer<float>).
+    // You'd normally call this with an explicit type for DstElemType and
+    // allow ImageType to be inferred, e.g.
+    //     Buffer<uint8_t> src = ...;
+    //     Buffer<float> dst = convert_image<float>(src);
+    template<typename DstElemType, typename ImageType,
+             typename std::enable_if<!ImageType::has_static_halide_type && !std::is_void<DstElemType>::value>::type * = nullptr>
+    static auto convert_image(const ImageType &src) ->
+        typename Internal::ImageTypeWithElemType<ImageType, DstElemType>::type {
+        // The enable_if ensures this will never fire; this is here primarily
+        // as documentation and a backstop against breakage.
+        static_assert(!ImageType::has_static_halide_type,
+                      "This variant of convert_image() requires a dynamically-typed image");
+        constexpr int AnyDims = Internal::AnyDims;
+        const halide_type_t src_type = src.type();
+        switch (src_type.element_of().as_u32()) {
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+        case halide_type_t(halide_type_float, 16).as_u32():
+            return convert_image<DstElemType>(src.template as<_Float16, AnyDims>());
+#endif
+        case halide_type_t(halide_type_float, 32).as_u32():
+            return convert_image<DstElemType>(src.template as<float, AnyDims>());
+        case halide_type_t(halide_type_float, 64).as_u32():
+            return convert_image<DstElemType>(src.template as<double, AnyDims>());
+        case halide_type_t(halide_type_int, 8).as_u32():
+            return convert_image<DstElemType>(src.template as<int8_t, AnyDims>());
+        case halide_type_t(halide_type_int, 16).as_u32():
+            return convert_image<DstElemType>(src.template as<int16_t, AnyDims>());
+        case halide_type_t(halide_type_int, 32).as_u32():
+            return convert_image<DstElemType>(src.template as<int32_t, AnyDims>());
+        case halide_type_t(halide_type_int, 64).as_u32():
+            return convert_image<DstElemType>(src.template as<int64_t, AnyDims>());
+        case halide_type_t(halide_type_uint, 1).as_u32():
+            return convert_image<DstElemType>(src.template as<bool, AnyDims>());
+        case halide_type_t(halide_type_uint, 8).as_u32():
+            return convert_image<DstElemType>(src.template as<uint8_t, AnyDims>());
+        case halide_type_t(halide_type_uint, 16).as_u32():
+            return convert_image<DstElemType>(src.template as<uint16_t, AnyDims>());
+        case halide_type_t(halide_type_uint, 32).as_u32():
+            return convert_image<DstElemType>(src.template as<uint32_t, AnyDims>());
+        case halide_type_t(halide_type_uint, 64).as_u32():
+            return convert_image<DstElemType>(src.template as<uint64_t, AnyDims>());
+        default:
+            assert(false && "Unsupported type");
+            using DstImageType = typename Internal::ImageTypeWithElemType<ImageType, DstElemType>::type;
+            return DstImageType();
+        }
+    }
+    // Convert an Image from one ElemType to another, where the src type
+    // is statically known but the dst type is not
+    // (e.g. Buffer<uint8_t> -> Buffer<>(halide_type_t)).
+    template<typename DstElemType = void,
+             typename ImageType,
+             typename std::enable_if<ImageType::has_static_halide_type && std::is_void<DstElemType>::value>::type * = nullptr>
+    static auto convert_image(const ImageType &src, const halide_type_t &dst_type) ->
+        typename Internal::ImageTypeWithElemType<ImageType, void>::type {
+        // The enable_if ensures this will never fire; this is here primarily
+        // as documentation and a backstop against breakage.
+        static_assert(ImageType::has_static_halide_type,
+                      "This variant of convert_image() requires a statically-typed image");
+        // Call the appropriate static-to-static conversion routine
+        // based on the desired dst type.
+        switch (dst_type.element_of().as_u32()) {
+#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
+        case halide_type_t(halide_type_float, 16).as_u32():
+            return convert_image<_Float16>(src);
+#endif
+        case halide_type_t(halide_type_float, 32).as_u32():
+            return convert_image<float>(src);
+        case halide_type_t(halide_type_float, 64).as_u32():
+            return convert_image<double>(src);
+        case halide_type_t(halide_type_int, 8).as_u32():
+            return convert_image<int8_t>(src);
+        case halide_type_t(halide_type_int, 16).as_u32():
+            return convert_image<int16_t>(src);
+        case halide_type_t(halide_type_int, 32).as_u32():
+            return convert_image<int32_t>(src);
+        case halide_type_t(halide_type_int, 64).as_u32():
+            return convert_image<int64_t>(src);
+        case halide_type_t(halide_type_uint, 1).as_u32():
+            return convert_image<bool>(src);
+        case halide_type_t(halide_type_uint, 8).as_u32():
+            return convert_image<uint8_t>(src);
+        case halide_type_t(halide_type_uint, 16).as_u32():
+            return convert_image<uint16_t>(src);
+        case halide_type_t(halide_type_uint, 32).as_u32():
+            return convert_image<uint32_t>(src);
+        case halide_type_t(halide_type_uint, 64).as_u32():
+            return convert_image<uint64_t>(src);
+        default:
+            assert(false && "Unsupported type");
+            using RetImageType = typename Internal::ImageTypeWithDynamicDims<ImageType>::type;
+            return RetImageType();
+        }
+    }
+    // Convert an Image from one ElemType to another, where neither src type
+    // nor dst type are statically known
+    // (e.g. Buffer<>(halide_type_t) -> Buffer<>(halide_type_t)).
+    template<typename DstElemType = void,
+             typename ImageType,
+             typename std::enable_if<!ImageType::has_static_halide_type && std::is_void<DstElemType>::value>::type * = nullptr>
+    static auto convert_image(const ImageType &src, const halide_type_t &dst_type) ->
+        typename Internal::ImageTypeWithElemType<ImageType, void>::type {
+        // The enable_if ensures this will never fire; this is here primarily
+        // as documentation and a backstop against breakage.
+        static_assert(!ImageType::has_static_halide_type,
+                      "This variant of convert_image() requires a dynamically-typed image");
+        constexpr int AnyDims = Internal::AnyDims;
+        // Sniff the runtime type of src, coerce it to that type using as<>(),
+        // and call the static-to-dynamic variant of this method. (Note that
+        // this forces instantiation of the complete any-to-any conversion
+        // matrix of code.)
+        const halide_type_t src_type = src.type();
+        switch (src_type.element_of().as_u32()) {
+        case halide_type_t(halide_type_float, 32).as_u32():
+            return convert_image(src.template as<float, AnyDims>(), dst_type);
+        case halide_type_t(halide_type_float, 64).as_u32():
+            return convert_image(src.template as<double, AnyDims>(), dst_type);
+        case halide_type_t(halide_type_int, 8).as_u32():
+            return convert_image(src.template as<int8_t, AnyDims>(), dst_type);
+        case halide_type_t(halide_type_int, 16).as_u32():
+            return convert_image(src.template as<int16_t, AnyDims>(), dst_type);
+        case halide_type_t(halide_type_int, 32).as_u32():
+            return convert_image(src.template as<int32_t, AnyDims>(), dst_type);
+        case halide_type_t(halide_type_int, 64).as_u32():
+            return convert_image(src.template as<int64_t, AnyDims>(), dst_type);
+        case halide_type_t(halide_type_uint, 1).as_u32():
+            return convert_image(src.template as<bool, AnyDims>(), dst_type);
+        case halide_type_t(halide_type_uint, 8).as_u32():
+            return convert_image(src.template as<uint8_t, AnyDims>(), dst_type);
+        case halide_type_t(halide_type_uint, 16).as_u32():
+            return convert_image(src.template as<uint16_t, AnyDims>(), dst_type);
+        case halide_type_t(halide_type_uint, 32).as_u32():
+            return convert_image(src.template as<uint32_t, AnyDims>(), dst_type);
+        case halide_type_t(halide_type_uint, 64).as_u32():
+            return convert_image(src.template as<uint64_t, AnyDims>(), dst_type);
+        default:
+            assert(false && "Unsupported type");
+            using RetImageType = typename Internal::ImageTypeWithDynamicDims<ImageType>::type;
+            return RetImageType();
+        }
+    }
+};
+// Load the Image from the given file.
+// If output Image has a static type, and the loaded image cannot be stored
+// in such an image without losing data, fail.
+// Returns false upon failure.
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool load(const std::string &filename, ImageType *im) {
+    using DynamicImageType = typename Internal::ImageTypeWithElemType<ImageType, void>::type;
+    Internal::ImageIO<DynamicImageType, check> imageio;
+    if (!Internal::find_imageio<DynamicImageType, check>(filename, &imageio)) {
+        return false;
+    }
+    using DynamicImageType = typename Internal::ImageTypeWithElemType<ImageType, void>::type;
+    DynamicImageType im_d;
+    if (!imageio.load(filename, &im_d)) {
+        return false;
+    }
+    // Allow statically-typed images to be passed as the out-param, but do
+    // a runtime check to ensure
+    if (ImageType::has_static_halide_type) {
+        const halide_type_t expected_type = ImageType::static_halide_type();
+        if (!check(im_d.type() == expected_type, "Image loaded did not match the expected type")) {
+            return false;
+        }
+    }
+    *im = im_d.template as<typename ImageType::ElemType, Internal::AnyDims>();
+    im->set_host_dirty();
+    return true;
+}
+// Save the Image in the format associated with the filename's extension.
+// If the format can't represent the Image without losing data, fail.
+// Returns false upon failure.
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool save(ImageType &im, const std::string &filename) {
+    using DynamicImageType = typename Internal::ImageTypeWithElemType<ImageType, void>::type;
+    Internal::ImageIO<DynamicImageType, check> imageio;
+    if (!Internal::find_imageio<DynamicImageType, check>(filename, &imageio)) {
+        return false;
+    }
+    if (!check(imageio.query().count({im.type(), im.dimensions()}) > 0, "Image cannot be saved in this format")) {
+        return false;
+    }
+    // Allow statically-typed images to be passed in, but quietly pass them on
+    // as dynamically-typed images.
+    auto im_d = im.template as<const void, Internal::AnyDims>();
+    return imageio.save(im_d, filename);
+}
+// Return a set of FormatInfo structs that contain the legal type-and-dimensions
+// that can be saved in this format. Most applications won't ever need to use
+// this call. Returns false upon failure.
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckReturn>
+bool save_query(const std::string &filename, std::set<FormatInfo> *info) {
+    using DynamicImageType = typename Internal::ImageTypeWithElemType<ImageType, void>::type;
+    Internal::ImageIO<DynamicImageType, check> imageio;
+    if (!Internal::find_imageio<DynamicImageType, check>(filename, &imageio)) {
+        return false;
+    }
+    *info = imageio.query();
+    return true;
+}
+// Fancy wrapper to call load() with CheckFail, inferring the return type;
+// this allows you to simply use
+//
+//    Image im = load_image("filename");
+//
+// without bothering to check error results (all errors simply abort).
+//
+// Note that if the image being loaded doesn't match the static type and
+// dimensions of of the image on the LHS, a runtime error will occur.
+class load_image {
+public:
+    load_image(const std::string &f)
+        : filename(f) {
+    }
+    template<typename ImageType>
+    operator ImageType() {
+        using DynamicImageType = typename Internal::ImageTypeWithElemType<ImageType, void>::type;
+        DynamicImageType im_d;
+        Internal::CheckFail(load<DynamicImageType, Internal::CheckFail>(filename, &im_d), "load() failed");
+        Internal::CheckFail(ImageType::can_convert_from(im_d),
+                            "Type mismatch assigning the result of load_image. "
+                            "Did you mean to use load_and_convert_image?");
+        return im_d.template as<typename ImageType::ElemType, Internal::AnyDims>();
+    }
+private:
+    const std::string filename;
+};
+// Like load_image, but quietly convert the loaded image to the type of the LHS
+// if necessary, discarding information if necessary.
+class load_and_convert_image {
+public:
+    load_and_convert_image(const std::string &f)
+        : filename(f) {
+    }
+    template<typename ImageType>
+    inline operator ImageType() {
+        using DynamicImageType = typename Internal::ImageTypeWithElemType<ImageType, void>::type;
+        DynamicImageType im_d;
+        Internal::CheckFail(load<DynamicImageType, Internal::CheckFail>(filename, &im_d), "load() failed");
+        const halide_type_t expected_type = ImageType::static_halide_type();
+        if (im_d.type() == expected_type) {
+            return im_d.template as<typename ImageType::ElemType, Internal::AnyDims>();
+        } else {
+            return ImageTypeConversion::convert_image<typename ImageType::ElemType>(im_d);
+        }
+    }
+private:
+    const std::string filename;
+};
+// Fancy wrapper to call save() with CheckFail; this allows you to simply use
+//
+//    save_image(im, "filename");
+//
+// without bothering to check error results (all errors simply abort).
+//
+// If the specified image file format cannot represent the image without
+// losing data (e.g, a float32 or 4-dimensional image saved as a JPEG),
+// a runtime error will occur.
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckFail>
+void save_image(ImageType &im, const std::string &filename) {
+    auto im_d = im.template as<const void, Internal::AnyDims>();
+    (void)save<decltype(im_d), check>(im_d, filename);
+}
+// Like save_image, but quietly convert the saved image to a type that the
+// specified image file format can hold, discarding information if necessary.
+// (Note that the input image is unaffected!)
+template<typename ImageType, Internal::CheckFunc check = Internal::CheckFail>
+void convert_and_save_image(ImageType &im, const std::string &filename) {
+    // We'll be doing any conversion on the CPU
+    if (!check(im.copy_to_host() == halide_error_code_success, "copy_to_host() failed.")) {
+        return;
+    }
+    std::set<FormatInfo> info;
+    (void)save_query<typename Internal::ImageTypeWithDynamicDims<ImageType>::type, check>(filename, &info);
+    const FormatInfo best = Internal::best_save_format(im, info);
+    if (best.type == im.type() && best.dimensions == im.dimensions()) {
+        // It's an exact match, we can save as-is.
+        using DynamicImageDims = typename Internal::ImageTypeWithDynamicDims<ImageType>::type;
+        (void)save<DynamicImageDims, check>(im.template as<typename ImageType::ElemType, Internal::AnyDims>(), filename);
+    } else {
+        using DynamicImageType = typename Internal::ImageTypeWithElemType<ImageType, void>::type;
+        DynamicImageType im_converted = ImageTypeConversion::convert_image(im, best.type);
+        while (im_converted.dimensions() < best.dimensions) {
+            im_converted.add_dimension();
+        }
+        (void)save<DynamicImageType, check>(im_converted, filename);
+    }
+}
+}  // namespace Tools
+}  // namespace Halide
+#endif  // HALIDE_IMAGE_IO_H