PyPI - halide - Versions diffs - 19.0.0__cp39-cp39-macosx_11_0_arm64.whl → 21.0.0__cp39-cp39-macosx_11_0_arm64.whl - Mend

halide 19.0.0__cp39-cp39-macosx_11_0_arm64.whl → 21.0.0__cp39-cp39-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

halide/__init__.py +10 -6
halide/_generator_helpers.py +190 -127
halide/bin/adams2019_retrain_cost_model +0 -0
halide/bin/anderson2021_retrain_cost_model +0 -0
halide/bin/gengen +0 -0
halide/bin/get_host_target +0 -0
halide/halide_.cpython-39-darwin.so +0 -0
halide/imageio.py +1 -1
halide/include/Halide.h +1775 -1477
halide/include/HalideBuffer.h +13 -13
halide/include/HalidePyTorchCudaHelpers.h +1 -1
halide/include/HalideRuntime.h +35 -16
halide/lib/cmake/Halide/FindHalide_LLVM.cmake +44 -15
halide/lib/cmake/Halide/FindV8.cmake +0 -12
halide/lib/cmake/Halide/Halide-shared-targets.cmake +1 -1
halide/lib/cmake/Halide/HalideConfig.cmake +1 -1
halide/lib/cmake/Halide/HalideConfigVersion.cmake +3 -3
halide/lib/cmake/HalideHelpers/Halide-Interfaces.cmake +1 -0
halide/lib/cmake/HalideHelpers/HalideGeneratorHelpers.cmake +31 -9
halide/lib/cmake/HalideHelpers/HalideHelpersConfigVersion.cmake +3 -3
halide/lib/cmake/Halide_Python/Halide_PythonConfigVersion.cmake +3 -3
halide/lib/libHalide.dylib +0 -0
halide/lib/libHalidePyStubs.a +0 -0
halide/lib/libHalide_GenGen.a +0 -0
halide/lib/libautoschedule_adams2019.so +0 -0
halide/lib/libautoschedule_anderson2021.so +0 -0
halide/lib/libautoschedule_li2018.so +0 -0
halide/lib/libautoschedule_mullapudi2016.so +0 -0
halide/share/doc/Halide/README.md +7 -6
halide/share/doc/Halide/doc/BuildingHalideWithCMake.md +78 -6
halide/share/doc/Halide/doc/HalideCMakePackage.md +9 -2
halide/share/doc/Halide/doc/Python.md +19 -4
halide/share/doc/Halide/doc/RunGen.md +1 -1
{halide-19.0.0.data → halide-21.0.0.data}/data/share/cmake/Halide/HalideConfig.cmake +4 -1
{halide-19.0.0.data → halide-21.0.0.data}/data/share/cmake/Halide/HalideConfigVersion.cmake +3 -3
{halide-19.0.0.data → halide-21.0.0.data}/data/share/cmake/HalideHelpers/HalideHelpersConfig.cmake +4 -1
{halide-19.0.0.data → halide-21.0.0.data}/data/share/cmake/HalideHelpers/HalideHelpersConfigVersion.cmake +3 -3
halide-21.0.0.dist-info/METADATA +302 -0
{halide-19.0.0.dist-info → halide-21.0.0.dist-info}/RECORD +41 -41
{halide-19.0.0.dist-info → halide-21.0.0.dist-info}/WHEEL +1 -1
halide-19.0.0.dist-info/METADATA +0 -301
{halide-19.0.0.dist-info → halide-21.0.0.dist-info}/licenses/LICENSE.txt +0 -0

halide/include/Halide.h CHANGED Viewed

@@ -315,7 +315,7 @@
 // our CMake build, so that we ensure that the in-build metadata (eg soversion)
 // matches, but keeping the canonical version here makes it easier to keep
 // downstream build systems (eg Blaze/Bazel) properly in sync with the source.
-#define HALIDE_VERSION_MAJOR 19
+#define HALIDE_VERSION_MAJOR 21
 #define HALIDE_VERSION_MINOR 0
 #define HALIDE_VERSION_PATCH 0
@@ -1643,21 +1643,27 @@ extern int halide_error_vscale_invalid(void *user_context, const char *func_name
 // @}
 /** Optional features a compilation Target can have.
- * Be sure to keep this in sync with the Feature enum in Target.h and the implementation of
- * get_runtime_compatible_target in Target.cpp if you add a new feature.
+ *
+ * Be sure to keep this in sync with:
+ *  1. the Feature enum in Target.h,
+ *  2. the implementation of get_runtime_compatible_target in Target.cpp,
+ *  3. PyEnums.cpp,
+ * if you add a new feature.
  */
 typedef enum halide_target_feature_t {
-    halide_target_feature_jit = 0,          ///< Generate code that will run immediately inside the calling process.
-    halide_target_feature_debug,            ///< Turn on debug info and output for runtime code.
-    halide_target_feature_no_asserts,       ///< Disable all runtime checks, for slightly tighter code.
-    halide_target_feature_no_bounds_query,  ///< Disable the bounds querying functionality.
-    halide_target_feature_sse41,  ///< Use SSE 4.1 and earlier instructions. Only relevant on x86.
-    halide_target_feature_avx,    ///< Use AVX 1 instructions. Only relevant on x86.
-    halide_target_feature_avx2,   ///< Use AVX 2 instructions. Only relevant on x86.
-    halide_target_feature_fma,    ///< Enable x86 FMA instruction
-    halide_target_feature_fma4,   ///< Enable x86 (AMD) FMA4 instruction set
-    halide_target_feature_f16c,   ///< Enable x86 16-bit float support
+    halide_target_feature_jit = 0,            ///< Generate code that will run immediately inside the calling process.
+    halide_target_feature_debug,              ///< Turn on debug info and output for runtime code.
+    halide_target_feature_enable_backtraces,  ///< Preserve frame pointers and include unwind tables to support accurate backtraces for debugging and profiling.
+    halide_target_feature_no_asserts,         ///< Disable all runtime checks, for slightly tighter code.
+    halide_target_feature_no_bounds_query,    ///< Disable the bounds querying functionality.
+    halide_target_feature_sse41,    ///< Use SSE 4.1 and earlier instructions. Only relevant on x86.
+    halide_target_feature_avx,      ///< Use AVX 1 instructions. Only relevant on x86.
+    halide_target_feature_avx2,     ///< Use AVX 2 instructions. Only relevant on x86.
+    halide_target_feature_avxvnni,  ///< Enable the AVX-VNNI features supported by AVX2 instructions. Supports 256-bit VNNI instructions without EVEX encoding.
+    halide_target_feature_fma,      ///< Enable x86 FMA instruction
+    halide_target_feature_fma4,     ///< Enable x86 (AMD) FMA4 instruction set
+    halide_target_feature_f16c,     ///< Enable x86 16-bit float support
     halide_target_feature_armv7s,   ///< Generate code for ARMv7s. Only relevant for 32-bit ARM.
     halide_target_feature_no_neon,  ///< Avoid using NEON instructions. Only relevant for 32-bit ARM.
@@ -1701,6 +1707,7 @@ typedef enum halide_target_feature_t {
     halide_target_feature_avx512_skylake,         ///< Enable the AVX512 features supported by Skylake Xeon server processors. This adds AVX512-VL, AVX512-BW, and AVX512-DQ to the base set. The main difference from the base AVX512 set is better support for small integer ops. Note that this does not include the Knight's Landing features. Note also that these features are not available on Skylake desktop and mobile processors.
     halide_target_feature_avx512_cannonlake,      ///< Enable the AVX512 features expected to be supported by future Cannonlake processors. This includes all of the Skylake features, plus AVX512-IFMA and AVX512-VBMI.
     halide_target_feature_avx512_zen4,            ///< Enable the AVX512 features supported by Zen4 processors. This include all of the Cannonlake features, plus AVX512-VNNI, AVX512-BF16, and more.
+    halide_target_feature_avx512_zen5,            ///< Enable the AVX512 features supported by Zen5 processors. This include all of the Cannonlake features, plus AVX512-VNNI, AVX512-BF16, AVX-VNNI and more.
     halide_target_feature_avx512_sapphirerapids,  ///< Enable the AVX512 features supported by Sapphire Rapids processors. This include all of the Zen4 features, plus AVX-VNNI and AMX instructions.
     halide_target_feature_trace_loads,            ///< Trace all loads done by the pipeline. Equivalent to calling Func::trace_loads on every non-inlined Func.
     halide_target_feature_trace_stores,           ///< Trace all stores done by the pipeline. Equivalent to calling Func::trace_stores on every non-inlined Func.
@@ -1755,6 +1762,7 @@ typedef enum halide_target_feature_t {
     halide_target_feature_semihosting,            ///< Used together with Target::NoOS for the baremetal target built with semihosting library and run with semihosting mode where minimum I/O communication with a host PC is available.
     halide_target_feature_avx10_1,                ///< Intel AVX10 version 1 support. vector_bits is used to indicate width.
     halide_target_feature_x86_apx,                ///< Intel x86 APX support. Covers initial set of features released as APX: egpr,push2pop2,ppx,ndd .
+    halide_target_feature_simulator,              ///< Target is for a simulator environment. Currently only applies to iOS.
     halide_target_feature_end                     ///< A sentinel. Every target is considered to have this feature, and setting this feature does nothing.
 } halide_target_feature_t;
@@ -1831,8 +1839,19 @@ typedef struct halide_dimension_t {
 }  // extern "C"
 #endif
-typedef enum { halide_buffer_flag_host_dirty = 1,
-               halide_buffer_flag_device_dirty = 2 } halide_buffer_flags;
+#if __cplusplus > 201100L || _MSVC_LANG > 201100L || __STDC_VERSION__ > 202300L
+// In C++, an underlying type is required to let the user define their own flag
+// values, without those values being undefined behavior when passed around as
+// this enum typedef.
+#define BUFFER_FLAGS_UNDERLYING_TYPE : uint64_t
+#else
+#define BUFFER_FLAGS_UNDERLYING_TYPE
+#endif
+typedef enum BUFFER_FLAGS_UNDERLYING_TYPE {
+    halide_buffer_flag_host_dirty = 1,
+    halide_buffer_flag_device_dirty = 2
+} halide_buffer_flags;
+#undef BUFFER_FLAGS_UNDERLYING_TYPE
 /**
  * The raw representation of an image passed around by generated
@@ -2730,12 +2749,15 @@ std::ostream &operator<<(std::ostream &stream, const Stmt &);
 struct LoweredFunc;
 std::ostream &operator<<(std::ostream &, const LoweredFunc &);
-/** For optional debugging during codegen, use the debug class as
+bool debug_is_active_impl(int verbosity, const char *file, const char *function, int line);
+#define debug_is_active(n) (::Halide::Internal::debug_is_active_impl((n), __FILE__, __FUNCTION__, __LINE__))
+/** For optional debugging during codegen, use the debug macro as
  * follows:
  *
- \code
- debug(verbosity) << "The expression is " << expr << "\n";
- \endcode
+ * \code
+ * debug(verbosity) << "The expression is " << expr << "\n";
+ * \endcode
  *
  * verbosity of 0 always prints, 1 should print after every major
  * stage, 2 should be used for more detail, and 3 should be used for
@@ -2743,25 +2765,11 @@ std::ostream &operator<<(std::ostream &, const LoweredFunc &);
  * is determined by the value of the environment variable
  * HL_DEBUG_CODEGEN
  */
-class debug {
-    const bool logging;
-public:
-    debug(int verbosity)
-        : logging(verbosity <= debug_level()) {
-    }
-    template<typename T>
-    debug &operator<<(T &&x) {
-        if (logging) {
-            std::cerr << std::forward<T>(x);
-        }
-        return *this;
-    }
-    static int debug_level();
-};
+// clang-format off
+#define debug(n)                                     \
+    /* NOLINTNEXTLINE(bugprone-macro-parentheses) */ \
+    if (debug_is_active((n))) std::cerr
+// clang-format on
 /** Allow easily printing the contents of containers, or std::vector-like containers,
  *  in debug output. Used like so:
@@ -2867,14 +2875,18 @@ private:
 };
 /** An error that occurs while running a JIT-compiled Halide pipeline. */
-struct HALIDE_EXPORT_SYMBOL RuntimeError : public Error {
+struct HALIDE_EXPORT_SYMBOL RuntimeError final : Error {
+    static constexpr auto error_name = "Runtime error";
     explicit RuntimeError(const char *msg);
     explicit RuntimeError(const std::string &msg);
 };
 /** An error that occurs while compiling a Halide pipeline that Halide
  * attributes to a user error. */
-struct HALIDE_EXPORT_SYMBOL CompileError : public Error {
+struct HALIDE_EXPORT_SYMBOL CompileError final : Error {
+    static constexpr auto error_name = "User error";
     explicit CompileError(const char *msg);
     explicit CompileError(const std::string &msg);
 };
@@ -2882,7 +2894,9 @@ struct HALIDE_EXPORT_SYMBOL CompileError : public Error {
 /** An error that occurs while compiling a Halide pipeline that Halide
  * attributes to an internal compiler bug, or to an invalid use of
  * Halide's internals. */
-struct HALIDE_EXPORT_SYMBOL InternalError : public Error {
+struct HALIDE_EXPORT_SYMBOL InternalError final : Error {
+    static constexpr auto error_name = "Internal error";
     explicit InternalError(const char *msg);
     explicit InternalError(const std::string &msg);
 };
@@ -2898,7 +2912,7 @@ class CompileTimeErrorReporter {
 public:
     virtual ~CompileTimeErrorReporter() = default;
     virtual void warning(const char *msg) = 0;
-    virtual void error(const char *msg) = 0;
+    [[noreturn]] virtual void error(const char *msg) = 0;
 };
 /** The default error reporter logs to stderr, then throws an exception
@@ -2912,84 +2926,136 @@ void set_custom_compile_time_error_reporter(CompileTimeErrorReporter *error_repo
 namespace Internal {
-struct ErrorReport {
-    enum {
-        User = 0x0001,
-        Warning = 0x0002,
-        Runtime = 0x0004
-    };
+/**
+ * If a custom error reporter is configured, notifies the reporter by calling
+ * its error() function with the value of \p e.what()
+ *
+ * Otherwise, if Halide was built with exceptions, throw \p e unless an
+ * existing exception is in flight. On the other hand, if Halide was built
+ * without exceptions, print the error message to stderr and abort().
+ *
+ * @param e The error to throw or report
+ */
+/// @{
+[[noreturn]] void throw_error(const RuntimeError &e);
+[[noreturn]] void throw_error(const CompileError &e);
+[[noreturn]] void throw_error(const InternalError &e);
+/// @}
-    std::ostringstream msg;
-    const int flags;
+/**
+ * If a custom error reporter is configured, notifies the reporter by calling
+ * its warning() function. Otherwise, prints the warning to stderr.
+ *
+ * @param warning The warning to issue
+ */
+void issue_warning(const char *warning);
-    ErrorReport(const char *f, int l, const char *cs, int flags);
+template<typename T>
+struct ReportBase {
+    template<typename S>
+    HALIDE_ALWAYS_INLINE T &operator<<(const S &x) {
+        msg << x;
+        return *static_cast<T *>(this);
+    }
-    // Just a trick used to convert RValue into LValue
-    HALIDE_ALWAYS_INLINE ErrorReport &ref() {
-        return *this;
+    HALIDE_ALWAYS_INLINE operator bool() const {
+        return !finalized;
     }
-    template<typename T>
-    ErrorReport &operator<<(const T &x) {
-        msg << x;
-        return *this;
+protected:
+    std::ostringstream msg{};
+    bool finalized{false};
+    // This function is called as part of issue() below. We can't use a
+    // virtual function because issue() needs to be marked [[noreturn]]
+    // for errors and be left alone for warnings (i.e., they have
+    // different signatures).
+    std::string finalize_message() {
+        if (!msg.str().empty() && msg.str().back() != '\n') {
+            msg << "\n";
+        }
+        finalized = true;
+        return msg.str();
+    }
+    T &init(const char *file, const char *function, const int line, const char *condition_string, const char *prefix) {
+        if (debug_is_active_impl(1, file, function, line)) {
+            msg << prefix << " at " << file << ":" << line << ' ';
+            if (condition_string) {
+                msg << "Condition failed: " << condition_string << ' ';
+            }
+        }
+        return *static_cast<T *>(this);
     }
+};
-    /** When you're done using << on the object, and let it fall out of
-     * scope, this errors out, or throws an exception if they are
-     * enabled. This is a little dangerous because the destructor will
-     * also be called if there's an exception in flight due to an
-     * error in one of the arguments passed to operator<<. We handle
-     * this by only actually throwing if there isn't an exception in
-     * flight already.
-     */
-    ~ErrorReport() noexcept(false);
+template<typename Exception>
+struct ErrorReport final : ReportBase<ErrorReport<Exception>> {
+    ErrorReport &init(const char *file, const char *function, const int line, const char *condition_string) {
+        return ReportBase<ErrorReport>::init(file, function, line, condition_string, Exception::error_name) << "Error: ";
+    }
+    [[noreturn]] void issue() noexcept(false) {
+        throw_error(Exception(this->finalize_message()));
+    }
 };
-// This uses operator precedence as a trick to avoid argument evaluation if
-// an assertion is true: it is intended to be used as part of the
-// _halide_internal_assertion macro, to coerce the result of the stream
-// expression to void (to match the condition-is-false case).
-class Voidifier {
-public:
-    HALIDE_ALWAYS_INLINE Voidifier() = default;
-    // This has to be an operator with a precedence lower than << but
-    // higher than ?:
-    HALIDE_ALWAYS_INLINE void operator&(ErrorReport &) {
+struct WarningReport final : ReportBase<WarningReport> {
+    WarningReport &init(const char *file, const char *function, const int line, const char *condition_string) {
+        return ReportBase::init(file, function, line, condition_string, "Warning") << "Warning: ";
+    }
+    void issue() {
+        issue_warning(this->finalize_message().c_str());
     }
 };
 /**
- * _halide_internal_assertion is used to implement our assertion macros
- * in such a way that the messages output for the assertion are only
- * evaluated if the assertion's value is false.
- *
- * Note that this macro intentionally has no parens internally; in actual
- * use, the implicit grouping will end up being
- *
- *   condition ? (void) : (Voidifier() & (ErrorReport << arg1 << arg2 ... << argN))
+ * The following three diagnostic macros are implemented such that the
+ * message is evaluated only if the assertion's value is false.
  *
  * This (regrettably) requires a macro to work, but has the highly desirable
  * effect that all assertion parameters are totally skipped (not ever evaluated)
  * when the assertion is true.
+ *
+ * The macros work by deferring the call to issue() until after the stream
+ * has been evaluated. This previously used a trick where ErrorReport would
+ * throw in the destructor, but throwing in a destructor is UB in a lot of
+ * scenarios, and it was easy to break things by mistake.
  */
-#define _halide_internal_assertion(condition, flags) \
-    /* NOLINTNEXTLINE(bugprone-macro-parentheses) */ \
-    (condition) ? (void)0 : ::Halide::Internal::Voidifier() & ::Halide::Internal::ErrorReport(__FILE__, __LINE__, #condition, flags).ref()
+/// @{
+#define _halide_error_impl(type)                                    \
+    for (Halide::Internal::ErrorReport<type> _err; 1; _err.issue()) \
+    /**/ _err.init(__FILE__, __FUNCTION__, __LINE__, nullptr)
+#define _halide_assert_impl(condition, type)                            \
+    if (!(condition))                                                   \
+        for (Halide::Internal::ErrorReport<type> _err; 1; _err.issue()) \
+    /*****/ _err.init(__FILE__, __FUNCTION__, __LINE__, #condition)
+#define _halide_user_warning                                       \
+    for (Halide::Internal::WarningReport _err; _err; _err.issue()) \
+    /**/ _err.init(__FILE__, __FUNCTION__, __LINE__, nullptr)
+/// @}
+#define user_warning _halide_user_warning
-#define internal_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, 0)
-#define user_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User)
-#define user_warning Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User | Halide::Internal::ErrorReport::Warning)
-#define halide_runtime_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User | Halide::Internal::ErrorReport::Runtime)
+#define user_error _halide_error_impl(Halide::CompileError)
+#define internal_error _halide_error_impl(Halide::InternalError)
+#define halide_runtime_error _halide_error_impl(Halide::RuntimeError)
-#define internal_assert(c) _halide_internal_assertion(c, 0)
-#define user_assert(c) _halide_internal_assertion(c, Halide::Internal::ErrorReport::User)
+#define internal_assert(c) _halide_assert_impl(c, Halide::InternalError)
+#define user_assert(c) _halide_assert_impl(c, Halide::CompileError)
 // The nicely named versions get cleaned up at the end of Halide.h,
 // but user code might want to do halide-style user_asserts (e.g. the
 // Extern macros introduce calls to user_assert), so for that purpose
 // we define an equivalent macro that can be used outside of Halide.h
-#define _halide_user_assert(c) _halide_internal_assertion(c, Halide::Internal::ErrorReport::User)
+#define _halide_user_error _halide_error_impl(Halide::CompileError)
+#define _halide_internal_error _halide_error_impl(Halide::InternalError)
+#define _halide_runtime_error _halide_error_impl(Halide::RuntimeError)
+#define _halide_internal_assert(c) _halide_assert_impl(c, Halide::InternalError)
+#define _halide_user_assert(c) _halide_assert_impl(c, Halide::CompileError)
 // N.B. Any function that might throw a user_assert or user_error may
 // not be inlined into the user's code, or the line number will be
@@ -3459,8 +3525,12 @@ bool starts_with(const std::string &str, const std::string &prefix);
 /** Test if the first string ends with the second string */
 bool ends_with(const std::string &str, const std::string &suffix);
-/** Replace all matches of the second string in the first string with the last string */
-std::string replace_all(const std::string &str, const std::string &find, const std::string &replace);
+/** Replace all matches of the second string in the first string with the last string.
+ * The string to search-and-replace in is passed by value, offering the ability to
+ * std::move() a string in if you're not interested in keeping the original string.
+ * This is useful when the original string does not contain the find-string, causing
+ * this function to return the same string without any copies being made. */
+std::string replace_all(std::string str, const std::string &find, const std::string &replace);
 /** Split the source string using 'delim' as the divider. */
 std::vector<std::string> split_string(const std::string &source, const std::string &delim);
@@ -3671,7 +3741,7 @@ struct ScopedValue {
         : var(var), old_value(var) {
     }
     /** Preserve the old value, then set the var to a new value. */
-    ScopedValue(T &var, T new_value)
+    ScopedValue(T &var, const T &new_value)
         : var(var), old_value(var) {
         var = new_value;
     }
@@ -4980,6 +5050,7 @@ struct Target {
         ZnVer2,    /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019).
         ZnVer3,    /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020).
         ZnVer4,    /// Tune for AMD Zen 4 CPU (AMD Family 19h, launched 2022).
+        ZnVer5,    /// Tune for AMD Zen 5 CPU (AMD Family 1Ah, launched 2024).
     } processor_tune = ProcessorGeneric;
     /** Optional features a target can have.
@@ -4989,11 +5060,13 @@ struct Target {
     enum Feature {
         JIT = halide_target_feature_jit,
         Debug = halide_target_feature_debug,
+        EnableBacktraces = halide_target_feature_enable_backtraces,
         NoAsserts = halide_target_feature_no_asserts,
         NoBoundsQuery = halide_target_feature_no_bounds_query,
         SSE41 = halide_target_feature_sse41,
         AVX = halide_target_feature_avx,
         AVX2 = halide_target_feature_avx2,
+        AVXVNNI = halide_target_feature_avxvnni,
         FMA = halide_target_feature_fma,
         FMA4 = halide_target_feature_fma4,
         F16C = halide_target_feature_f16c,
@@ -5038,6 +5111,7 @@ struct Target {
         AVX512_Cannonlake = halide_target_feature_avx512_cannonlake,
         AVX512_SapphireRapids = halide_target_feature_avx512_sapphirerapids,
         AVX512_Zen4 = halide_target_feature_avx512_zen4,
+        AVX512_Zen5 = halide_target_feature_avx512_zen5,
         TraceLoads = halide_target_feature_trace_loads,
         TraceStores = halide_target_feature_trace_stores,
         TraceRealizations = halide_target_feature_trace_realizations,
@@ -5085,6 +5159,7 @@ struct Target {
         Semihosting = halide_target_feature_semihosting,
         AVX10_1 = halide_target_feature_avx10_1,
         X86APX = halide_target_feature_x86_apx,
+        Simulator = halide_target_feature_simulator,
         FeatureEnd = halide_target_feature_end
     };
     Target() = default;
@@ -5413,10 +5488,12 @@ static_assert(((HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT & (HALIDE_RUNTIME_BUF
 #ifndef HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
 // clang-format off
-#ifdef _MSC_VER
+#ifdef _WIN32
-    // MSVC doesn't implement aligned_alloc(), even in C++17 mode, and
-    // has stated they probably never will, so, always default it off here.
+    // Windows (regardless of which compiler) doesn't implement aligned_alloc(),
+    // even in C++17 mode, and has stated they probably never will, as the issue
+    // is in the incompatibility that free() needs to be able to free both pointers
+    // returned by malloc() and aligned_alloc(). So, always default it off here.
     #define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0
 #elif defined(__ANDROID_API__) && __ANDROID_API__ < 28
@@ -6317,7 +6394,7 @@ public:
     /** Allocate a new image of the given size with a runtime
      * type. Only used when you do know what size you want but you
-     * don't know statically what type the elements are. Pass zeroes
+     * don't know statically what type the elements are. Pass zeros
      * to make a buffer suitable for bounds query calls. */
     template<typename... Args,
              typename = typename std::enable_if<AllInts<Args...>::value>::type>
@@ -6336,7 +6413,7 @@ public:
         }
     }
-    /** Allocate a new image of the given size. Pass zeroes to make a
+    /** Allocate a new image of the given size. Pass zeros to make a
      * buffer suitable for bounds query calls. */
     // @{
@@ -7339,9 +7416,12 @@ public:
     /** Make a buffer with the same shape and memory nesting order as
      * another buffer. It may have a different type. */
     template<typename T2, int D2, int S2>
+    // NOLINTNEXTLINE(performance-unnecessary-value-param)
     static Buffer<T, Dims, InClassDimStorage> make_with_shape_of(Buffer<T2, D2, S2> src,
                                                                  void *(*allocate_fn)(size_t) = nullptr,
                                                                  void (*deallocate_fn)(void *) = nullptr) {
+        // Note that src is taken by value because its dims are mutated
+        // in-place by the helper. Do not change to taking it by reference.
         static_assert(Dims == D2 || Dims == AnyDims);
         const halide_type_t dst_type = T_is_void ? src.type() : halide_type_of<typename std::remove_cv<not_void_T>::type>();
         return Buffer<>::make_with_shape_of_helper(dst_type, src.dimensions(), src.buf.dim,
@@ -7407,9 +7487,7 @@ private:
     }
     template<typename... Args>
-    HALIDE_ALWAYS_INLINE
-        storage_T *
-        address_of(Args... args) const {
+    HALIDE_ALWAYS_INLINE storage_T *address_of(Args... args) const {
         if (T_is_void) {
             return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
         } else {
@@ -7464,8 +7542,7 @@ public:
     }
     HALIDE_ALWAYS_INLINE
-    const not_void_T &
-    operator()() const {
+    const not_void_T &operator()() const {
         static_assert(!T_is_void,
                       "Cannot use operator() on Buffer<void> types");
         constexpr int expected_dims = 0;
@@ -7485,9 +7562,7 @@ public:
     template<typename... Args,
              typename = typename std::enable_if<AllInts<Args...>::value>::type>
-    HALIDE_ALWAYS_INLINE
-        not_void_T &
-        operator()(int first, Args... rest) {
+    HALIDE_ALWAYS_INLINE not_void_T &operator()(int first, Args... rest) {
         static_assert(!T_is_void,
                       "Cannot use operator() on Buffer<void> types");
         constexpr int expected_dims = 1 + (int)(sizeof...(rest));
@@ -8181,7 +8256,7 @@ public:
     template<typename... Args,
              typename = typename std::enable_if<Internal::all_ints_and_optional_name<Args...>::value>::type>
-    explicit Buffer(int first, Args... rest)
+    explicit Buffer(int first, const Args &...rest)
         : Buffer(Runtime::Buffer<T, Dims>(Internal::get_shape_from_start_of_parameter_pack(first, rest...)),
                  Internal::get_name_from_end_of_parameter_pack(rest...)) {
     }
@@ -8408,6 +8483,7 @@ public:
     HALIDE_BUFFER_FORWARD_CONST(contains)
     HALIDE_BUFFER_FORWARD(crop)
     HALIDE_BUFFER_FORWARD_INITIALIZER_LIST(crop, std::vector<std::pair<int, int>>)
+    HALIDE_BUFFER_FORWARD_CONST(cropped)
     HALIDE_BUFFER_FORWARD(slice)
     HALIDE_BUFFER_FORWARD_CONST(sliced)
     HALIDE_BUFFER_FORWARD(embed)
@@ -8415,6 +8491,7 @@ public:
     HALIDE_BUFFER_FORWARD(set_min)
     HALIDE_BUFFER_FORWARD(translate)
     HALIDE_BUFFER_FORWARD_INITIALIZER_LIST(translate, std::vector<int>)
+    HALIDE_BUFFER_FORWARD_CONST(translated)
     HALIDE_BUFFER_FORWARD(transpose)
     HALIDE_BUFFER_FORWARD_CONST(transposed)
     HALIDE_BUFFER_FORWARD(add_dimension)
@@ -8935,6 +9012,12 @@ public:
     void store_in(MemoryType memory_type);
     MemoryType memory_type() const;
+    void trace_loads();
+    bool is_tracing_loads() const;
+    void add_trace_tag(const std::string &trace_tag);
+    std::vector<std::string> get_trace_tags() const;
 };
 namespace Internal {
@@ -10251,8 +10334,7 @@ struct Split {
     enum SplitType { SplitVar = 0,
                      RenameVar,
-                     FuseVars,
-                     PurifyRVar };
+                     FuseVars };
     // If split_type is Rename, then this is just a renaming of the
     // old_var to the outer and not a split. The inner var should
@@ -10260,10 +10342,6 @@ struct Split {
     // the same list as splits so that ordering between them is
     // respected.
-    // If split type is Purify, this replaces the old_var RVar to
-    // the outer Var. The inner var should be ignored, and factor
-    // should be one.
     // If split_type is Fuse, then this does the opposite of a
     // split, it joins the outer and inner into the old_var.
     SplitType split_type;
@@ -10854,7 +10932,12 @@ class IRMutator;
 /** A single named dimension of a reduction domain */
 struct ReductionVariable {
+    /**
+     * A variable name for the reduction variable. This name must be a
+     * valid Var name, i.e. it must not contain a <tt>.</tt> character.
+     */
     std::string var;
     Expr min, extent;
     /** This lets you use a ReductionVariable as a key in a map of the form
@@ -11680,7 +11763,7 @@ struct ExternFuncArgument {
     }
     template<typename T, int Dims>
-    ExternFuncArgument(Buffer<T, Dims> b)
+    ExternFuncArgument(const Buffer<T, Dims> &b)
         : arg_type(BufferArg), buffer(b) {
     }
     ExternFuncArgument(Expr e)
@@ -12323,9 +12406,25 @@ struct Call : public ExprNode<Call> {
         // Compute (arg[0] + arg[1]) / 2, assuming arg[0] < arg[1].
         sorted_avg,
-        strict_float,
+        // strict floating point ops. These are floating point ops that we would
+        // like to optimize around (or let llvm optimize around) by treating
+        // them as reals and ignoring the existence of nan and inf. Using these
+        // intrinsics instead prevents any such optimizations.
+        strict_add,
+        strict_div,
+        strict_eq,
+        strict_le,
+        strict_lt,
+        strict_max,
+        strict_min,
+        strict_mul,
+        strict_sub,
+        // Convert a list of Exprs to a string
         stringify,
+        // Query properties of the compiled-for target (resolved at compile-time)
         target_arch_is,
         target_bits,
         target_has_feature,
@@ -12450,7 +12549,7 @@ struct Call : public ExprNode<Call> {
     }
     bool is_tag() const {
-        return is_intrinsic({Call::likely, Call::likely_if_innermost, Call::strict_float});
+        return is_intrinsic({Call::likely, Call::likely_if_innermost});
     }
     /** Returns a pointer to a call node if the expression is a call to
@@ -12467,7 +12566,7 @@ struct Call : public ExprNode<Call> {
     }
     static const Call *as_tag(const Expr &e) {
-        return as_intrinsic(e, {Call::likely, Call::likely_if_innermost, Call::strict_float});
+        return as_intrinsic(e, {Call::likely, Call::likely_if_innermost});
     }
     bool is_extern() const {
@@ -12476,6 +12575,19 @@ struct Call : public ExprNode<Call> {
                 call_type == PureExtern);
     }
+    bool is_strict_float_intrinsic() const {
+        return is_intrinsic(
+            {Call::strict_add,
+             Call::strict_div,
+             Call::strict_max,
+             Call::strict_min,
+             Call::strict_mul,
+             Call::strict_sub,
+             Call::strict_lt,
+             Call::strict_le,
+             Call::strict_eq});
+    }
     static const IRNodeType _node_type = IRNodeType::Call;
 };
@@ -12628,6 +12740,10 @@ struct Shuffle : public ExprNode<Shuffle> {
      * arguments. */
     bool is_extract_element() const;
+    /** Returns the sequence of vector and lane indices that represent each
+     * entry to be used for the shuffled vector */
+    std::vector<std::pair<int, int>> vector_and_lane_indices() const;
     static const IRNodeType _node_type = IRNodeType::Shuffle;
 };
@@ -13070,6 +13186,577 @@ inline Expr user_context_value() {
 #include <map>
 #include <optional>
+#ifndef HALIDE_CONSTANT_INTERVAL_H
+#define HALIDE_CONSTANT_INTERVAL_H
+#include <stdint.h>
+/** \file
+ * Defines the ConstantInterval class, and operators on it.
+ */
+namespace Halide {
+struct Type;
+namespace Internal {
+/** A class to represent ranges of integers. Can be unbounded above or below,
+ * but they cannot be empty. */
+struct ConstantInterval {
+    /** The lower and upper bound of the interval. They are included
+     * in the interval. */
+    int64_t min = 0, max = 0;
+    bool min_defined = false, max_defined = false;
+    /* A default-constructed Interval is everything */
+    ConstantInterval() = default;
+    /** Construct an interval from a lower and upper bound. */
+    ConstantInterval(int64_t min, int64_t max);
+    /** The interval representing everything. */
+    static ConstantInterval everything();
+    /** Construct an interval representing a single point. */
+    static ConstantInterval single_point(int64_t x);
+    /** Construct intervals bounded above or below. */
+    static ConstantInterval bounded_below(int64_t min);
+    static ConstantInterval bounded_above(int64_t max);
+    /** Is the interval the entire range */
+    bool is_everything() const;
+    /** Is the interval just a single value (min == max) */
+    bool is_single_point() const;
+    /** Is the interval a particular single value */
+    bool is_single_point(int64_t x) const;
+    /** Does the interval have a finite upper and lower bound */
+    bool is_bounded() const;
+    /** Expand the interval to include another Interval */
+    void include(const ConstantInterval &i);
+    /** Expand the interval to include a point */
+    void include(int64_t x);
+    /** Test if the interval contains a particular value */
+    bool contains(int32_t x) const;
+    /** Test if the interval contains a particular value */
+    bool contains(int64_t x) const;
+    /** Test if the interval contains a particular unsigned value */
+    bool contains(uint64_t x) const;
+    /** Construct the smallest interval containing two intervals. */
+    static ConstantInterval make_union(const ConstantInterval &a, const ConstantInterval &b);
+    /** Construct the largest interval contained within two intervals. Throws an
+     * error if the interval is empty. */
+    static ConstantInterval make_intersection(const ConstantInterval &a, const ConstantInterval &b);
+    /** Equivalent to same_as. Exists so that the autoscheduler can
+     * compare two map<string, Interval> for equality in order to
+     * cache computations. */
+    bool operator==(const ConstantInterval &other) const;
+    /** In-place versions of the arithmetic operators below. */
+    // @{
+    void operator+=(const ConstantInterval &other);
+    void operator+=(int64_t);
+    void operator-=(const ConstantInterval &other);
+    void operator-=(int64_t);
+    void operator*=(const ConstantInterval &other);
+    void operator*=(int64_t);
+    void operator/=(const ConstantInterval &other);
+    void operator/=(int64_t);
+    void operator%=(const ConstantInterval &other);
+    void operator%=(int64_t);
+    // @}
+    /** Negate an interval. */
+    ConstantInterval operator-() const;
+    /** Track what happens if a constant integer interval is forced to fit into
+     * a concrete integer type. */
+    void cast_to(const Type &t);
+    /** Get constant integer bounds on a type. */
+    static ConstantInterval bounds_of_type(Type);
+};
+/** Arithmetic operators on ConstantIntervals. The resulting interval contains
+ * all possible values of the operator applied to any two elements of the
+ * argument intervals. Note that these operator on unbounded integers. If you
+ * are applying this to concrete small integer types, you will need to manually
+ * cast the constant interval back to the desired type to model the effect of
+ * overflow. */
+// @{
+ConstantInterval operator+(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator+(const ConstantInterval &a, int64_t b);
+ConstantInterval operator-(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator-(const ConstantInterval &a, int64_t b);
+ConstantInterval operator/(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator/(const ConstantInterval &a, int64_t b);
+ConstantInterval operator*(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator*(const ConstantInterval &a, int64_t b);
+ConstantInterval operator%(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator%(const ConstantInterval &a, int64_t b);
+ConstantInterval min(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval min(const ConstantInterval &a, int64_t b);
+ConstantInterval max(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval max(const ConstantInterval &a, int64_t b);
+ConstantInterval abs(const ConstantInterval &a);
+ConstantInterval operator<<(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator<<(const ConstantInterval &a, int64_t b);
+ConstantInterval operator<<(int64_t a, const ConstantInterval &b);
+ConstantInterval operator>>(const ConstantInterval &a, const ConstantInterval &b);
+ConstantInterval operator>>(const ConstantInterval &a, int64_t b);
+ConstantInterval operator>>(int64_t a, const ConstantInterval &b);
+// @}
+/** Comparison operators on ConstantIntervals. Returns whether the comparison is
+ * true for all values of the two intervals. */
+// @{
+bool operator<=(const ConstantInterval &a, const ConstantInterval &b);
+bool operator<=(const ConstantInterval &a, int64_t b);
+bool operator<=(int64_t a, const ConstantInterval &b);
+bool operator<(const ConstantInterval &a, const ConstantInterval &b);
+bool operator<(const ConstantInterval &a, int64_t b);
+bool operator<(int64_t a, const ConstantInterval &b);
+inline bool operator>=(const ConstantInterval &a, const ConstantInterval &b) {
+    return b <= a;
+}
+inline bool operator>(const ConstantInterval &a, const ConstantInterval &b) {
+    return b < a;
+}
+inline bool operator>=(const ConstantInterval &a, int64_t b) {
+    return b <= a;
+}
+inline bool operator>(const ConstantInterval &a, int64_t b) {
+    return b < a;
+}
+inline bool operator>=(int64_t a, const ConstantInterval &b) {
+    return b <= a;
+}
+inline bool operator>(int64_t a, const ConstantInterval &b) {
+    return b < a;
+}
+// @}
+}  // namespace Internal
+/** Cast operators for ConstantIntervals. These ones have to live out in
+ * Halide::, to avoid C++ name lookup confusion with the Halide::cast variants
+ * that take Exprs. */
+// @{
+Internal::ConstantInterval cast(Type t, const Internal::ConstantInterval &a);
+Internal::ConstantInterval saturating_cast(Type t, const Internal::ConstantInterval &a);
+// @}
+}  // namespace Halide
+#endif
+#ifndef HALIDE_SCOPE_H
+#define HALIDE_SCOPE_H
+#include <iostream>
+#include <map>
+#include <stack>
+#include <string>
+#include <utility>
+#include <vector>
+/** \file
+ * Defines the Scope class, which is used for keeping track of names in a scope while traversing IR
+ */
+namespace Halide {
+namespace Internal {
+/** A stack which can store one item very efficiently. Using this
+ * instead of std::stack speeds up Scope substantially. */
+template<typename T>
+class SmallStack {
+private:
+    T _top;
+    std::vector<T> _rest;
+    bool _empty = true;
+public:
+    SmallStack() = default;
+    void pop() {
+        if (_rest.empty()) {
+            _empty = true;
+            _top = T();
+        } else {
+            _top = std::move(_rest.back());
+            _rest.pop_back();
+        }
+    }
+    void push(T t) {
+        if (!_empty) {
+            _rest.push_back(std::move(_top));
+        }
+        _top = std::move(t);
+        _empty = false;
+    }
+    T top() const {
+        return _top;
+    }
+    T &top_ref() {
+        return _top;
+    }
+    const T &top_ref() const {
+        return _top;
+    }
+    bool empty() const {
+        return _empty;
+    }
+    size_t size() const {
+        return _empty ? 0 : (_rest.size() + 1);
+    }
+};
+template<>
+class SmallStack<void> {
+    // A stack of voids. Voids are all the same, so just record how many voids are in the stack
+    int counter = 0;
+public:
+    void pop() {
+        counter--;
+    }
+    void push() {
+        counter++;
+    }
+    bool empty() const {
+        return counter == 0;
+    }
+};
+/** A common pattern when traversing Halide IR is that you need to
+ * keep track of stuff when you find a Let or a LetStmt, and that it
+ * should hide previous values with the same name until you leave the
+ * Let or LetStmt nodes This class helps with that. */
+template<typename T = void>
+class Scope {
+private:
+    std::map<std::string, SmallStack<T>> table;
+    const Scope<T> *containing_scope = nullptr;
+public:
+    Scope() = default;
+    Scope(Scope &&that) noexcept = default;
+    Scope &operator=(Scope &&that) noexcept = default;
+    // Copying a scope object copies a large table full of strings and
+    // stacks. Bad idea.
+    Scope(const Scope<T> &) = delete;
+    Scope<T> &operator=(const Scope<T> &) = delete;
+    /** Set the parent scope. If lookups fail in this scope, they
+     * check the containing scope before returning an error. Caller is
+     * responsible for managing the memory of the containing scope. */
+    void set_containing_scope(const Scope<T> *s) {
+        containing_scope = s;
+    }
+    /** A const ref to an empty scope. Useful for default function
+     * arguments, which would otherwise require a copy constructor
+     * (with llvm in c++98 mode) */
+    static const Scope<T> &empty_scope() {
+        static Scope<T> _empty_scope;
+        return _empty_scope;
+    }
+    /** Retrieve the value referred to by a name */
+    template<typename T2 = T,
+             typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
+    T2 get(const std::string &name) const {
+        typename std::map<std::string, SmallStack<T>>::const_iterator iter = table.find(name);
+        if (iter == table.end() || iter->second.empty()) {
+            if (containing_scope) {
+                return containing_scope->get(name);
+            } else {
+                internal_error << "Name not in Scope: " << name << "\n"
+                               << *this << "\n";
+            }
+        }
+        return iter->second.top();
+    }
+    /** Return a reference to an entry. Does not consider the containing scope. */
+    template<typename T2 = T,
+             typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
+    T2 &ref(const std::string &name) {
+        typename std::map<std::string, SmallStack<T>>::iterator iter = table.find(name);
+        if (iter == table.end() || iter->second.empty()) {
+            internal_error << "Name not in Scope: " << name << "\n"
+                           << *this << "\n";
+        }
+        return iter->second.top_ref();
+    }
+    /** Returns a const pointer to an entry if it exists in this scope or any
+     * containing scope, or nullptr if it does not. Use this instead of if
+     * (scope.contains(foo)) { ... scope.get(foo) ... } to avoid doing two
+     * lookups. */
+    template<typename T2 = T,
+             typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
+    const T2 *find(const std::string &name) const {
+        typename std::map<std::string, SmallStack<T>>::const_iterator iter = table.find(name);
+        if (iter == table.end() || iter->second.empty()) {
+            if (containing_scope) {
+                return containing_scope->find(name);
+            } else {
+                return nullptr;
+            }
+        }
+        return &(iter->second.top_ref());
+    }
+    /** A version of find that returns a non-const pointer, but ignores
+     * containing scope. */
+    template<typename T2 = T,
+             typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
+    T2 *shallow_find(const std::string &name) {
+        typename std::map<std::string, SmallStack<T>>::iterator iter = table.find(name);
+        if (iter == table.end() || iter->second.empty()) {
+            return nullptr;
+        } else {
+            return &(iter->second.top_ref());
+        }
+    }
+    /** Tests if a name is in scope. If you plan to use the value if it is, call
+     * find instead. */
+    bool contains(const std::string &name) const {
+        typename std::map<std::string, SmallStack<T>>::const_iterator iter = table.find(name);
+        if (iter == table.end() || iter->second.empty()) {
+            if (containing_scope) {
+                return containing_scope->contains(name);
+            } else {
+                return false;
+            }
+        }
+        return true;
+    }
+    /** How many nested definitions of a single name exist? */
+    size_t count(const std::string &name) const {
+        auto it = table.find(name);
+        if (it == table.end()) {
+            return 0;
+        } else {
+            return it->second.size();
+        }
+    }
+    /** How many distinct names exist (does not count nested definitions of the same name) */
+    size_t size() const {
+        return table.size();
+    }
+    struct PushToken {
+        typename std::map<std::string, SmallStack<T>>::iterator iter;
+    };
+    /** Add a new (name, value) pair to the current scope. Hide old values that
+     * have this name until we pop this name. Returns a token that can be used
+     * to pop the same value without doing a fresh lookup.
+     */
+    template<typename T2 = T,
+             typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
+    PushToken push(const std::string &name, T2 &&value) {
+        auto it = table.try_emplace(name).first;
+        it->second.push(std::forward<T2>(value));
+        return PushToken{it};
+    }
+    template<typename T2 = T,
+             typename = typename std::enable_if<std::is_same<T2, void>::value>::type>
+    PushToken push(const std::string &name) {
+        auto it = table.try_emplace(name).first;
+        it->second.push();
+        return PushToken{it};
+    }
+    /** A name goes out of scope. Restore whatever its old value
+     * was (or remove it entirely if there was nothing else of the
+     * same name in an outer scope) */
+    void pop(const std::string &name) {
+        typename std::map<std::string, SmallStack<T>>::iterator iter = table.find(name);
+        internal_assert(iter != table.end()) << "Name not in Scope: " << name << "\n"
+                                             << *this << "\n";
+        iter->second.pop();
+        if (iter->second.empty()) {
+            table.erase(iter);
+        }
+    }
+    /** Pop a name using a token returned by push instead of a string. */
+    void pop(PushToken p) {
+        p.iter->second.pop();
+        if (p.iter->second.empty()) {
+            table.erase(p.iter);
+        }
+    }
+    /** Iterate through the scope. Does not capture any containing scope. */
+    class const_iterator {
+        typename std::map<std::string, SmallStack<T>>::const_iterator iter;
+    public:
+        explicit const_iterator(const typename std::map<std::string, SmallStack<T>>::const_iterator &i)
+            : iter(i) {
+        }
+        const_iterator() = default;
+        bool operator!=(const const_iterator &other) {
+            return iter != other.iter;
+        }
+        void operator++() {
+            ++iter;
+        }
+        const std::string &name() {
+            return iter->first;
+        }
+        const SmallStack<T> &stack() {
+            return iter->second;
+        }
+        template<typename T2 = T,
+                 typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
+        const T2 &value() {
+            return iter->second.top_ref();
+        }
+    };
+    const_iterator cbegin() const {
+        return const_iterator(table.begin());
+    }
+    const_iterator cend() const {
+        return const_iterator(table.end());
+    }
+    void swap(Scope<T> &other) noexcept {
+        table.swap(other.table);
+        std::swap(containing_scope, other.containing_scope);
+    }
+};
+template<typename T>
+std::ostream &operator<<(std::ostream &stream, const Scope<T> &s) {
+    stream << "{\n";
+    typename Scope<T>::const_iterator iter;
+    for (iter = s.cbegin(); iter != s.cend(); ++iter) {
+        stream << "  " << iter.name() << "\n";
+    }
+    stream << "}";
+    return stream;
+}
+/** Helper class for pushing/popping Scope<> values, to allow
+ * for early-exit in Visitor/Mutators that preserves correctness.
+ * Note that this name can be a bit confusing, since there are two "scopes"
+ * involved here:
+ * - the Scope object itself
+ * - the lifetime of this helper object
+ * The "Scoped" in this class name refers to the latter, as it temporarily binds
+ * a name within the scope of this helper's lifetime. */
+template<typename T = void>
+struct ScopedBinding {
+    Scope<T> *scope = nullptr;
+    typename Scope<T>::PushToken token;
+    ScopedBinding() = default;
+    ScopedBinding(Scope<T> &s, const std::string &n, T value)
+        : scope(&s), token(scope->push(n, std::move(value))) {
+    }
+    ScopedBinding(bool condition, Scope<T> &s, const std::string &n, const T &value)
+        : scope(condition ? &s : nullptr),
+          token(condition ? scope->push(n, value) : typename Scope<T>::PushToken{}) {
+    }
+    bool bound() const {
+        return scope != nullptr;
+    }
+    ~ScopedBinding() {
+        if (scope) {
+            scope->pop(token);
+        }
+    }
+    // allow move but not copy
+    ScopedBinding(const ScopedBinding &that) = delete;
+    ScopedBinding(ScopedBinding &&that) noexcept
+        : scope(that.scope),
+          token(that.token) {
+        // The move constructor must null out scope, so we don't try to pop it
+        that.scope = nullptr;
+    }
+    void operator=(const ScopedBinding &that) = delete;
+    void operator=(ScopedBinding &&that) = delete;
+};
+template<>
+struct ScopedBinding<void> {
+    Scope<> *scope;
+    Scope<>::PushToken token;
+    ScopedBinding(Scope<> &s, const std::string &n)
+        : scope(&s), token(scope->push(n)) {
+    }
+    ScopedBinding(bool condition, Scope<> &s, const std::string &n)
+        : scope(condition ? &s : nullptr),
+          token(condition ? scope->push(n) : Scope<>::PushToken{}) {
+    }
+    ~ScopedBinding() {
+        if (scope) {
+            scope->pop(token);
+        }
+    }
+    // allow move but not copy
+    ScopedBinding(const ScopedBinding &that) = delete;
+    ScopedBinding(ScopedBinding &&that) noexcept
+        : scope(that.scope),
+          token(that.token) {
+        // The move constructor must null out scope, so we don't try to pop it
+        that.scope = nullptr;
+    }
+    void operator=(const ScopedBinding &that) = delete;
+    void operator=(ScopedBinding &&that) = delete;
+};
+}  // namespace Internal
+}  // namespace Halide
+#endif
 #ifndef HALIDE_TUPLE_H
 #define HALIDE_TUPLE_H
@@ -13275,13 +13962,16 @@ Expr const_false(int lanes = 1);
 /** Attempt to cast an expression to a smaller type while provably not losing
  * information. If it can't be done, return an undefined Expr.
  *
- * Optionally accepts a map that gives the constant bounds of exprs already
- * analyzed to avoid redoing work across many calls to lossless_cast. It is not
- * safe to use this optional map in contexts where the same Expr object may
- * take on a different value. For example:
- * (let x = 4 in some_expr_object) + (let x = 5 in the_same_expr_object)).
- * It is safe to use it after uniquify_variable_names has been run. */
-Expr lossless_cast(Type t, Expr e, std::map<Expr, ConstantInterval, ExprCompare> *cache = nullptr);
+ * Optionally accepts a scope giving the constant bounds of any variables, and a
+ * map that gives the constant bounds of exprs already analyzed to avoid redoing
+ * work across many calls to lossless_cast. It is not safe to use this optional
+ * map in contexts where the same Expr object may take on a different value. For
+ * example: (let x = 4 in some_expr_object) + (let x = 5 in
+ * the_same_expr_object)).  It is safe to use it after uniquify_variable_names
+ * has been run. */
+Expr lossless_cast(Type t, Expr e,
+                   const Scope<ConstantInterval> &scope = Scope<ConstantInterval>::empty_scope(),
+                   std::map<Expr, ConstantInterval, ExprCompare> *cache = nullptr);
 /** Attempt to negate x without introducing new IR and without overflow.
  * If it can't be done, return an undefined Expr. */
@@ -14095,8 +14785,9 @@ Expr pow(Expr x, Expr y);
  * mantissa. Vectorizes cleanly. */
 Expr erf(const Expr &x);
-/** Fast vectorizable approximation to some trigonometric functions for Float(32).
- * Absolute approximation error is less than 1e-5. */
+/** Fast vectorizable approximation to some trigonometric functions for
+ * Float(32).  Absolute approximation error is less than 1e-5. Slow on x86 if
+ * you don't have at least sse 4.1. */
 // @{
 Expr fast_sin(const Expr &x);
 Expr fast_cos(const Expr &x);
@@ -14104,19 +14795,22 @@ Expr fast_cos(const Expr &x);
 /** Fast approximate cleanly vectorizable log for Float(32). Returns
  * nonsense for x <= 0.0f. Accurate up to the last 5 bits of the
- * mantissa. Vectorizes cleanly. */
+ * mantissa. Vectorizes cleanly. Slow on x86 if you don't
+ * have at least sse 4.1. */
 Expr fast_log(const Expr &x);
 /** Fast approximate cleanly vectorizable exp for Float(32). Returns
  * nonsense for inputs that would overflow or underflow. Typically
  * accurate up to the last 5 bits of the mantissa. Gets worse when
- * approaching overflow. Vectorizes cleanly. */
+ * approaching overflow. Vectorizes cleanly. Slow on x86 if you don't
+ * have at least sse 4.1. */
 Expr fast_exp(const Expr &x);
 /** Fast approximate cleanly vectorizable pow for Float(32). Returns
  * nonsense for x < 0.0f. Accurate up to the last 5 bits of the
  * mantissa for typical exponents. Gets worse when approaching
- * overflow. Vectorizes cleanly. */
+ * overflow. Vectorizes cleanly. Slow on x86 if you don't
+ * have at least sse 4.1. */
 Expr fast_pow(Expr x, Expr y);
 /** Fast approximate inverse for Float(32). Corresponds to the rcpps
@@ -14559,7 +15253,7 @@ Expr saturating_cast(Type t, Expr e);
  * all backends. (E.g. it is difficult to do this for C++ code
  * generation as it depends on the compiler flags used to compile the
  * generated code. */
-Expr strict_float(Expr e);
+Expr strict_float(const Expr &e);
 /** Create an Expr that that promises another Expr is clamped but do
  * not generate code to check the assertion or modify the value. No
@@ -14671,7 +15365,7 @@ f(scatter(3, 5)) = f(select(p, gather(5, 3), gather(3, 5)));
 f(select(p, scatter(3, 5, 5), scatter(1, 2, 3))) = f(select(p, gather(5, 3, 3), gather(2, 3, 1)));
 \endcode
 *
-* Note that in the p == true case, we redudantly load from 3 and write
+* Note that in the p == true case, we redundantly load from 3 and write
 * to 5 twice.
 */
 //@{
@@ -14952,7 +15646,7 @@ struct PipelineContents;
  *
  * The 'name' field specifies the type of Autoscheduler
  * to be used (e.g. Adams2019, Mullapudi2016). If this is an empty string,
- * no autoscheduling will be done; if not, it mustbe the name of a known Autoscheduler.
+ * no autoscheduling will be done; if not, it must be the name of a known Autoscheduler.
  *
  * At this time, well-known autoschedulers include:
  *  "Mullapudi2016" -- heuristics-based; the first working autoscheduler; currently built in to libHalide
@@ -15743,7 +16437,7 @@ public:
     }
     template<typename... Args>
-    HALIDE_NO_USER_CODE_INLINE RDom(Expr min, Expr extent, Args &&...args) {
+    HALIDE_NO_USER_CODE_INLINE RDom(const Expr &min, const Expr &extent, Args &&...args) {
         // This should really just be a delegating constructor, but I couldn't make
         // that work with variadic template unpacking in visual studio 2013
         Region region;
@@ -15895,12 +16589,14 @@ class Var {
     /* The expression representing the Var. Guaranteed to be an
      * Internal::Variable of type Int(32). Created once on
      * construction of the Var to avoid making a fresh Expr every time
-     * the Var is used in a context in which is will be converted to
+     * the Var is used in a context in which it will be converted to
      * one. */
     Expr e;
 public:
-    /** Construct a Var with the given name */
+    /** Construct a Var with the given name. Unlike Funcs, this will be treated
+     * as the same Var as another other Var with the same name, including
+     * implicit Vars. */
     Var(const std::string &n);
     /** Construct a Var with an automatically-generated unique name. */
@@ -15995,9 +16691,6 @@ public:
     static Var implicit(int n);
     /** Return whether a variable name is of the form for an implicit argument.
-     * TODO: This is almost guaranteed to incorrectly fire on user
-     * declared variables at some point. We should likely prevent
-     * user Var declarations from making names of this form.
      */
     //{
     static bool is_implicit(const std::string &name);
@@ -16130,6 +16823,7 @@ struct VarOrRVar {
 class ImageParam;
 namespace Internal {
+struct AssociativeOp;
 class Function;
 struct Split;
 struct StorageDim;
@@ -16151,7 +16845,6 @@ class Stage {
     void split(const std::string &old, const std::string &outer, const std::string &inner,
                const Expr &factor, bool exact, TailStrategy tail);
     void remove(const std::string &var);
-    Stage &purify(const VarOrRVar &old_name, const VarOrRVar &new_name);
     const std::vector<Internal::StorageDim> &storage_dims() const {
         return function.schedule().storage_dims();
@@ -16159,6 +16852,9 @@ class Stage {
     Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);
+    std::pair<std::vector<Internal::Split>, std::vector<Internal::Split>>
+    rfactor_validate_args(const std::vector<std::pair<RVar, Var>> &preserved, const Internal::AssociativeOp &prover_result);
 public:
     Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
         : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
@@ -16254,7 +16950,7 @@ public:
      *
      */
     // @{
-    Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
+    Func rfactor(const std::vector<std::pair<RVar, Var>> &preserved);
     Func rfactor(const RVar &r, const Var &v);
     // @}
@@ -16575,7 +17271,7 @@ class FuncRef {
      * already have a pure definition, init_val will be used as RHS in
      * the initial function definition. */
     template<typename BinaryOp>
-    Stage func_ref_update(Expr e, int init_val);
+    Stage func_ref_update(const Expr &e, int init_val);
 public:
     FuncRef(const Internal::Function &, const std::vector<Expr> &,
@@ -16598,7 +17294,7 @@ public:
      * pure definition, this sets it to zero.
      */
     // @{
-    Stage operator+=(Expr);
+    Stage operator+=(const Expr &);
     Stage operator+=(const Tuple &);
     Stage operator+=(const FuncRef &);
     // @}
@@ -16609,7 +17305,7 @@ public:
      * not already have a pure definition, this sets it to zero.
      */
     // @{
-    Stage operator-=(Expr);
+    Stage operator-=(const Expr &);
     Stage operator-=(const Tuple &);
     Stage operator-=(const FuncRef &);
     // @}
@@ -16620,7 +17316,7 @@ public:
      * definition, this sets it to 1.
      */
     // @{
-    Stage operator*=(Expr);
+    Stage operator*=(const Expr &);
     Stage operator*=(const Tuple &);
     Stage operator*=(const FuncRef &);
     // @}
@@ -16631,7 +17327,7 @@ public:
      * function does not already have a pure definition, this sets it to 1.
      */
     // @{
-    Stage operator/=(Expr);
+    Stage operator/=(const Expr &);
     Stage operator/=(const Tuple &);
     Stage operator/=(const FuncRef &);
     // @}
@@ -16654,6 +17350,9 @@ public:
     /** How many outputs does the function this refers to produce. */
     size_t size() const;
+    /** Is this FuncRef syntactically equivalent to another one? */
+    bool equivalent_to(const FuncRef &other) const;
     /** What function is this calling? */
     Internal::Function function() const {
         return func;
@@ -16820,7 +17519,7 @@ public:
      * not contain free variables). */
     explicit Func(const Expr &e);
-    /** Construct a new Func to wrap an existing, already-define
+    /** Construct a new Func to wrap an existing, already-defined
      * Function object. */
     explicit Func(Internal::Function f);
@@ -17231,14 +17930,6 @@ public:
                       device_api);
     }
-    void define_extern(const std::string &function_name,
-                       const std::vector<ExternFuncArgument> &params,
-                       const std::vector<Type> &types, int dimensionality,
-                       NameMangling mangling) {
-        define_extern(function_name, params, types,
-                      Internal::make_argument_list(dimensionality), mangling);
-    }
     void define_extern(const std::string &function_name,
                        const std::vector<ExternFuncArgument> &params,
                        const std::vector<Type> &types, int dimensionality,
@@ -18331,6 +19022,11 @@ public:
      * to remove memoized entries using this eviction key from the
      * cache. Memoized computations that do not provide an eviction
      * key will never be evicted by this mechanism.
+     *
+     * It is invalid to memoize the output of a Pipeline; attempting
+     * to do so will issue an error. To cache an entire pipeline,
+     * either implement a caching mechanism outside of Halide or
+     * explicitly copy out of the cache with another output Func.
      */
     Func &memoize(const EvictionKey &eviction_key = EvictionKey());
@@ -19195,401 +19891,6 @@ private:
 }  // namespace Internal
 }  // namespace Halide
-#endif
-#ifndef HALIDE_SCOPE_H
-#define HALIDE_SCOPE_H
-#include <iostream>
-#include <map>
-#include <stack>
-#include <string>
-#include <utility>
-#include <vector>
-/** \file
- * Defines the Scope class, which is used for keeping track of names in a scope while traversing IR
- */
-namespace Halide {
-namespace Internal {
-/** A stack which can store one item very efficiently. Using this
- * instead of std::stack speeds up Scope substantially. */
-template<typename T>
-class SmallStack {
-private:
-    T _top;
-    std::vector<T> _rest;
-    bool _empty = true;
-public:
-    SmallStack() = default;
-    void pop() {
-        if (_rest.empty()) {
-            _empty = true;
-            _top = T();
-        } else {
-            _top = std::move(_rest.back());
-            _rest.pop_back();
-        }
-    }
-    void push(T t) {
-        if (!_empty) {
-            _rest.push_back(std::move(_top));
-        }
-        _top = std::move(t);
-        _empty = false;
-    }
-    T top() const {
-        return _top;
-    }
-    T &top_ref() {
-        return _top;
-    }
-    const T &top_ref() const {
-        return _top;
-    }
-    bool empty() const {
-        return _empty;
-    }
-    size_t size() const {
-        return _empty ? 0 : (_rest.size() + 1);
-    }
-};
-template<>
-class SmallStack<void> {
-    // A stack of voids. Voids are all the same, so just record how many voids are in the stack
-    int counter = 0;
-public:
-    void pop() {
-        counter--;
-    }
-    void push() {
-        counter++;
-    }
-    bool empty() const {
-        return counter == 0;
-    }
-};
-/** A common pattern when traversing Halide IR is that you need to
- * keep track of stuff when you find a Let or a LetStmt, and that it
- * should hide previous values with the same name until you leave the
- * Let or LetStmt nodes This class helps with that. */
-template<typename T = void>
-class Scope {
-private:
-    std::map<std::string, SmallStack<T>> table;
-    const Scope<T> *containing_scope = nullptr;
-public:
-    Scope() = default;
-    Scope(Scope &&that) noexcept = default;
-    Scope &operator=(Scope &&that) noexcept = default;
-    // Copying a scope object copies a large table full of strings and
-    // stacks. Bad idea.
-    Scope(const Scope<T> &) = delete;
-    Scope<T> &operator=(const Scope<T> &) = delete;
-    /** Set the parent scope. If lookups fail in this scope, they
-     * check the containing scope before returning an error. Caller is
-     * responsible for managing the memory of the containing scope. */
-    void set_containing_scope(const Scope<T> *s) {
-        containing_scope = s;
-    }
-    /** A const ref to an empty scope. Useful for default function
-     * arguments, which would otherwise require a copy constructor
-     * (with llvm in c++98 mode) */
-    static const Scope<T> &empty_scope() {
-        static Scope<T> _empty_scope;
-        return _empty_scope;
-    }
-    /** Retrieve the value referred to by a name */
-    template<typename T2 = T,
-             typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
-    T2 get(const std::string &name) const {
-        typename std::map<std::string, SmallStack<T>>::const_iterator iter = table.find(name);
-        if (iter == table.end() || iter->second.empty()) {
-            if (containing_scope) {
-                return containing_scope->get(name);
-            } else {
-                internal_error << "Name not in Scope: " << name << "\n"
-                               << *this << "\n";
-            }
-        }
-        return iter->second.top();
-    }
-    /** Return a reference to an entry. Does not consider the containing scope. */
-    template<typename T2 = T,
-             typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
-    T2 &ref(const std::string &name) {
-        typename std::map<std::string, SmallStack<T>>::iterator iter = table.find(name);
-        if (iter == table.end() || iter->second.empty()) {
-            internal_error << "Name not in Scope: " << name << "\n"
-                           << *this << "\n";
-        }
-        return iter->second.top_ref();
-    }
-    /** Returns a const pointer to an entry if it exists in this scope or any
-     * containing scope, or nullptr if it does not. Use this instead of if
-     * (scope.contains(foo)) { ... scope.get(foo) ... } to avoid doing two
-     * lookups. */
-    template<typename T2 = T,
-             typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
-    const T2 *find(const std::string &name) const {
-        typename std::map<std::string, SmallStack<T>>::const_iterator iter = table.find(name);
-        if (iter == table.end() || iter->second.empty()) {
-            if (containing_scope) {
-                return containing_scope->find(name);
-            } else {
-                return nullptr;
-            }
-        }
-        return &(iter->second.top_ref());
-    }
-    /** A version of find that returns a non-const pointer, but ignores
-     * containing scope. */
-    template<typename T2 = T,
-             typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
-    T2 *shallow_find(const std::string &name) {
-        typename std::map<std::string, SmallStack<T>>::iterator iter = table.find(name);
-        if (iter == table.end() || iter->second.empty()) {
-            return nullptr;
-        } else {
-            return &(iter->second.top_ref());
-        }
-    }
-    /** Tests if a name is in scope. If you plan to use the value if it is, call
-     * find instead. */
-    bool contains(const std::string &name) const {
-        typename std::map<std::string, SmallStack<T>>::const_iterator iter = table.find(name);
-        if (iter == table.end() || iter->second.empty()) {
-            if (containing_scope) {
-                return containing_scope->contains(name);
-            } else {
-                return false;
-            }
-        }
-        return true;
-    }
-    /** How many nested definitions of a single name exist? */
-    size_t count(const std::string &name) const {
-        auto it = table.find(name);
-        if (it == table.end()) {
-            return 0;
-        } else {
-            return it->second.size();
-        }
-    }
-    /** How many distinct names exist (does not count nested definitions of the same name) */
-    size_t size() const {
-        return table.size();
-    }
-    struct PushToken {
-        typename std::map<std::string, SmallStack<T>>::iterator iter;
-    };
-    /** Add a new (name, value) pair to the current scope. Hide old values that
-     * have this name until we pop this name. Returns a token that can be used
-     * to pop the same value without doing a fresh lookup.
-     */
-    template<typename T2 = T,
-             typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
-    PushToken push(const std::string &name, T2 &&value) {
-        auto it = table.try_emplace(name).first;
-        it->second.push(std::forward<T2>(value));
-        return PushToken{it};
-    }
-    template<typename T2 = T,
-             typename = typename std::enable_if<std::is_same<T2, void>::value>::type>
-    PushToken push(const std::string &name) {
-        auto it = table.try_emplace(name).first;
-        it->second.push();
-        return PushToken{it};
-    }
-    /** A name goes out of scope. Restore whatever its old value
-     * was (or remove it entirely if there was nothing else of the
-     * same name in an outer scope) */
-    void pop(const std::string &name) {
-        typename std::map<std::string, SmallStack<T>>::iterator iter = table.find(name);
-        internal_assert(iter != table.end()) << "Name not in Scope: " << name << "\n"
-                                             << *this << "\n";
-        iter->second.pop();
-        if (iter->second.empty()) {
-            table.erase(iter);
-        }
-    }
-    /** Pop a name using a token returned by push instead of a string. */
-    void pop(PushToken p) {
-        p.iter->second.pop();
-        if (p.iter->second.empty()) {
-            table.erase(p.iter);
-        }
-    }
-    /** Iterate through the scope. Does not capture any containing scope. */
-    class const_iterator {
-        typename std::map<std::string, SmallStack<T>>::const_iterator iter;
-    public:
-        explicit const_iterator(const typename std::map<std::string, SmallStack<T>>::const_iterator &i)
-            : iter(i) {
-        }
-        const_iterator() = default;
-        bool operator!=(const const_iterator &other) {
-            return iter != other.iter;
-        }
-        void operator++() {
-            ++iter;
-        }
-        const std::string &name() {
-            return iter->first;
-        }
-        const SmallStack<T> &stack() {
-            return iter->second;
-        }
-        template<typename T2 = T,
-                 typename = typename std::enable_if<!std::is_same<T2, void>::value>::type>
-        const T2 &value() {
-            return iter->second.top_ref();
-        }
-    };
-    const_iterator cbegin() const {
-        return const_iterator(table.begin());
-    }
-    const_iterator cend() const {
-        return const_iterator(table.end());
-    }
-    void swap(Scope<T> &other) noexcept {
-        table.swap(other.table);
-        std::swap(containing_scope, other.containing_scope);
-    }
-};
-template<typename T>
-std::ostream &operator<<(std::ostream &stream, const Scope<T> &s) {
-    stream << "{\n";
-    typename Scope<T>::const_iterator iter;
-    for (iter = s.cbegin(); iter != s.cend(); ++iter) {
-        stream << "  " << iter.name() << "\n";
-    }
-    stream << "}";
-    return stream;
-}
-/** Helper class for pushing/popping Scope<> values, to allow
- * for early-exit in Visitor/Mutators that preserves correctness.
- * Note that this name can be a bit confusing, since there are two "scopes"
- * involved here:
- * - the Scope object itself
- * - the lifetime of this helper object
- * The "Scoped" in this class name refers to the latter, as it temporarily binds
- * a name within the scope of this helper's lifetime. */
-template<typename T = void>
-struct ScopedBinding {
-    Scope<T> *scope = nullptr;
-    typename Scope<T>::PushToken token;
-    ScopedBinding() = default;
-    ScopedBinding(Scope<T> &s, const std::string &n, T value)
-        : scope(&s), token(scope->push(n, std::move(value))) {
-    }
-    ScopedBinding(bool condition, Scope<T> &s, const std::string &n, const T &value)
-        : scope(condition ? &s : nullptr),
-          token(condition ? scope->push(n, value) : typename Scope<T>::PushToken{}) {
-    }
-    bool bound() const {
-        return scope != nullptr;
-    }
-    ~ScopedBinding() {
-        if (scope) {
-            scope->pop(token);
-        }
-    }
-    // allow move but not copy
-    ScopedBinding(const ScopedBinding &that) = delete;
-    ScopedBinding(ScopedBinding &&that) noexcept
-        : scope(that.scope),
-          token(that.token) {
-        // The move constructor must null out scope, so we don't try to pop it
-        that.scope = nullptr;
-    }
-    void operator=(const ScopedBinding &that) = delete;
-    void operator=(ScopedBinding &&that) = delete;
-};
-template<>
-struct ScopedBinding<void> {
-    Scope<> *scope;
-    Scope<>::PushToken token;
-    ScopedBinding(Scope<> &s, const std::string &n)
-        : scope(&s), token(scope->push(n)) {
-    }
-    ScopedBinding(bool condition, Scope<> &s, const std::string &n)
-        : scope(condition ? &s : nullptr),
-          token(condition ? scope->push(n) : Scope<>::PushToken{}) {
-    }
-    ~ScopedBinding() {
-        if (scope) {
-            scope->pop(token);
-        }
-    }
-    // allow move but not copy
-    ScopedBinding(const ScopedBinding &that) = delete;
-    ScopedBinding(ScopedBinding &&that) noexcept
-        : scope(that.scope),
-          token(that.token) {
-        // The move constructor must null out scope, so we don't try to pop it
-        that.scope = nullptr;
-    }
-    void operator=(const ScopedBinding &that) = delete;
-    void operator=(ScopedBinding &&that) = delete;
-};
-}  // namespace Internal
-}  // namespace Halide
 #endif
 namespace Halide {
@@ -20029,7 +20330,7 @@ bool graph_equal(const IRNode &a, const IRNode &b) {
     } else if (a.node_type != b.node_type) {
         return false;
     } else {
-        return equal_impl(a, b);
+        return graph_equal_impl(a, b);
     }
 }
@@ -20042,7 +20343,7 @@ bool graph_equal(const IRHandle &a, const IRHandle &b) {
     } else if (!b.defined()) {
         return false;
     } else {
-        return equal(*(a.get()), *(b.get()));
+        return graph_equal(*(a.get()), *(b.get()));
     }
 }
@@ -20438,8 +20739,10 @@ protected:
     // @}
 private:
-    /** The nodes visited so far */
-    std::set<IRHandle> visited;
+    /** The nodes visited so far. Only includes nodes with a ref count greater
+     * than one, because we know that nodes with a ref count of 1 will only be
+     * visited once if their parents are only visited once. */
+    std::set<const IRNode *> visited;
 protected:
     /** These methods should call 'include' on the children to only
@@ -20822,360 +21125,6 @@ void propagate_estimate_test();
 }  // namespace Internal
 }  // namespace Halide
-#endif
-#ifndef HALIDE_BOUNDARY_CONDITIONS_H
-#define HALIDE_BOUNDARY_CONDITIONS_H
-/** \file
- * Support for imposing boundary conditions on Halide::Funcs.
- */
-#include <vector>
-#ifndef HALIDE_LAMBDA_H
-#define HALIDE_LAMBDA_H
-/** \file
- * Convenience functions for creating small anonymous Halide
- * functions. See test/lambda.cpp for example usage. */
-namespace Halide {
-/** Create a zero-dimensional halide function that returns the given
- * expression. The function may have more dimensions if the expression
- * contains implicit arguments. */
-Func lambda(const Expr &e);
-/** Create a 1-D halide function in the first argument that returns
- * the second argument. The function may have more dimensions if the
- * expression contains implicit arguments and the list of Var
- * arguments contains a placeholder ("_"). */
-Func lambda(const Var &x, const Expr &e);
-/** Create a 2-D halide function in the first two arguments that
- * returns the last argument. The function may have more dimensions if
- * the expression contains implicit arguments and the list of Var
- * arguments contains a placeholder ("_"). */
-Func lambda(const Var &x, const Var &y, const Expr &e);
-/** Create a 3-D halide function in the first three arguments that
- * returns the last argument.  The function may have more dimensions
- * if the expression contains implicit arguments and the list of Var
- * arguments contains a placeholder ("_"). */
-Func lambda(const Var &x, const Var &y, const Var &z, const Expr &e);
-/** Create a 4-D halide function in the first four arguments that
- * returns the last argument. The function may have more dimensions if
- * the expression contains implicit arguments and the list of Var
- * arguments contains a placeholder ("_"). */
-Func lambda(const Var &x, const Var &y, const Var &z, const Var &w, const Expr &e);
-/** Create a 5-D halide function in the first five arguments that
- * returns the last argument. The function may have more dimensions if
- * the expression contains implicit arguments and the list of Var
- * arguments contains a placeholder ("_"). */
-Func lambda(const Var &x, const Var &y, const Var &z, const Var &w, const Var &v, const Expr &e);
-}  // namespace Halide
-#endif  // HALIDE_LAMBDA_H
-namespace Halide {
-/** namespace to hold functions for imposing boundary conditions on
- *  Halide Funcs.
- *
- *  All functions in this namespace transform a source Func to a
- *  result Func where the result produces the values of the source
- *  within a given region and a different set of values outside the
- *  given region. A region is an N dimensional box specified by
- *  mins and extents.
- *
- *  Three areas are defined:
- *      The image is the entire set of values in the region.
- *      The edge is the set of pixels in the image but adjacent
- *          to coordinates that are not
- *      The interior is the image minus the edge (and is undefined
- *          if the extent of any region is 1 or less).
- *
- *  If the source Func has more dimensions than are specified, the extra ones
- *  are unmodified. Additionally, passing an undefined (default constructed)
- *  'Expr' for the min and extent of a dimension will keep that dimension
- *  unmodified.
- *
- *  Numerous options for specifing the outside area are provided,
- *  including replacement with an expression, repeating the edge
- *  samples, mirroring over the edge, and repeating or mirroring the
- *  entire image.
- *
- *  Using these functions to express your boundary conditions is highly
- *  recommended for correctness and performance. Some of these are hard
- *  to get right. The versions here are both understood by bounds
- *  inference, and also judiciously use the 'likely' intrinsic to minimize
- *  runtime overhead.
- *
- */
-namespace BoundaryConditions {
-namespace Internal {
-inline HALIDE_NO_USER_CODE_INLINE void collect_region(Region &collected_args,
-                                                      const Expr &a1, const Expr &a2) {
-    collected_args.emplace_back(a1, a2);
-}
-template<typename... Args>
-inline HALIDE_NO_USER_CODE_INLINE void collect_region(Region &collected_args,
-                                                      const Expr &a1, const Expr &a2, Args &&...args) {
-    collected_args.emplace_back(a1, a2);
-    collect_region(collected_args, std::forward<Args>(args)...);
-}
-inline const Func &func_like_to_func(const Func &func) {
-    return func;
-}
-template<typename T>
-inline HALIDE_NO_USER_CODE_INLINE Func func_like_to_func(const T &func_like) {
-    return lambda(_, func_like(_));
-}
-}  // namespace Internal
-/** Impose a boundary condition such that a given expression is returned
- *  everywhere outside the boundary. Generally the expression will be a
- *  constant, though the code currently allows accessing the arguments
- *  of source.
- *
- *  An ImageParam, Buffer<T>, or similar can be passed instead of a
- *  Func. If this is done and no bounds are given, the boundaries will
- *  be taken from the min and extent methods of the passed
- *  object. Note that objects are taken by mutable ref. Pipelines
- *  capture Buffers via mutable refs, because running a pipeline might
- *  alter the Buffer metadata (e.g. device allocation state).
- *
- *  (This is similar to setting GL_TEXTURE_WRAP_* to GL_CLAMP_TO_BORDER
- *   and putting value in the border of the texture.)
- *
- *  You may pass undefined Exprs for dimensions that you do not wish
- *  to bound.
- */
-// @{
-Func constant_exterior(const Func &source, const Tuple &value,
-                       const Region &bounds);
-Func constant_exterior(const Func &source, const Expr &value,
-                       const Region &bounds);
-template<typename T>
-HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tuple &value, const Region &bounds) {
-    return constant_exterior(Internal::func_like_to_func(func_like), value, bounds);
-}
-template<typename T>
-HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr &value, const Region &bounds) {
-    return constant_exterior(Internal::func_like_to_func(func_like), value, bounds);
-}
-template<typename T>
-HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tuple &value) {
-    Region object_bounds;
-    for (int i = 0; i < func_like.dimensions(); i++) {
-        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
-    }
-    return constant_exterior(Internal::func_like_to_func(func_like), value, object_bounds);
-}
-template<typename T>
-HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr &value) {
-    return constant_exterior(func_like, Tuple(value));
-}
-template<typename T, typename... Bounds,
-         typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
-HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tuple &value,
-                                                  Bounds &&...bounds) {
-    Region collected_bounds;
-    Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
-    return constant_exterior(Internal::func_like_to_func(func_like), value, collected_bounds);
-}
-template<typename T, typename... Bounds,
-         typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
-HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr &value,
-                                                  Bounds &&...bounds) {
-    return constant_exterior(func_like, Tuple(value), std::forward<Bounds>(bounds)...);
-}
-// @}
-/** Impose a boundary condition such that the nearest edge sample is returned
- *  everywhere outside the given region.
- *
- *  An ImageParam, Buffer<T>, or similar can be passed instead of a Func. If this
- *  is done and no bounds are given, the boundaries will be taken from the
- *  min and extent methods of the passed object.
- *
- *  (This is similar to setting GL_TEXTURE_WRAP_* to GL_CLAMP_TO_EDGE.)
- *
- *  You may pass undefined Exprs for dimensions that you do not wish
- *  to bound.
- */
-// @{
-Func repeat_edge(const Func &source, const Region &bounds);
-template<typename T>
-HALIDE_NO_USER_CODE_INLINE Func repeat_edge(const T &func_like, const Region &bounds) {
-    return repeat_edge(Internal::func_like_to_func(func_like), bounds);
-}
-template<typename T>
-HALIDE_NO_USER_CODE_INLINE Func repeat_edge(const T &func_like) {
-    Region object_bounds;
-    for (int i = 0; i < func_like.dimensions(); i++) {
-        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
-    }
-    return repeat_edge(Internal::func_like_to_func(func_like), object_bounds);
-}
-// @}
-/** Impose a boundary condition such that the entire coordinate space is
- *  tiled with copies of the image abutted against each other.
- *
- *  An ImageParam, Buffer<T>, or similar can be passed instead of a Func. If this
- *  is done and no bounds are given, the boundaries will be taken from the
- *  min and extent methods of the passed object.
- *
- *  (This is similar to setting GL_TEXTURE_WRAP_* to GL_REPEAT.)
- *
- *  You may pass undefined Exprs for dimensions that you do not wish
- *  to bound.
- */
-// @{
-Func repeat_image(const Func &source, const Region &bounds);
-template<typename T>
-HALIDE_NO_USER_CODE_INLINE Func repeat_image(const T &func_like, const Region &bounds) {
-    return repeat_image(Internal::func_like_to_func(func_like), bounds);
-}
-template<typename T>
-HALIDE_NO_USER_CODE_INLINE Func repeat_image(const T &func_like) {
-    Region object_bounds;
-    for (int i = 0; i < func_like.dimensions(); i++) {
-        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
-    }
-    return repeat_image(Internal::func_like_to_func(func_like), object_bounds);
-}
-/** Impose a boundary condition such that the entire coordinate space is
- *  tiled with copies of the image abutted against each other, but mirror
- *  them such that adjacent edges are the same.
- *
- *  An ImageParam, Buffer<T>, or similar can be passed instead of a Func. If this
- *  is done and no bounds are given, the boundaries will be taken from the
- *  min and extent methods of the passed object.
- *
- *  (This is similar to setting GL_TEXTURE_WRAP_* to GL_MIRRORED_REPEAT.)
- *
- *  You may pass undefined Exprs for dimensions that you do not wish
- *  to bound.
- */
-// @{
-Func mirror_image(const Func &source, const Region &bounds);
-template<typename T>
-HALIDE_NO_USER_CODE_INLINE Func mirror_image(const T &func_like, const Region &bounds) {
-    return mirror_image(Internal::func_like_to_func(func_like), bounds);
-}
-template<typename T>
-HALIDE_NO_USER_CODE_INLINE Func mirror_image(const T &func_like) {
-    Region object_bounds;
-    for (int i = 0; i < func_like.dimensions(); i++) {
-        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
-    }
-    return mirror_image(Internal::func_like_to_func(func_like), object_bounds);
-}
-// @}
-/** Impose a boundary condition such that the entire coordinate space is
- *  tiled with copies of the image abutted against each other, but mirror
- *  them such that adjacent edges are the same and then overlap the edges.
- *
- *  This produces an error if any extent is 1 or less. (TODO: check this.)
- *
- *  An ImageParam, Buffer<T>, or similar can be passed instead of a Func. If this
- *  is done and no bounds are given, the boundaries will be taken from the
- *  min and extent methods of the passed object.
- *
- *  (I do not believe there is a direct GL_TEXTURE_WRAP_* equivalent for this.)
- *
- *  You may pass undefined Exprs for dimensions that you do not wish
- *  to bound.
- */
-// @{
-Func mirror_interior(const Func &source, const Region &bounds);
-template<typename T>
-HALIDE_NO_USER_CODE_INLINE Func mirror_interior(const T &func_like, const Region &bounds) {
-    return mirror_interior(Internal::func_like_to_func(func_like), bounds);
-}
-template<typename T>
-HALIDE_NO_USER_CODE_INLINE Func mirror_interior(const T &func_like) {
-    Region object_bounds;
-    for (int i = 0; i < func_like.dimensions(); i++) {
-        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
-    }
-    return mirror_interior(Internal::func_like_to_func(func_like), object_bounds);
-}
-// @}
-}  // namespace BoundaryConditions
-}  // namespace Halide
-#endif
-#ifndef HALIDE_BOUNDS_INFERENCE_H
-#define HALIDE_BOUNDS_INFERENCE_H
-/** \file
- * Defines the bounds_inference lowering pass.
- */
-#include <map>
-#include <string>
-#include <vector>
-namespace Halide {
-struct Target;
-namespace Internal {
-class Function;
-/** Take a partially lowered statement that includes symbolic
- * representations of the bounds over which things should be realized,
- * and inject expressions defining those bounds.
- */
-Stmt bounds_inference(Stmt,
-                      const std::vector<Function> &outputs,
-                      const std::vector<std::string> &realization_order,
-                      const std::vector<std::vector<std::string>> &fused_groups,
-                      const std::map<std::string, Function> &environment,
-                      const std::map<std::pair<std::string, int>, Interval> &func_bounds,
-                      const Target &target);
-}  // namespace Internal
-}  // namespace Halide
 #endif
 #ifndef HALIDE_BOUND_CONSTANT_EXTENT_LOOPS_H
 #define HALIDE_BOUND_CONSTANT_EXTENT_LOOPS_H
@@ -21223,6 +21172,431 @@ Stmt bound_small_allocations(const Stmt &s);
 }  // namespace Internal
 }  // namespace Halide
+#endif
+#ifndef HALIDE_BOUNDARY_CONDITIONS_H
+#define HALIDE_BOUNDARY_CONDITIONS_H
+/** \file
+ * Support for imposing boundary conditions on Halide::Funcs.
+ */
+#include <vector>
+#ifndef HALIDE_LAMBDA_H
+#define HALIDE_LAMBDA_H
+/** \file
+ * Convenience functions for creating small anonymous Halide
+ * functions. See test/lambda.cpp for example usage. */
+namespace Halide {
+/** Create a zero-dimensional halide function that returns the given
+ * expression. The function may have more dimensions if the expression
+ * contains implicit arguments. */
+Func lambda(const Expr &e);
+/** Create a 1-D halide function in the first argument that returns
+ * the second argument. The function may have more dimensions if the
+ * expression contains implicit arguments and the list of Var
+ * arguments contains a placeholder ("_"). */
+Func lambda(const Var &x, const Expr &e);
+/** Create a 2-D halide function in the first two arguments that
+ * returns the last argument. The function may have more dimensions if
+ * the expression contains implicit arguments and the list of Var
+ * arguments contains a placeholder ("_"). */
+Func lambda(const Var &x, const Var &y, const Expr &e);
+/** Create a 3-D halide function in the first three arguments that
+ * returns the last argument.  The function may have more dimensions
+ * if the expression contains implicit arguments and the list of Var
+ * arguments contains a placeholder ("_"). */
+Func lambda(const Var &x, const Var &y, const Var &z, const Expr &e);
+/** Create a 4-D halide function in the first four arguments that
+ * returns the last argument. The function may have more dimensions if
+ * the expression contains implicit arguments and the list of Var
+ * arguments contains a placeholder ("_"). */
+Func lambda(const Var &x, const Var &y, const Var &z, const Var &w, const Expr &e);
+/** Create a 5-D halide function in the first five arguments that
+ * returns the last argument. The function may have more dimensions if
+ * the expression contains implicit arguments and the list of Var
+ * arguments contains a placeholder ("_"). */
+Func lambda(const Var &x, const Var &y, const Var &z, const Var &w, const Var &v, const Expr &e);
+}  // namespace Halide
+#endif  // HALIDE_LAMBDA_H
+namespace Halide {
+/** namespace to hold functions for imposing boundary conditions on
+ *  Halide Funcs.
+ *
+ *  All functions in this namespace transform a source Func to a
+ *  result Func where the result produces the values of the source
+ *  within a given region and a different set of values outside the
+ *  given region. A region is an N dimensional box specified by
+ *  mins and extents.
+ *
+ *  Three areas are defined:
+ *      The image is the entire set of values in the region.
+ *      The edge is the set of pixels in the image but adjacent
+ *          to coordinates that are not
+ *      The interior is the image minus the edge (and is undefined
+ *          if the extent of any region is 1 or less).
+ *
+ *  If the source Func has more dimensions than are specified, the extra ones
+ *  are unmodified. Additionally, passing an undefined (default constructed)
+ *  'Expr' for the min and extent of a dimension will keep that dimension
+ *  unmodified.
+ *
+ *  Numerous options for specifing the outside area are provided,
+ *  including replacement with an expression, repeating the edge
+ *  samples, mirroring over the edge, and repeating or mirroring the
+ *  entire image.
+ *
+ *  Using these functions to express your boundary conditions is highly
+ *  recommended for correctness and performance. Some of these are hard
+ *  to get right. The versions here are both understood by bounds
+ *  inference, and also judiciously use the 'likely' intrinsic to minimize
+ *  runtime overhead.
+ *
+ */
+namespace BoundaryConditions {
+namespace Internal {
+inline HALIDE_NO_USER_CODE_INLINE void collect_region(Region &collected_args,
+                                                      const Expr &a1, const Expr &a2) {
+    collected_args.emplace_back(a1, a2);
+}
+template<typename... Args>
+inline HALIDE_NO_USER_CODE_INLINE void collect_region(Region &collected_args,
+                                                      const Expr &a1, const Expr &a2, Args &&...args) {
+    collected_args.emplace_back(a1, a2);
+    collect_region(collected_args, std::forward<Args>(args)...);
+}
+inline const Func &func_like_to_func(const Func &func) {
+    return func;
+}
+template<typename T>
+inline HALIDE_NO_USER_CODE_INLINE Func func_like_to_func(const T &func_like) {
+    return lambda(_, func_like(_));
+}
+}  // namespace Internal
+/** Impose a boundary condition such that a given expression is returned
+ *  everywhere outside the boundary. Generally the expression will be a
+ *  constant, though the code currently allows accessing the arguments
+ *  of source.
+ *
+ *  An ImageParam, Buffer<T>, or similar can be passed instead of a
+ *  Func. If this is done and no bounds are given, the boundaries will
+ *  be taken from the min and extent methods of the passed
+ *  object. Note that objects are taken by mutable ref. Pipelines
+ *  capture Buffers via mutable refs, because running a pipeline might
+ *  alter the Buffer metadata (e.g. device allocation state).
+ *
+ *  (This is similar to setting GL_TEXTURE_WRAP_* to GL_CLAMP_TO_BORDER
+ *   and putting value in the border of the texture.)
+ *
+ *  You may pass undefined Exprs for dimensions that you do not wish
+ *  to bound.
+ */
+// @{
+Func constant_exterior(const Func &source, const Tuple &value,
+                       const Region &bounds);
+Func constant_exterior(const Func &source, const Expr &value,
+                       const Region &bounds);
+template<typename T>
+HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tuple &value, const Region &bounds) {
+    return constant_exterior(Internal::func_like_to_func(func_like), value, bounds);
+}
+template<typename T>
+HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr &value, const Region &bounds) {
+    return constant_exterior(Internal::func_like_to_func(func_like), value, bounds);
+}
+template<typename T>
+HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tuple &value) {
+    Region object_bounds;
+    for (int i = 0; i < func_like.dimensions(); i++) {
+        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
+    }
+    return constant_exterior(Internal::func_like_to_func(func_like), value, object_bounds);
+}
+template<typename T>
+HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr &value) {
+    return constant_exterior(func_like, Tuple(value));
+}
+template<typename T, typename... Bounds,
+         typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
+HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tuple &value,
+                                                  Bounds &&...bounds) {
+    Region collected_bounds;
+    Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
+    return constant_exterior(Internal::func_like_to_func(func_like), value, collected_bounds);
+}
+template<typename T, typename... Bounds,
+         typename std::enable_if<Halide::Internal::all_are_convertible<Expr, Bounds...>::value>::type * = nullptr>
+HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr &value,
+                                                  Bounds &&...bounds) {
+    return constant_exterior(func_like, Tuple(value), std::forward<Bounds>(bounds)...);
+}
+// @}
+/** Impose a boundary condition such that the nearest edge sample is returned
+ *  everywhere outside the given region.
+ *
+ *  An ImageParam, Buffer<T>, or similar can be passed instead of a Func. If this
+ *  is done and no bounds are given, the boundaries will be taken from the
+ *  min and extent methods of the passed object.
+ *
+ *  (This is similar to setting GL_TEXTURE_WRAP_* to GL_CLAMP_TO_EDGE.)
+ *
+ *  You may pass undefined Exprs for dimensions that you do not wish
+ *  to bound.
+ */
+// @{
+Func repeat_edge(const Func &source, const Region &bounds);
+template<typename T>
+HALIDE_NO_USER_CODE_INLINE Func repeat_edge(const T &func_like, const Region &bounds) {
+    return repeat_edge(Internal::func_like_to_func(func_like), bounds);
+}
+template<typename T>
+HALIDE_NO_USER_CODE_INLINE Func repeat_edge(const T &func_like) {
+    Region object_bounds;
+    for (int i = 0; i < func_like.dimensions(); i++) {
+        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
+    }
+    return repeat_edge(Internal::func_like_to_func(func_like), object_bounds);
+}
+// @}
+/** Impose a boundary condition such that the entire coordinate space is
+ *  tiled with copies of the image abutted against each other.
+ *
+ *  An ImageParam, Buffer<T>, or similar can be passed instead of a Func. If this
+ *  is done and no bounds are given, the boundaries will be taken from the
+ *  min and extent methods of the passed object.
+ *
+ *  (This is similar to setting GL_TEXTURE_WRAP_* to GL_REPEAT.)
+ *
+ *  You may pass undefined Exprs for dimensions that you do not wish
+ *  to bound.
+ */
+// @{
+Func repeat_image(const Func &source, const Region &bounds);
+template<typename T>
+HALIDE_NO_USER_CODE_INLINE Func repeat_image(const T &func_like, const Region &bounds) {
+    return repeat_image(Internal::func_like_to_func(func_like), bounds);
+}
+template<typename T>
+HALIDE_NO_USER_CODE_INLINE Func repeat_image(const T &func_like) {
+    Region object_bounds;
+    for (int i = 0; i < func_like.dimensions(); i++) {
+        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
+    }
+    return repeat_image(Internal::func_like_to_func(func_like), object_bounds);
+}
+/** Impose a boundary condition such that the entire coordinate space is
+ *  tiled with copies of the image abutted against each other, but mirror
+ *  them such that adjacent edges are the same.
+ *
+ *  An ImageParam, Buffer<T>, or similar can be passed instead of a Func. If this
+ *  is done and no bounds are given, the boundaries will be taken from the
+ *  min and extent methods of the passed object.
+ *
+ *  (This is similar to setting GL_TEXTURE_WRAP_* to GL_MIRRORED_REPEAT.)
+ *
+ *  You may pass undefined Exprs for dimensions that you do not wish
+ *  to bound.
+ */
+// @{
+Func mirror_image(const Func &source, const Region &bounds);
+template<typename T>
+HALIDE_NO_USER_CODE_INLINE Func mirror_image(const T &func_like, const Region &bounds) {
+    return mirror_image(Internal::func_like_to_func(func_like), bounds);
+}
+template<typename T>
+HALIDE_NO_USER_CODE_INLINE Func mirror_image(const T &func_like) {
+    Region object_bounds;
+    for (int i = 0; i < func_like.dimensions(); i++) {
+        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
+    }
+    return mirror_image(Internal::func_like_to_func(func_like), object_bounds);
+}
+// @}
+/** Impose a boundary condition such that the entire coordinate space is
+ *  tiled with copies of the image abutted against each other, but mirror
+ *  them such that adjacent edges are the same and then overlap the edges.
+ *
+ *  This produces an error if any extent is 1 or less. (TODO: check this.)
+ *
+ *  An ImageParam, Buffer<T>, or similar can be passed instead of a Func. If this
+ *  is done and no bounds are given, the boundaries will be taken from the
+ *  min and extent methods of the passed object.
+ *
+ *  (I do not believe there is a direct GL_TEXTURE_WRAP_* equivalent for this.)
+ *
+ *  You may pass undefined Exprs for dimensions that you do not wish
+ *  to bound.
+ */
+// @{
+Func mirror_interior(const Func &source, const Region &bounds);
+template<typename T>
+HALIDE_NO_USER_CODE_INLINE Func mirror_interior(const T &func_like, const Region &bounds) {
+    return mirror_interior(Internal::func_like_to_func(func_like), bounds);
+}
+template<typename T>
+HALIDE_NO_USER_CODE_INLINE Func mirror_interior(const T &func_like) {
+    Region object_bounds;
+    for (int i = 0; i < func_like.dimensions(); i++) {
+        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
+    }
+    return mirror_interior(Internal::func_like_to_func(func_like), object_bounds);
+}
+// @}
+}  // namespace BoundaryConditions
+}  // namespace Halide
+#endif
+#ifndef HALIDE_BOUNDS_INFERENCE_H
+#define HALIDE_BOUNDS_INFERENCE_H
+/** \file
+ * Defines the bounds_inference lowering pass.
+ */
+#include <map>
+#include <string>
+#include <vector>
+namespace Halide {
+struct Target;
+namespace Internal {
+class Function;
+/** Take a partially lowered statement that includes symbolic
+ * representations of the bounds over which things should be realized,
+ * and inject expressions defining those bounds.
+ */
+Stmt bounds_inference(Stmt,
+                      const std::vector<Function> &outputs,
+                      const std::vector<std::string> &realization_order,
+                      const std::vector<std::vector<std::string>> &fused_groups,
+                      const std::map<std::string, Function> &environment,
+                      const std::map<std::pair<std::string, int>, Interval> &func_bounds,
+                      const Target &target);
+}  // namespace Internal
+}  // namespace Halide
+#endif
+#ifndef HALIDE_CPLUSPLUS_MANGLE_H
+#define HALIDE_CPLUSPLUS_MANGLE_H
+/** \file
+ *
+ * A simple function to get a C++ mangled function name for a function.
+ */
+#include <string>
+#include <vector>
+namespace Halide {
+struct ExternFuncArgument;
+struct Target;
+namespace Internal {
+/** Return the mangled C++ name for a function.
+ * The target parameter is used to decide on the C++
+ * ABI/mangling style to use.
+ */
+std::string cplusplus_function_mangled_name(const std::string &name,
+                                            const std::vector<std::string> &namespaces,
+                                            Type return_type,
+                                            const std::vector<ExternFuncArgument> &args,
+                                            const Target &target);
+void cplusplus_mangle_test();
+}  // namespace Internal
+}  // namespace Halide
+#endif
+#ifndef HALIDE_INTERNAL_CSE_H
+#define HALIDE_INTERNAL_CSE_H
+/** \file
+ * Defines a pass for introducing let expressions to wrap common sub-expressions. */
+namespace Halide {
+namespace Internal {
+/** Replace each common sub-expression in the argument with a
+ * variable, and wrap the resulting expr in a let statement giving a
+ * value to that variable.
+ *
+ * This is important to do within Halide (instead of punting to llvm),
+ * because exprs that come in from the front-end are small when
+ * considered as a graph, but combinatorially large when considered as
+ * a tree. For an example of a such a case, see
+ * test/code_explosion.cpp
+ *
+ * The last parameter determines whether all common subexpressions are
+ * lifted, or only those that the simplifier would not subsitute back
+ * in (e.g. addition of a constant).
+ */
+Expr common_subexpression_elimination(const Expr &, bool lift_all = false);
+/** Do common-subexpression-elimination on each expression in a
+ * statement. Does not introduce let statements. */
+Stmt common_subexpression_elimination(const Stmt &, bool lift_all = false);
+void cse_test();
+}  // namespace Internal
+}  // namespace Halide
 #endif
 #ifndef HALIDE_CANONICALIZE_GPU_VARS_H
 #define HALIDE_CANONICALIZE_GPU_VARS_H
@@ -21498,6 +21872,24 @@ struct Indentation {
 };
 std::ostream &operator<<(std::ostream &stream, const Indentation &);
+template<typename T>
+struct Ansi {
+    const T &cnt;
+    const char *open, *close;
+};
+template<typename T>
+std::ostream &operator<<(std::ostream &out, const Ansi<T> &a) {
+    if (a.open) {
+        out << a.open;
+    }
+    out << a.cnt;
+    if (a.close) {
+        out << a.close;
+    }
+    return out;
+}
 /** An IRVisitor that emits IR to the given output stream in a human
  * readable form. Can be subclassed if you want to modify the way in
  * which it prints.
@@ -21547,12 +21939,51 @@ protected:
      * ellipses (...). */
     bool is_summary = false;
+    bool ansi = false;
+    int paren_depth = 0;
+    const char *ansi_hl = "";
+    const char *ansi_dim = "";
+    const char *ansi_kw = "";
+    const char *ansi_imm_int = "";
+    const char *ansi_imm_float = "";
+    const char *ansi_imm_str = "";
+    const char *ansi_var = "";
+    const char *ansi_buf = "";
+    const char *ansi_fn = "";
+    const char *ansi_type = "";
+    const char *ansi_reset_col = "";
+    const char *ansi_reset = "";
+    // clang-format off
+    template<typename T> Ansi<T> hl(const T &t);
+    template<typename T> Ansi<T> kw(const T &t);
+    template<typename T> Ansi<T> imm_int(const T &t);
+    template<typename T> Ansi<T> imm_float(const T &t);
+    template<typename T> Ansi<T> imm_str(const T &t);
+    template<typename T> Ansi<T> var(const T &t);
+    template<typename T> Ansi<T> buf(const T &t);
+    template<typename T> Ansi<T> fn(const T &t);
+    template<typename T> Ansi<T> type(const T &t);
+    template<typename T> Ansi<T> typep(const T &t);
+    template<typename T> Ansi<T> paren(const T &t, bool bold = true, int d = -1);
+    // clang-format on
     /** Either emits "(" or "", depending on the value of implicit_parens */
     void open();
     /** Either emits ")" or "", depending on the value of implicit_parens */
     void close();
+    /** Emits "(" always */
+    void openf();
+    /** Emits "name(" always */
+    void openf(const char *name);
+    /** Emits ")" always */
+    void closef();
     /** The symbols whose types can be inferred from values printed
      * already. */
     Scope<> known_type;
@@ -21625,6 +22056,8 @@ std::string lldb_string(const Stmt &);
 #endif
+#include <unordered_map>
 namespace Halide {
 struct Argument;
@@ -21749,6 +22182,8 @@ protected:
      * use different syntax for other C-like languages. */
     virtual void add_vector_typedefs(const std::set<Type> &vector_types);
+    std::unordered_map<std::string, std::string> extern_function_name_map;
     /** Bottleneck to allow customization of calls to generic Extern/PureExtern calls.  */
     virtual std::string print_extern_call(const Call *op);
@@ -22153,7 +22588,10 @@ protected:
     void visit(const Shuffle *op) override;
     void visit(const Call *op) override;
+    std::string print_extern_call(const Call *op) override;
     VectorDeclarationStyle vector_declaration_style = VectorDeclarationStyle::CLikeSyntax;
+    bool abs_returns_unsigned_type{false};
 };
 }  // namespace Internal
@@ -22292,6 +22730,7 @@ template<typename, typename>
 class IRBuilder;
 class LLVMContext;
 class Type;
+class PointerType;
 class StructType;
 class Instruction;
 class CallInst;
@@ -22437,10 +22876,31 @@ protected:
     std::unique_ptr<llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>> builder;
     llvm::Value *value = nullptr;
     llvm::MDNode *very_likely_branch = nullptr;
-    llvm::MDNode *default_fp_math_md = nullptr;
+    llvm::MDNode *fast_fp_math_md = nullptr;
     llvm::MDNode *strict_fp_math_md = nullptr;
     std::vector<LoweredArgument> current_function_args;
+    bool in_strict_float = false;
+    bool any_strict_float = false;
+    /** Change floating-point math op emission to use fast flags. */
+    void set_fast_fp_math();
+    /** Change floating-point math op emission to use strict flags. */
+    void set_strict_fp_math();
+    /** If any_strict_float is true, sets fast math flags for the lifetime of
+     * this object, then sets them to strict on destruction. If any_strict_float
+     * is false, does nothing.  Any call to an IRBuilder method that starts with
+     * "CreateF" should probably be wrapped in one of these, but it's safe to
+     * miss one - we just miss out on some optimizations. In this way codegen is
+     * designed to fail safe. */
+    struct ScopedFastMath {
+        CodeGen_LLVM *codegen;
+        ScopedFastMath(CodeGen_LLVM *);
+        ~ScopedFastMath();
+    };
     /** The target we're generating code for */
     Halide::Target target;
@@ -22478,6 +22938,7 @@ protected:
     /** Some useful llvm types */
     // @{
     llvm::Type *void_t = nullptr, *i1_t = nullptr, *i8_t = nullptr, *i16_t = nullptr, *i32_t = nullptr, *i64_t = nullptr, *f16_t = nullptr, *f32_t = nullptr, *f64_t = nullptr;
+    llvm::PointerType *ptr_t = nullptr;
     llvm::StructType *halide_buffer_t_type = nullptr,
                      *type_t_type,
                      *dimension_t_type,
@@ -22985,7 +23446,7 @@ private:
     void codegen_atomic_rmw(const Store *op);
-    void init_codegen(const std::string &name, bool any_strict_float = false);
+    void init_codegen(const std::string &name);
     std::unique_ptr<llvm::Module> finish_codegen();
     /** A helper routine for generating folded vector reductions. */
@@ -23054,6 +23515,29 @@ std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_OpenCL_Dev(const Target &target);
 }  // namespace Internal
 }  // namespace Halide
+#endif
+#ifndef HALIDE_CODEGEN_PTX_DEV_H
+#define HALIDE_CODEGEN_PTX_DEV_H
+/** \file
+ * Defines the code-generator for producing CUDA host code
+ */
+#include <memory>
+namespace Halide {
+struct Target;
+namespace Internal {
+struct CodeGen_GPU_Dev;
+std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target);
+}  // namespace Internal
+}  // namespace Halide
 #endif
 #ifndef HALIDE_CODEGEN_POSIX_H
 #define HALIDE_CODEGEN_POSIX_H
@@ -23163,29 +23647,6 @@ private:
 }  // namespace Internal
 }  // namespace Halide
-#endif
-#ifndef HALIDE_CODEGEN_PTX_DEV_H
-#define HALIDE_CODEGEN_PTX_DEV_H
-/** \file
- * Defines the code-generator for producing CUDA host code
- */
-#include <memory>
-namespace Halide {
-struct Target;
-namespace Internal {
-struct CodeGen_GPU_Dev;
-std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target);
-}  // namespace Internal
-}  // namespace Halide
 #endif
 #ifndef HALIDE_CODEGEN_PYTORCH_H
 #define HALIDE_CODEGEN_PYTORCH_H
@@ -23549,221 +24010,10 @@ inline Expr u64_sat(Expr e) {
 };  // namespace ConciseCasts
 };  // namespace Halide
-#endif
-#ifndef HALIDE_CPLUSPLUS_MANGLE_H
-#define HALIDE_CPLUSPLUS_MANGLE_H
-/** \file
- *
- * A simple function to get a C++ mangled function name for a function.
- */
-#include <string>
-#include <vector>
-namespace Halide {
-struct ExternFuncArgument;
-struct Target;
-namespace Internal {
-/** Return the mangled C++ name for a function.
- * The target parameter is used to decide on the C++
- * ABI/mangling style to use.
- */
-std::string cplusplus_function_mangled_name(const std::string &name,
-                                            const std::vector<std::string> &namespaces,
-                                            Type return_type,
-                                            const std::vector<ExternFuncArgument> &args,
-                                            const Target &target);
-void cplusplus_mangle_test();
-}  // namespace Internal
-}  // namespace Halide
 #endif
 #ifndef HALIDE_CONSTANT_BOUNDS_H
 #define HALIDE_CONSTANT_BOUNDS_H
-#ifndef HALIDE_CONSTANT_INTERVAL_H
-#define HALIDE_CONSTANT_INTERVAL_H
-#include <stdint.h>
-/** \file
- * Defines the ConstantInterval class, and operators on it.
- */
-namespace Halide {
-struct Type;
-namespace Internal {
-/** A class to represent ranges of integers. Can be unbounded above or below,
- * but they cannot be empty. */
-struct ConstantInterval {
-    /** The lower and upper bound of the interval. They are included
-     * in the interval. */
-    int64_t min = 0, max = 0;
-    bool min_defined = false, max_defined = false;
-    /* A default-constructed Interval is everything */
-    ConstantInterval() = default;
-    /** Construct an interval from a lower and upper bound. */
-    ConstantInterval(int64_t min, int64_t max);
-    /** The interval representing everything. */
-    static ConstantInterval everything();
-    /** Construct an interval representing a single point. */
-    static ConstantInterval single_point(int64_t x);
-    /** Construct intervals bounded above or below. */
-    static ConstantInterval bounded_below(int64_t min);
-    static ConstantInterval bounded_above(int64_t max);
-    /** Is the interval the entire range */
-    bool is_everything() const;
-    /** Is the interval just a single value (min == max) */
-    bool is_single_point() const;
-    /** Is the interval a particular single value */
-    bool is_single_point(int64_t x) const;
-    /** Does the interval have a finite upper and lower bound */
-    bool is_bounded() const;
-    /** Expand the interval to include another Interval */
-    void include(const ConstantInterval &i);
-    /** Expand the interval to include a point */
-    void include(int64_t x);
-    /** Test if the interval contains a particular value */
-    bool contains(int32_t x) const;
-    /** Test if the interval contains a particular value */
-    bool contains(int64_t x) const;
-    /** Test if the interval contains a particular unsigned value */
-    bool contains(uint64_t x) const;
-    /** Construct the smallest interval containing two intervals. */
-    static ConstantInterval make_union(const ConstantInterval &a, const ConstantInterval &b);
-    /** Construct the largest interval contained within two intervals. Throws an
-     * error if the interval is empty. */
-    static ConstantInterval make_intersection(const ConstantInterval &a, const ConstantInterval &b);
-    /** Equivalent to same_as. Exists so that the autoscheduler can
-     * compare two map<string, Interval> for equality in order to
-     * cache computations. */
-    bool operator==(const ConstantInterval &other) const;
-    /** In-place versions of the arithmetic operators below. */
-    // @{
-    void operator+=(const ConstantInterval &other);
-    void operator+=(int64_t);
-    void operator-=(const ConstantInterval &other);
-    void operator-=(int64_t);
-    void operator*=(const ConstantInterval &other);
-    void operator*=(int64_t);
-    void operator/=(const ConstantInterval &other);
-    void operator/=(int64_t);
-    void operator%=(const ConstantInterval &other);
-    void operator%=(int64_t);
-    // @}
-    /** Negate an interval. */
-    ConstantInterval operator-() const;
-    /** Track what happens if a constant integer interval is forced to fit into
-     * a concrete integer type. */
-    void cast_to(const Type &t);
-    /** Get constant integer bounds on a type. */
-    static ConstantInterval bounds_of_type(Type);
-};
-/** Arithmetic operators on ConstantIntervals. The resulting interval contains
- * all possible values of the operator applied to any two elements of the
- * argument intervals. Note that these operator on unbounded integers. If you
- * are applying this to concrete small integer types, you will need to manually
- * cast the constant interval back to the desired type to model the effect of
- * overflow. */
-// @{
-ConstantInterval operator+(const ConstantInterval &a, const ConstantInterval &b);
-ConstantInterval operator+(const ConstantInterval &a, int64_t b);
-ConstantInterval operator-(const ConstantInterval &a, const ConstantInterval &b);
-ConstantInterval operator-(const ConstantInterval &a, int64_t b);
-ConstantInterval operator/(const ConstantInterval &a, const ConstantInterval &b);
-ConstantInterval operator/(const ConstantInterval &a, int64_t b);
-ConstantInterval operator*(const ConstantInterval &a, const ConstantInterval &b);
-ConstantInterval operator*(const ConstantInterval &a, int64_t b);
-ConstantInterval operator%(const ConstantInterval &a, const ConstantInterval &b);
-ConstantInterval operator%(const ConstantInterval &a, int64_t b);
-ConstantInterval min(const ConstantInterval &a, const ConstantInterval &b);
-ConstantInterval min(const ConstantInterval &a, int64_t b);
-ConstantInterval max(const ConstantInterval &a, const ConstantInterval &b);
-ConstantInterval max(const ConstantInterval &a, int64_t b);
-ConstantInterval abs(const ConstantInterval &a);
-ConstantInterval operator<<(const ConstantInterval &a, const ConstantInterval &b);
-ConstantInterval operator<<(const ConstantInterval &a, int64_t b);
-ConstantInterval operator<<(int64_t a, const ConstantInterval &b);
-ConstantInterval operator>>(const ConstantInterval &a, const ConstantInterval &b);
-ConstantInterval operator>>(const ConstantInterval &a, int64_t b);
-ConstantInterval operator>>(int64_t a, const ConstantInterval &b);
-// @}
-/** Comparison operators on ConstantIntervals. Returns whether the comparison is
- * true for all values of the two intervals. */
-// @{
-bool operator<=(const ConstantInterval &a, const ConstantInterval &b);
-bool operator<=(const ConstantInterval &a, int64_t b);
-bool operator<=(int64_t a, const ConstantInterval &b);
-bool operator<(const ConstantInterval &a, const ConstantInterval &b);
-bool operator<(const ConstantInterval &a, int64_t b);
-bool operator<(int64_t a, const ConstantInterval &b);
-inline bool operator>=(const ConstantInterval &a, const ConstantInterval &b) {
-    return b <= a;
-}
-inline bool operator>(const ConstantInterval &a, const ConstantInterval &b) {
-    return b < a;
-}
-inline bool operator>=(const ConstantInterval &a, int64_t b) {
-    return b <= a;
-}
-inline bool operator>(const ConstantInterval &a, int64_t b) {
-    return b < a;
-}
-inline bool operator>=(int64_t a, const ConstantInterval &b) {
-    return b <= a;
-}
-inline bool operator>(int64_t a, const ConstantInterval &b) {
-    return b < a;
-}
-// @}
-}  // namespace Internal
-/** Cast operators for ConstantIntervals. These ones have to live out in
- * Halide::, to avoid C++ name lookup confusion with the Halide::cast variants
- * that take Exprs. */
-// @{
-Internal::ConstantInterval cast(Type t, const Internal::ConstantInterval &a);
-Internal::ConstantInterval saturating_cast(Type t, const Internal::ConstantInterval &a);
-// @}
-}  // namespace Halide
-#endif
 /** \file
  * Methods for computing compile-time constant int64_t upper and lower bounds of
@@ -23792,42 +24042,6 @@ ConstantInterval constant_integer_bounds(const Expr &e,
 }  // namespace Internal
 }  // namespace Halide
-#endif
-#ifndef HALIDE_INTERNAL_CSE_H
-#define HALIDE_INTERNAL_CSE_H
-/** \file
- * Defines a pass for introducing let expressions to wrap common sub-expressions. */
-namespace Halide {
-namespace Internal {
-/** Replace each common sub-expression in the argument with a
- * variable, and wrap the resulting expr in a let statement giving a
- * value to that variable.
- *
- * This is important to do within Halide (instead of punting to llvm),
- * because exprs that come in from the front-end are small when
- * considered as a graph, but combinatorially large when considered as
- * a tree. For an example of a such a case, see
- * test/code_explosion.cpp
- *
- * The last parameter determines whether all common subexpressions are
- * lifted, or only those that the simplifier would not subsitute back
- * in (e.g. addition of a constant).
- */
-Expr common_subexpression_elimination(const Expr &, bool lift_all = false);
-/** Do common-subexpression-elimination on each expression in a
- * statement. Does not introduce let statements. */
-Stmt common_subexpression_elimination(const Stmt &, bool lift_all = false);
-void cse_test();
-}  // namespace Internal
-}  // namespace Halide
 #endif
 #ifndef HALIDE_INTERNAL_DEBUG_ARGUMENTS_H
 #define HALIDE_INTERNAL_DEBUG_ARGUMENTS_H
@@ -24124,27 +24338,27 @@ Pipeline deserialize_pipeline(const std::string &filename, const std::map<std::s
 /// @return Returns a newly constructed deserialized Pipeline object/
 Pipeline deserialize_pipeline(std::istream &in, const std::map<std::string, Parameter> &user_params);
-/// @brief Deserialize a Halide pipeline from a byte buffer containing a serizalized pipeline in binary format
+/// @brief Deserialize a Halide pipeline from a byte buffer containing a serialized pipeline in binary format
 /// @param data The data buffer containing a serialized Halide pipeline
 /// @param user_params Map of named input/output parameters to bind with the resulting pipeline (used to avoid deserializing specific objects and enable the use of externally defined ones instead).
 /// @return Returns a newly constructed deserialized Pipeline object/
 Pipeline deserialize_pipeline(const std::vector<uint8_t> &data, const std::map<std::string, Parameter> &user_params);
-/// @brief Deserialize the extenal parameters for the Halide pipeline from a file.
+/// @brief Deserialize the external parameters for the Halide pipeline from a file.
 ///        This method allows a minimal deserialization of just the external pipeline parameters, so they can be
 ///        remapped and overridden with user parameters prior to deserializing the pipeline definition.
 /// @param filename The location of the file to deserialize.  Must use .hlpipe extension.
 /// @return Returns a map containing the names and description of external parameters referenced in the pipeline
 std::map<std::string, Parameter> deserialize_parameters(const std::string &filename);
-/// @brief Deserialize the extenal parameters for the Halide pipeline from input stream.
+/// @brief Deserialize the external parameters for the Halide pipeline from input stream.
 ///        This method allows a minimal deserialization of just the external pipeline parameters, so they can be
 ///        remapped and overridden with user parameters prior to deserializing the pipeline definition.
 /// @param in The input stream to read from containing a serialized Halide pipeline
 /// @return Returns a map containing the names and description of external parameters referenced in the pipeline
 std::map<std::string, Parameter> deserialize_parameters(std::istream &in);
-/// @brief Deserialize the extenal parameters for the Halide pipeline from a byte buffer containing a serialized
+/// @brief Deserialize the external parameters for the Halide pipeline from a byte buffer containing a serialized
 ///        pipeline in binary format.  This method allows a minimal deserialization of just the external pipeline
 ///        parameters, so they can be remapped and overridden with user parameters prior to deserializing the
 ///        pipeline definition.
@@ -24301,6 +24515,7 @@ Stmt inject_early_frees(const Stmt &s);
 #define HALIDE_ELF_H
 #include <algorithm>
+#include <cstdint>
 #include <iterator>
 #include <list>
 #include <memory>
@@ -24524,17 +24739,17 @@ public:
         SHT_REL = 9,
         SHT_SHLIB = 10,
         SHT_DYNSYM = 11,
-        SHT_LOPROC = 0x70000000,
-        SHT_HIPROC = 0x7fffffff,
-        SHT_LOUSER = 0x80000000,
-        SHT_HIUSER = 0xffffffff,
+        SHT_LOPROC = 0x70000000u,
+        SHT_HIPROC = 0x7fffffffu,
+        SHT_LOUSER = 0x80000000u,
+        SHT_HIUSER = 0xffffffffu,
     };
     enum Flag : uint32_t {
         SHF_WRITE = 0x1,
         SHF_ALLOC = 0x2,
         SHF_EXECINSTR = 0x4,
-        SHF_MASKPROC = 0xf0000000,
+        SHF_MASKPROC = 0xf0000000u,
     };
     typedef std::vector<Relocation> RelocationList;
@@ -24762,8 +24977,8 @@ public:
         ET_EXEC = 2,
         ET_DYN = 3,
         ET_CORE = 4,
-        ET_LOPROC = 0xff00,
-        ET_HIPROC = 0xffff,
+        ET_LOPROC = 0xff00u,
+        ET_HIPROC = 0xffffu,
     };
     // We use lists for sections and symbols to avoid iterator
@@ -28128,6 +28343,11 @@ public:
 #undef HALIDE_OUTPUT_FORWARD
 #undef HALIDE_OUTPUT_FORWARD_CONST
+    using GIOBase::set_type;
+    /** Set types dynamically for tuple outputs. */
+    void set_type(const std::vector<Type> &types);
 protected:
     GeneratorOutputBase(size_t array_size,
                         const std::string &name,
@@ -28999,11 +29219,21 @@ public:
     // long as all Outputs have been defined.)
     Pipeline get_pipeline();
+protected:
+    void claim_name(const std::string &name, const char *param_type) {
+        user_assert(param_info_ptr->names.count(name) == 0)
+            << "Cannot add " << param_type << " with name " << name
+            << ". It is already taken by another input or output parameter.";
+        param_info_ptr->names.insert(name);
+    }
+public:
     // Create Input<Func> with dynamic type & dimensions
     template<typename T,
              typename std::enable_if<std::is_same<T, Halide::Func>::value>::type * = nullptr>
     GeneratorInput<T> *add_input(const std::string &name, const Type &t, int dimensions) {
         check_exact_phase(GeneratorBase::ConfigureCalled);
+        claim_name(name, "input");
         auto *p = new GeneratorInput<T>(name, t, dimensions);
         p->generator = this;
         param_info_ptr->owned_extras.push_back(std::unique_ptr<Internal::GIOBase>(p));
@@ -29018,6 +29248,7 @@ public:
         static_assert(!T::has_static_halide_type, "You can only call this version of add_input() for a Buffer<T, D> where T is void or omitted .");
         static_assert(!T::has_static_dimensions, "You can only call this version of add_input() for a Buffer<T, D> where D is -1 or omitted.");
         check_exact_phase(GeneratorBase::ConfigureCalled);
+        claim_name(name, "input");
         auto *p = new GeneratorInput<T>(name, t, dimensions);
         p->generator = this;
         param_info_ptr->owned_extras.push_back(std::unique_ptr<Internal::GIOBase>(p));
@@ -29032,6 +29263,7 @@ public:
         static_assert(T::has_static_halide_type, "You can only call this version of add_input() for a Buffer<T, D> where T is not void.");
         static_assert(!T::has_static_dimensions, "You can only call this version of add_input() for a Buffer<T, D> where D is -1 or omitted.");
         check_exact_phase(GeneratorBase::ConfigureCalled);
+        claim_name(name, "input");
         auto *p = new GeneratorInput<T>(name, dimensions);
         p->generator = this;
         param_info_ptr->owned_extras.push_back(std::unique_ptr<Internal::GIOBase>(p));
@@ -29046,6 +29278,7 @@ public:
         static_assert(T::has_static_halide_type, "You can only call this version of add_input() for a Buffer<T, D> where T is not void.");
         static_assert(T::has_static_dimensions, "You can only call this version of add_input() for a Buffer<T, D> where D is not -1.");
         check_exact_phase(GeneratorBase::ConfigureCalled);
+        claim_name(name, "input");
         auto *p = new GeneratorInput<T>(name);
         p->generator = this;
         param_info_ptr->owned_extras.push_back(std::unique_ptr<Internal::GIOBase>(p));
@@ -29057,6 +29290,7 @@ public:
              typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr>
     GeneratorInput<T> *add_input(const std::string &name) {
         check_exact_phase(GeneratorBase::ConfigureCalled);
+        claim_name(name, "input");
         auto *p = new GeneratorInput<T>(name);
         p->generator = this;
         param_info_ptr->owned_extras.push_back(std::unique_ptr<Internal::GIOBase>(p));
@@ -29068,6 +29302,7 @@ public:
              typename std::enable_if<std::is_same<T, Expr>::value>::type * = nullptr>
     GeneratorInput<T> *add_input(const std::string &name, const Type &type) {
         check_exact_phase(GeneratorBase::ConfigureCalled);
+        claim_name(name, "input");
         auto *p = new GeneratorInput<Expr>(name);
         p->generator = this;
         p->set_type(type);
@@ -29079,8 +29314,9 @@ public:
     // Create Output<Func> with dynamic type & dimensions
     template<typename T,
              typename std::enable_if<std::is_same<T, Halide::Func>::value>::type * = nullptr>
-    GeneratorOutput<T> *add_output(const std::string &name, const Type &t, int dimensions) {
+    GeneratorOutput<T> *add_output(const std::string &name, const std::vector<Type> &t, int dimensions) {
         check_exact_phase(GeneratorBase::ConfigureCalled);
+        claim_name(name, "output");
         auto *p = new GeneratorOutput<T>(name, t, dimensions);
         p->generator = this;
         param_info_ptr->owned_extras.push_back(std::unique_ptr<Internal::GIOBase>(p));
@@ -29088,13 +29324,20 @@ public:
         return p;
     }
+    template<typename T,
+             typename std::enable_if<std::is_same<T, Halide::Func>::value>::type * = nullptr>
+    GeneratorOutput<T> *add_output(const std::string &name, const Type &t, int dimensions) {
+        return add_output<T>(name, std::vector<Type>{t}, dimensions);
+    }
     // Create Output<Buffer> with dynamic type & dimensions
     template<typename T,
              typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<T, Halide::Func>::value>::type * = nullptr>
-    GeneratorOutput<T> *add_output(const std::string &name, const Type &t, int dimensions) {
+    GeneratorOutput<T> *add_output(const std::string &name, const std::vector<Type> &t, int dimensions) {
         static_assert(!T::has_static_halide_type, "You can only call this version of add_output() for a Buffer<T, D> where T is void or omitted .");
         static_assert(!T::has_static_dimensions, "You can only call this version of add_output() for a Buffer<T, D> where D is -1 or omitted.");
         check_exact_phase(GeneratorBase::ConfigureCalled);
+        claim_name(name, "output");
         auto *p = new GeneratorOutput<T>(name, t, dimensions);
         p->generator = this;
         param_info_ptr->owned_extras.push_back(std::unique_ptr<Internal::GIOBase>(p));
@@ -29102,13 +29345,20 @@ public:
         return p;
     }
-    // Create Output<Buffer> with compile-time type
+    template<typename T,
+             typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<T, Halide::Func>::value>::type * = nullptr>
+    GeneratorOutput<T> *add_output(const std::string &name, const Type &t, int dimensions) {
+        return add_output<T>(name, std::vector<Type>{t}, dimensions);
+    }
+    // Create Output<Buffer> with either a compile-time type or a
+    // to-be-set-later type and dynamic dimensions
     template<typename T,
              typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<T, Halide::Func>::value>::type * = nullptr>
     GeneratorOutput<T> *add_output(const std::string &name, int dimensions) {
-        static_assert(T::has_static_halide_type, "You can only call this version of add_output() for a Buffer<T, D> where T is not void.");
         static_assert(!T::has_static_dimensions, "You can only call this version of add_output() for a Buffer<T, D> where D is -1 or omitted.");
         check_exact_phase(GeneratorBase::ConfigureCalled);
+        claim_name(name, "output");
         auto *p = new GeneratorOutput<T>(name, dimensions);
         p->generator = this;
         param_info_ptr->owned_extras.push_back(std::unique_ptr<Internal::GIOBase>(p));
@@ -29116,13 +29366,35 @@ public:
         return p;
     }
-    // Create Output<Buffer> with compile-time type & dimensions
+    // Create Output<Buffer> with compile-time dimensions and dynamic type
+    template<typename T,
+             typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<T, Halide::Func>::value>::type * = nullptr>
+    GeneratorOutput<T> *add_output(const std::string &name, const std::vector<Type> &t) {
+        static_assert(!T::has_static_halide_type, "You can only call this version of add_output() for a Buffer<T, D> where T is void or omitted.");
+        static_assert(T::has_static_dimensions, "You can only call this version of add_output() for a Buffer<void, D> where D is not -1.");
+        check_exact_phase(GeneratorBase::ConfigureCalled);
+        claim_name(name, "output");
+        auto *p = new GeneratorOutput<T>(name, t);
+        p->generator = this;
+        param_info_ptr->owned_extras.push_back(std::unique_ptr<Internal::GIOBase>(p));
+        param_info_ptr->filter_outputs.push_back(p);
+        return p;
+    }
+    template<typename T,
+             typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<T, Halide::Func>::value>::type * = nullptr>
+    GeneratorOutput<T> *add_output(const std::string &name, const Type &t) {
+        return add_output<T>(name, std::vector<Type>{t});
+    }
+    // Create Output<Buffer> with compile-time type and dimensions
     template<typename T,
              typename std::enable_if<!std::is_arithmetic<T>::value && !std::is_same<T, Halide::Func>::value>::type * = nullptr>
     GeneratorOutput<T> *add_output(const std::string &name) {
         static_assert(T::has_static_halide_type, "You can only call this version of add_output() for a Buffer<T, D> where T is not void.");
         static_assert(T::has_static_dimensions, "You can only call this version of add_output() for a Buffer<T, D> where D is not -1.");
         check_exact_phase(GeneratorBase::ConfigureCalled);
+        claim_name(name, "output");
         auto *p = new GeneratorOutput<T>(name);
         p->generator = this;
         param_info_ptr->owned_extras.push_back(std::unique_ptr<Internal::GIOBase>(p));
@@ -29954,240 +30226,6 @@ std::string type_suffix(const std::vector<Expr> &ops, bool signed_variants = tru
 }  // namespace Internal
 }  // namespace Halide
-#endif
-#ifndef HALIDE_INFER_ARGUMENTS_H
-#define HALIDE_INFER_ARGUMENTS_H
-#include <vector>
-/** \file
- *
- * Interface for a visitor to infer arguments used in a body Stmt.
- */
-namespace Halide {
-namespace Internal {
-/** An inferred argument. Inferred args are either Params,
- * ImageParams, or Buffers. The first two are handled by the param
- * field, and global images are tracked via the buf field. These
- * are used directly when jitting, or used for validation when
- * compiling with an explicit argument list. */
-struct InferredArgument {
-    Argument arg;
-    Parameter param;
-    Buffer<> buffer;
-    bool operator<(const InferredArgument &other) const {
-        if (arg.is_buffer() && !other.arg.is_buffer()) {
-            return true;
-        } else if (other.arg.is_buffer() && !arg.is_buffer()) {
-            return false;
-        } else {
-            return arg.name < other.arg.name;
-        }
-    }
-};
-class Function;
-std::vector<InferredArgument> infer_arguments(const Stmt &body, const std::vector<Function> &outputs);
-}  // namespace Internal
-}  // namespace Halide
-#endif
-#ifndef HALIDE_HOST_GPU_BUFFER_COPIES_H
-#define HALIDE_HOST_GPU_BUFFER_COPIES_H
-/** \file
- * Defines the lowering passes that deal with host and device buffer flow.
- */
-#include <string>
-#include <vector>
-namespace Halide {
-struct Target;
-namespace Internal {
-/** A helper function to call an extern function, and assert that it
- * returns 0. */
-Stmt call_extern_and_assert(const std::string &name, const std::vector<Expr> &args);
-/** Inject calls to halide_device_malloc, halide_copy_to_device, and
- * halide_copy_to_host as needed. */
-Stmt inject_host_dev_buffer_copies(Stmt s, const Target &t);
-}  // namespace Internal
-}  // namespace Halide
-#endif
-#ifndef HALIDE_INLINE_H
-#define HALIDE_INLINE_H
-/** \file
- * Methods for replacing calls to functions with their definitions.
- */
-namespace Halide {
-namespace Internal {
-class Function;
-/** Inline a single named function, which must be pure. For a pure function to
- * be inlined, it must not have any specializations (i.e. it can only have one
- * values definition). */
-// @{
-Stmt inline_function(Stmt s, const Function &f);
-Expr inline_function(Expr e, const Function &f);
-void inline_function(Function caller, const Function &f);
-// @}
-/** Check if the schedule of an inlined function is legal, throwing an error
- * if it is not. */
-void validate_schedule_inlined_function(Function f);
-}  // namespace Internal
-}  // namespace Halide
-#endif
-#ifndef HALIDE_INLINE_REDUCTIONS_H
-#define HALIDE_INLINE_REDUCTIONS_H
-#include <string>
-/** \file
- * Defines some inline reductions: sum, product, minimum, maximum.
- */
-namespace Halide {
-class Func;
-/** An inline reduction. This is suitable for convolution-type
- * operations - the reduction will be computed in the innermost loop
- * that it is used in. The argument may contain free or implicit
- * variables, and must refer to some reduction domain. The free
- * variables are still free in the return value, but the reduction
- * domain is captured - the result expression does not refer to a
- * reduction domain and can be used in a pure function definition.
- *
- * An example using \ref sum :
- *
- \code
- Func f, g;
- Var x;
- RDom r(0, 10);
- f(x) = x*x;
- g(x) = sum(f(x + r));
- \endcode
- *
- * Here g computes some blur of x, but g is still a pure function. The
- * sum is being computed by an anonymous reduction function that is
- * scheduled innermost within g.
- */
-//@{
-Expr sum(Expr, const std::string &s = "sum");
-Expr saturating_sum(Expr, const std::string &s = "saturating_sum");
-Expr product(Expr, const std::string &s = "product");
-Expr maximum(Expr, const std::string &s = "maximum");
-Expr minimum(Expr, const std::string &s = "minimum");
-//@}
-/** Variants of the inline reduction in which the RDom is stated
- * explicitly. The expression can refer to multiple RDoms, and only
- * the inner one is captured by the reduction. This allows you to
- * write expressions like:
- \code
- RDom r1(0, 10), r2(0, 10), r3(0, 10);
- Expr e = minimum(r1, product(r2, sum(r3, r1 + r2 + r3)));
- \endcode
-*/
-// @{
-Expr sum(const RDom &, Expr, const std::string &s = "sum");
-Expr saturating_sum(const RDom &r, Expr e, const std::string &s = "saturating_sum");
-Expr product(const RDom &, Expr, const std::string &s = "product");
-Expr maximum(const RDom &, Expr, const std::string &s = "maximum");
-Expr minimum(const RDom &, Expr, const std::string &s = "minimum");
-// @}
-/** Returns an Expr or Tuple representing the coordinates of the point
- * in the RDom which minimizes or maximizes the expression. The
- * expression must refer to some RDom. Also returns the extreme value
- * of the expression as the last element of the tuple. */
-// @{
-Tuple argmax(Expr, const std::string &s = "argmax");
-Tuple argmin(Expr, const std::string &s = "argmin");
-Tuple argmax(const RDom &, Expr, const std::string &s = "argmax");
-Tuple argmin(const RDom &, Expr, const std::string &s = "argmin");
-// @}
-/** Inline reductions create an anonymous helper Func to do the
- * work. The variants below instead take a named Func object to use,
- * so that it is no longer anonymous and can be scheduled
- * (e.g. unrolled across the reduction domain). The Func passed must
- * not have any existing definition. */
-//@{
-Expr sum(Expr, const Func &);
-Expr saturating_sum(Expr, const Func &);
-Expr product(Expr, const Func &);
-Expr maximum(Expr, const Func &);
-Expr minimum(Expr, const Func &);
-Expr sum(const RDom &, Expr, const Func &);
-Expr saturating_sum(const RDom &r, Expr e, const Func &);
-Expr product(const RDom &, Expr, const Func &);
-Expr maximum(const RDom &, Expr, const Func &);
-Expr minimum(const RDom &, Expr, const Func &);
-Tuple argmax(Expr, const Func &);
-Tuple argmin(Expr, const Func &);
-Tuple argmax(const RDom &, Expr, const Func &);
-Tuple argmin(const RDom &, Expr, const Func &);
-//@}
-}  // namespace Halide
-#endif
-#ifndef HALIDE_INTEGER_DIVISION_TABLE_H
-#define HALIDE_INTEGER_DIVISION_TABLE_H
-#include <cstdint>
-/** \file
- * Tables telling us how to do integer division via fixed-point
- * multiplication for various small constants. This file is
- * automatically generated by find_inverse.cpp.
- */
-namespace Halide {
-namespace Internal {
-namespace IntegerDivision {
-extern const int64_t table_u8[256][4];
-extern const int64_t table_s8[256][4];
-extern const int64_t table_srz8[256][4];
-extern const int64_t table_u16[256][4];
-extern const int64_t table_s16[256][4];
-extern const int64_t table_srz16[256][4];
-extern const int64_t table_u32[256][4];
-extern const int64_t table_s32[256][4];
-extern const int64_t table_srz32[256][4];
-extern const int64_t table_runtime_u8[256][4];
-extern const int64_t table_runtime_s8[256][4];
-extern const int64_t table_runtime_srz8[256][4];
-extern const int64_t table_runtime_u16[256][4];
-extern const int64_t table_runtime_s16[256][4];
-extern const int64_t table_runtime_srz16[256][4];
-extern const int64_t table_runtime_u32[256][4];
-extern const int64_t table_runtime_s32[256][4];
-extern const int64_t table_runtime_srz32[256][4];
-}  // namespace IntegerDivision
-}  // namespace Internal
-}  // namespace Halide
 #endif
 #ifndef HALIDE_IR_MATCH_H
 #define HALIDE_IR_MATCH_H
@@ -30836,14 +30874,14 @@ struct BinOp {
         }
         const Op &op = (const Op &)e;
         return (a.template match<bound>(*op.a.get(), state) &&
-                b.template match<bound | bindings<A>::mask>(*op.b.get(), state));
+                b.template match<(bound | bindings<A>::mask)>(*op.b.get(), state));
     }
     template<uint32_t bound, typename Op2, typename A2, typename B2>
     HALIDE_ALWAYS_INLINE bool match(const BinOp<Op2, A2, B2> &op, MatcherState &state) const noexcept {
         return (std::is_same<Op, Op2>::value &&
                 a.template match<bound>(unwrap(op.a), state) &&
-                b.template match<bound | bindings<A>::mask>(unwrap(op.b), state));
+                b.template match<(bound | bindings<A>::mask)>(unwrap(op.b), state));
     }
     constexpr static bool foldable = A::foldable && B::foldable;
@@ -30938,14 +30976,14 @@ struct CmpOp {
         }
         const Op &op = (const Op &)e;
         return (a.template match<bound>(*op.a.get(), state) &&
-                b.template match<bound | bindings<A>::mask>(*op.b.get(), state));
+                b.template match<(bound | bindings<A>::mask)>(*op.b.get(), state));
     }
     template<uint32_t bound, typename Op2, typename A2, typename B2>
     HALIDE_ALWAYS_INLINE bool match(const CmpOp<Op2, A2, B2> &op, MatcherState &state) const noexcept {
         return (std::is_same<Op, Op2>::value &&
                 a.template match<bound>(unwrap(op.a), state) &&
-                b.template match<bound | bindings<A>::mask>(unwrap(op.b), state));
+                b.template match<(bound | bindings<A>::mask)>(unwrap(op.b), state));
     }
     constexpr static bool foldable = A::foldable && B::foldable;
@@ -31508,11 +31546,6 @@ constexpr bool and_reduce(bool first, Args... rest) {
     return first && and_reduce(rest...);
 }
-// TODO: this can be replaced with std::min() once we require C++14 or later
-constexpr int const_min(int a, int b) {
-    return a < b ? a : b;
-}
 template<Call::IntrinsicOp intrin>
 struct OptionalIntrinType {
     bool check(const Type &) const {
@@ -31550,7 +31583,7 @@ struct Intrin {
     HALIDE_ALWAYS_INLINE bool match_args(int, const Call &c, MatcherState &state) const noexcept {
         using T = decltype(std::get<i>(args));
         return (std::get<i>(args).template match<bound>(*c.args[i].get(), state) &&
-                match_args<i + 1, bound | bindings<T>::mask>(0, c, state));
+                match_args<i + 1, (bound | bindings<T>::mask)>(0, c, state));
     }
     template<int i, uint32_t binds>
@@ -31601,7 +31634,7 @@ struct Intrin {
             return saturating_cast(optional_type_hint.type, std::move(arg0));
         }
-        Expr arg1 = std::get<const_min(1, sizeof...(Args) - 1)>(args).make(state, type_hint);
+        Expr arg1 = std::get<std::min<size_t>(1, sizeof...(Args) - 1)>(args).make(state, type_hint);
         if (intrin == Call::absd) {
             return absd(std::move(arg0), std::move(arg1));
         } else if (intrin == Call::widen_right_add) {
@@ -31636,7 +31669,7 @@ struct Intrin {
             return rounding_shift_right(std::move(arg0), std::move(arg1));
         }
-        Expr arg2 = std::get<const_min(2, sizeof...(Args) - 1)>(args).make(state, type_hint);
+        Expr arg2 = std::get<std::min<size_t>(2, sizeof...(Args) - 1)>(args).make(state, type_hint);
         if (intrin == Call::mul_shift_right) {
             return mul_shift_right(std::move(arg0), std::move(arg1), std::move(arg2));
         } else if (intrin == Call::rounding_mul_shift_right) {
@@ -31880,14 +31913,14 @@ struct SelectOp {
         }
         const Select &op = (const Select &)e;
         return (c.template match<bound>(*op.condition.get(), state) &&
-                t.template match<bound | bindings<C>::mask>(*op.true_value.get(), state) &&
-                f.template match<bound | bindings<C>::mask | bindings<T>::mask>(*op.false_value.get(), state));
+                t.template match<(bound | bindings<C>::mask)>(*op.true_value.get(), state) &&
+                f.template match<(bound | bindings<C>::mask | bindings<T>::mask)>(*op.false_value.get(), state));
     }
     template<uint32_t bound, typename C2, typename T2, typename F2>
     HALIDE_ALWAYS_INLINE bool match(const SelectOp<C2, T2, F2> &instance, MatcherState &state) const noexcept {
         return (c.template match<bound>(unwrap(instance.c), state) &&
-                t.template match<bound | bindings<C>::mask>(unwrap(instance.t), state) &&
-                f.template match<bound | bindings<C>::mask | bindings<T>::mask>(unwrap(instance.f), state));
+                t.template match<(bound | bindings<C>::mask)>(unwrap(instance.t), state) &&
+                f.template match<(bound | bindings<C>::mask | bindings<T>::mask)>(unwrap(instance.f), state));
     }
     HALIDE_ALWAYS_INLINE
@@ -31953,7 +31986,7 @@ struct BroadcastOp {
     template<uint32_t bound, typename A2, typename B2>
     HALIDE_ALWAYS_INLINE bool match(const BroadcastOp<A2, B2> &op, MatcherState &state) const noexcept {
         return (a.template match<bound>(unwrap(op.a), state) &&
-                lanes.template match<bound | bindings<A>::mask>(unwrap(op.lanes), state));
+                lanes.template match<(bound | bindings<A>::mask)>(unwrap(op.lanes), state));
     }
     HALIDE_ALWAYS_INLINE
@@ -32017,8 +32050,8 @@ struct RampOp {
         }
         const Ramp &op = (const Ramp &)e;
         if (a.template match<bound>(*op.base.get(), state) &&
-            b.template match<bound | bindings<A>::mask>(*op.stride.get(), state) &&
-            lanes.template match<bound | bindings<A>::mask | bindings<B>::mask>(op.lanes, state)) {
+            b.template match<(bound | bindings<A>::mask)>(*op.stride.get(), state) &&
+            lanes.template match<(bound | bindings<A>::mask | bindings<B>::mask)>(op.lanes, state)) {
             return true;
         } else {
             return false;
@@ -32028,8 +32061,8 @@ struct RampOp {
     template<uint32_t bound, typename A2, typename B2, typename C2>
     HALIDE_ALWAYS_INLINE bool match(const RampOp<A2, B2, C2> &op, MatcherState &state) const noexcept {
         return (a.template match<bound>(unwrap(op.a), state) &&
-                b.template match<bound | bindings<A>::mask>(unwrap(op.b), state) &&
-                lanes.template match<bound | bindings<A>::mask | bindings<B>::mask>(unwrap(op.lanes), state));
+                b.template match<(bound | bindings<A>::mask)>(unwrap(op.b), state) &&
+                lanes.template match<(bound | bindings<A>::mask | bindings<B>::mask)>(unwrap(op.lanes), state));
     }
     HALIDE_ALWAYS_INLINE
@@ -32080,7 +32113,7 @@ struct VectorReduceOp {
             const VectorReduce &op = (const VectorReduce &)e;
             if (op.op == reduce_op &&
                 a.template match<bound>(*op.value.get(), state) &&
-                lanes.template match<bound | bindings<A>::mask>(op.type.lanes(), state)) {
+                lanes.template match<(bound | bindings<A>::mask)>(op.type.lanes(), state)) {
                 return true;
             }
         }
@@ -32091,7 +32124,7 @@ struct VectorReduceOp {
     HALIDE_ALWAYS_INLINE bool match(const VectorReduceOp<A2, B2, reduce_op_2> &op, MatcherState &state) const noexcept {
         return (reduce_op == reduce_op_2 &&
                 a.template match<bound>(unwrap(op.a), state) &&
-                lanes.template match<bound | bindings<A>::mask>(unwrap(op.lanes), state));
+                lanes.template match<(bound | bindings<A>::mask)>(unwrap(op.lanes), state));
     }
     HALIDE_ALWAYS_INLINE
@@ -32340,9 +32373,9 @@ struct SliceOp {
         return v.vectors.size() == 1 &&
                v.is_slice() &&
                vec.template match<bound>(*v.vectors[0].get(), state) &&
-               base.template match<bound | bindings<Vec>::mask>(v.slice_begin(), state) &&
-               stride.template match<bound | bindings<Vec>::mask | bindings<Base>::mask>(v.slice_stride(), state) &&
-               lanes.template match<bound | bindings<Vec>::mask | bindings<Base>::mask | bindings<Stride>::mask>(v.type.lanes(), state);
+               base.template match<(bound | bindings<Vec>::mask)>(v.slice_begin(), state) &&
+               stride.template match<(bound | bindings<Vec>::mask | bindings<Base>::mask)>(v.slice_stride(), state) &&
+               lanes.template match<(bound | bindings<Vec>::mask | bindings<Base>::mask | bindings<Stride>::mask)>(v.type.lanes(), state);
     }
     HALIDE_ALWAYS_INLINE
@@ -33409,13 +33442,59 @@ std::pair<Region, bool> mutate_region(Mutator *mutator, const Region &bounds, Ar
 }  // namespace Halide
 #endif
-#ifndef HALIDE_LERP_H
-#define HALIDE_LERP_H
+#ifndef HALIDE_INFER_ARGUMENTS_H
+#define HALIDE_INFER_ARGUMENTS_H
+#include <vector>
 /** \file
- * Defines methods for converting a lerp intrinsic into Halide IR.
+ *
+ * Interface for a visitor to infer arguments used in a body Stmt.
  */
+namespace Halide {
+namespace Internal {
+/** An inferred argument. Inferred args are either Params,
+ * ImageParams, or Buffers. The first two are handled by the param
+ * field, and global images are tracked via the buf field. These
+ * are used directly when jitting, or used for validation when
+ * compiling with an explicit argument list. */
+struct InferredArgument {
+    Argument arg;
+    Parameter param;
+    Buffer<> buffer;
+    bool operator<(const InferredArgument &other) const {
+        if (arg.is_buffer() && !other.arg.is_buffer()) {
+            return true;
+        } else if (other.arg.is_buffer() && !arg.is_buffer()) {
+            return false;
+        } else {
+            return arg.name < other.arg.name;
+        }
+    }
+};
+class Function;
+std::vector<InferredArgument> infer_arguments(const Stmt &body, const std::vector<Function> &outputs);
+}  // namespace Internal
+}  // namespace Halide
+#endif
+#ifndef HALIDE_HOST_GPU_BUFFER_COPIES_H
+#define HALIDE_HOST_GPU_BUFFER_COPIES_H
+/** \file
+ * Defines the lowering passes that deal with host and device buffer flow.
+ */
+#include <string>
+#include <vector>
 namespace Halide {
@@ -33423,15 +33502,179 @@ struct Target;
 namespace Internal {
-/** Build Halide IR that computes a lerp. Use by codegen targets that don't have
- * a native lerp. The lerp is done in the type of the zero value. The final_type
- * is a cast that should occur after the lerp. It's included because in some
- * cases you can incorporate a final cast into the lerp math. */
-Expr lower_lerp(Type final_type, Expr zero_val, Expr one_val, const Expr &weight, const Target &target);
+/** A helper function to call an extern function, and assert that it
+ * returns 0. */
+Stmt call_extern_and_assert(const std::string &name, const std::vector<Expr> &args);
+/** Inject calls to halide_device_malloc, halide_copy_to_device, and
+ * halide_copy_to_host as needed. */
+Stmt inject_host_dev_buffer_copies(Stmt s, const Target &t);
 }  // namespace Internal
 }  // namespace Halide
+#endif
+#ifndef HALIDE_INLINE_H
+#define HALIDE_INLINE_H
+/** \file
+ * Methods for replacing calls to functions with their definitions.
+ */
+namespace Halide {
+namespace Internal {
+class Function;
+/** Inline a single named function, which must be pure. For a pure function to
+ * be inlined, it must not have any specializations (i.e. it can only have one
+ * values definition). */
+// @{
+Stmt inline_function(Stmt s, const Function &f);
+Expr inline_function(Expr e, const Function &f);
+void inline_function(Function caller, const Function &f);
+// @}
+/** Check if the schedule of an inlined function is legal, throwing an error
+ * if it is not. */
+void validate_schedule_inlined_function(Function f);
+}  // namespace Internal
+}  // namespace Halide
+#endif
+#ifndef HALIDE_INLINE_REDUCTIONS_H
+#define HALIDE_INLINE_REDUCTIONS_H
+#include <string>
+/** \file
+ * Defines some inline reductions: sum, product, minimum, maximum.
+ */
+namespace Halide {
+class Func;
+/** An inline reduction. This is suitable for convolution-type
+ * operations - the reduction will be computed in the innermost loop
+ * that it is used in. The argument may contain free or implicit
+ * variables, and must refer to some reduction domain. The free
+ * variables are still free in the return value, but the reduction
+ * domain is captured - the result expression does not refer to a
+ * reduction domain and can be used in a pure function definition.
+ *
+ * An example using \ref sum :
+ *
+ \code
+ Func f, g;
+ Var x;
+ RDom r(0, 10);
+ f(x) = x*x;
+ g(x) = sum(f(x + r));
+ \endcode
+ *
+ * Here g computes some blur of x, but g is still a pure function. The
+ * sum is being computed by an anonymous reduction function that is
+ * scheduled innermost within g.
+ */
+//@{
+Expr sum(Expr, const std::string &s = "sum");
+Expr saturating_sum(Expr, const std::string &s = "saturating_sum");
+Expr product(Expr, const std::string &s = "product");
+Expr maximum(Expr, const std::string &s = "maximum");
+Expr minimum(Expr, const std::string &s = "minimum");
+//@}
+/** Variants of the inline reduction in which the RDom is stated
+ * explicitly. The expression can refer to multiple RDoms, and only
+ * the inner one is captured by the reduction. This allows you to
+ * write expressions like:
+ \code
+ RDom r1(0, 10), r2(0, 10), r3(0, 10);
+ Expr e = minimum(r1, product(r2, sum(r3, r1 + r2 + r3)));
+ \endcode
+*/
+// @{
+Expr sum(const RDom &, Expr, const std::string &s = "sum");
+Expr saturating_sum(const RDom &r, Expr e, const std::string &s = "saturating_sum");
+Expr product(const RDom &, Expr, const std::string &s = "product");
+Expr maximum(const RDom &, Expr, const std::string &s = "maximum");
+Expr minimum(const RDom &, Expr, const std::string &s = "minimum");
+// @}
+/** Returns an Expr or Tuple representing the coordinates of the point
+ * in the RDom which minimizes or maximizes the expression. The
+ * expression must refer to some RDom. Also returns the extreme value
+ * of the expression as the last element of the tuple. */
+// @{
+Tuple argmax(Expr, const std::string &s = "argmax");
+Tuple argmin(Expr, const std::string &s = "argmin");
+Tuple argmax(const RDom &, Expr, const std::string &s = "argmax");
+Tuple argmin(const RDom &, Expr, const std::string &s = "argmin");
+// @}
+/** Inline reductions create an anonymous helper Func to do the
+ * work. The variants below instead take a named Func object to use,
+ * so that it is no longer anonymous and can be scheduled
+ * (e.g. unrolled across the reduction domain). The Func passed must
+ * not have any existing definition. */
+//@{
+Expr sum(Expr, const Func &);
+Expr saturating_sum(Expr, const Func &);
+Expr product(Expr, const Func &);
+Expr maximum(Expr, const Func &);
+Expr minimum(Expr, const Func &);
+Expr sum(const RDom &, Expr, const Func &);
+Expr saturating_sum(const RDom &r, Expr e, const Func &);
+Expr product(const RDom &, Expr, const Func &);
+Expr maximum(const RDom &, Expr, const Func &);
+Expr minimum(const RDom &, Expr, const Func &);
+Tuple argmax(Expr, const Func &);
+Tuple argmin(Expr, const Func &);
+Tuple argmax(const RDom &, Expr, const Func &);
+Tuple argmin(const RDom &, Expr, const Func &);
+//@}
+}  // namespace Halide
+#endif
+#ifndef HALIDE_INTEGER_DIVISION_TABLE_H
+#define HALIDE_INTEGER_DIVISION_TABLE_H
+#include <cstdint>
+/** \file
+ * Tables telling us how to do integer division via fixed-point
+ * multiplication for various small constants. This file is
+ * automatically generated by find_inverse.cpp.
+ */
+namespace Halide {
+namespace Internal {
+namespace IntegerDivision {
+extern const int64_t table_u8[256][4];
+extern const int64_t table_s8[256][4];
+extern const int64_t table_srz8[256][4];
+extern const int64_t table_u16[256][4];
+extern const int64_t table_s16[256][4];
+extern const int64_t table_srz16[256][4];
+extern const int64_t table_u32[256][4];
+extern const int64_t table_s32[256][4];
+extern const int64_t table_srz32[256][4];
+extern const int64_t table_runtime_u8[256][4];
+extern const int64_t table_runtime_s8[256][4];
+extern const int64_t table_runtime_srz8[256][4];
+extern const int64_t table_runtime_u16[256][4];
+extern const int64_t table_runtime_s16[256][4];
+extern const int64_t table_runtime_srz16[256][4];
+extern const int64_t table_runtime_u32[256][4];
+extern const int64_t table_runtime_s32[256][4];
+extern const int64_t table_runtime_srz32[256][4];
+}  // namespace IntegerDivision
+}  // namespace Internal
+}  // namespace Halide
 #endif
 #ifndef HALIDE_LICM_H
 #define HALIDE_LICM_H
@@ -33524,6 +33767,7 @@ void create_static_library(const std::vector<std::string> &src_files, const Targ
  * Support for linking LLVM modules that comprise the runtime.
  */
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <vector>
@@ -33562,6 +33806,30 @@ std::unique_ptr<llvm::Module> link_with_wasm_jit_runtime(llvm::LLVMContext *c, c
 }  // namespace Internal
 }  // namespace Halide
+#endif
+#ifndef HALIDE_LERP_H
+#define HALIDE_LERP_H
+/** \file
+ * Defines methods for converting a lerp intrinsic into Halide IR.
+ */
+namespace Halide {
+struct Target;
+namespace Internal {
+/** Build Halide IR that computes a lerp. Use by codegen targets that don't have
+ * a native lerp. The lerp is done in the type of the zero value. The final_type
+ * is a cast that should occur after the lerp. It's included because in some
+ * cases you can incorporate a final cast into the lerp math. */
+Expr lower_lerp(Type final_type, Expr zero_val, Expr one_val, const Expr &weight, const Target &target);
+}  // namespace Internal
+}  // namespace Halide
 #endif
 #ifndef HALIDE_LOOP_CARRY_H
 #define HALIDE_LOOP_CARRY_H
@@ -34754,6 +35022,13 @@ Interval solve_for_inner_interval(const Expr &c, const std::string &variable);
  * 'and' over the vector lanes, and return a scalar result. */
 Expr and_condition_over_domain(const Expr &c, const Scope<Interval> &varying);
+/** Take a conditional that includes variables that vary over some
+ * domain, and convert it to a weaker (less frequently false) condition
+ * that doesn't depend on those variables. Formally, the input expr
+ * implies the output expr. Note that this function might be unable to
+ * provide a better response than simply const_true(). */
+Expr or_condition_over_domain(const Expr &c, const Scope<Interval> &varying);
 void solve_test();
 }  // namespace Internal
@@ -34948,19 +35223,26 @@ Stmt storage_folding(const Stmt &s, const std::map<std::string, Function> &env);
 namespace Halide {
 struct Target;
+struct Expr;
 namespace Internal {
 class Function;
+struct Call;
-/** Propagate strict_float intrinisics such that they immediately wrap
- * all floating-point expressions. This makes the IR nodes context
- * independent.  If the Target::StrictFloat flag is specified in
- * target, starts in strict_float mode so all floating-point type
- * Exprs in the compilation will be marked with strict_float. Returns
- * whether any strict floating-point is used in any function in the
- * passed in env.
- */
+/** Replace all rounding floating point ops and floating point ops that need to
+ * handle nan and inf differently with strict float intrinsics. */
+Expr strictify_float(const Expr &e);
+/** Replace a strict float intrinsic with its non-strict equivalent. Non-recursive. */
+Expr unstrictify_float(const Call *op);
+/** If the StrictFloat target feature is set, replace add, sub, mul, div, etc
+ * operations with strict float intrinsics for all Funcs in the environment. If
+ * StrictFloat is not set does nothing. Returns whether or not there's any usage
+ * of strict float intrinsics or if the target flag is set (i.e. returns whether
+ * or not the rest of lowering and codegen needs to worry about floating point
+ * strictness). */
 bool strictify_float(std::map<std::string, Function> &env, const Target &t);
 }  // namespace Internal
@@ -34992,6 +35274,8 @@ Stmt strip_asserts(const Stmt &s);
  * Defines methods for substituting out variables in expressions and
  * statements. */
+#include <algorithm>
+#include <iterator>
 #include <map>
@@ -35022,6 +35306,16 @@ Expr substitute(const Expr &find, const Expr &replacement, const Expr &expr);
 Stmt substitute(const Expr &find, const Expr &replacement, const Stmt &stmt);
 // @}
+/** Substitute a container of Exprs or Stmts out of place */
+template<typename T>
+T substitute(const std::map<std::string, Expr> &replacements, const T &container) {
+    T output;
+    std::transform(container.begin(), container.end(), std::back_inserter(output), [&](const auto &expr_or_stmt) {
+        return substitute(replacements, expr_or_stmt);
+    });
+    return output;
+}
 /** Substitutions where the IR may be a general graph (and not just a
  * DAG). */
 // @{
@@ -35284,10 +35578,14 @@ std::map<std::string, Function> wrap_func_calls(const std::map<std::string, Func
 #endif
 // Clean up macros used inside Halide headers
+#ifndef HALIDE_KEEP_MACROS
 #undef user_assert
 #undef user_error
 #undef user_warning
 #undef internal_error
 #undef internal_assert
 #undef halide_runtime_error
+#undef debug
+#undef debug_is_active
+#endif
 #endif  // HALIDE_H