npm - numbl - Versions diffs - 0.4.1 → 0.4.2 - Mend

numbl 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist-cli/cli.js +106 -16
package/dist-lib/lib.js +105 -15
package/dist-lib/numbl-core/helpers/effectively-real.d.ts +25 -0
package/dist-lib/numbl-core/jit/builtins/runtime/snippets.gen.d.ts +1 -0
package/dist-lib/numbl-core/jit/builtins/runtime/tensor_ops/tensor_imag_all_zero.d.ts +1 -0
package/dist-lib/numbl-core/version.d.ts +1 -1
package/dist-site-viewer/assets/{index-C5c2lKAx.js → index-D0XGPdHU.js} +267 -267
package/dist-site-viewer/assets/{numbl-worker-CkoM4MUa.js → numbl-worker-B18l6dfh.js} +275 -118
package/dist-site-viewer/index.html +1 -1
package/package.json +1 -1

package/dist-lib/lib.js CHANGED Viewed

@@ -30210,6 +30210,27 @@ function toNumArray(v, name) {
   throw new RuntimeError(`${name}: arguments must be numeric arrays`);
 }
+// src/numbl-core/helpers/effectively-real.ts
+function imagAllZero(imag2) {
+  if (!imag2) return true;
+  for (let i = 0; i < imag2.length; i++) {
+    if (imag2[i] !== 0) return false;
+  }
+  return true;
+}
+function stripZeroImagTensor(t) {
+  if (t.imag && imagAllZero(t.imag)) {
+    const out = RTV.tensor(t.data, t.shape);
+    if (t._isLogical) out._isLogical = true;
+    return out;
+  }
+  return t;
+}
+function stripZeroImagValue(v) {
+  if (isRuntimeTensor(v)) return stripZeroImagTensor(v);
+  return v;
+}
 // src/numbl-core/helpers/reduction/min-max.ts
 function minMaxScan(data, imag2, indices, initial, isBetter, complexIsBetter) {
   let mRe = initial, mIm = 0, mIdx = 0;
@@ -30394,6 +30415,7 @@ function minMaxImpl(name, args, nargout, initial, isBetter, twoArgFn) {
     return isBetter(Math.atan2(imA, reA), Math.atan2(imB, reB));
   };
   args = args.map((a) => isRuntimeSparseMatrix(a) ? sparseToDense(a) : a);
+  args = args.map(stripZeroImagValue);
   if (args.length === 1) {
     const v = args[0];
     if (isRuntimeNumber(v) || isRuntimeLogical(v) || isRuntimeComplexNumber(v)) {
@@ -30881,8 +30903,8 @@ defineBuiltin({
         if (typeof v === "number") return true;
         if (typeof v === "boolean") return true;
         if (isRuntimeComplexNumber(v)) return v.im === 0;
-        if (isRuntimeTensor(v)) return !v.imag;
-        if (isRuntimeSparseMatrix(v)) return !v.pi;
+        if (isRuntimeTensor(v)) return imagAllZero(v.imag);
+        if (isRuntimeSparseMatrix(v)) return !v.pi || imagAllZero(v.pi);
         return true;
       }
     }
@@ -34009,7 +34031,7 @@ defineBuiltin({
           return v;
         }
         if (isRuntimeTensor(v)) {
-          return sortTensor(v, dim, descend, nargout);
+          return sortTensor(stripZeroImagTensor(v), dim, descend, nargout);
         }
         if (isRuntimeCell(v)) {
           return sortCell(v, descend, nargout);
@@ -63821,6 +63843,15 @@ function mtoc2_tensor_flip_complex(a, dimIdx) {
   return r;
 }
+// src/numbl-core/jit/builtins/runtime/tensor_ops/tensor_imag_all_zero.js
+function mtoc2_tensor_imag_all_zero(a) {
+  if (a.imag === void 0) return true;
+  for (let i = 0; i < a.imag.length; i++) {
+    if (a.imag[i] !== 0) return false;
+  }
+  return true;
+}
 // src/numbl-core/jit/builtins/runtime/tensor_ops/tensor_linspace.js
 function mtoc2_tensor_linspace(a, b, n) {
   if (n < 0) n = 0;
@@ -64122,6 +64153,13 @@ function cSqueezeTrailing(dims) {
 function cReduceLaneIm(t, i) {
   return t.imag !== void 0 ? t.imag[i] : 0;
 }
+function cReduceAllImagZero(t) {
+  if (t.imag === void 0) return true;
+  for (let i = 0; i < t.imag.length; i++) {
+    if (t.imag[i] !== 0) return false;
+  }
+  return true;
+}
 function complexAccumAll(t, init, accum, finalize) {
   let acc = { ...init };
   for (let i = 0; i < t.data.length; i++) {
@@ -64179,6 +64217,7 @@ var mtoc2_prod_complex_dim = (t, d) => complexAccumDim(t, d, cProdInit, cProdAcc
 var mtoc2_mean_complex_all = (t) => complexAccumAll(t, cSumInit, cSumAccum, cMeanFinalize);
 var mtoc2_mean_complex_dim = (t, d) => complexAccumDim(t, d, cSumInit, cSumAccum, cMeanFinalize);
 function complexMinmaxAll(t, cmp) {
+  const realMode = cReduceAllImagZero(t);
   let found = false;
   let mRe = NaN;
   let mIm = 0;
@@ -64186,7 +64225,8 @@ function complexMinmaxAll(t, cmp) {
     const xr = t.data[i];
     const xi = cReduceLaneIm(t, i);
     if (xr !== xr || xi !== xi) continue;
-    if (!found || complexBetter(xr, xi, mRe, mIm, cmp)) {
+    const better = realMode ? cmp === "<" ? xr < mRe : xr > mRe : complexBetter(xr, xi, mRe, mIm, cmp);
+    if (!found || better) {
       mRe = xr;
       mIm = xi;
       found = true;
@@ -64209,6 +64249,7 @@ function complexMinmaxDim(t, dim, cmp) {
     return out2;
   }
   const dimIdx = dim - 1;
+  const realMode = cReduceAllImagZero(t);
   const axis = t.shape[dimIdx];
   let before = 1;
   for (let i = 0; i < dimIdx; i++) before *= t.shape[i];
@@ -64229,7 +64270,8 @@ function complexMinmaxDim(t, dim, cmp) {
         const xr = t.data[off];
         const xi = cReduceLaneIm(t, off);
         if (xr !== xr || xi !== xi) continue;
-        if (!found || complexBetter(xr, xi, mRe, mIm, cmp)) {
+        const better = realMode ? cmp === "<" ? xr < mRe : xr > mRe : complexBetter(xr, xi, mRe, mIm, cmp);
+        if (!found || better) {
           mRe = xr;
           mIm = xi;
           found = true;
@@ -64709,6 +64751,35 @@ function complex_sort_indices(a, descending) {
   const n = a.data.length;
   const im = a.imag;
   const idx = new Array(n);
+  for (let i = 0; i < n; i++) idx[i] = i;
+  let realMode = true;
+  if (im !== void 0) {
+    for (let i = 0; i < n; i++) {
+      if (im[i] !== 0) {
+        realMode = false;
+        break;
+      }
+    }
+  }
+  if (realMode) {
+    const re = a.data;
+    idx.sort((p2, q) => {
+      const rp = re[p2];
+      const rq = re[q];
+      const pNaN = rp !== rp;
+      const qNaN = rq !== rq;
+      if (pNaN && qNaN) return 0;
+      if (descending) {
+        if (pNaN) return -1;
+        if (qNaN) return 1;
+        return rp < rq ? 1 : rp > rq ? -1 : 0;
+      }
+      if (pNaN) return 1;
+      if (qNaN) return -1;
+      return rp < rq ? -1 : rp > rq ? 1 : 0;
+    });
+    return idx;
+  }
   const mag = new Float64Array(n);
   const ph = new Float64Array(n);
   for (let i = 0; i < n; i++) {
@@ -64716,7 +64787,6 @@ function complex_sort_indices(a, descending) {
     const xi = im !== void 0 ? im[i] : 0;
     mag[i] = Math.hypot(re, xi);
     ph[i] = Math.atan2(xi, re);
-    idx[i] = i;
   }
   idx.sort((p2, q) => {
     if (mag[p2] < mag[q]) return descending ? 1 : -1;
@@ -65986,6 +66056,7 @@ static void mtoc2__format_walk(mtoc2__writer_fn writer, void *ctx,
   "tensor_fill_nd.h": "/* mtoc2 runtime helper: build an N-D tensor filled with `v`. Real\n * variant allocates a real tensor; complex variant takes `(re, im)`\n * and fills both lanes.\n *\n * Parameterized companion to `mtoc2_tensor_zeros_nd` / `_ones_nd`;\n * activated by the `nan` / `NaN` / `Inf` / `inf` shape-constructor\n * builtins (which would otherwise need their own per-constant fill\n * helpers) and by `repmat(scalar, ...)`. The returned tensor is\n * freshly owned.\n */\n\nstatic mtoc2_tensor_t mtoc2_tensor_fill_nd(double v, int ndim,\n                                           const long *dims) {\n  mtoc2_tensor_t out = mtoc2_tensor_alloc_nd(ndim, dims);\n  size_t n = 1;\n  for (int i = 0; i < ndim; i++) n *= (size_t)out.dims[i];\n  for (size_t i = 0; i < n; i++) out.real[i] = v;\n  return out;\n}\n\nstatic mtoc2_tensor_t mtoc2_tensor_fill_nd_complex(double re, double im,\n                                                   int ndim,\n                                                   const long *dims) {\n  mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(ndim, dims);\n  size_t n = 1;\n  for (int i = 0; i < ndim; i++) n *= (size_t)out.dims[i];\n  for (size_t i = 0; i < n; i++) {\n    out.real[i] = re;\n    out.imag[i] = im;\n  }\n  return out;\n}\n",
   "tensor_fill_square.h": "/* mtoc2 runtime helper: build an n\xD7n real tensor filled with `v`.\n *\n * Single-eval companion to `mtoc2_tensor_fill_nd` for the MATLAB\n * `nan(n)` / `Inf(n)` shorthand when `n` is a runtime expression.\n * See `mtoc2_tensor_zeros_square` for the rationale.\n */\n\nstatic mtoc2_tensor_t mtoc2_tensor_fill_square(double v, long n) {\n  long dims[2] = {n, n};\n  return mtoc2_tensor_fill_nd(v, 2, dims);\n}\n",
   "tensor_flip.h": "/* mtoc2 runtime helper: `flip(t, dimIdx)` \u2014 return a freshly-owned\n * tensor with `t.real` mirrored along axis `dimIdx` (0-based).\n *\n * Numbl's reference is `flipAlongDim` in\n * `interpreter/builtins/array-manipulation.ts` line ~41. Same\n * column-major slab math: stride = product of dims below the axis,\n * outer = product of dims above. For each outer slab, we walk the\n * axis backwards on the source and forward on the destination,\n * copying `strideDim`-element contiguous blocks.\n *\n * `flipud(t)` lowers to `mtoc2_tensor_flip(t, 0)`; `fliplr(t)` to\n * `mtoc2_tensor_flip(t, 1)`; `flip(t, k)` to\n * `mtoc2_tensor_flip(t, k - 1)`. The `dimIdx` is 0-based at the C\n * boundary so the runtime stays uniform across the three source-\n * level builtins. mtoc2 codegen converts MATLAB's 1-based `k` to\n * 0-based at the call site.\n *\n * Out-of-range `dimIdx` (\u2265 ndim) is a no-op flip \u2014 numbl returns\n * the input unchanged in that case (the \"axis is size 1\" rule). We\n * still allocate a fresh copy so the owned-value invariant holds.\n *\n * `mtoc2_tensor_flip_complex` is the sibling that walks both lanes;\n * it tolerates `a.imag == NULL` (a real tensor that flowed in via a\n * complex-typed route) by zero-filling the output imag lane.\n */\n\n#include <string.h>\n#include <stdlib.h>\n\nstatic mtoc2_tensor_t mtoc2_tensor_flip(mtoc2_tensor_t a, long dimIdx) {\n  long total = 1;\n  for (int i = 0; i < a.ndim; i++) total *= a.dims[i];\n  mtoc2_tensor_t r;\n  r.real = mtoc2_alloc((size_t)total * sizeof(double));\n  r.imag = NULL;\n  r.ndim = a.ndim;\n  for (int i = 0; i < a.ndim; i++) r.dims[i] = a.dims[i];\n\n  long axisSize = (dimIdx >= 0 && dimIdx < (long)a.ndim) ? a.dims[dimIdx] : 1;\n  // total==0 (empty along some axis) must short-circuit BEFORE the slab math:\n  // if a dim below the axis is 0, slabSize becomes 0 and total/slabSize is a\n  // 0/0 integer division (SIGILL). An empty result needs no copy anyway.\n  if (axisSize <= 1 || total == 0) {\n    // Nothing to flip \u2014 just deep-copy the buffer.\n    if (total > 0) memcpy(r.real, a.real, (size_t)total * sizeof(double));\n    return r;\n  }\n\n  long strideDim = 1;\n  for (long d = 0; d < dimIdx; d++) strideDim *= a.dims[d];\n  long slabSize = strideDim * axisSize;\n  long numOuter = total / slabSize;\n\n  for (long outer = 0; outer < numOuter; outer++) {\n    long base = outer * slabSize;\n    for (long k = 0; k < axisSize; k++) {\n      long srcOff = base + k * strideDim;\n      long dstOff = base + (axisSize - 1 - k) * strideDim;\n      memcpy(\n        r.real + dstOff,\n        a.real + srcOff,\n        (size_t)strideDim * sizeof(double)\n      );\n    }\n  }\n  return r;\n}\n\nstatic mtoc2_tensor_t mtoc2_tensor_flip_complex(mtoc2_tensor_t a, long dimIdx) {\n  long total = 1;\n  for (int i = 0; i < a.ndim; i++) total *= a.dims[i];\n  mtoc2_tensor_t r = mtoc2_tensor_alloc_nd_complex(a.ndim, a.dims);\n  int srcHasImag = (a.imag != NULL);\n\n  long axisSize = (dimIdx >= 0 && dimIdx < (long)a.ndim) ? a.dims[dimIdx] : 1;\n  // See mtoc2_tensor_flip: total==0 must short-circuit before slab math to\n  // avoid a 0/0 division when a dim below the axis is 0.\n  if (axisSize <= 1 || total == 0) {\n    if (total > 0) {\n      memcpy(r.real, a.real, (size_t)total * sizeof(double));\n      if (srcHasImag) {\n        memcpy(r.imag, a.imag, (size_t)total * sizeof(double));\n      } else {\n        memset(r.imag, 0, (size_t)total * sizeof(double));\n      }\n    }\n    return r;\n  }\n\n  if (!srcHasImag && total > 0) {\n    memset(r.imag, 0, (size_t)total * sizeof(double));\n  }\n\n  long strideDim = 1;\n  for (long d = 0; d < dimIdx; d++) strideDim *= a.dims[d];\n  long slabSize = strideDim * axisSize;\n  long numOuter = total / slabSize;\n\n  for (long outer = 0; outer < numOuter; outer++) {\n    long base = outer * slabSize;\n    for (long k = 0; k < axisSize; k++) {\n      long srcOff = base + k * strideDim;\n      long dstOff = base + (axisSize - 1 - k) * strideDim;\n      memcpy(r.real + dstOff, a.real + srcOff,\n             (size_t)strideDim * sizeof(double));\n      if (srcHasImag) {\n        memcpy(r.imag + dstOff, a.imag + srcOff,\n               (size_t)strideDim * sizeof(double));\n      }\n    }\n  }\n  return r;\n}\n",
+  "tensor_imag_all_zero.h": "/* mtoc2 runtime helper: true (1.0) when a tensor carries no imaginary\n * content \u2014 NULL imag lane, or every imag element exactly zero. `isreal`\n * uses this for complex-typed tensors the JIT could not prove real at\n * compile time, reporting realness by value (matching the interpreter\n * and the complex-scalar `cimag(z) == 0` rule). Returns a logical double. */\n\nstatic double mtoc2_tensor_imag_all_zero(mtoc2_tensor_t a) {\n  if (a.imag == NULL) return 1.0;\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  for (long i = 0; i < n; i++) {\n    if (a.imag[i] != 0.0) return 0.0;\n  }\n  return 1.0;\n}\n",
   "tensor_linspace.h": "/* mtoc2 runtime helper: build a 1\xD7n row tensor of n linearly-spaced\n * values from `a` to `b` inclusive. Matches numbl's `linspace` byte-\n * for-byte:\n *\n *   - n <= 0  \u2192 1\xD70 empty tensor.\n *   - n == 1  \u2192 just `[b]` (matches MATLAB; not the midpoint).\n *   - n  > 1  \u2192 first/last slots pinned at `a`/`b` exactly so a NaN\n *               or Inf endpoint doesn't contaminate the other end;\n *               inner values are `a + (b - a) * i / (n - 1)`.\n *\n * Opposite-sign infinite endpoints place 0 at the exact center for\n * odd n (e.g. `linspace(-Inf, Inf, 5)` \u2192 `[-Inf, ?, 0, ?, Inf]`).\n */\n\n#include <math.h>\n\nstatic mtoc2_tensor_t mtoc2_tensor_linspace(double a, double b, long n) {\n  if (n < 0) n = 0;\n  mtoc2_tensor_t out = mtoc2_tensor_alloc(1, n);\n  if (n == 0) return out;\n  if (n == 1) {\n    out.real[0] = b;\n    return out;\n  }\n  out.real[0] = a;\n  out.real[n - 1] = b;\n  for (long i = 1; i < n - 1; i++) {\n    out.real[i] = a + (b - a) * (double)i / (double)(n - 1);\n  }\n  if ((n & 1) == 1 && !isfinite(a) && !isfinite(b)) {\n    double sa = (a > 0) - (a < 0);\n    double sb = (b > 0) - (b < 0);\n    if (sa != sb) {\n      out.real[(n - 1) / 2] = 0.0;\n    }\n  }\n  return out;\n}\n",
   "tensor_logical_mask.h": '/* mtoc2 runtime helper: logical-mask indexing support.\n *\n * `mtoc2_logical_mask_indices` scans `mask` column-major and fills\n * `out_indices` with the 0-based positions where the mask is truthy.\n * Each truthy position must be less than `axis_len`; otherwise this\n * aborts with a numbl-style "Index exceeds array bounds" message via\n * `mtoc2_oob_abort`. Returns the truthy count, which is also the\n * number of entries written into `out_indices`. `out_indices` must\n * have room for at least `mask.numel()` longs.\n *\n * `axis` is the axis number for the diagnostic (0-based) when this\n * helper is called for a per-axis slot (`M(:, mask)` \u2192 axis = 1), or\n * `-1` for the linear single-slot form (`a(mask)`).\n *\n * Used by both reads (`IndexSlice` with a `LogicalMask` slot) and\n * linear-form writes (`IndexSliceStore` with a single `LogicalMask`\n * slot). The caller allocates the index buffer with `mtoc2_alloc`,\n * passes it in, and frees it after the iteration that consumes it.\n */\n\nstatic long mtoc2_logical_mask_indices(\n  mtoc2_tensor_t mask, long axis_len, int axis, const char *loc,\n  long *out_indices\n) {\n  long mask_n = 1;\n  for (int d = 0; d < mask.ndim; d++) mask_n *= mask.dims[d];\n  long count = 0;\n  for (long i = 0; i < mask_n; i++) {\n    if (mask.real[i] != 0.0) {\n      if (i >= axis_len) {\n        mtoc2_oob_abort(loc, axis, i + 1, 1, axis_len);\n      }\n      out_indices[count++] = i;\n    }\n  }\n  return count;\n}\n',
   "tensor_logical_real.h": '/* mtoc2 runtime helpers: elementwise logical ops on tensors.\n *\n * Same allocate-and-fill pattern as tensor_unary_real_math.h, but the\n * per-element kernel is a logical predicate that returns 0.0 or 1.0.\n *\n * `mtoc2_tensor_not` mirrors numbl\'s `not(v)` (runtimeOperators.ts):\n * a real-tensor input produces a freshly-owned logical-typed tensor\n * of the same shape with `out[i] = (in[i] == 0.0) ? 1.0 : 0.0`. The\n * complex sibling fires "true" iff both lanes are exactly zero. The\n * result is a real (logical-typed) tensor in both cases \u2014 the type\n * system records `elem: "logical"` so disp / downstream consumers\n * know how to interpret the doubles.\n */\n#include <stdlib.h>\n\nstatic mtoc2_tensor_t mtoc2_tensor_not(mtoc2_tensor_t a) {\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  mtoc2_tensor_t r;\n  r.real = mtoc2_alloc((size_t)n * sizeof(double));\n  r.imag = NULL;\n  r.ndim = a.ndim;\n  for (int i = 0; i < a.ndim; i++) r.dims[i] = a.dims[i];\n  for (long i = 0; i < n; i++) r.real[i] = (a.real[i] == 0.0) ? 1.0 : 0.0;\n  return r;\n}\n\nstatic mtoc2_tensor_t mtoc2_tensor_not_complex(mtoc2_tensor_t a) {\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  mtoc2_tensor_t r;\n  r.real = mtoc2_alloc((size_t)n * sizeof(double));\n  r.imag = NULL;\n  r.ndim = a.ndim;\n  for (int i = 0; i < a.ndim; i++) r.dims[i] = a.dims[i];\n  int srcHasImag = (a.imag != NULL);\n  for (long i = 0; i < n; i++) {\n    double re = a.real[i];\n    double im = srcHasImag ? a.imag[i] : 0.0;\n    r.real[i] = (re == 0.0 && im == 0.0) ? 1.0 : 0.0;\n  }\n  return r;\n}\n',
@@ -65997,13 +66068,13 @@ static void mtoc2__format_walk(mtoc2__writer_fn writer, void *ctx,
   "tensor_ones_nd.h": "/* mtoc2 runtime helper: build a real N-D tensor filled with ones.\n *\n * Allocates via `mtoc2_tensor_alloc_nd`, then fills the `real` buffer\n * with `1.0` via a plain element loop (`memset` only works for byte\n * patterns; `1.0` is not such a pattern). The returned tensor is\n * freshly owned; `imag` is NULL.\n */\n\nstatic mtoc2_tensor_t mtoc2_tensor_ones_nd(int ndim, const long *dims) {\n  mtoc2_tensor_t out = mtoc2_tensor_alloc_nd(ndim, dims);\n  size_t n = 1;\n  for (int i = 0; i < ndim; i++) n *= (size_t)out.dims[i];\n  for (size_t i = 0; i < n; i++) out.real[i] = 1.0;\n  return out;\n}\n",
   "tensor_ones_square.h": "/* mtoc2 runtime helper: build an n\xD7n real tensor filled with ones.\n *\n * Single-eval companion to `mtoc2_tensor_ones_nd` for the MATLAB\n * `ones(n)` shorthand when `n` is a runtime expression. See the\n * `mtoc2_tensor_zeros_square` header for the rationale.\n */\n\nstatic mtoc2_tensor_t mtoc2_tensor_ones_square(long n) {\n  long dims[2] = {n, n};\n  return mtoc2_tensor_ones_nd(2, dims);\n}\n",
   "tensor_predicate.h": "/* mtoc2 runtime helpers: elementwise tensor \u2192 logical-tensor\n * predicates (`isnan`, `isinf`, `isfinite`, `logical`). Same\n * allocate-and-fill shape as `tensor_unary_real_math.h`; each element\n * maps to 1.0 / 0.0. The result is logical at the source level (the\n * buffer is the usual Float64 lane, `imag == NULL`).\n *\n * The `_complex` siblings operate per-element on `(re, im)`; for\n * `isnan` / `isinf` the predicate fires if either component\n * triggers, for `isfinite` both must be finite. They tolerate\n * `a.imag == NULL` (a real tensor that flowed through a\n * complex-typed route) by treating the imag input as zero.\n */\n#include <math.h>\n#include <stdlib.h>\n\n#define MTOC2_DEFINE_UNARY_PRED(name, EXPR)              \\\n  static mtoc2_tensor_t name(mtoc2_tensor_t a) {         \\\n    long n = 1;                                          \\\n    for (int i = 0; i < a.ndim; i++) n *= a.dims[i];     \\\n    mtoc2_tensor_t r;                                    \\\n    r.real = mtoc2_alloc((size_t)n * sizeof(double));    \\\n    r.imag = NULL;                                       \\\n    r.ndim = a.ndim;                                     \\\n    for (int i = 0; i < a.ndim; i++) r.dims[i] = a.dims[i]; \\\n    MTOC2_OMP_PARFOR_N                                   \\\n    for (long i = 0; i < n; i++) {                       \\\n      double x = a.real[i];                              \\\n      r.real[i] = (EXPR) ? 1.0 : 0.0;                    \\\n    }                                                    \\\n    return r;                                            \\\n  }\n\nMTOC2_DEFINE_UNARY_PRED(mtoc2_tensor_isnan, isnan(x))\nMTOC2_DEFINE_UNARY_PRED(mtoc2_tensor_logical, x != 0.0)\nMTOC2_DEFINE_UNARY_PRED(mtoc2_tensor_isinf, isinf(x))\nMTOC2_DEFINE_UNARY_PRED(mtoc2_tensor_isfinite, isfinite(x))\n\n#undef MTOC2_DEFINE_UNARY_PRED\n\n#define MTOC2_DEFINE_UNARY_PRED_COMPLEX(name, EXPR)         \\\n  static mtoc2_tensor_t name(mtoc2_tensor_t a) {            \\\n    long n = 1;                                             \\\n    for (int i = 0; i < a.ndim; i++) n *= a.dims[i];        \\\n    mtoc2_tensor_t r;                                       \\\n    r.real = mtoc2_alloc((size_t)n * sizeof(double));       \\\n    r.imag = NULL;                                          \\\n    r.ndim = a.ndim;                                        \\\n    for (int i = 0; i < a.ndim; i++) r.dims[i] = a.dims[i]; \\\n    MTOC2_OMP_PARFOR_N                                      \\\n    for (long i = 0; i < n; i++) {                          \\\n      double re = a.real[i];                                \\\n      double im = (a.imag != NULL) ? a.imag[i] : 0.0;       \\\n      r.real[i] = (EXPR) ? 1.0 : 0.0;                       \\\n    }                                                       \\\n    return r;                                               \\\n  }\n\nMTOC2_DEFINE_UNARY_PRED_COMPLEX(mtoc2_tensor_isnan_complex,\n                                isnan(re) || isnan(im))\nMTOC2_DEFINE_UNARY_PRED_COMPLEX(mtoc2_tensor_isinf_complex,\n                                isinf(re) || isinf(im))\nMTOC2_DEFINE_UNARY_PRED_COMPLEX(mtoc2_tensor_isfinite_complex,\n                                isfinite(re) && isfinite(im))\n\n#undef MTOC2_DEFINE_UNARY_PRED_COMPLEX\n",
-  "tensor_reduce_complex.h": '/* mtoc2 runtime helpers: complex-tensor reductions.\n *\n * Sibling of `tensor_reduce_real.h`. Same `_all` / `_dim` shape per\n * op; each kernel walks both lanes and builds intermediate\n * `double _Complex` values through `mtoc2_cmake` / `mtoc2_c*` so the\n * c2js backend can translate the bodies straight (no bare\n * `<complex.h>` operators).\n *\n * Result types per op:\n *   sum / prod / mean         \u2192 complex (lane-pair accumulator)\n *   min / max                 \u2192 complex (magnitude compare, atan2\n *                                tiebreak \u2014 matches numbl\'s\n *                                `complexIsBetter`)\n *   any / all                 \u2192 real (toBool: `re != 0 || im != 0`,\n *                                then aggregate via OR/AND)\n *\n * Input tolerance: `imag == NULL` (a real tensor flowing in through\n * a complex-typed route) is treated as zero on every cell.\n *\n * `_all` returns a `double _Complex` for the numeric reducers and a\n * `double` for the logical reducers. `_dim` returns a freshly-owned\n * complex tensor (numeric reducers) or a real tensor (logical\n * reducers) of the reduced shape \u2014 same trailing-singleton squeeze\n * rule as the real helper.\n */\n\n#include <math.h>\n#include <stdlib.h>\n#include <string.h>\n\n/* Shared with the real helper; defined inline here so this file is\n * standalone-includable (the runtime activator may pull the complex\n * snippet in independently of the real one). */\nstatic void mtoc2__squeeze_trailing_c(int *ndim, long *dims) {\n  while (*ndim > 2 && dims[*ndim - 1] == 1) {\n    (*ndim)--;\n  }\n}\n\n/* Numeric (sum/prod/mean) reduction template \u2014 complex accumulator. */\n#define MTOC2_DEFINE_CACCUM_REDUCTION(name, INIT, ACCUM, FINALIZE)            \\\n  static double _Complex mtoc2_##name##_complex_all(mtoc2_tensor_t a) {       \\\n    long n = 1;                                                               \\\n    for (int i = 0; i < a.ndim; i++) n *= a.dims[i];                          \\\n    double _Complex acc = (INIT);                                             \\\n    for (long i = 0; i < n; i++) {                                            \\\n      double aim = (a.imag != NULL) ? a.imag[i] : 0.0;                        \\\n      double _Complex x = mtoc2_cmake(a.real[i], aim);                        \\\n      acc = ACCUM(acc, x);                                                    \\\n    }                                                                         \\\n    return FINALIZE(acc, n);                                                  \\\n  }                                                                           \\\n                                                                              \\\n  static mtoc2_tensor_t mtoc2_##name##_complex_dim(                           \\\n      mtoc2_tensor_t a, int dim) {                                            \\\n    if (dim < 1) {                                                            \\\n      fprintf(stderr, "mtoc2: " #name "_complex_dim: dim must be >= 1 (got %d)\\n", dim); \\\n      abort();                                                                \\\n    }                                                                         \\\n    if (dim > a.ndim) {                                                       \\\n      long total = 1;                                                         \\\n      for (int i = 0; i < a.ndim; i++) total *= a.dims[i];                    \\\n      mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(a.ndim, a.dims);     \\\n      memcpy(out.real, a.real, (size_t)total * sizeof(double));               \\\n      if (a.imag != NULL) {                                                   \\\n        memcpy(out.imag, a.imag, (size_t)total * sizeof(double));             \\\n      } else {                                                                \\\n        memset(out.imag, 0, (size_t)total * sizeof(double));                  \\\n      }                                                                       \\\n      return out;                                                             \\\n    }                                                                         \\\n    int dimIdx = dim - 1;                                                     \\\n    long axis = a.dims[dimIdx];                                               \\\n    long before = 1;                                                          \\\n    for (int i = 0; i < dimIdx; i++) before *= a.dims[i];                     \\\n    long after = 1;                                                           \\\n    for (int i = dimIdx + 1; i < a.ndim; i++) after *= a.dims[i];             \\\n    long out_dims[MTOC2_MAX_NDIM];                                            \\\n    int out_ndim = a.ndim;                                                    \\\n    for (int i = 0; i < a.ndim; i++) out_dims[i] = a.dims[i];                 \\\n    out_dims[dimIdx] = 1;                                                     \\\n    mtoc2__squeeze_trailing_c(&out_ndim, out_dims);                           \\\n    mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(out_ndim, out_dims);   \\\n    long slab = before * axis;                                                \\\n    for (long outer = 0; outer < after; outer++) {                            \\\n      long slabBase = outer * slab;                                           \\\n      for (long inner = 0; inner < before; inner++) {                         \\\n        double _Complex acc = (INIT);                                         \\\n        for (long k = 0; k < axis; k++) {                                     \\\n          long off = slabBase + inner + k * before;                           \\\n          double aim = (a.imag != NULL) ? a.imag[off] : 0.0;                  \\\n          double _Complex x = mtoc2_cmake(a.real[off], aim);                  \\\n          acc = ACCUM(acc, x);                                                \\\n        }                                                                     \\\n        double _Complex fin = FINALIZE(acc, axis);                            \\\n        long dst = outer * before + inner;                                    \\\n        out.real[dst] = mtoc2_creal(fin);                                     \\\n        out.imag[dst] = mtoc2_cimag(fin);                                     \\\n      }                                                                       \\\n    }                                                                         \\\n    return out;                                                               \\\n  }\n\n/* min/max template \u2014 complex compare via magnitude + atan2 tiebreak.\n * NaN-skip on either lane. Accumulator seed is NaN+NaN; first non-\n * NaN element captures, later non-NaN elements compare. */\n#define MTOC2_DEFINE_CMINMAX_REDUCTION(name, CMP)                              \\\n  static int mtoc2__##name##_complex_better(                                  \\\n      double aRe, double aIm, double bRe, double bIm) {                       \\\n    double absA = hypot(aRe, aIm);                                            \\\n    double absB = hypot(bRe, bIm);                                            \\\n    if (absA != absB) return absA CMP absB;                                   \\\n    return atan2(aIm, aRe) CMP atan2(bIm, bRe);                               \\\n  }                                                                           \\\n  static double _Complex mtoc2_##name##_complex_all(mtoc2_tensor_t a) {       \\\n    long n = 1;                                                               \\\n    for (int i = 0; i < a.ndim; i++) n *= a.dims[i];                          \\\n    int found = 0;                                                            \\\n    double mRe = NAN, mIm = 0.0;                                              \\\n    for (long i = 0; i < n; i++) {                                            \\\n      double xr = a.real[i];                                                  \\\n      double xi = (a.imag != NULL) ? a.imag[i] : 0.0;                         \\\n      if (xr != xr || xi != xi) continue;                                     \\\n      if (!found || mtoc2__##name##_complex_better(xr, xi, mRe, mIm)) {       \\\n        mRe = xr;                                                             \\\n        mIm = xi;                                                             \\\n        found = 1;                                                            \\\n      }                                                                       \\\n    }                                                                         \\\n    return mtoc2_cmake(mRe, mIm);                                             \\\n  }                                                                           \\\n                                                                              \\\n  static mtoc2_tensor_t mtoc2_##name##_complex_dim(                           \\\n      mtoc2_tensor_t a, int dim) {                                            \\\n    if (dim < 1) {                                                            \\\n      fprintf(stderr, "mtoc2: " #name "_complex_dim: dim must be >= 1 (got %d)\\n", dim); \\\n      abort();                                                                \\\n    }                                                                         \\\n    if (dim > a.ndim) {                                                       \\\n      long total = 1;                                                         \\\n      for (int i = 0; i < a.ndim; i++) total *= a.dims[i];                    \\\n      mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(a.ndim, a.dims);     \\\n      memcpy(out.real, a.real, (size_t)total * sizeof(double));               \\\n      if (a.imag != NULL) {                                                   \\\n        memcpy(out.imag, a.imag, (size_t)total * sizeof(double));             \\\n      } else {                                                                \\\n        memset(out.imag, 0, (size_t)total * sizeof(double));                  \\\n      }                                                                       \\\n      return out;                                                             \\\n    }                                                                         \\\n    int dimIdx = dim - 1;                                                     \\\n    long axis = a.dims[dimIdx];                                               \\\n    long before = 1;                                                          \\\n    for (int i = 0; i < dimIdx; i++) before *= a.dims[i];                     \\\n    long after = 1;                                                           \\\n    for (int i = dimIdx + 1; i < a.ndim; i++) after *= a.dims[i];             \\\n    long out_dims[MTOC2_MAX_NDIM];                                            \\\n    int out_ndim = a.ndim;                                                    \\\n    for (int i = 0; i < a.ndim; i++) out_dims[i] = a.dims[i];                 \\\n    out_dims[dimIdx] = 1;                                                     \\\n    mtoc2__squeeze_trailing_c(&out_ndim, out_dims);                           \\\n    mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(out_ndim, out_dims);   \\\n    long slab = before * axis;                                                \\\n    for (long outer = 0; outer < after; outer++) {                            \\\n      long slabBase = outer * slab;                                           \\\n      for (long inner = 0; inner < before; inner++) {                         \\\n        int found = 0;                                                        \\\n        double mRe = NAN, mIm = 0.0;                                          \\\n        for (long k = 0; k < axis; k++) {                                     \\\n          long off = slabBase + inner + k * before;                           \\\n          double xr = a.real[off];                                            \\\n          double xi = (a.imag != NULL) ? a.imag[off] : 0.0;                   \\\n          if (xr != xr || xi != xi) continue;                                 \\\n          if (!found || mtoc2__##name##_complex_better(xr, xi, mRe, mIm)) {   \\\n            mRe = xr;                                                         \\\n            mIm = xi;                                                         \\\n            found = 1;                                                        \\\n          }                                                                   \\\n        }                                                                     \\\n        long dst = outer * before + inner;                                    \\\n        out.real[dst] = mRe;                                                  \\\n        out.imag[dst] = mIm;                                                  \\\n      }                                                                       \\\n    }                                                                         \\\n    return out;                                                               \\\n  }\n\n/* any/all template \u2014 real result; toBool per element (either lane\n * nonzero). Mirrors `MTOC2_DEFINE_LOGICAL_REDUCTION` shape. */\n#define MTOC2_DEFINE_CLOGICAL_REDUCTION(name, EMPTY_RESULT, SHORT_BODY)        \\\n  static double mtoc2_##name##_complex_all(mtoc2_tensor_t a) {                \\\n    long n = 1;                                                               \\\n    for (int i = 0; i < a.ndim; i++) n *= a.dims[i];                          \\\n    if (n == 0) return (double)(EMPTY_RESULT);                                \\\n    double acc = (double)(EMPTY_RESULT);                                      \\\n    for (long i = 0; i < n; i++) {                                            \\\n      double xr = a.real[i];                                                  \\\n      double xi = (a.imag != NULL) ? a.imag[i] : 0.0;                         \\\n      int x = (xr != 0.0 || xi != 0.0);                                       \\\n      SHORT_BODY;                                                             \\\n    }                                                                         \\\n    return acc;                                                               \\\n  }                                                                           \\\n                                                                              \\\n  static mtoc2_tensor_t mtoc2_##name##_complex_dim(                           \\\n      mtoc2_tensor_t a, int dim) {                                            \\\n    if (dim < 1) {                                                            \\\n      fprintf(stderr, "mtoc2: " #name "_complex_dim: dim must be >= 1 (got %d)\\n", dim); \\\n      abort();                                                                \\\n    }                                                                         \\\n    if (dim > a.ndim) {                                                       \\\n      long total = 1;                                                         \\\n      for (int i = 0; i < a.ndim; i++) total *= a.dims[i];                    \\\n      long out_dims[MTOC2_MAX_NDIM];                                          \\\n      int out_ndim = a.ndim;                                                  \\\n      for (int i = 0; i < a.ndim; i++) out_dims[i] = a.dims[i];               \\\n      mtoc2_tensor_t out = mtoc2_tensor_alloc_nd(out_ndim, out_dims);         \\\n      for (long i = 0; i < total; i++) {                                      \\\n        double xr = a.real[i];                                                \\\n        double xi = (a.imag != NULL) ? a.imag[i] : 0.0;                       \\\n        out.real[i] = (xr != 0.0 || xi != 0.0) ? 1.0 : 0.0;                   \\\n      }                                                                       \\\n      return out;                                                             \\\n    }                                                                         \\\n    int dimIdx = dim - 1;                                                     \\\n    long axis = a.dims[dimIdx];                                               \\\n    long before = 1;                                                          \\\n    for (int i = 0; i < dimIdx; i++) before *= a.dims[i];                     \\\n    long after = 1;                                                           \\\n    for (int i = dimIdx + 1; i < a.ndim; i++) after *= a.dims[i];             \\\n    long out_dims[MTOC2_MAX_NDIM];                                            \\\n    int out_ndim = a.ndim;                                                    \\\n    for (int i = 0; i < a.ndim; i++) out_dims[i] = a.dims[i];                 \\\n    out_dims[dimIdx] = 1;                                                     \\\n    mtoc2__squeeze_trailing_c(&out_ndim, out_dims);                           \\\n    mtoc2_tensor_t out = mtoc2_tensor_alloc_nd(out_ndim, out_dims);           \\\n    long slab = before * axis;                                                \\\n    for (long outer = 0; outer < after; outer++) {                            \\\n      long slabBase = outer * slab;                                           \\\n      for (long inner = 0; inner < before; inner++) {                         \\\n        double acc = (double)(EMPTY_RESULT);                                  \\\n        for (long k = 0; k < axis; k++) {                                     \\\n          long off = slabBase + inner + k * before;                           \\\n          double xr = a.real[off];                                            \\\n          double xi = (a.imag != NULL) ? a.imag[off] : 0.0;                   \\\n          int x = (xr != 0.0 || xi != 0.0);                                   \\\n          SHORT_BODY;                                                         \\\n        }                                                                     \\\n        out.real[outer * before + inner] = acc;                               \\\n      }                                                                       \\\n    }                                                                         \\\n    return out;                                                               \\\n  }\n\n/* Accumulator-statement macros. */\n#define MTOC2_CACC_SUM(acc, x) mtoc2_cadd((acc), (x))\n#define MTOC2_CACC_PROD(acc, x) mtoc2_cmul((acc), (x))\n#define MTOC2_CFIN_ID(acc, n) (acc)\n#define MTOC2_CFIN_MEAN(acc, n) mtoc2_cdiv((acc), mtoc2_cmake((double)(n), 0.0))\n\nMTOC2_DEFINE_CACCUM_REDUCTION(sum, mtoc2_cmake(0.0, 0.0), MTOC2_CACC_SUM, MTOC2_CFIN_ID)\nMTOC2_DEFINE_CACCUM_REDUCTION(prod, mtoc2_cmake(1.0, 0.0), MTOC2_CACC_PROD, MTOC2_CFIN_ID)\nMTOC2_DEFINE_CACCUM_REDUCTION(mean, mtoc2_cmake(0.0, 0.0), MTOC2_CACC_SUM, MTOC2_CFIN_MEAN)\n\nMTOC2_DEFINE_CMINMAX_REDUCTION(min, <)\nMTOC2_DEFINE_CMINMAX_REDUCTION(max, >)\n\nMTOC2_DEFINE_CLOGICAL_REDUCTION(any, 0,\n  if (x) { acc = 1.0; break; })\nMTOC2_DEFINE_CLOGICAL_REDUCTION(all, 1,\n  if (!x) { acc = 0.0; break; })\n',
+  "tensor_reduce_complex.h": '/* mtoc2 runtime helpers: complex-tensor reductions.\n *\n * Sibling of `tensor_reduce_real.h`. Same `_all` / `_dim` shape per\n * op; each kernel walks both lanes and builds intermediate\n * `double _Complex` values through `mtoc2_cmake` / `mtoc2_c*` so the\n * c2js backend can translate the bodies straight (no bare\n * `<complex.h>` operators).\n *\n * Result types per op:\n *   sum / prod / mean         \u2192 complex (lane-pair accumulator)\n *   min / max                 \u2192 complex (magnitude compare, atan2\n *                                tiebreak \u2014 matches numbl\'s\n *                                `complexIsBetter`)\n *   any / all                 \u2192 real (toBool: `re != 0 || im != 0`,\n *                                then aggregate via OR/AND)\n *\n * Input tolerance: `imag == NULL` (a real tensor flowing in through\n * a complex-typed route) is treated as zero on every cell.\n *\n * `_all` returns a `double _Complex` for the numeric reducers and a\n * `double` for the logical reducers. `_dim` returns a freshly-owned\n * complex tensor (numeric reducers) or a real tensor (logical\n * reducers) of the reduced shape \u2014 same trailing-singleton squeeze\n * rule as the real helper.\n */\n\n#include <math.h>\n#include <stdlib.h>\n#include <string.h>\n\n/* Shared with the real helper; defined inline here so this file is\n * standalone-includable (the runtime activator may pull the complex\n * snippet in independently of the real one). */\nstatic void mtoc2__squeeze_trailing_c(int *ndim, long *dims) {\n  while (*ndim > 2 && dims[*ndim - 1] == 1) {\n    (*ndim)--;\n  }\n}\n\n/* Numeric (sum/prod/mean) reduction template \u2014 complex accumulator. */\n#define MTOC2_DEFINE_CACCUM_REDUCTION(name, INIT, ACCUM, FINALIZE)            \\\n  static double _Complex mtoc2_##name##_complex_all(mtoc2_tensor_t a) {       \\\n    long n = 1;                                                               \\\n    for (int i = 0; i < a.ndim; i++) n *= a.dims[i];                          \\\n    double _Complex acc = (INIT);                                             \\\n    for (long i = 0; i < n; i++) {                                            \\\n      double aim = (a.imag != NULL) ? a.imag[i] : 0.0;                        \\\n      double _Complex x = mtoc2_cmake(a.real[i], aim);                        \\\n      acc = ACCUM(acc, x);                                                    \\\n    }                                                                         \\\n    return FINALIZE(acc, n);                                                  \\\n  }                                                                           \\\n                                                                              \\\n  static mtoc2_tensor_t mtoc2_##name##_complex_dim(                           \\\n      mtoc2_tensor_t a, int dim) {                                            \\\n    if (dim < 1) {                                                            \\\n      fprintf(stderr, "mtoc2: " #name "_complex_dim: dim must be >= 1 (got %d)\\n", dim); \\\n      abort();                                                                \\\n    }                                                                         \\\n    if (dim > a.ndim) {                                                       \\\n      long total = 1;                                                         \\\n      for (int i = 0; i < a.ndim; i++) total *= a.dims[i];                    \\\n      mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(a.ndim, a.dims);     \\\n      memcpy(out.real, a.real, (size_t)total * sizeof(double));               \\\n      if (a.imag != NULL) {                                                   \\\n        memcpy(out.imag, a.imag, (size_t)total * sizeof(double));             \\\n      } else {                                                                \\\n        memset(out.imag, 0, (size_t)total * sizeof(double));                  \\\n      }                                                                       \\\n      return out;                                                             \\\n    }                                                                         \\\n    int dimIdx = dim - 1;                                                     \\\n    long axis = a.dims[dimIdx];                                               \\\n    long before = 1;                                                          \\\n    for (int i = 0; i < dimIdx; i++) before *= a.dims[i];                     \\\n    long after = 1;                                                           \\\n    for (int i = dimIdx + 1; i < a.ndim; i++) after *= a.dims[i];             \\\n    long out_dims[MTOC2_MAX_NDIM];                                            \\\n    int out_ndim = a.ndim;                                                    \\\n    for (int i = 0; i < a.ndim; i++) out_dims[i] = a.dims[i];                 \\\n    out_dims[dimIdx] = 1;                                                     \\\n    mtoc2__squeeze_trailing_c(&out_ndim, out_dims);                           \\\n    mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(out_ndim, out_dims);   \\\n    long slab = before * axis;                                                \\\n    for (long outer = 0; outer < after; outer++) {                            \\\n      long slabBase = outer * slab;                                           \\\n      for (long inner = 0; inner < before; inner++) {                         \\\n        double _Complex acc = (INIT);                                         \\\n        for (long k = 0; k < axis; k++) {                                     \\\n          long off = slabBase + inner + k * before;                           \\\n          double aim = (a.imag != NULL) ? a.imag[off] : 0.0;                  \\\n          double _Complex x = mtoc2_cmake(a.real[off], aim);                  \\\n          acc = ACCUM(acc, x);                                                \\\n        }                                                                     \\\n        double _Complex fin = FINALIZE(acc, axis);                            \\\n        long dst = outer * before + inner;                                    \\\n        out.real[dst] = mtoc2_creal(fin);                                     \\\n        out.imag[dst] = mtoc2_cimag(fin);                                     \\\n      }                                                                       \\\n    }                                                                         \\\n    return out;                                                               \\\n  }\n\n/* True when the tensor carries no imaginary content (NULL lane or every\n * element zero). Such a tensor is real in value \u2014 min/max must order by\n * value, not magnitude, to match the interpreter and MATLAB on real data. */\nstatic int mtoc2__creduce_all_imag_zero(mtoc2_tensor_t a) {\n  if (a.imag == NULL) return 1;\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  for (long i = 0; i < n; i++) {\n    if (a.imag[i] != 0.0) return 0;\n  }\n  return 1;\n}\n\n/* min/max template \u2014 complex compare via magnitude + atan2 tiebreak.\n * When the whole tensor is real (all-zero imag) we order by value\n * instead. NaN-skip on either lane. Accumulator seed is NaN+NaN; first\n * non-NaN element captures, later non-NaN elements compare. */\n#define MTOC2_DEFINE_CMINMAX_REDUCTION(name, CMP)                              \\\n  static int mtoc2__##name##_complex_better(                                  \\\n      double aRe, double aIm, double bRe, double bIm) {                       \\\n    double absA = hypot(aRe, aIm);                                            \\\n    double absB = hypot(bRe, bIm);                                            \\\n    if (absA != absB) return absA CMP absB;                                   \\\n    return atan2(aIm, aRe) CMP atan2(bIm, bRe);                               \\\n  }                                                                           \\\n  static double _Complex mtoc2_##name##_complex_all(mtoc2_tensor_t a) {       \\\n    long n = 1;                                                               \\\n    for (int i = 0; i < a.ndim; i++) n *= a.dims[i];                          \\\n    int realMode = mtoc2__creduce_all_imag_zero(a);                           \\\n    int found = 0;                                                            \\\n    double mRe = NAN, mIm = 0.0;                                              \\\n    for (long i = 0; i < n; i++) {                                            \\\n      double xr = a.real[i];                                                  \\\n      double xi = (a.imag != NULL) ? a.imag[i] : 0.0;                         \\\n      if (xr != xr || xi != xi) continue;                                     \\\n      int better = realMode ? (xr CMP mRe)                                    \\\n                            : mtoc2__##name##_complex_better(xr, xi, mRe, mIm); \\\n      if (!found || better) {                                                 \\\n        mRe = xr;                                                             \\\n        mIm = xi;                                                             \\\n        found = 1;                                                            \\\n      }                                                                       \\\n    }                                                                         \\\n    return mtoc2_cmake(mRe, mIm);                                             \\\n  }                                                                           \\\n                                                                              \\\n  static mtoc2_tensor_t mtoc2_##name##_complex_dim(                           \\\n      mtoc2_tensor_t a, int dim) {                                            \\\n    if (dim < 1) {                                                            \\\n      fprintf(stderr, "mtoc2: " #name "_complex_dim: dim must be >= 1 (got %d)\\n", dim); \\\n      abort();                                                                \\\n    }                                                                         \\\n    if (dim > a.ndim) {                                                       \\\n      long total = 1;                                                         \\\n      for (int i = 0; i < a.ndim; i++) total *= a.dims[i];                    \\\n      mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(a.ndim, a.dims);     \\\n      memcpy(out.real, a.real, (size_t)total * sizeof(double));               \\\n      if (a.imag != NULL) {                                                   \\\n        memcpy(out.imag, a.imag, (size_t)total * sizeof(double));             \\\n      } else {                                                                \\\n        memset(out.imag, 0, (size_t)total * sizeof(double));                  \\\n      }                                                                       \\\n      return out;                                                             \\\n    }                                                                         \\\n    int dimIdx = dim - 1;                                                     \\\n    long axis = a.dims[dimIdx];                                               \\\n    long before = 1;                                                          \\\n    for (int i = 0; i < dimIdx; i++) before *= a.dims[i];                     \\\n    long after = 1;                                                           \\\n    for (int i = dimIdx + 1; i < a.ndim; i++) after *= a.dims[i];             \\\n    long out_dims[MTOC2_MAX_NDIM];                                            \\\n    int out_ndim = a.ndim;                                                    \\\n    for (int i = 0; i < a.ndim; i++) out_dims[i] = a.dims[i];                 \\\n    out_dims[dimIdx] = 1;                                                     \\\n    mtoc2__squeeze_trailing_c(&out_ndim, out_dims);                           \\\n    mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(out_ndim, out_dims);   \\\n    long slab = before * axis;                                                \\\n    int realMode = mtoc2__creduce_all_imag_zero(a);                           \\\n    for (long outer = 0; outer < after; outer++) {                            \\\n      long slabBase = outer * slab;                                           \\\n      for (long inner = 0; inner < before; inner++) {                         \\\n        int found = 0;                                                        \\\n        double mRe = NAN, mIm = 0.0;                                          \\\n        for (long k = 0; k < axis; k++) {                                     \\\n          long off = slabBase + inner + k * before;                           \\\n          double xr = a.real[off];                                            \\\n          double xi = (a.imag != NULL) ? a.imag[off] : 0.0;                   \\\n          if (xr != xr || xi != xi) continue;                                 \\\n          int better = realMode ? (xr CMP mRe)                                \\\n                                : mtoc2__##name##_complex_better(xr, xi, mRe, mIm); \\\n          if (!found || better) {                                             \\\n            mRe = xr;                                                         \\\n            mIm = xi;                                                         \\\n            found = 1;                                                        \\\n          }                                                                   \\\n        }                                                                     \\\n        long dst = outer * before + inner;                                    \\\n        out.real[dst] = mRe;                                                  \\\n        out.imag[dst] = mIm;                                                  \\\n      }                                                                       \\\n    }                                                                         \\\n    return out;                                                               \\\n  }\n\n/* any/all template \u2014 real result; toBool per element (either lane\n * nonzero). Mirrors `MTOC2_DEFINE_LOGICAL_REDUCTION` shape. */\n#define MTOC2_DEFINE_CLOGICAL_REDUCTION(name, EMPTY_RESULT, SHORT_BODY)        \\\n  static double mtoc2_##name##_complex_all(mtoc2_tensor_t a) {                \\\n    long n = 1;                                                               \\\n    for (int i = 0; i < a.ndim; i++) n *= a.dims[i];                          \\\n    if (n == 0) return (double)(EMPTY_RESULT);                                \\\n    double acc = (double)(EMPTY_RESULT);                                      \\\n    for (long i = 0; i < n; i++) {                                            \\\n      double xr = a.real[i];                                                  \\\n      double xi = (a.imag != NULL) ? a.imag[i] : 0.0;                         \\\n      int x = (xr != 0.0 || xi != 0.0);                                       \\\n      SHORT_BODY;                                                             \\\n    }                                                                         \\\n    return acc;                                                               \\\n  }                                                                           \\\n                                                                              \\\n  static mtoc2_tensor_t mtoc2_##name##_complex_dim(                           \\\n      mtoc2_tensor_t a, int dim) {                                            \\\n    if (dim < 1) {                                                            \\\n      fprintf(stderr, "mtoc2: " #name "_complex_dim: dim must be >= 1 (got %d)\\n", dim); \\\n      abort();                                                                \\\n    }                                                                         \\\n    if (dim > a.ndim) {                                                       \\\n      long total = 1;                                                         \\\n      for (int i = 0; i < a.ndim; i++) total *= a.dims[i];                    \\\n      long out_dims[MTOC2_MAX_NDIM];                                          \\\n      int out_ndim = a.ndim;                                                  \\\n      for (int i = 0; i < a.ndim; i++) out_dims[i] = a.dims[i];               \\\n      mtoc2_tensor_t out = mtoc2_tensor_alloc_nd(out_ndim, out_dims);         \\\n      for (long i = 0; i < total; i++) {                                      \\\n        double xr = a.real[i];                                                \\\n        double xi = (a.imag != NULL) ? a.imag[i] : 0.0;                       \\\n        out.real[i] = (xr != 0.0 || xi != 0.0) ? 1.0 : 0.0;                   \\\n      }                                                                       \\\n      return out;                                                             \\\n    }                                                                         \\\n    int dimIdx = dim - 1;                                                     \\\n    long axis = a.dims[dimIdx];                                               \\\n    long before = 1;                                                          \\\n    for (int i = 0; i < dimIdx; i++) before *= a.dims[i];                     \\\n    long after = 1;                                                           \\\n    for (int i = dimIdx + 1; i < a.ndim; i++) after *= a.dims[i];             \\\n    long out_dims[MTOC2_MAX_NDIM];                                            \\\n    int out_ndim = a.ndim;                                                    \\\n    for (int i = 0; i < a.ndim; i++) out_dims[i] = a.dims[i];                 \\\n    out_dims[dimIdx] = 1;                                                     \\\n    mtoc2__squeeze_trailing_c(&out_ndim, out_dims);                           \\\n    mtoc2_tensor_t out = mtoc2_tensor_alloc_nd(out_ndim, out_dims);           \\\n    long slab = before * axis;                                                \\\n    for (long outer = 0; outer < after; outer++) {                            \\\n      long slabBase = outer * slab;                                           \\\n      for (long inner = 0; inner < before; inner++) {                         \\\n        double acc = (double)(EMPTY_RESULT);                                  \\\n        for (long k = 0; k < axis; k++) {                                     \\\n          long off = slabBase + inner + k * before;                           \\\n          double xr = a.real[off];                                            \\\n          double xi = (a.imag != NULL) ? a.imag[off] : 0.0;                   \\\n          int x = (xr != 0.0 || xi != 0.0);                                   \\\n          SHORT_BODY;                                                         \\\n        }                                                                     \\\n        out.real[outer * before + inner] = acc;                               \\\n      }                                                                       \\\n    }                                                                         \\\n    return out;                                                               \\\n  }\n\n/* Accumulator-statement macros. */\n#define MTOC2_CACC_SUM(acc, x) mtoc2_cadd((acc), (x))\n#define MTOC2_CACC_PROD(acc, x) mtoc2_cmul((acc), (x))\n#define MTOC2_CFIN_ID(acc, n) (acc)\n#define MTOC2_CFIN_MEAN(acc, n) mtoc2_cdiv((acc), mtoc2_cmake((double)(n), 0.0))\n\nMTOC2_DEFINE_CACCUM_REDUCTION(sum, mtoc2_cmake(0.0, 0.0), MTOC2_CACC_SUM, MTOC2_CFIN_ID)\nMTOC2_DEFINE_CACCUM_REDUCTION(prod, mtoc2_cmake(1.0, 0.0), MTOC2_CACC_PROD, MTOC2_CFIN_ID)\nMTOC2_DEFINE_CACCUM_REDUCTION(mean, mtoc2_cmake(0.0, 0.0), MTOC2_CACC_SUM, MTOC2_CFIN_MEAN)\n\nMTOC2_DEFINE_CMINMAX_REDUCTION(min, <)\nMTOC2_DEFINE_CMINMAX_REDUCTION(max, >)\n\nMTOC2_DEFINE_CLOGICAL_REDUCTION(any, 0,\n  if (x) { acc = 1.0; break; })\nMTOC2_DEFINE_CLOGICAL_REDUCTION(all, 1,\n  if (!x) { acc = 0.0; break; })\n',
   "tensor_reduce_real.h": '/* mtoc2 runtime helpers: real-tensor reductions.\n *\n * One macro per op generates two helpers:\n *\n *   mtoc2_<name>_all(a)       \u2014 reduce every element to a scalar.\n *   mtoc2_<name>_dim(a, dim)  \u2014 reduce along the 1-based axis `dim`,\n *                               returning a freshly-owned tensor.\n *\n * The `_dim` template mirrors numbl\'s `forEachSlice`: compute\n * `before = prod(dims[0..dim-2])`, `axis = dims[dim-1]`,\n * `after = prod(dims[dim..ndim-1])`. Walk the column-major buffer in\n * `(after \xD7 before)` fiber order with stride `before` between\n * elements along the reduced axis. Output dims are the input dims\n * with `dims[dim-1] = 1`, then trailing singletons stripped subject\n * to a 2-axis floor (matches the type system\'s\n * `tensorDoubleFromDims` rule).\n *\n * If `dim > a.ndim` the runtime emits a per-op no-op: every reducer\n * (sum/prod/mean/min/max) copies `a` as-is; the logical reducers\n * (any/all) emit an elementwise cast to {0, 1}. The transfer step\n * already proved the output shape, so this branch only fires when\n * the type-side dim/shape analysis can\'t fold to AxisAll.\n *\n * Real-only \u2014 complex is out of scope. Same-shape and column-major\n * conventions match the rest of mtoc2\'s tensor runtime.\n */\n\n#include <math.h>\n#include <stdlib.h>\n#include <string.h>\n\n/* Strip trailing singleton axes down to a 2-axis floor. Updates\n * `*ndim` in place; `dims` is the row buffer. */\nstatic void mtoc2__squeeze_trailing(int *ndim, long *dims) {\n  while (*ndim > 2 && dims[*ndim - 1] == 1) {\n    (*ndim)--;\n  }\n}\n\n/* Helper: reduce-all loop for accumulator-based reducers\n * (sum, prod, mean). `INIT` seeds the accumulator; `ACCUM(acc, x)`\n * is a C statement updating `acc`; `FINALIZE(acc, n)` is the final\n * transformation given the count. */\n#define MTOC2_DEFINE_ACCUM_REDUCTION(name, INIT, ACCUM, FINALIZE)             \\\n  static double mtoc2_##name##_all(mtoc2_tensor_t a) {                        \\\n    long n = 1;                                                               \\\n    for (int i = 0; i < a.ndim; i++) n *= a.dims[i];                          \\\n    double acc = (INIT);                                                      \\\n    for (long i = 0; i < n; i++) {                                            \\\n      double x = a.real[i];                                                   \\\n      ACCUM(acc, x);                                                          \\\n    }                                                                         \\\n    return FINALIZE(acc, n);                                                  \\\n  }                                                                           \\\n                                                                              \\\n  static mtoc2_tensor_t mtoc2_##name##_dim(mtoc2_tensor_t a, int dim) {       \\\n    if (dim < 1) {                                                            \\\n      fprintf(stderr, "mtoc2: " #name "_dim: dim must be >= 1 (got %d)\\n",    \\\n              dim);                                                           \\\n      abort();                                                                \\\n    }                                                                         \\\n    if (dim > a.ndim) {                                                       \\\n      /* No-op axis: output is same shape/data as input (fresh copy). */      \\\n      long total = 1;                                                         \\\n      for (int i = 0; i < a.ndim; i++) total *= a.dims[i];                    \\\n      mtoc2_tensor_t out;                                                     \\\n      out.ndim = a.ndim;                                                      \\\n      for (int i = 0; i < a.ndim; i++) out.dims[i] = a.dims[i];               \\\n      out.real = mtoc2_alloc((size_t)total * sizeof(double));                 \\\n      out.imag = NULL;                                                        \\\n      memcpy(out.real, a.real, (size_t)total * sizeof(double));               \\\n      return out;                                                             \\\n    }                                                                         \\\n    int dimIdx = dim - 1;                                                     \\\n    long axis = a.dims[dimIdx];                                               \\\n    long before = 1;                                                          \\\n    for (int i = 0; i < dimIdx; i++) before *= a.dims[i];                     \\\n    long after = 1;                                                           \\\n    for (int i = dimIdx + 1; i < a.ndim; i++) after *= a.dims[i];             \\\n    long out_total = before * after;                                          \\\n    long out_dims[MTOC2_MAX_NDIM];                                            \\\n    int out_ndim = a.ndim;                                                    \\\n    for (int i = 0; i < a.ndim; i++) out_dims[i] = a.dims[i];                 \\\n    out_dims[dimIdx] = 1;                                                     \\\n    mtoc2__squeeze_trailing(&out_ndim, out_dims);                             \\\n    mtoc2_tensor_t out = mtoc2_tensor_alloc_nd(out_ndim, out_dims);           \\\n    long slab = before * axis;                                                \\\n    for (long outer = 0; outer < after; outer++) {                            \\\n      long slabBase = outer * slab;                                           \\\n      for (long inner = 0; inner < before; inner++) {                         \\\n        double acc = (INIT);                                                  \\\n        for (long k = 0; k < axis; k++) {                                     \\\n          double x = a.real[slabBase + inner + k * before];                   \\\n          ACCUM(acc, x);                                                      \\\n        }                                                                     \\\n        out.real[outer * before + inner] = FINALIZE(acc, axis);               \\\n      }                                                                       \\\n    }                                                                         \\\n    (void)out_total;                                                          \\\n    return out;                                                               \\\n  }\n\n/* Helper: reduce-all loop for min/max. Seed is NaN; first non-NaN\n * element captures, later non-NaN elements compare via CMP.\n * Mirrors numbl\'s NaN-skip convention. */\n#define MTOC2_DEFINE_MINMAX_REDUCTION(name, CMP)                              \\\n  static double mtoc2_##name##_all(mtoc2_tensor_t a) {                        \\\n    long n = 1;                                                               \\\n    for (int i = 0; i < a.ndim; i++) n *= a.dims[i];                          \\\n    double acc = NAN;                                                         \\\n    for (long i = 0; i < n; i++) {                                            \\\n      double x = a.real[i];                                                   \\\n      if (x != x) continue; /* skip NaN */                                    \\\n      if (acc != acc || (x CMP acc)) acc = x;                                 \\\n    }                                                                         \\\n    return acc;                                                               \\\n  }                                                                           \\\n                                                                              \\\n  static mtoc2_tensor_t mtoc2_##name##_dim(mtoc2_tensor_t a, int dim) {       \\\n    if (dim < 1) {                                                            \\\n      fprintf(stderr, "mtoc2: " #name "_dim: dim must be >= 1 (got %d)\\n",    \\\n              dim);                                                           \\\n      abort();                                                                \\\n    }                                                                         \\\n    if (dim > a.ndim) {                                                       \\\n      long total = 1;                                                         \\\n      for (int i = 0; i < a.ndim; i++) total *= a.dims[i];                    \\\n      mtoc2_tensor_t out;                                                     \\\n      out.ndim = a.ndim;                                                      \\\n      for (int i = 0; i < a.ndim; i++) out.dims[i] = a.dims[i];               \\\n      out.real = mtoc2_alloc((size_t)total * sizeof(double));                 \\\n      out.imag = NULL;                                                        \\\n      memcpy(out.real, a.real, (size_t)total * sizeof(double));               \\\n      return out;                                                             \\\n    }                                                                         \\\n    int dimIdx = dim - 1;                                                     \\\n    long axis = a.dims[dimIdx];                                               \\\n    long before = 1;                                                          \\\n    for (int i = 0; i < dimIdx; i++) before *= a.dims[i];                     \\\n    long after = 1;                                                           \\\n    for (int i = dimIdx + 1; i < a.ndim; i++) after *= a.dims[i];             \\\n    long out_dims[MTOC2_MAX_NDIM];                                            \\\n    int out_ndim = a.ndim;                                                    \\\n    for (int i = 0; i < a.ndim; i++) out_dims[i] = a.dims[i];                 \\\n    out_dims[dimIdx] = 1;                                                     \\\n    mtoc2__squeeze_trailing(&out_ndim, out_dims);                             \\\n    mtoc2_tensor_t out = mtoc2_tensor_alloc_nd(out_ndim, out_dims);           \\\n    long slab = before * axis;                                                \\\n    for (long outer = 0; outer < after; outer++) {                            \\\n      long slabBase = outer * slab;                                           \\\n      for (long inner = 0; inner < before; inner++) {                         \\\n        double acc = NAN;                                                     \\\n        for (long k = 0; k < axis; k++) {                                     \\\n          double x = a.real[slabBase + inner + k * before];                   \\\n          if (x != x) continue;                                               \\\n          if (acc != acc || (x CMP acc)) acc = x;                             \\\n        }                                                                     \\\n        out.real[outer * before + inner] = acc;                               \\\n      }                                                                       \\\n    }                                                                         \\\n    return out;                                                               \\\n  }\n\n/* Helper: any/all reduction. Short-circuits per fiber.\n * `EMPTY_RESULT` is the value when the reduced fiber is empty:\n *  - any: 0 (no element is nonzero in an empty set)\n *  - all: 1 (vacuously true)\n * `SHORT(acc, x)` updates `acc` if `x` triggers the short-circuit;\n * `done` short-circuits the inner loop once `acc` settles. */\n#define MTOC2_DEFINE_LOGICAL_REDUCTION(name, EMPTY_RESULT, SHORT_BODY)        \\\n  static double mtoc2_##name##_all(mtoc2_tensor_t a) {                        \\\n    long n = 1;                                                               \\\n    for (int i = 0; i < a.ndim; i++) n *= a.dims[i];                          \\\n    if (n == 0) return (double)(EMPTY_RESULT);                                \\\n    double acc = (double)(EMPTY_RESULT);                                      \\\n    for (long i = 0; i < n; i++) {                                            \\\n      double x = a.real[i];                                                   \\\n      SHORT_BODY;                                                             \\\n    }                                                                         \\\n    return acc;                                                               \\\n  }                                                                           \\\n                                                                              \\\n  static mtoc2_tensor_t mtoc2_##name##_dim(mtoc2_tensor_t a, int dim) {       \\\n    if (dim < 1) {                                                            \\\n      fprintf(stderr, "mtoc2: " #name "_dim: dim must be >= 1 (got %d)\\n",    \\\n              dim);                                                           \\\n      abort();                                                                \\\n    }                                                                         \\\n    if (dim > a.ndim) {                                                       \\\n      /* Numbl\'s `logicalAlongDim` with `dim > ndims` does an elementwise   \\\n       * cast to logical: each element becomes 1.0 if nonzero else 0.0. */   \\\n      long total = 1;                                                         \\\n      for (int i = 0; i < a.ndim; i++) total *= a.dims[i];                    \\\n      long out_dims[MTOC2_MAX_NDIM];                                          \\\n      int out_ndim = a.ndim;                                                  \\\n      for (int i = 0; i < a.ndim; i++) out_dims[i] = a.dims[i];               \\\n      mtoc2_tensor_t out = mtoc2_tensor_alloc_nd(out_ndim, out_dims);         \\\n      for (long i = 0; i < total; i++) {                                      \\\n        out.real[i] = (a.real[i] != 0.0) ? 1.0 : 0.0;                         \\\n      }                                                                       \\\n      return out;                                                             \\\n    }                                                                         \\\n    int dimIdx = dim - 1;                                                     \\\n    long axis = a.dims[dimIdx];                                               \\\n    long before = 1;                                                          \\\n    for (int i = 0; i < dimIdx; i++) before *= a.dims[i];                     \\\n    long after = 1;                                                           \\\n    for (int i = dimIdx + 1; i < a.ndim; i++) after *= a.dims[i];             \\\n    long out_dims[MTOC2_MAX_NDIM];                                            \\\n    int out_ndim = a.ndim;                                                    \\\n    for (int i = 0; i < a.ndim; i++) out_dims[i] = a.dims[i];                 \\\n    out_dims[dimIdx] = 1;                                                     \\\n    mtoc2__squeeze_trailing(&out_ndim, out_dims);                             \\\n    mtoc2_tensor_t out = mtoc2_tensor_alloc_nd(out_ndim, out_dims);           \\\n    long slab = before * axis;                                                \\\n    for (long outer = 0; outer < after; outer++) {                            \\\n      long slabBase = outer * slab;                                           \\\n      for (long inner = 0; inner < before; inner++) {                         \\\n        double acc = (double)(EMPTY_RESULT);                                  \\\n        for (long k = 0; k < axis; k++) {                                     \\\n          double x = a.real[slabBase + inner + k * before];                   \\\n          SHORT_BODY;                                                         \\\n        }                                                                     \\\n        out.real[outer * before + inner] = acc;                               \\\n      }                                                                       \\\n    }                                                                         \\\n    return out;                                                               \\\n  }\n\n/* Identity finalizer (sum, prod): pass the accumulator through. */\n#define MTOC2_FIN_ID(acc, n) (acc)\n/* Mean finalizer: divide by element count. Empty fiber \u2192 0/0 = NaN. */\n#define MTOC2_FIN_MEAN(acc, n) ((double)(acc) / (double)(n))\n\n/* Accumulator-statement macros. Wrapped in `do {} while(0)` to keep\n * them safe inside any single-statement context the templates use. */\n#define MTOC2_ACC_SUM(acc, x) do { (acc) += (x); } while (0)\n#define MTOC2_ACC_PROD(acc, x) do { (acc) *= (x); } while (0)\n\nMTOC2_DEFINE_ACCUM_REDUCTION(sum, 0.0, MTOC2_ACC_SUM, MTOC2_FIN_ID)\nMTOC2_DEFINE_ACCUM_REDUCTION(prod, 1.0, MTOC2_ACC_PROD, MTOC2_FIN_ID)\nMTOC2_DEFINE_ACCUM_REDUCTION(mean, 0.0, MTOC2_ACC_SUM, MTOC2_FIN_MEAN)\n\nMTOC2_DEFINE_MINMAX_REDUCTION(min, <)\nMTOC2_DEFINE_MINMAX_REDUCTION(max, >)\n\n/* `any` ignores NaN (MATLAB: any(NaN) is 0); `x == x` excludes it so a\n * NaN doesn\'t wrongly short-circuit to true. `all` tests `x == 0.0`,\n * which NaN already fails, so it needs no guard. */\nMTOC2_DEFINE_LOGICAL_REDUCTION(any, 0,\n  if (x != 0.0 && x == x) { acc = 1.0; break; })\nMTOC2_DEFINE_LOGICAL_REDUCTION(all, 1,\n  if (x == 0.0) { acc = 0.0; break; })\n',
   "tensor_repmat.h": '/* mtoc2 runtime helper: `repmat(A, reps)` \u2014 tile a tensor by\n * replicating it along each axis.\n *\n * Numbl\'s reference is the `repmat` builtin in\n * `interpreter/builtins/array-manipulation.ts` (the tensor branch).\n *\n * Contract:\n *   - `in` is the source tensor (real, owned-value invariant unchanged).\n *   - `nreps` is the number of replication factors supplied (1..MTOC2_MAX_NDIM).\n *   - `reps_in[i]` is the per-axis replication factor; negative values\n *     clamp to 0 (yielding an empty axis), matching numbl/MATLAB.\n *\n * Output shape is `padShape[i] * padReps[i]` where the input\'s shape\n * and the reps vector are both right-padded with 1s to a common rank\n * `max(in.ndim, nreps)`. Result is freshly owned; `imag` is NULL.\n *\n * Algorithm: copy the input data into the start of the output buffer\n * (column-major flat layout is preserved when trailing dims are 1),\n * then iteratively expand along each axis. For axis `d` with rep > 1,\n * we walk the existing blocks of size `blockSize = prod(curShape[0..d])`\n * in reverse order and replicate each block `rep` times consecutively.\n * Reverse order avoids overwriting source data; `memmove` covers the\n * b=0 in-place case where the block stays at its original offset.\n */\n\n#include <string.h>\n#include <stdio.h>\n#include <stdlib.h>\n\nstatic mtoc2_tensor_t mtoc2_tensor_repmat(mtoc2_tensor_t in, int nreps,\n                                          const long *reps_in) {\n  if (nreps < 1 || nreps > MTOC2_MAX_NDIM) {\n    fprintf(stderr,\n      "mtoc2: repmat nreps %d out of range [1, %d]\\n", nreps, MTOC2_MAX_NDIM);\n    abort();\n  }\n  long reps[MTOC2_MAX_NDIM];\n  for (int i = 0; i < nreps; i++) reps[i] = reps_in[i] < 0 ? 0 : reps_in[i];\n\n  int in_ndim = in.ndim;\n  int out_ndim = nreps > in_ndim ? nreps : in_ndim;\n  if (out_ndim > MTOC2_MAX_NDIM) {\n    fprintf(stderr,\n      "mtoc2: repmat output ndim %d exceeds %d\\n", out_ndim, MTOC2_MAX_NDIM);\n    abort();\n  }\n\n  long padShape[MTOC2_MAX_NDIM];\n  long padReps[MTOC2_MAX_NDIM];\n  long outDims[MTOC2_MAX_NDIM];\n  for (int i = 0; i < out_ndim; i++) {\n    padShape[i] = i < in_ndim ? in.dims[i] : 1;\n    padReps[i] = i < nreps ? reps[i] : 1;\n    outDims[i] = padShape[i] * padReps[i];\n  }\n\n  mtoc2_tensor_t out = mtoc2_tensor_alloc_nd(out_ndim, outDims);\n\n  size_t outTotal = 1;\n  for (int i = 0; i < out_ndim; i++) outTotal *= (size_t)outDims[i];\n  if (outTotal == 0) return out;\n\n  size_t inTotal = 1;\n  for (int i = 0; i < in_ndim; i++) inTotal *= (size_t)in.dims[i];\n  if (inTotal == 0) return out;\n\n  /* Initial copy: input\'s data laid out in column-major with shape\n   * `in.dims` matches the same flat layout under `padShape` (trailing\n   * 1s don\'t change flat indexing). */\n  memcpy(out.real, in.real, inTotal * sizeof(double));\n\n  long curShape[MTOC2_MAX_NDIM];\n  for (int i = 0; i < out_ndim; i++) curShape[i] = padShape[i];\n  size_t curTotal = inTotal;\n\n  for (int d = 0; d < out_ndim; d++) {\n    long rep = padReps[d];\n    if (rep == 1) continue;\n\n    size_t blockSize = 1;\n    for (int i = 0; i <= d; i++) blockSize *= (size_t)curShape[i];\n\n    if (rep == 0 || blockSize == 0) {\n      curShape[d] *= rep;\n      curTotal = 0;\n      /* Once curTotal is 0, no further work needed \u2014 outTotal is also\n       * 0 (because outDims[d] = padShape[d] * 0 = 0). The alloc above\n       * already produced a zero-element tensor; bail out. */\n      return out;\n    }\n\n    size_t numBlocks = curTotal / blockSize;\n    /* Walk blocks in reverse so writes don\'t clobber as-yet-unread\n     * source blocks. Each block of `blockSize` doubles to `blockSize\n     * * rep` consecutive slots at offset `b * blockSize * rep`. */\n    for (size_t b = numBlocks; b > 0;) {\n      b--;\n      size_t srcOff = b * blockSize;\n      size_t dstBase = b * blockSize * (size_t)rep;\n      if (dstBase != srcOff) {\n        memmove(out.real + dstBase, out.real + srcOff,\n                blockSize * sizeof(double));\n      }\n      for (long r = 1; r < rep; r++) {\n        memcpy(out.real + dstBase + (size_t)r * blockSize,\n               out.real + dstBase,\n               blockSize * sizeof(double));\n      }\n    }\n\n    curShape[d] *= rep;\n    curTotal *= (size_t)rep;\n  }\n\n  return out;\n}\n\n/* Complex-input sibling: tiles both lanes. Tolerates `in.imag == NULL`\n * (a real tensor that flowed in via a complex-typed route) by zero-\n * filling the output imag lane. */\nstatic mtoc2_tensor_t mtoc2_tensor_repmat_complex(mtoc2_tensor_t in,\n                                                  int nreps,\n                                                  const long *reps_in) {\n  if (nreps < 1 || nreps > MTOC2_MAX_NDIM) {\n    fprintf(stderr,\n      "mtoc2: repmat_complex nreps %d out of range [1, %d]\\n",\n      nreps, MTOC2_MAX_NDIM);\n    abort();\n  }\n  long reps[MTOC2_MAX_NDIM];\n  for (int i = 0; i < nreps; i++) reps[i] = reps_in[i] < 0 ? 0 : reps_in[i];\n\n  int in_ndim = in.ndim;\n  int out_ndim = nreps > in_ndim ? nreps : in_ndim;\n  if (out_ndim > MTOC2_MAX_NDIM) {\n    fprintf(stderr,\n      "mtoc2: repmat_complex output ndim %d exceeds %d\\n",\n      out_ndim, MTOC2_MAX_NDIM);\n    abort();\n  }\n\n  long padShape[MTOC2_MAX_NDIM];\n  long padReps[MTOC2_MAX_NDIM];\n  long outDims[MTOC2_MAX_NDIM];\n  for (int i = 0; i < out_ndim; i++) {\n    padShape[i] = i < in_ndim ? in.dims[i] : 1;\n    padReps[i] = i < nreps ? reps[i] : 1;\n    outDims[i] = padShape[i] * padReps[i];\n  }\n\n  mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(out_ndim, outDims);\n\n  size_t outTotal = 1;\n  for (int i = 0; i < out_ndim; i++) outTotal *= (size_t)outDims[i];\n  if (outTotal == 0) return out;\n\n  size_t inTotal = 1;\n  for (int i = 0; i < in_ndim; i++) inTotal *= (size_t)in.dims[i];\n  if (inTotal == 0) {\n    memset(out.imag, 0, outTotal * sizeof(double));\n    return out;\n  }\n\n  int srcHasImag = (in.imag != NULL);\n\n  memcpy(out.real, in.real, inTotal * sizeof(double));\n  if (srcHasImag) {\n    memcpy(out.imag, in.imag, inTotal * sizeof(double));\n  } else {\n    memset(out.imag, 0, inTotal * sizeof(double));\n  }\n\n  long curShape[MTOC2_MAX_NDIM];\n  for (int i = 0; i < out_ndim; i++) curShape[i] = padShape[i];\n  size_t curTotal = inTotal;\n\n  for (int d = 0; d < out_ndim; d++) {\n    long rep = padReps[d];\n    if (rep == 1) continue;\n\n    size_t blockSize = 1;\n    for (int i = 0; i <= d; i++) blockSize *= (size_t)curShape[i];\n\n    if (rep == 0 || blockSize == 0) {\n      curShape[d] *= rep;\n      curTotal = 0;\n      return out;\n    }\n\n    size_t numBlocks = curTotal / blockSize;\n    for (size_t b = numBlocks; b > 0;) {\n      b--;\n      size_t srcOff = b * blockSize;\n      size_t dstBase = b * blockSize * (size_t)rep;\n      if (dstBase != srcOff) {\n        memmove(out.real + dstBase, out.real + srcOff,\n                blockSize * sizeof(double));\n        memmove(out.imag + dstBase, out.imag + srcOff,\n                blockSize * sizeof(double));\n      }\n      for (long r = 1; r < rep; r++) {\n        memcpy(out.real + dstBase + (size_t)r * blockSize,\n               out.real + dstBase,\n               blockSize * sizeof(double));\n        memcpy(out.imag + dstBase + (size_t)r * blockSize,\n               out.imag + dstBase,\n               blockSize * sizeof(double));\n      }\n    }\n\n    curShape[d] *= rep;\n    curTotal *= (size_t)rep;\n  }\n\n  return out;\n}\n',
   "tensor_reshape_nd.h": '/* mtoc2 runtime helper: reshape a real tensor to an N-D shape.\n *\n * Receives the input tensor by value (`mtoc2_tensor_t`) plus a\n * caller-supplied dim list (`ndim`, `dims`). Allocates a fresh\n * output tensor via `mtoc2_tensor_alloc_nd` and copies the input\'s\n * column-major buffer wholesale \u2014 reshape is a layout reinterpret,\n * so the linear element order is unchanged.\n *\n * `dims[i] == -1` is the MATLAB `[]` auto-infer slot: the helper\n * scans for a single -1 and fills it from `in_total / prod(others)`.\n * Two or more sentinels, or an `in_total` not divisible by the\n * explicit dims, abort with a clear message.\n *\n * Element-count check: the lowerer enforces `prod(input.dims) ==\n * prod(dims)` at translate time when the input shape is statically\n * known. This helper is the fallback when the input shape only\n * appears at runtime (e.g. a tensor function param whose\n * specialization arg type came in without a concrete shape). On\n * mismatch it prints to stderr and aborts, matching numbl\'s\n * runtime-error surface.\n *\n * Real-only \u2014 mtoc2\'s tensor side is real-only today; the type\n * lattice rejects complex inputs at lowering. The output\'s `imag`\n * is NULL, set by `mtoc2_tensor_alloc_nd`.\n */\n\n#include <string.h>\n#include <stdio.h>\n#include <stdlib.h>\n\nstatic mtoc2_tensor_t mtoc2_reshape_nd(\n    mtoc2_tensor_t in, int ndim, const long *dims) {\n  size_t in_total = 1;\n  for (int i = 0; i < in.ndim; i++) in_total *= (size_t)in.dims[i];\n  /* Scan for at most one `-1` infer slot and the product of the\n   * remaining explicit dims. */\n  int infer_idx = -1;\n  size_t explicit_prod = 1;\n  for (int i = 0; i < ndim; i++) {\n    if (dims[i] == -1) {\n      if (infer_idx != -1) {\n        fprintf(stderr,\n          "mtoc2: reshape: at most one \'[]\' auto-infer slot allowed\\n");\n        abort();\n      }\n      infer_idx = i;\n    } else if (dims[i] < 0) {\n      fprintf(stderr,\n        "mtoc2: reshape: dim %d must be a non-negative integer "\n        "(got %ld)\\n", i + 1, dims[i]);\n      abort();\n    } else {\n      explicit_prod *= (size_t)dims[i];\n    }\n  }\n  long resolved_dims[MTOC2_MAX_NDIM];\n  for (int i = 0; i < ndim; i++) resolved_dims[i] = dims[i];\n  size_t out_total;\n  if (infer_idx != -1) {\n    if (explicit_prod == 0 && in_total != 0) {\n      fprintf(stderr,\n        "mtoc2: reshape: input has %zu elements but the explicit dims "\n        "around \'[]\' multiply to 0\\n", in_total);\n      abort();\n    }\n    if (explicit_prod > 0 && in_total % explicit_prod != 0) {\n      fprintf(stderr,\n        "mtoc2: reshape: input has %zu elements, not divisible by %zu "\n        "(the explicit dims around \'[]\')\\n", in_total, explicit_prod);\n      abort();\n    }\n    resolved_dims[infer_idx] =\n      (explicit_prod == 0) ? 0 : (long)(in_total / explicit_prod);\n    out_total = in_total;\n  } else {\n    out_total = explicit_prod;\n    if (in_total != out_total) {\n      fprintf(stderr,\n        "mtoc2: reshape: number of elements must not change "\n        "(in=%zu, out=%zu)\\n", in_total, out_total);\n      abort();\n    }\n  }\n  mtoc2_tensor_t out = mtoc2_tensor_alloc_nd(ndim, resolved_dims);\n  if (out_total > 0)\n    memcpy(out.real, in.real, out_total * sizeof(double));\n  return out;\n}\n',
   "tensor_reshape_nd_complex.h": '/* mtoc2 runtime helper: reshape a complex tensor to an N-D shape.\n *\n * Sibling of `mtoc2_reshape_nd`. Same `-1` auto-infer slot, same\n * element-count check, same runtime-error surface \u2014 the only\n * difference is the output is allocated via\n * `mtoc2_tensor_alloc_nd_complex` (both lanes) and both lanes get\n * memcpy\'d from the input. Reshape is a layout reinterpret, so the\n * linear element order is unchanged on either lane.\n *\n * Tolerates `in.imag == NULL` (a real tensor flowing through a\n * complex-typed reshape route) by zeroing the output imag lane.\n */\n\n#include <string.h>\n#include <stdio.h>\n#include <stdlib.h>\n\nstatic mtoc2_tensor_t mtoc2_reshape_nd_complex(\n    mtoc2_tensor_t in, int ndim, const long *dims) {\n  size_t in_total = 1;\n  for (int i = 0; i < in.ndim; i++) in_total *= (size_t)in.dims[i];\n  int infer_idx = -1;\n  size_t explicit_prod = 1;\n  for (int i = 0; i < ndim; i++) {\n    if (dims[i] == -1) {\n      if (infer_idx != -1) {\n        fprintf(stderr,\n          "mtoc2: reshape: at most one \'[]\' auto-infer slot allowed\\n");\n        abort();\n      }\n      infer_idx = i;\n    } else if (dims[i] < 0) {\n      fprintf(stderr,\n        "mtoc2: reshape: dim %d must be a non-negative integer "\n        "(got %ld)\\n", i + 1, dims[i]);\n      abort();\n    } else {\n      explicit_prod *= (size_t)dims[i];\n    }\n  }\n  long resolved_dims[MTOC2_MAX_NDIM];\n  for (int i = 0; i < ndim; i++) resolved_dims[i] = dims[i];\n  size_t out_total;\n  if (infer_idx != -1) {\n    if (explicit_prod == 0 && in_total != 0) {\n      fprintf(stderr,\n        "mtoc2: reshape: input has %zu elements but the explicit dims "\n        "around \'[]\' multiply to 0\\n", in_total);\n      abort();\n    }\n    if (explicit_prod > 0 && in_total % explicit_prod != 0) {\n      fprintf(stderr,\n        "mtoc2: reshape: input has %zu elements, not divisible by %zu "\n        "(the explicit dims around \'[]\')\\n", in_total, explicit_prod);\n      abort();\n    }\n    resolved_dims[infer_idx] =\n      (explicit_prod == 0) ? 0 : (long)(in_total / explicit_prod);\n    out_total = in_total;\n  } else {\n    out_total = explicit_prod;\n    if (in_total != out_total) {\n      fprintf(stderr,\n        "mtoc2: reshape: number of elements must not change "\n        "(in=%zu, out=%zu)\\n", in_total, out_total);\n      abort();\n    }\n  }\n  mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(ndim, resolved_dims);\n  if (out_total > 0) {\n    memcpy(out.real, in.real, out_total * sizeof(double));\n    if (in.imag != NULL) {\n      memcpy(out.imag, in.imag, out_total * sizeof(double));\n    } else {\n      memset(out.imag, 0, out_total * sizeof(double));\n    }\n  }\n  return out;\n}\n',
   "tensor_size.h": "/* mtoc2 runtime helper: `size(t)` \u2014 returns a freshly-owned 1\xD7ndim\n * row tensor whose elements are the input's dim sizes as doubles.\n * MATLAB / numbl semantics: scalars and vectors return at least a\n * 2-element row (the type system already pads to ndim \u2265 2; this\n * helper just copies dims[] into a row vector).\n *\n * For `size(t, k)` mtoc2 emits a scalar `(double)t.dims[k-1]` inline\n * \u2014 no runtime helper needed for that form.\n */\n\n#include <stdlib.h>\n\nstatic mtoc2_tensor_t mtoc2_tensor_size_row(mtoc2_tensor_t a) {\n  long n = a.ndim;\n  mtoc2_tensor_t r;\n  r.real = mtoc2_alloc((size_t)n * sizeof(double));\n  r.imag = NULL;\n  r.ndim = 2;\n  r.dims[0] = 1;\n  r.dims[1] = n;\n  for (long i = 0; i < n; i++) r.real[i] = (double)a.dims[i];\n  return r;\n}\n",
-  "tensor_sort_real.h": '/* mtoc2 runtime helper: stable sort on a tensor.\n *\n *   mtoc2_sort_real(a, descending)\n *     `b = sort(a)` / `sort(a, \'ascend\'|\'descend\')` \u2014 returns a\n *     freshly-owned tensor of the same shape as `a`, with the flat\n *     (column-major) entries sorted in the requested direction.\n *\n *   mtoc2_sort_real_2(a, descending, &out_v, &out_i)\n *     `[v, i] = sort(...)` \u2014 fills `*out_v` with the sorted values\n *     and `*out_i` with 1-based original positions.\n *\n *   mtoc2_sort_complex / mtoc2_sort_complex_2\n *     Complex-input siblings. Numbl / MATLAB sort complex by\n *     magnitude (hypot), tiebreak by phase (atan2). Tolerates\n *     `a.imag == NULL` (real-input flowed through a complex route)\n *     by treating imag as zero.\n *\n * Sort is stable in both directions: ties resolve by ascending\n * original index, matching numbl\'s behaviour (verified against\n * `sort([5 2 8 1 2], \'descend\')` \u2192 indices `3 1 2 5 4`).\n *\n * The lowering layer restricts the input to a 1\xD7N row vector or N\xD71\n * column vector for v1; the helper itself walks the column-major\n * flat buffer and would handle any rank, but the type system rejects\n * the higher-rank cases until the per-axis form is plumbed through.\n */\n\n#include <math.h>\n#include <stdlib.h>\n#include <string.h>\n\ntypedef struct {\n  double v;\n  long ix;\n} mtoc2_sort_pair_t;\n\n/* NaN ranks as the maximum (MATLAB): last when ascending, first when\n * descending. Without this, NaN compares false both ways and falls to\n * the index tie-break, leaving the comparator non-transitive \u2014 which is\n * undefined behavior for qsort and corrupts the array. */\nstatic int mtoc2_sort_cmp_asc(const void *pa, const void *pb) {\n  const mtoc2_sort_pair_t *a = (const mtoc2_sort_pair_t *)pa;\n  const mtoc2_sort_pair_t *b = (const mtoc2_sort_pair_t *)pb;\n  int an = a->v != a->v, bn = b->v != b->v;\n  if (an || bn) {\n    if (!(an && bn)) return an ? 1 : -1; /* NaN sorts last */\n  } else {\n    if (a->v < b->v) return -1;\n    if (a->v > b->v) return 1;\n  }\n  if (a->ix < b->ix) return -1;\n  if (a->ix > b->ix) return 1;\n  return 0;\n}\n\nstatic int mtoc2_sort_cmp_desc(const void *pa, const void *pb) {\n  const mtoc2_sort_pair_t *a = (const mtoc2_sort_pair_t *)pa;\n  const mtoc2_sort_pair_t *b = (const mtoc2_sort_pair_t *)pb;\n  int an = a->v != a->v, bn = b->v != b->v;\n  if (an || bn) {\n    if (!(an && bn)) return an ? -1 : 1; /* NaN sorts first */\n  } else {\n    if (a->v > b->v) return -1;\n    if (a->v < b->v) return 1;\n  }\n  /* Tie-break still by ascending original index \u2014 both numbl and\n   * MATLAB keep ties in original order in either direction. */\n  if (a->ix < b->ix) return -1;\n  if (a->ix > b->ix) return 1;\n  return 0;\n}\n\nstatic mtoc2_tensor_t mtoc2_sort_real(mtoc2_tensor_t a, int descending) {\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  mtoc2_tensor_t r;\n  r.real = mtoc2_alloc((size_t)n * sizeof(double));\n  r.imag = NULL;\n  r.ndim = a.ndim;\n  for (int i = 0; i < a.ndim; i++) r.dims[i] = a.dims[i];\n  if (n == 0) return r;\n  mtoc2_sort_pair_t *buf =\n    (mtoc2_sort_pair_t *)malloc((size_t)n * sizeof(mtoc2_sort_pair_t));\n  if (!buf) {\n    fprintf(stderr, "mtoc2: out of memory (sort buffer)\\n");\n    abort();\n  }\n  for (long i = 0; i < n; i++) {\n    buf[i].v = a.real[i];\n    buf[i].ix = i;\n  }\n  qsort(buf, (size_t)n, sizeof(mtoc2_sort_pair_t),\n        descending ? mtoc2_sort_cmp_desc : mtoc2_sort_cmp_asc);\n  for (long i = 0; i < n; i++) r.real[i] = buf[i].v;\n  free(buf);\n  return r;\n}\n\nstatic void mtoc2_sort_real_2(mtoc2_tensor_t a, int descending,\n                              mtoc2_tensor_t *out_v, mtoc2_tensor_t *out_i) {\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  mtoc2_tensor_t v;\n  mtoc2_tensor_t ix;\n  v.real = mtoc2_alloc((size_t)n * sizeof(double));\n  v.imag = NULL;\n  v.ndim = a.ndim;\n  for (int i = 0; i < a.ndim; i++) v.dims[i] = a.dims[i];\n  ix.real = mtoc2_alloc((size_t)n * sizeof(double));\n  ix.imag = NULL;\n  ix.ndim = a.ndim;\n  for (int i = 0; i < a.ndim; i++) ix.dims[i] = a.dims[i];\n  if (n > 0) {\n    mtoc2_sort_pair_t *buf =\n      (mtoc2_sort_pair_t *)malloc((size_t)n * sizeof(mtoc2_sort_pair_t));\n    if (!buf) {\n      fprintf(stderr, "mtoc2: out of memory (sort buffer)\\n");\n      abort();\n    }\n    for (long i = 0; i < n; i++) {\n      buf[i].v = a.real[i];\n      buf[i].ix = i;\n    }\n    qsort(buf, (size_t)n, sizeof(mtoc2_sort_pair_t),\n          descending ? mtoc2_sort_cmp_desc : mtoc2_sort_cmp_asc);\n    for (long i = 0; i < n; i++) {\n      v.real[i] = buf[i].v;\n      ix.real[i] = (double)(buf[i].ix + 1);\n    }\n    free(buf);\n  }\n  mtoc2_tensor_assign(out_v, v);\n  mtoc2_tensor_assign(out_i, ix);\n}\n\ntypedef struct {\n  double mag;\n  double phase;\n  long ix;\n} mtoc2_sort_complex_pair_t;\n\nstatic int mtoc2_sort_cmp_complex_asc(const void *pa, const void *pb) {\n  const mtoc2_sort_complex_pair_t *a = (const mtoc2_sort_complex_pair_t *)pa;\n  const mtoc2_sort_complex_pair_t *b = (const mtoc2_sort_complex_pair_t *)pb;\n  if (a->mag < b->mag) return -1;\n  if (a->mag > b->mag) return 1;\n  if (a->phase < b->phase) return -1;\n  if (a->phase > b->phase) return 1;\n  if (a->ix < b->ix) return -1;\n  if (a->ix > b->ix) return 1;\n  return 0;\n}\n\nstatic int mtoc2_sort_cmp_complex_desc(const void *pa, const void *pb) {\n  const mtoc2_sort_complex_pair_t *a = (const mtoc2_sort_complex_pair_t *)pa;\n  const mtoc2_sort_complex_pair_t *b = (const mtoc2_sort_complex_pair_t *)pb;\n  if (a->mag > b->mag) return -1;\n  if (a->mag < b->mag) return 1;\n  if (a->phase > b->phase) return -1;\n  if (a->phase < b->phase) return 1;\n  if (a->ix < b->ix) return -1;\n  if (a->ix > b->ix) return 1;\n  return 0;\n}\n\nstatic mtoc2_tensor_t mtoc2_sort_complex(mtoc2_tensor_t a, int descending) {\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  mtoc2_tensor_t r = mtoc2_tensor_alloc_nd_complex(a.ndim, a.dims);\n  if (n == 0) return r;\n  int srcHasImag = (a.imag != NULL);\n  if (!srcHasImag) memset(r.imag, 0, (size_t)n * sizeof(double));\n  mtoc2_sort_complex_pair_t *buf =\n    (mtoc2_sort_complex_pair_t *)malloc(\n      (size_t)n * sizeof(mtoc2_sort_complex_pair_t));\n  if (!buf) {\n    fprintf(stderr, "mtoc2: out of memory (sort complex buffer)\\n");\n    abort();\n  }\n  for (long i = 0; i < n; i++) {\n    double re = a.real[i];\n    double im = srcHasImag ? a.imag[i] : 0.0;\n    buf[i].mag = hypot(re, im);\n    buf[i].phase = atan2(im, re);\n    buf[i].ix = i;\n  }\n  qsort(buf, (size_t)n, sizeof(mtoc2_sort_complex_pair_t),\n        descending ? mtoc2_sort_cmp_complex_desc\n                   : mtoc2_sort_cmp_complex_asc);\n  for (long i = 0; i < n; i++) {\n    r.real[i] = a.real[buf[i].ix];\n    r.imag[i] = srcHasImag ? a.imag[buf[i].ix] : 0.0;\n  }\n  free(buf);\n  return r;\n}\n\nstatic void mtoc2_sort_complex_2(mtoc2_tensor_t a, int descending,\n                                 mtoc2_tensor_t *out_v, mtoc2_tensor_t *out_i) {\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  mtoc2_tensor_t v = mtoc2_tensor_alloc_nd_complex(a.ndim, a.dims);\n  mtoc2_tensor_t ix;\n  ix.real = mtoc2_alloc((size_t)n * sizeof(double));\n  ix.imag = NULL;\n  ix.ndim = a.ndim;\n  for (int i = 0; i < a.ndim; i++) ix.dims[i] = a.dims[i];\n  if (n > 0) {\n    int srcHasImag = (a.imag != NULL);\n    if (!srcHasImag) memset(v.imag, 0, (size_t)n * sizeof(double));\n    mtoc2_sort_complex_pair_t *buf =\n      (mtoc2_sort_complex_pair_t *)malloc(\n        (size_t)n * sizeof(mtoc2_sort_complex_pair_t));\n    if (!buf) {\n      fprintf(stderr, "mtoc2: out of memory (sort complex buffer)\\n");\n      abort();\n    }\n    for (long i = 0; i < n; i++) {\n      double re = a.real[i];\n      double im = srcHasImag ? a.imag[i] : 0.0;\n      buf[i].mag = hypot(re, im);\n      buf[i].phase = atan2(im, re);\n      buf[i].ix = i;\n    }\n    qsort(buf, (size_t)n, sizeof(mtoc2_sort_complex_pair_t),\n          descending ? mtoc2_sort_cmp_complex_desc\n                     : mtoc2_sort_cmp_complex_asc);\n    for (long i = 0; i < n; i++) {\n      v.real[i] = a.real[buf[i].ix];\n      v.imag[i] = srcHasImag ? a.imag[buf[i].ix] : 0.0;\n      ix.real[i] = (double)(buf[i].ix + 1);\n    }\n    free(buf);\n  }\n  mtoc2_tensor_assign(out_v, v);\n  mtoc2_tensor_assign(out_i, ix);\n}\n',
+  "tensor_sort_real.h": "/* mtoc2 runtime helper: stable sort on a tensor.\n *\n *   mtoc2_sort_real(a, descending)\n *     `b = sort(a)` / `sort(a, 'ascend'|'descend')` \u2014 returns a\n *     freshly-owned tensor of the same shape as `a`, with the flat\n *     (column-major) entries sorted in the requested direction.\n *\n *   mtoc2_sort_real_2(a, descending, &out_v, &out_i)\n *     `[v, i] = sort(...)` \u2014 fills `*out_v` with the sorted values\n *     and `*out_i` with 1-based original positions.\n *\n *   mtoc2_sort_complex / mtoc2_sort_complex_2\n *     Complex-input siblings. Numbl / MATLAB sort complex by\n *     magnitude (hypot), tiebreak by phase (atan2). Tolerates\n *     `a.imag == NULL` (real-input flowed through a complex route)\n *     by treating imag as zero.\n *\n * Sort is stable in both directions: ties resolve by ascending\n * original index, matching numbl's behaviour (verified against\n * `sort([5 2 8 1 2], 'descend')` \u2192 indices `3 1 2 5 4`).\n *\n * The lowering layer restricts the input to a 1\xD7N row vector or N\xD71\n * column vector for v1; the helper itself walks the column-major\n * flat buffer and would handle any rank, but the type system rejects\n * the higher-rank cases until the per-axis form is plumbed through.\n */\n\n#include <math.h>\n#include <stdlib.h>\n#include <string.h>\n\ntypedef struct {\n  double v;\n  long ix;\n} mtoc2_sort_pair_t;\n\n/* NaN ranks as the maximum (MATLAB): last when ascending, first when\n * descending. Without this, NaN compares false both ways and falls to\n * the index tie-break, leaving the comparator non-transitive \u2014 which is\n * undefined behavior for qsort and corrupts the array. */\nstatic int mtoc2_sort_cmp_asc(const void *pa, const void *pb) {\n  const mtoc2_sort_pair_t *a = (const mtoc2_sort_pair_t *)pa;\n  const mtoc2_sort_pair_t *b = (const mtoc2_sort_pair_t *)pb;\n  int an = a->v != a->v, bn = b->v != b->v;\n  if (an || bn) {\n    if (!(an && bn)) return an ? 1 : -1; /* NaN sorts last */\n  } else {\n    if (a->v < b->v) return -1;\n    if (a->v > b->v) return 1;\n  }\n  if (a->ix < b->ix) return -1;\n  if (a->ix > b->ix) return 1;\n  return 0;\n}\n\nstatic int mtoc2_sort_cmp_desc(const void *pa, const void *pb) {\n  const mtoc2_sort_pair_t *a = (const mtoc2_sort_pair_t *)pa;\n  const mtoc2_sort_pair_t *b = (const mtoc2_sort_pair_t *)pb;\n  int an = a->v != a->v, bn = b->v != b->v;\n  if (an || bn) {\n    if (!(an && bn)) return an ? -1 : 1; /* NaN sorts first */\n  } else {\n    if (a->v > b->v) return -1;\n    if (a->v < b->v) return 1;\n  }\n  /* Tie-break still by ascending original index \u2014 both numbl and\n   * MATLAB keep ties in original order in either direction. */\n  if (a->ix < b->ix) return -1;\n  if (a->ix > b->ix) return 1;\n  return 0;\n}\n\nstatic mtoc2_tensor_t mtoc2_sort_real(mtoc2_tensor_t a, int descending) {\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  mtoc2_tensor_t r;\n  r.real = mtoc2_alloc((size_t)n * sizeof(double));\n  r.imag = NULL;\n  r.ndim = a.ndim;\n  for (int i = 0; i < a.ndim; i++) r.dims[i] = a.dims[i];\n  if (n == 0) return r;\n  mtoc2_sort_pair_t *buf =\n    (mtoc2_sort_pair_t *)malloc((size_t)n * sizeof(mtoc2_sort_pair_t));\n  if (!buf) {\n    fprintf(stderr, \"mtoc2: out of memory (sort buffer)\\n\");\n    abort();\n  }\n  for (long i = 0; i < n; i++) {\n    buf[i].v = a.real[i];\n    buf[i].ix = i;\n  }\n  qsort(buf, (size_t)n, sizeof(mtoc2_sort_pair_t),\n        descending ? mtoc2_sort_cmp_desc : mtoc2_sort_cmp_asc);\n  for (long i = 0; i < n; i++) r.real[i] = buf[i].v;\n  free(buf);\n  return r;\n}\n\nstatic void mtoc2_sort_real_2(mtoc2_tensor_t a, int descending,\n                              mtoc2_tensor_t *out_v, mtoc2_tensor_t *out_i) {\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  mtoc2_tensor_t v;\n  mtoc2_tensor_t ix;\n  v.real = mtoc2_alloc((size_t)n * sizeof(double));\n  v.imag = NULL;\n  v.ndim = a.ndim;\n  for (int i = 0; i < a.ndim; i++) v.dims[i] = a.dims[i];\n  ix.real = mtoc2_alloc((size_t)n * sizeof(double));\n  ix.imag = NULL;\n  ix.ndim = a.ndim;\n  for (int i = 0; i < a.ndim; i++) ix.dims[i] = a.dims[i];\n  if (n > 0) {\n    mtoc2_sort_pair_t *buf =\n      (mtoc2_sort_pair_t *)malloc((size_t)n * sizeof(mtoc2_sort_pair_t));\n    if (!buf) {\n      fprintf(stderr, \"mtoc2: out of memory (sort buffer)\\n\");\n      abort();\n    }\n    for (long i = 0; i < n; i++) {\n      buf[i].v = a.real[i];\n      buf[i].ix = i;\n    }\n    qsort(buf, (size_t)n, sizeof(mtoc2_sort_pair_t),\n          descending ? mtoc2_sort_cmp_desc : mtoc2_sort_cmp_asc);\n    for (long i = 0; i < n; i++) {\n      v.real[i] = buf[i].v;\n      ix.real[i] = (double)(buf[i].ix + 1);\n    }\n    free(buf);\n  }\n  mtoc2_tensor_assign(out_v, v);\n  mtoc2_tensor_assign(out_i, ix);\n}\n\ntypedef struct {\n  double mag;\n  double phase;\n  long ix;\n} mtoc2_sort_complex_pair_t;\n\nstatic int mtoc2_sort_cmp_complex_asc(const void *pa, const void *pb) {\n  const mtoc2_sort_complex_pair_t *a = (const mtoc2_sort_complex_pair_t *)pa;\n  const mtoc2_sort_complex_pair_t *b = (const mtoc2_sort_complex_pair_t *)pb;\n  if (a->mag < b->mag) return -1;\n  if (a->mag > b->mag) return 1;\n  if (a->phase < b->phase) return -1;\n  if (a->phase > b->phase) return 1;\n  if (a->ix < b->ix) return -1;\n  if (a->ix > b->ix) return 1;\n  return 0;\n}\n\nstatic int mtoc2_sort_cmp_complex_desc(const void *pa, const void *pb) {\n  const mtoc2_sort_complex_pair_t *a = (const mtoc2_sort_complex_pair_t *)pa;\n  const mtoc2_sort_complex_pair_t *b = (const mtoc2_sort_complex_pair_t *)pb;\n  if (a->mag > b->mag) return -1;\n  if (a->mag < b->mag) return 1;\n  if (a->phase > b->phase) return -1;\n  if (a->phase < b->phase) return 1;\n  if (a->ix < b->ix) return -1;\n  if (a->ix > b->ix) return 1;\n  return 0;\n}\n\n/* Real-value comparators used when the input's imaginary lane is all\n * zero: order by signed value (the `mag` field holds the signed real\n * part in that mode), NaNs last (asc) / first (desc), tiebreak by index.\n * Matches the interpreter and MATLAB on real data. */\nstatic int mtoc2_sort_cmp_real_asc(const void *pa, const void *pb) {\n  const mtoc2_sort_complex_pair_t *a = (const mtoc2_sort_complex_pair_t *)pa;\n  const mtoc2_sort_complex_pair_t *b = (const mtoc2_sort_complex_pair_t *)pb;\n  int aNaN = a->mag != a->mag;\n  int bNaN = b->mag != b->mag;\n  if (!(aNaN && bNaN)) {\n    if (aNaN) return 1;\n    if (bNaN) return -1;\n    if (a->mag < b->mag) return -1;\n    if (a->mag > b->mag) return 1;\n  }\n  if (a->ix < b->ix) return -1;\n  if (a->ix > b->ix) return 1;\n  return 0;\n}\n\nstatic int mtoc2_sort_cmp_real_desc(const void *pa, const void *pb) {\n  const mtoc2_sort_complex_pair_t *a = (const mtoc2_sort_complex_pair_t *)pa;\n  const mtoc2_sort_complex_pair_t *b = (const mtoc2_sort_complex_pair_t *)pb;\n  int aNaN = a->mag != a->mag;\n  int bNaN = b->mag != b->mag;\n  if (!(aNaN && bNaN)) {\n    if (aNaN) return -1;\n    if (bNaN) return 1;\n    if (a->mag > b->mag) return -1;\n    if (a->mag < b->mag) return 1;\n  }\n  if (a->ix < b->ix) return -1;\n  if (a->ix > b->ix) return 1;\n  return 0;\n}\n\n/* True when the tensor carries no imaginary content (NULL lane or all\n * elements zero) \u2014 then sort orders by signed real value. */\nstatic int mtoc2_sort_all_imag_zero(mtoc2_tensor_t a) {\n  if (a.imag == NULL) return 1;\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  for (long i = 0; i < n; i++) {\n    if (a.imag[i] != 0.0) return 0;\n  }\n  return 1;\n}\n\nstatic mtoc2_tensor_t mtoc2_sort_complex(mtoc2_tensor_t a, int descending) {\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  mtoc2_tensor_t r = mtoc2_tensor_alloc_nd_complex(a.ndim, a.dims);\n  if (n == 0) return r;\n  int srcHasImag = (a.imag != NULL);\n  if (!srcHasImag) memset(r.imag, 0, (size_t)n * sizeof(double));\n  int realMode = mtoc2_sort_all_imag_zero(a);\n  mtoc2_sort_complex_pair_t *buf =\n    (mtoc2_sort_complex_pair_t *)malloc(\n      (size_t)n * sizeof(mtoc2_sort_complex_pair_t));\n  if (!buf) {\n    fprintf(stderr, \"mtoc2: out of memory (sort complex buffer)\\n\");\n    abort();\n  }\n  for (long i = 0; i < n; i++) {\n    double re = a.real[i];\n    double im = srcHasImag ? a.imag[i] : 0.0;\n    buf[i].mag = realMode ? re : hypot(re, im);\n    buf[i].phase = realMode ? 0.0 : atan2(im, re);\n    buf[i].ix = i;\n  }\n  qsort(buf, (size_t)n, sizeof(mtoc2_sort_complex_pair_t),\n        realMode ? (descending ? mtoc2_sort_cmp_real_desc\n                               : mtoc2_sort_cmp_real_asc)\n                 : (descending ? mtoc2_sort_cmp_complex_desc\n                               : mtoc2_sort_cmp_complex_asc));\n  for (long i = 0; i < n; i++) {\n    r.real[i] = a.real[buf[i].ix];\n    r.imag[i] = srcHasImag ? a.imag[buf[i].ix] : 0.0;\n  }\n  free(buf);\n  return r;\n}\n\nstatic void mtoc2_sort_complex_2(mtoc2_tensor_t a, int descending,\n                                 mtoc2_tensor_t *out_v, mtoc2_tensor_t *out_i) {\n  long n = 1;\n  for (int i = 0; i < a.ndim; i++) n *= a.dims[i];\n  mtoc2_tensor_t v = mtoc2_tensor_alloc_nd_complex(a.ndim, a.dims);\n  mtoc2_tensor_t ix;\n  ix.real = mtoc2_alloc((size_t)n * sizeof(double));\n  ix.imag = NULL;\n  ix.ndim = a.ndim;\n  for (int i = 0; i < a.ndim; i++) ix.dims[i] = a.dims[i];\n  if (n > 0) {\n    int srcHasImag = (a.imag != NULL);\n    if (!srcHasImag) memset(v.imag, 0, (size_t)n * sizeof(double));\n    int realMode = mtoc2_sort_all_imag_zero(a);\n    mtoc2_sort_complex_pair_t *buf =\n      (mtoc2_sort_complex_pair_t *)malloc(\n        (size_t)n * sizeof(mtoc2_sort_complex_pair_t));\n    if (!buf) {\n      fprintf(stderr, \"mtoc2: out of memory (sort complex buffer)\\n\");\n      abort();\n    }\n    for (long i = 0; i < n; i++) {\n      double re = a.real[i];\n      double im = srcHasImag ? a.imag[i] : 0.0;\n      buf[i].mag = realMode ? re : hypot(re, im);\n      buf[i].phase = realMode ? 0.0 : atan2(im, re);\n      buf[i].ix = i;\n    }\n    qsort(buf, (size_t)n, sizeof(mtoc2_sort_complex_pair_t),\n          realMode ? (descending ? mtoc2_sort_cmp_real_desc\n                                 : mtoc2_sort_cmp_real_asc)\n                   : (descending ? mtoc2_sort_cmp_complex_desc\n                                 : mtoc2_sort_cmp_complex_asc));\n    for (long i = 0; i < n; i++) {\n      v.real[i] = a.real[buf[i].ix];\n      v.imag[i] = srcHasImag ? a.imag[buf[i].ix] : 0.0;\n      ix.real[i] = (double)(buf[i].ix + 1);\n    }\n    free(buf);\n  }\n  mtoc2_tensor_assign(out_v, v);\n  mtoc2_tensor_assign(out_i, ix);\n}\n",
   "tensor_transpose.h": "/* mtoc2 runtime helper: real-tensor non-conjugate transpose for 2-D\n * inputs. Returns a freshly-owned tensor with `dims` swapped. The\n * 2-D restriction is enforced at lowering; by the time this helper\n * runs, `a.ndim` is always 2.\n *\n * Column-major in, column-major out. Source `a` has shape (m \xD7 n);\n * destination has shape (n \xD7 m). Source element at (sr, sc) lives at\n * `a.real[sr + sc*m]`; destination element at (sc, sr) \u2014 the transpose\n * mapping \u2014 lives at `out.real[sc + sr*n]`. The inner loop walks the\n * source's column-major buffer linearly to keep the read stride\n * unit-stride.\n *\n * For complex support (not yet a thing in mtoc2), the conjugate\n * variant would negate `a.imag` while copying; the non-conjugate\n * variant just copies. Mirrors numbl's `transposeCore` in\n * `helpers/arithmetic.ts`.\n */\n\n#include <stdlib.h>\n\nstatic mtoc2_tensor_t mtoc2_tensor_transpose(mtoc2_tensor_t a) {\n  long m = a.dims[0];\n  long n = a.dims[1];\n  mtoc2_tensor_t r;\n  r.real = mtoc2_alloc((size_t)m * (size_t)n * sizeof(double));\n  r.imag = NULL;\n  r.ndim = 2;\n  r.dims[0] = n;\n  r.dims[1] = m;\n  for (long sc = 0; sc < n; sc++) {\n    for (long sr = 0; sr < m; sr++) {\n      r.real[sc + sr * n] = a.real[sr + sc * m];\n    }\n  }\n  return r;\n}\n",
   "tensor_transpose_complex.h": "/* mtoc2 runtime helper: complex-tensor non-conjugate transpose for\n * 2-D inputs. Sibling of `mtoc2_tensor_transpose` \u2014 same shape\n * permutation, but copies BOTH lanes (no conjugation, the `.'`\n * operator). The `'` (conjugate transpose) operator lowers to\n * `transpose(conj(z))` at the lowering layer, so this helper only\n * sees the non-conjugating case.\n *\n * Tolerates `a.imag == NULL` (real-tensor flowing through a\n * complex-typed transpose route) by zeroing the result's imag lane.\n */\n\n#include <stdlib.h>\n#include <string.h>\n\nstatic mtoc2_tensor_t mtoc2_tensor_transpose_complex(mtoc2_tensor_t a) {\n  long m = a.dims[0];\n  long n = a.dims[1];\n  long dims[2];\n  dims[0] = n;\n  dims[1] = m;\n  mtoc2_tensor_t r = mtoc2_tensor_alloc_nd_complex(2, dims);\n  if (a.imag == NULL) {\n    /* Defensive: zero the imag lane so the transposed result is a\n     * well-formed complex tensor with re-only content. */\n    memset(r.imag, 0, (size_t)m * (size_t)n * sizeof(double));\n  }\n  for (long sc = 0; sc < n; sc++) {\n    for (long sr = 0; sr < m; sr++) {\n      r.real[sc + sr * n] = a.real[sr + sc * m];\n      if (a.imag != NULL) {\n        r.imag[sc + sr * n] = a.imag[sr + sc * m];\n      }\n    }\n  }\n  return r;\n}\n",
   "tensor_triangular.h": "/* mtoc2 runtime helper: `triu` / `tril` \u2014 extract upper / lower\n * triangular part of a 2-D matrix around the k-th diagonal. Mirrors\n * numbl's `triPart` in `interpreter/builtins/array-extras.ts`.\n *\n *   - `mtoc2_tensor_triu(A, k)` returns a fresh `rows \xD7 cols` tensor\n *     equal to `A` where `j - i >= k` (column - row), zero elsewhere.\n *     `k = 0` is the main diagonal; `k > 0` selects a super-diagonal;\n *     `k < 0` selects a sub-diagonal.\n *   - `mtoc2_tensor_tril(A, k)` is the mirror: keep entries where\n *     `i - j >= -k` (equivalently `j - i <= k`), zero elsewhere.\n *   - `*_complex` siblings walk both lanes; tolerate `a.imag == NULL`\n *     (real-input flowed through a complex route).\n *\n * Storage column-major to match `mtoc2_tensor_t`. Result is freshly\n * owned.\n */\n\n#include <string.h>\n#include <stdlib.h>\n\nstatic mtoc2_tensor_t mtoc2_tensor_triu(mtoc2_tensor_t a, long k) {\n  long rows = a.dims[0];\n  long cols = a.dims[1];\n  mtoc2_tensor_t out = mtoc2_tensor_alloc(rows, cols);\n  if (rows > 0 && cols > 0)\n    memset(out.real, 0, (size_t)rows * (size_t)cols * sizeof(double));\n  for (long j = 0; j < cols; j++) {\n    for (long i = 0; i < rows; i++) {\n      if (j - i >= k) {\n        long idx = i + j * rows;\n        out.real[idx] = a.real[idx];\n      }\n    }\n  }\n  return out;\n}\n\nstatic mtoc2_tensor_t mtoc2_tensor_tril(mtoc2_tensor_t a, long k) {\n  long rows = a.dims[0];\n  long cols = a.dims[1];\n  mtoc2_tensor_t out = mtoc2_tensor_alloc(rows, cols);\n  if (rows > 0 && cols > 0)\n    memset(out.real, 0, (size_t)rows * (size_t)cols * sizeof(double));\n  for (long j = 0; j < cols; j++) {\n    for (long i = 0; i < rows; i++) {\n      if (i - j >= -k) {\n        long idx = i + j * rows;\n        out.real[idx] = a.real[idx];\n      }\n    }\n  }\n  return out;\n}\n\nstatic mtoc2_tensor_t mtoc2_tensor_triu_complex(mtoc2_tensor_t a, long k) {\n  long rows = a.dims[0];\n  long cols = a.dims[1];\n  long dims2[2] = {rows, cols};\n  mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(2, dims2);\n  if (rows > 0 && cols > 0) {\n    memset(out.real, 0, (size_t)rows * (size_t)cols * sizeof(double));\n    memset(out.imag, 0, (size_t)rows * (size_t)cols * sizeof(double));\n  }\n  int srcHasImag = (a.imag != NULL);\n  for (long j = 0; j < cols; j++) {\n    for (long i = 0; i < rows; i++) {\n      if (j - i >= k) {\n        long idx = i + j * rows;\n        out.real[idx] = a.real[idx];\n        if (srcHasImag) out.imag[idx] = a.imag[idx];\n      }\n    }\n  }\n  return out;\n}\n\nstatic mtoc2_tensor_t mtoc2_tensor_tril_complex(mtoc2_tensor_t a, long k) {\n  long rows = a.dims[0];\n  long cols = a.dims[1];\n  long dims2[2] = {rows, cols};\n  mtoc2_tensor_t out = mtoc2_tensor_alloc_nd_complex(2, dims2);\n  if (rows > 0 && cols > 0) {\n    memset(out.real, 0, (size_t)rows * (size_t)cols * sizeof(double));\n    memset(out.imag, 0, (size_t)rows * (size_t)cols * sizeof(double));\n  }\n  int srcHasImag = (a.imag != NULL);\n  for (long j = 0; j < cols; j++) {\n    for (long i = 0; i < rows; i++) {\n      if (i - j >= -k) {\n        long idx = i + j * rows;\n        out.real[idx] = a.real[idx];\n        if (srcHasImag) out.imag[idx] = a.imag[idx];\n      }\n    }\n  }\n  return out;\n}\n",
@@ -66075,6 +66146,7 @@ var JS_SNIPPETS = {
   "tensor_fill_nd.js": "// JS sibling of `tensor_fill_nd.h`. Like zeros/ones but takes the\n// fill value as a leading argument \u2014 used by the `nan` / `Inf` shape-\n// constructor branches and by `repmat(scalar, ...)`. Complex variant\n// takes `(re, im)` and fills both lanes.\n\n\n\nfunction mtoc2_tensor_fill_nd(value, ndim, dims) {\n  const t = mtoc2_tensor_alloc_nd(ndim, dims);\n  t.data.fill(value);\n  return t;\n}\n\nfunction mtoc2_tensor_fill_nd_complex(re, im, ndim, dims) {\n  const t = mtoc2_tensor_alloc_nd_complex(ndim, dims);\n  t.data.fill(re);\n  t.imag.fill(im);\n  return t;\n}\n",
   "tensor_fill_square.js": "// JS sibling of `tensor_fill_square.h`. Single-eval helper for\n// `nan(n)` / `Inf(n)` style square-fill constructors.\n\n\nfunction mtoc2_tensor_fill_square(value, n) {\n  return mtoc2_tensor_fill_nd(value, 2, [n, n]);\n}\n",
   "tensor_flip.js": "// JS sibling of `tensor_flip.h`. Two helpers:\n//   - `mtoc2_tensor_flip(t, dimIdx)` \u2014 real-input variant.\n//   - `mtoc2_tensor_flip_complex(t, dimIdx)` \u2014 walks both lanes;\n//     if `imag` is undefined the output imag stays zero.\n// Out-of-range axis acts as a deep-copy no-op in both.\n\n\n\nfunction mtoc2_tensor_flip(a, dimIdx) {\n  const r = mtoc2_tensor_alloc_nd(a.shape.length, a.shape);\n  const axisSize = dimIdx >= 0 && dimIdx < a.shape.length ? a.shape[dimIdx] : 1;\n  if (axisSize <= 1) {\n    r.data.set(a.data);\n    return r;\n  }\n  let strideDim = 1;\n  for (let d = 0; d < dimIdx; d++) strideDim *= a.shape[d];\n  const slabSize = strideDim * axisSize;\n  const total = r.data.length;\n  const numOuter = total / slabSize;\n  for (let outer = 0; outer < numOuter; outer++) {\n    const base = outer * slabSize;\n    for (let k = 0; k < axisSize; k++) {\n      const srcOff = base + k * strideDim;\n      const dstOff = base + (axisSize - 1 - k) * strideDim;\n      for (let i = 0; i < strideDim; i++) {\n        r.data[dstOff + i] = a.data[srcOff + i];\n      }\n    }\n  }\n  return r;\n}\n\nfunction mtoc2_tensor_flip_complex(a, dimIdx) {\n  const r = mtoc2_tensor_alloc_nd_complex(a.shape.length, a.shape);\n  const im = a.imag;\n  const axisSize = dimIdx >= 0 && dimIdx < a.shape.length ? a.shape[dimIdx] : 1;\n  if (axisSize <= 1) {\n    r.data.set(a.data);\n    if (im !== undefined) r.imag.set(im);\n    return r;\n  }\n  let strideDim = 1;\n  for (let d = 0; d < dimIdx; d++) strideDim *= a.shape[d];\n  const slabSize = strideDim * axisSize;\n  const total = r.data.length;\n  const numOuter = total / slabSize;\n  for (let outer = 0; outer < numOuter; outer++) {\n    const base = outer * slabSize;\n    for (let k = 0; k < axisSize; k++) {\n      const srcOff = base + k * strideDim;\n      const dstOff = base + (axisSize - 1 - k) * strideDim;\n      for (let i = 0; i < strideDim; i++) {\n        r.data[dstOff + i] = a.data[srcOff + i];\n        if (im !== undefined) r.imag[dstOff + i] = im[srcOff + i];\n      }\n    }\n  }\n  return r;\n}\n",
+  "tensor_imag_all_zero.js": "// JS sibling of `tensor_imag_all_zero.h`. True when a tensor carries no\n// imaginary content (no imag lane, or every imag element exactly zero).\n// `isreal` uses this for complex-typed tensors the JIT could not prove\n// real at compile time, so it reports realness by value \u2014 matching the\n// interpreter and the complex-scalar `v.im === 0` rule.\nfunction mtoc2_tensor_imag_all_zero(a) {\n  if (a.imag === undefined) return true;\n  for (let i = 0; i < a.imag.length; i++) {\n    if (a.imag[i] !== 0) return false;\n  }\n  return true;\n}\n",
   "tensor_linspace.js": "// JS sibling of `tensor_linspace.h`. Build a 1\xD7n row tensor of n\n// linearly-spaced values from `a` to `b`.\n\n\nfunction mtoc2_tensor_linspace(a, b, n) {\n  if (n < 0) n = 0;\n  const out = mtoc2_tensor_alloc(1, n);\n  if (n === 0) return out;\n  if (n === 1) {\n    out.data[0] = b;\n    return out;\n  }\n  out.data[0] = a;\n  out.data[n - 1] = b;\n  for (let i = 1; i < n - 1; i++) {\n    out.data[i] = a + ((b - a) * i) / (n - 1);\n  }\n  if ((n & 1) === 1 && !Number.isFinite(a) && !Number.isFinite(b)) {\n    const sa = Math.sign(a);\n    const sb = Math.sign(b);\n    if (sa !== sb) out.data[(n - 1) / 2] = 0;\n  }\n  return out;\n}\n",
   "tensor_logical_real.js": '// JS sibling of `tensor_logical_real.h`. Elementwise logical NOT on\n// real and complex tensors. Real input: `out[i] = (in[i] == 0) ? 1 : 0`.\n// Complex input: fires "true" iff both lanes are exactly zero.\n// Result is logical-tagged in both cases.\n\n\nfunction mtoc2_tensor_not(a) {\n  const r = mtoc2_tensor_alloc_nd(a.shape.length, a.shape);\n  for (let i = 0; i < r.data.length; i++) {\n    r.data[i] = a.data[i] === 0 ? 1 : 0;\n  }\n  // Tag as logical so `a(mask)` / `M(:, mask)` etc. take the mask\n  // path in the interpreter (and js-aot, when wired). The tensor\n  // alloc helpers return plain numeric tensors; we mutate the field\n  // here rather than threading a parameter through every allocator.\n  r.isLogical = true;\n  return r;\n}\n\nfunction mtoc2_tensor_not_complex(a) {\n  const r = mtoc2_tensor_alloc_nd(a.shape.length, a.shape);\n  const im = a.imag;\n  for (let i = 0; i < r.data.length; i++) {\n    const re = a.data[i];\n    const v = im !== undefined ? im[i] : 0;\n    r.data[i] = re === 0 && v === 0 ? 1 : 0;\n  }\n  r.isLogical = true;\n  return r;\n}\n',
   "tensor_logspace.js": "// JS sibling of `tensor_logspace.h`. Build a 1\xD7n row tensor of n\n// logarithmically-spaced values from 10^a to 10^b. Byte-for-byte with\n// numbl's interpreter `logspace`, including the MATLAB special case\n// where an upper limit of exactly `pi` makes the last point `pi`.\n\n\nfunction mtoc2_tensor_logspace(a, b, n) {\n  if (n <= 0) return mtoc2_tensor_alloc(1, 0);\n  const isPi = b === Math.PI;\n  const endVal = isPi ? Math.PI : Math.pow(10, b);\n  const out = mtoc2_tensor_alloc(1, n);\n  if (n === 1) {\n    out.data[0] = endVal;\n    return out;\n  }\n  if (isPi) {\n    const logStart = Math.log10(Math.pow(10, a));\n    const logEnd = Math.log10(Math.PI);\n    for (let i = 0; i < n; i++) {\n      const t = logStart + ((logEnd - logStart) * i) / (n - 1);\n      out.data[i] = Math.pow(10, t);\n    }\n  } else {\n    for (let i = 0; i < n; i++) {\n      const t = a + ((b - a) * i) / (n - 1);\n      out.data[i] = Math.pow(10, t);\n    }\n  }\n  return out;\n}\n",
@@ -66085,13 +66157,13 @@ var JS_SNIPPETS = {
   "tensor_ones_nd.js": "// JS sibling of `tensor_ones_nd.h`. Fill the freshly-allocated\n// tensor with `1.0`.\n\n\nfunction mtoc2_tensor_ones_nd(ndim, dims) {\n  const t = mtoc2_tensor_alloc_nd(ndim, dims);\n  t.data.fill(1);\n  return t;\n}\n",
   "tensor_ones_square.js": "// JS sibling of `tensor_ones_square.h`. See `tensor_zeros_square.js`\n// for the rationale.\n\n\nfunction mtoc2_tensor_ones_square(n) {\n  return mtoc2_tensor_ones_nd(2, [n, n]);\n}\n",
   "tensor_predicate.js": '// JS sibling of `tensor_predicate.h`. Real-tensor \u2192 logical-tensor\n// predicate kernels for the js-aot backend, plus their `_complex`\n// siblings (each reads `imag[i]` when defined, treats it as 0\n// otherwise). Result carries `isLogical: true` so downstream index-\n// slot resolution treats it as a mask.\n\nfunction pred_kernel(a, fn) {\n  const out = new Float64Array(a.data.length);\n  for (let i = 0; i < a.data.length; i++) out[i] = fn(a.data[i]) ? 1 : 0;\n  return {\n    mtoc2Tag: "tensor",\n    shape: a.shape.slice(),\n    data: out,\n    isLogical: true,\n  };\n}\n\nfunction pred_kernel_complex(a, fn) {\n  const out = new Float64Array(a.data.length);\n  const im = a.imag;\n  for (let i = 0; i < a.data.length; i++) {\n    out[i] = fn(a.data[i], im !== undefined ? im[i] : 0) ? 1 : 0;\n  }\n  return {\n    mtoc2Tag: "tensor",\n    shape: a.shape.slice(),\n    data: out,\n    isLogical: true,\n  };\n}\n\nfunction mtoc2_tensor_isnan(a) {\n  return pred_kernel(a, Number.isNaN);\n}\n\nfunction mtoc2_tensor_logical(a) {\n  return pred_kernel(a, x => x !== 0);\n}\n\nfunction mtoc2_tensor_isinf(a) {\n  return pred_kernel(a, x => x === Infinity || x === -Infinity);\n}\n\nfunction mtoc2_tensor_isfinite(a) {\n  return pred_kernel(a, Number.isFinite);\n}\n\nfunction mtoc2_tensor_isnan_complex(a) {\n  return pred_kernel_complex(\n    a,\n    (re, im) => Number.isNaN(re) || Number.isNaN(im)\n  );\n}\n\nfunction mtoc2_tensor_isinf_complex(a) {\n  const isInf = x => x === Infinity || x === -Infinity;\n  return pred_kernel_complex(a, (re, im) => isInf(re) || isInf(im));\n}\n\nfunction mtoc2_tensor_isfinite_complex(a) {\n  return pred_kernel_complex(\n    a,\n    (re, im) => Number.isFinite(re) && Number.isFinite(im)\n  );\n}\n',
-  "tensor_reduce_complex.js": '// JS sibling of `tensor_reduce_complex.h`. Complex-tensor reductions.\n// Mirrors the real reducer shape (sum/prod/mean \u2192 complex; min/max\n// \u2192 complex via magnitude+atan2 tiebreak; any/all \u2192 real).\n\n\n\nfunction cSqueezeTrailing(dims) {\n  while (dims.length > 2 && dims[dims.length - 1] === 1) dims.pop();\n  return dims;\n}\n\nfunction cReduceLaneIm(t, i) {\n  return t.imag !== undefined ? t.imag[i] : 0;\n}\n\n// Numeric (sum/prod/mean) \u2014 complex accumulator { re, im }.\nfunction complexAccumAll(t, init, accum, finalize) {\n  let acc = { ...init };\n  for (let i = 0; i < t.data.length; i++) {\n    acc = accum(acc, { re: t.data[i], im: cReduceLaneIm(t, i) });\n  }\n  return finalize(acc, t.data.length);\n}\n\nfunction complexAccumDim(t, dim, init, accum, finalize) {\n  if (dim < 1) throw new Error(`reducer _dim: dim must be >= 1 (got ${dim})`);\n  if (dim > t.shape.length) {\n    // No-op axis: return a fresh complex copy.\n    const out = mtoc2_tensor_alloc_nd_complex(t.shape.length, t.shape.slice());\n    out.data.set(t.data);\n    if (t.imag !== undefined) out.imag.set(t.imag);\n    return out;\n  }\n  const dimIdx = dim - 1;\n  const axis = t.shape[dimIdx];\n  let before = 1;\n  for (let i = 0; i < dimIdx; i++) before *= t.shape[i];\n  let after = 1;\n  for (let i = dimIdx + 1; i < t.shape.length; i++) after *= t.shape[i];\n  const outDims = t.shape.slice();\n  outDims[dimIdx] = 1;\n  cSqueezeTrailing(outDims);\n  const out = mtoc2_tensor_alloc_nd_complex(outDims.length, outDims);\n  for (let aft = 0; aft < after; aft++) {\n    for (let bef = 0; bef < before; bef++) {\n      const base = aft * before * axis + bef;\n      let acc = { ...init };\n      for (let k = 0; k < axis; k++) {\n        const off = base + k * before;\n        acc = accum(acc, { re: t.data[off], im: cReduceLaneIm(t, off) });\n      }\n      const fin = finalize(acc, axis);\n      const dst = aft * before + bef;\n      out.data[dst] = fin.re;\n      out.imag[dst] = fin.im;\n    }\n  }\n  return out;\n}\n\nconst cSumInit = { re: 0, im: 0 };\nconst cProdInit = { re: 1, im: 0 };\nconst cSumAccum = (a, x) => ({ re: a.re + x.re, im: a.im + x.im });\nconst cProdAccum = (a, x) => ({\n  re: a.re * x.re - a.im * x.im,\n  im: a.re * x.im + a.im * x.re,\n});\nconst cIdFinalize = a => a;\nconst cMeanFinalize = (a, n) =>\n  n === 0 ? { re: NaN, im: NaN } : { re: a.re / n, im: a.im / n };\n\nconst mtoc2_sum_complex_all = t =>\n  complexAccumAll(t, cSumInit, cSumAccum, cIdFinalize);\nconst mtoc2_sum_complex_dim = (t, d) =>\n  complexAccumDim(t, d, cSumInit, cSumAccum, cIdFinalize);\nconst mtoc2_prod_complex_all = t =>\n  complexAccumAll(t, cProdInit, cProdAccum, cIdFinalize);\nconst mtoc2_prod_complex_dim = (t, d) =>\n  complexAccumDim(t, d, cProdInit, cProdAccum, cIdFinalize);\nconst mtoc2_mean_complex_all = t =>\n  complexAccumAll(t, cSumInit, cSumAccum, cMeanFinalize);\nconst mtoc2_mean_complex_dim = (t, d) =>\n  complexAccumDim(t, d, cSumInit, cSumAccum, cMeanFinalize);\n\n// Min / max \u2014 magnitude compare with atan2 tiebreak (numbl\'s\n// complexIsBetter). Skip NaN-lane elements; result is complex.\nfunction complexMinmaxAll(t, cmp) {\n  let found = false;\n  let mRe = NaN;\n  let mIm = 0;\n  for (let i = 0; i < t.data.length; i++) {\n    const xr = t.data[i];\n    const xi = cReduceLaneIm(t, i);\n    if (xr !== xr || xi !== xi) continue;\n    if (!found || complexBetter(xr, xi, mRe, mIm, cmp)) {\n      mRe = xr;\n      mIm = xi;\n      found = true;\n    }\n  }\n  return { re: mRe, im: mIm };\n}\n\nfunction complexBetter(aRe, aIm, bRe, bIm, cmp) {\n  const absA = Math.hypot(aRe, aIm);\n  const absB = Math.hypot(bRe, bIm);\n  if (absA !== absB) return cmp === "<" ? absA < absB : absA > absB;\n  return cmp === "<"\n    ? Math.atan2(aIm, aRe) < Math.atan2(bIm, bRe)\n    : Math.atan2(aIm, aRe) > Math.atan2(bIm, bRe);\n}\n\nfunction complexMinmaxDim(t, dim, cmp) {\n  if (dim < 1) throw new Error(`reducer _dim: dim must be >= 1 (got ${dim})`);\n  if (dim > t.shape.length) {\n    const out = mtoc2_tensor_alloc_nd_complex(t.shape.length, t.shape.slice());\n    out.data.set(t.data);\n    if (t.imag !== undefined) out.imag.set(t.imag);\n    return out;\n  }\n  const dimIdx = dim - 1;\n  const axis = t.shape[dimIdx];\n  let before = 1;\n  for (let i = 0; i < dimIdx; i++) before *= t.shape[i];\n  let after = 1;\n  for (let i = dimIdx + 1; i < t.shape.length; i++) after *= t.shape[i];\n  const outDims = t.shape.slice();\n  outDims[dimIdx] = 1;\n  cSqueezeTrailing(outDims);\n  const out = mtoc2_tensor_alloc_nd_complex(outDims.length, outDims);\n  for (let aft = 0; aft < after; aft++) {\n    for (let bef = 0; bef < before; bef++) {\n      const base = aft * before * axis + bef;\n      let found = false;\n      let mRe = NaN;\n      let mIm = 0;\n      for (let k = 0; k < axis; k++) {\n        const off = base + k * before;\n        const xr = t.data[off];\n        const xi = cReduceLaneIm(t, off);\n        if (xr !== xr || xi !== xi) continue;\n        if (!found || complexBetter(xr, xi, mRe, mIm, cmp)) {\n          mRe = xr;\n          mIm = xi;\n          found = true;\n        }\n      }\n      const dst = aft * before + bef;\n      out.data[dst] = mRe;\n      out.imag[dst] = mIm;\n    }\n  }\n  return out;\n}\n\nconst mtoc2_min_complex_all = t => complexMinmaxAll(t, "<");\nconst mtoc2_min_complex_dim = (t, d) => complexMinmaxDim(t, d, "<");\nconst mtoc2_max_complex_all = t => complexMinmaxAll(t, ">");\nconst mtoc2_max_complex_dim = (t, d) => complexMinmaxDim(t, d, ">");\n\n// any / all \u2014 real result; toBool per element (either lane nonzero).\nfunction complexLogicalAll(t, emptyResult, shortPredicate) {\n  if (t.data.length === 0) return emptyResult;\n  for (let i = 0; i < t.data.length; i++) {\n    const xr = t.data[i];\n    const xi = cReduceLaneIm(t, i);\n    const x = xr !== 0 || xi !== 0;\n    if (shortPredicate(x)) return emptyResult === 1 ? 0 : 1;\n  }\n  return emptyResult;\n}\n\nfunction complexLogicalDim(t, dim, emptyResult, shortPredicate) {\n  if (dim < 1) throw new Error(`reducer _dim: dim must be >= 1 (got ${dim})`);\n  if (dim > t.shape.length) {\n    const out = mtoc2_tensor_alloc_nd(t.shape.length, t.shape.slice());\n    for (let i = 0; i < t.data.length; i++) {\n      const xr = t.data[i];\n      const xi = cReduceLaneIm(t, i);\n      out.data[i] = xr !== 0 || xi !== 0 ? 1 : 0;\n    }\n    return out;\n  }\n  const dimIdx = dim - 1;\n  const axis = t.shape[dimIdx];\n  let before = 1;\n  for (let i = 0; i < dimIdx; i++) before *= t.shape[i];\n  let after = 1;\n  for (let i = dimIdx + 1; i < t.shape.length; i++) after *= t.shape[i];\n  const outDims = t.shape.slice();\n  outDims[dimIdx] = 1;\n  cSqueezeTrailing(outDims);\n  const out = mtoc2_tensor_alloc_nd(outDims.length, outDims);\n  for (let aft = 0; aft < after; aft++) {\n    for (let bef = 0; bef < before; bef++) {\n      const base = aft * before * axis + bef;\n      let res = emptyResult;\n      for (let k = 0; k < axis; k++) {\n        const off = base + k * before;\n        const x = t.data[off] !== 0 || cReduceLaneIm(t, off) !== 0;\n        if (shortPredicate(x)) {\n          res = emptyResult === 1 ? 0 : 1;\n          break;\n        }\n      }\n      out.data[aft * before + bef] = res;\n    }\n  }\n  return out;\n}\n\nconst cAnyShort = x => x;\nconst cAllShort = x => !x;\nconst mtoc2_any_complex_all = t => complexLogicalAll(t, 0, cAnyShort);\nconst mtoc2_any_complex_dim = (t, d) =>\n  complexLogicalDim(t, d, 0, cAnyShort);\nconst mtoc2_all_complex_all = t => complexLogicalAll(t, 1, cAllShort);\nconst mtoc2_all_complex_dim = (t, d) =>\n  complexLogicalDim(t, d, 1, cAllShort);\n',
+  "tensor_reduce_complex.js": '// JS sibling of `tensor_reduce_complex.h`. Complex-tensor reductions.\n// Mirrors the real reducer shape (sum/prod/mean \u2192 complex; min/max\n// \u2192 complex via magnitude+atan2 tiebreak; any/all \u2192 real).\n\n\n\nfunction cSqueezeTrailing(dims) {\n  while (dims.length > 2 && dims[dims.length - 1] === 1) dims.pop();\n  return dims;\n}\n\nfunction cReduceLaneIm(t, i) {\n  return t.imag !== undefined ? t.imag[i] : 0;\n}\n\n// True when the tensor carries no imaginary content (no lane, or every\n// element zero). Such a tensor is real in value \u2014 min/max must order by\n// value, not magnitude, to match the interpreter and MATLAB on real data.\nfunction cReduceAllImagZero(t) {\n  if (t.imag === undefined) return true;\n  for (let i = 0; i < t.imag.length; i++) {\n    if (t.imag[i] !== 0) return false;\n  }\n  return true;\n}\n\n// Numeric (sum/prod/mean) \u2014 complex accumulator { re, im }.\nfunction complexAccumAll(t, init, accum, finalize) {\n  let acc = { ...init };\n  for (let i = 0; i < t.data.length; i++) {\n    acc = accum(acc, { re: t.data[i], im: cReduceLaneIm(t, i) });\n  }\n  return finalize(acc, t.data.length);\n}\n\nfunction complexAccumDim(t, dim, init, accum, finalize) {\n  if (dim < 1) throw new Error(`reducer _dim: dim must be >= 1 (got ${dim})`);\n  if (dim > t.shape.length) {\n    // No-op axis: return a fresh complex copy.\n    const out = mtoc2_tensor_alloc_nd_complex(t.shape.length, t.shape.slice());\n    out.data.set(t.data);\n    if (t.imag !== undefined) out.imag.set(t.imag);\n    return out;\n  }\n  const dimIdx = dim - 1;\n  const axis = t.shape[dimIdx];\n  let before = 1;\n  for (let i = 0; i < dimIdx; i++) before *= t.shape[i];\n  let after = 1;\n  for (let i = dimIdx + 1; i < t.shape.length; i++) after *= t.shape[i];\n  const outDims = t.shape.slice();\n  outDims[dimIdx] = 1;\n  cSqueezeTrailing(outDims);\n  const out = mtoc2_tensor_alloc_nd_complex(outDims.length, outDims);\n  for (let aft = 0; aft < after; aft++) {\n    for (let bef = 0; bef < before; bef++) {\n      const base = aft * before * axis + bef;\n      let acc = { ...init };\n      for (let k = 0; k < axis; k++) {\n        const off = base + k * before;\n        acc = accum(acc, { re: t.data[off], im: cReduceLaneIm(t, off) });\n      }\n      const fin = finalize(acc, axis);\n      const dst = aft * before + bef;\n      out.data[dst] = fin.re;\n      out.imag[dst] = fin.im;\n    }\n  }\n  return out;\n}\n\nconst cSumInit = { re: 0, im: 0 };\nconst cProdInit = { re: 1, im: 0 };\nconst cSumAccum = (a, x) => ({ re: a.re + x.re, im: a.im + x.im });\nconst cProdAccum = (a, x) => ({\n  re: a.re * x.re - a.im * x.im,\n  im: a.re * x.im + a.im * x.re,\n});\nconst cIdFinalize = a => a;\nconst cMeanFinalize = (a, n) =>\n  n === 0 ? { re: NaN, im: NaN } : { re: a.re / n, im: a.im / n };\n\nconst mtoc2_sum_complex_all = t =>\n  complexAccumAll(t, cSumInit, cSumAccum, cIdFinalize);\nconst mtoc2_sum_complex_dim = (t, d) =>\n  complexAccumDim(t, d, cSumInit, cSumAccum, cIdFinalize);\nconst mtoc2_prod_complex_all = t =>\n  complexAccumAll(t, cProdInit, cProdAccum, cIdFinalize);\nconst mtoc2_prod_complex_dim = (t, d) =>\n  complexAccumDim(t, d, cProdInit, cProdAccum, cIdFinalize);\nconst mtoc2_mean_complex_all = t =>\n  complexAccumAll(t, cSumInit, cSumAccum, cMeanFinalize);\nconst mtoc2_mean_complex_dim = (t, d) =>\n  complexAccumDim(t, d, cSumInit, cSumAccum, cMeanFinalize);\n\n// Min / max \u2014 magnitude compare with atan2 tiebreak (numbl\'s\n// complexIsBetter). Skip NaN-lane elements; result is complex.\nfunction complexMinmaxAll(t, cmp) {\n  const realMode = cReduceAllImagZero(t);\n  let found = false;\n  let mRe = NaN;\n  let mIm = 0;\n  for (let i = 0; i < t.data.length; i++) {\n    const xr = t.data[i];\n    const xi = cReduceLaneIm(t, i);\n    if (xr !== xr || xi !== xi) continue;\n    const better = realMode\n      ? cmp === "<"\n        ? xr < mRe\n        : xr > mRe\n      : complexBetter(xr, xi, mRe, mIm, cmp);\n    if (!found || better) {\n      mRe = xr;\n      mIm = xi;\n      found = true;\n    }\n  }\n  return { re: mRe, im: mIm };\n}\n\nfunction complexBetter(aRe, aIm, bRe, bIm, cmp) {\n  const absA = Math.hypot(aRe, aIm);\n  const absB = Math.hypot(bRe, bIm);\n  if (absA !== absB) return cmp === "<" ? absA < absB : absA > absB;\n  return cmp === "<"\n    ? Math.atan2(aIm, aRe) < Math.atan2(bIm, bRe)\n    : Math.atan2(aIm, aRe) > Math.atan2(bIm, bRe);\n}\n\nfunction complexMinmaxDim(t, dim, cmp) {\n  if (dim < 1) throw new Error(`reducer _dim: dim must be >= 1 (got ${dim})`);\n  if (dim > t.shape.length) {\n    const out = mtoc2_tensor_alloc_nd_complex(t.shape.length, t.shape.slice());\n    out.data.set(t.data);\n    if (t.imag !== undefined) out.imag.set(t.imag);\n    return out;\n  }\n  const dimIdx = dim - 1;\n  const realMode = cReduceAllImagZero(t);\n  const axis = t.shape[dimIdx];\n  let before = 1;\n  for (let i = 0; i < dimIdx; i++) before *= t.shape[i];\n  let after = 1;\n  for (let i = dimIdx + 1; i < t.shape.length; i++) after *= t.shape[i];\n  const outDims = t.shape.slice();\n  outDims[dimIdx] = 1;\n  cSqueezeTrailing(outDims);\n  const out = mtoc2_tensor_alloc_nd_complex(outDims.length, outDims);\n  for (let aft = 0; aft < after; aft++) {\n    for (let bef = 0; bef < before; bef++) {\n      const base = aft * before * axis + bef;\n      let found = false;\n      let mRe = NaN;\n      let mIm = 0;\n      for (let k = 0; k < axis; k++) {\n        const off = base + k * before;\n        const xr = t.data[off];\n        const xi = cReduceLaneIm(t, off);\n        if (xr !== xr || xi !== xi) continue;\n        const better = realMode\n          ? cmp === "<"\n            ? xr < mRe\n            : xr > mRe\n          : complexBetter(xr, xi, mRe, mIm, cmp);\n        if (!found || better) {\n          mRe = xr;\n          mIm = xi;\n          found = true;\n        }\n      }\n      const dst = aft * before + bef;\n      out.data[dst] = mRe;\n      out.imag[dst] = mIm;\n    }\n  }\n  return out;\n}\n\nconst mtoc2_min_complex_all = t => complexMinmaxAll(t, "<");\nconst mtoc2_min_complex_dim = (t, d) => complexMinmaxDim(t, d, "<");\nconst mtoc2_max_complex_all = t => complexMinmaxAll(t, ">");\nconst mtoc2_max_complex_dim = (t, d) => complexMinmaxDim(t, d, ">");\n\n// any / all \u2014 real result; toBool per element (either lane nonzero).\nfunction complexLogicalAll(t, emptyResult, shortPredicate) {\n  if (t.data.length === 0) return emptyResult;\n  for (let i = 0; i < t.data.length; i++) {\n    const xr = t.data[i];\n    const xi = cReduceLaneIm(t, i);\n    const x = xr !== 0 || xi !== 0;\n    if (shortPredicate(x)) return emptyResult === 1 ? 0 : 1;\n  }\n  return emptyResult;\n}\n\nfunction complexLogicalDim(t, dim, emptyResult, shortPredicate) {\n  if (dim < 1) throw new Error(`reducer _dim: dim must be >= 1 (got ${dim})`);\n  if (dim > t.shape.length) {\n    const out = mtoc2_tensor_alloc_nd(t.shape.length, t.shape.slice());\n    for (let i = 0; i < t.data.length; i++) {\n      const xr = t.data[i];\n      const xi = cReduceLaneIm(t, i);\n      out.data[i] = xr !== 0 || xi !== 0 ? 1 : 0;\n    }\n    return out;\n  }\n  const dimIdx = dim - 1;\n  const axis = t.shape[dimIdx];\n  let before = 1;\n  for (let i = 0; i < dimIdx; i++) before *= t.shape[i];\n  let after = 1;\n  for (let i = dimIdx + 1; i < t.shape.length; i++) after *= t.shape[i];\n  const outDims = t.shape.slice();\n  outDims[dimIdx] = 1;\n  cSqueezeTrailing(outDims);\n  const out = mtoc2_tensor_alloc_nd(outDims.length, outDims);\n  for (let aft = 0; aft < after; aft++) {\n    for (let bef = 0; bef < before; bef++) {\n      const base = aft * before * axis + bef;\n      let res = emptyResult;\n      for (let k = 0; k < axis; k++) {\n        const off = base + k * before;\n        const x = t.data[off] !== 0 || cReduceLaneIm(t, off) !== 0;\n        if (shortPredicate(x)) {\n          res = emptyResult === 1 ? 0 : 1;\n          break;\n        }\n      }\n      out.data[aft * before + bef] = res;\n    }\n  }\n  return out;\n}\n\nconst cAnyShort = x => x;\nconst cAllShort = x => !x;\nconst mtoc2_any_complex_all = t => complexLogicalAll(t, 0, cAnyShort);\nconst mtoc2_any_complex_dim = (t, d) =>\n  complexLogicalDim(t, d, 0, cAnyShort);\nconst mtoc2_all_complex_all = t => complexLogicalAll(t, 1, cAllShort);\nconst mtoc2_all_complex_dim = (t, d) =>\n  complexLogicalDim(t, d, 1, cAllShort);\n',
   "tensor_reduce_real.js": '// JS sibling of `tensor_reduce_real.h`. Real-tensor reductions \u2014\n// `_all` returns a scalar; `_dim` returns a freshly-allocated tensor\n// reduced along the 1-based `dim` axis. Mirrors numbl\'s\n// `forEachSlice` semantics with column-major (before \xD7 axis \xD7 after)\n// traversal.\n//\n// Output shape rule for `_dim`: input dims with `dims[dim-1] = 1`,\n// then trailing singletons stripped subject to a 2-axis floor.\n\n\nfunction squeeze_trailing(dims) {\n  while (dims.length > 2 && dims[dims.length - 1] === 1) dims.pop();\n  return dims;\n}\n\n// Accumulator-based reducer (`sum`, `prod`, `mean`). `init` seeds\n// the running value; `accum(a, x)` is the per-element step;\n// `finalize(a, n)` is the post-loop transform.\nfunction accum_all(t, init, accum, finalize) {\n  let acc = init;\n  for (let i = 0; i < t.data.length; i++) acc = accum(acc, t.data[i]);\n  return finalize(acc, t.data.length);\n}\n\nfunction accum_dim(t, dim, init, accum, finalize) {\n  if (dim < 1) {\n    throw new Error(`reducer _dim: dim must be >= 1 (got ${dim})`);\n  }\n  if (dim > t.shape.length) {\n    // Reducing along a trailing singleton axis is the identity: every\n    // fiber has length 1, so sum/prod/mean each yield that single\n    // element unchanged. Return a same-shape COPY of the data (the C\n    // kernel memcpy\'s here too); allocating without copying left a\n    // zero-filled tensor \u2014 the original opt1 bug.\n    const out = mtoc2_tensor_alloc_nd(t.shape.length, t.shape.slice());\n    out.data.set(t.data);\n    return out;\n  }\n  const dimIdx = dim - 1;\n  const axis = t.shape[dimIdx];\n  let before = 1;\n  for (let i = 0; i < dimIdx; i++) before *= t.shape[i];\n  let after = 1;\n  for (let i = dimIdx + 1; i < t.shape.length; i++) after *= t.shape[i];\n  const outDims = squeeze_trailing(t.shape.slice());\n  outDims[dimIdx] = 1;\n  // Re-squeeze after the in-place axis update (the original `out_dims\n  // = a.shape.slice(); out_dims[dimIdx] = 1` then squeeze pattern).\n  squeeze_trailing(outDims);\n  const out = mtoc2_tensor_alloc_nd(outDims.length, outDims);\n  for (let aft = 0; aft < after; aft++) {\n    for (let bef = 0; bef < before; bef++) {\n      const base = aft * before * axis + bef;\n      let acc = init;\n      for (let k = 0; k < axis; k++) {\n        acc = accum(acc, t.data[base + k * before]);\n      }\n      out.data[aft * before + bef] = finalize(acc, axis);\n    }\n  }\n  return out;\n}\n\n// Min/max reducer. Ignores NaN like numbl/MATLAB: NaN entries are\n// skipped, and the result is NaN only when every element is NaN.\n// Mirrors the interpreter\'s `minMaxScan` (helpers/reduction/min-max.ts)\n// and the C kernel \u2014 seeding with data[0] would let a *leading* NaN\n// poison the result (`x < NaN` / `x > NaN` are always false).\nfunction minmax_all(t, op /* "min" | "max" */) {\n  if (t.data.length === 0) return op === "min" ? Infinity : -Infinity;\n  let best = NaN;\n  let found = false;\n  for (let i = 0; i < t.data.length; i++) {\n    const x = t.data[i];\n    if (x !== x) continue; // skip NaN\n    if (!found || (op === "min" ? x < best : x > best)) {\n      best = x;\n      found = true;\n    }\n  }\n  return found ? best : NaN;\n}\n\nfunction minmax_dim(t, dim, op) {\n  if (dim < 1) throw new Error(`reducer _dim: dim must be >= 1 (got ${dim})`);\n  if (dim > t.shape.length) {\n    const out = mtoc2_tensor_alloc_nd(t.shape.length, t.shape.slice());\n    out.data.set(t.data);\n    return out;\n  }\n  const dimIdx = dim - 1;\n  const axis = t.shape[dimIdx];\n  let before = 1;\n  for (let i = 0; i < dimIdx; i++) before *= t.shape[i];\n  let after = 1;\n  for (let i = dimIdx + 1; i < t.shape.length; i++) after *= t.shape[i];\n  const outDims = t.shape.slice();\n  outDims[dimIdx] = 1;\n  squeeze_trailing(outDims);\n  const out = mtoc2_tensor_alloc_nd(outDims.length, outDims);\n  for (let aft = 0; aft < after; aft++) {\n    for (let bef = 0; bef < before; bef++) {\n      const base = aft * before * axis + bef;\n      let best = NaN;\n      let found = false;\n      for (let k = 0; k < axis; k++) {\n        const x = t.data[base + k * before];\n        if (x !== x) continue; // skip NaN\n        if (!found || (op === "min" ? x < best : x > best)) {\n          best = x;\n          found = true;\n        }\n      }\n      out.data[aft * before + bef] = found ? best : NaN;\n    }\n  }\n  return out;\n}\n\n// Logical reducer (`any`, `all`). `emptyResult` is the value for a\n// 0-element reduction; `short` is the early-exit predicate.\nfunction logical_all(t, emptyResult, shortPredicate) {\n  if (t.data.length === 0) return emptyResult;\n  for (let i = 0; i < t.data.length; i++) {\n    if (shortPredicate(t.data[i])) return emptyResult === 1 ? 0 : 1;\n  }\n  return emptyResult;\n}\n\nfunction logical_dim(t, dim, emptyResult, shortPredicate) {\n  if (dim < 1) throw new Error(`reducer _dim: dim must be >= 1 (got ${dim})`);\n  if (dim > t.shape.length) {\n    // No-op axis: emit a logical cast of the input (each element \u2192\n    // 1 if nonzero, 0 otherwise).\n    const out = mtoc2_tensor_alloc_nd(t.shape.length, t.shape.slice());\n    for (let i = 0; i < t.data.length; i++) {\n      out.data[i] = t.data[i] !== 0 ? 1 : 0;\n    }\n    return out;\n  }\n  const dimIdx = dim - 1;\n  const axis = t.shape[dimIdx];\n  let before = 1;\n  for (let i = 0; i < dimIdx; i++) before *= t.shape[i];\n  let after = 1;\n  for (let i = dimIdx + 1; i < t.shape.length; i++) after *= t.shape[i];\n  const outDims = t.shape.slice();\n  outDims[dimIdx] = 1;\n  squeeze_trailing(outDims);\n  const out = mtoc2_tensor_alloc_nd(outDims.length, outDims);\n  for (let aft = 0; aft < after; aft++) {\n    for (let bef = 0; bef < before; bef++) {\n      const base = aft * before * axis + bef;\n      let res = emptyResult;\n      for (let k = 0; k < axis; k++) {\n        if (shortPredicate(t.data[base + k * before])) {\n          res = emptyResult === 1 ? 0 : 1;\n          break;\n        }\n      }\n      out.data[aft * before + bef] = res;\n    }\n  }\n  return out;\n}\n\n// \u2500\u2500 Sum \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\nconst sumInit = 0;\nconst sumAccum = (a, x) => a + x;\nconst idFinalize = a => a;\nconst mtoc2_sum_all = t => accum_all(t, sumInit, sumAccum, idFinalize);\nconst mtoc2_sum_dim = (t, d) =>\n  accum_dim(t, d, sumInit, sumAccum, idFinalize);\n\n// \u2500\u2500 Prod \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\nconst prodInit = 1;\nconst prodAccum = (a, x) => a * x;\nconst mtoc2_prod_all = t =>\n  accum_all(t, prodInit, prodAccum, idFinalize);\nconst mtoc2_prod_dim = (t, d) =>\n  accum_dim(t, d, prodInit, prodAccum, idFinalize);\n\n// \u2500\u2500 Mean \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\nconst meanFinalize = (a, n) => (n === 0 ? NaN : a / n);\nconst mtoc2_mean_all = t =>\n  accum_all(t, sumInit, sumAccum, meanFinalize);\nconst mtoc2_mean_dim = (t, d) =>\n  accum_dim(t, d, sumInit, sumAccum, meanFinalize);\n\n// \u2500\u2500 Min / max \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\nconst mtoc2_min_all = t => minmax_all(t, "min");\nconst mtoc2_min_dim = (t, d) => minmax_dim(t, d, "min");\nconst mtoc2_max_all = t => minmax_all(t, "max");\nconst mtoc2_max_dim = (t, d) => minmax_dim(t, d, "max");\n\n// \u2500\u2500 Any / all \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n// any: short-circuits on a non-NaN nonzero; emptyResult = 0. NaN is\n// ignored (MATLAB: any(NaN) is 0, any([0 NaN]) is 0), so `x === x`\n// excludes it \u2014 without that, NaN wrongly short-circuited to true.\n// (`all` needs no such guard: allShort tests `x === 0`, which NaN\n// already fails, so a NaN simply doesn\'t force all to false.)\nconst anyShort = x => x !== 0 && x === x;\nconst mtoc2_any_all = t => logical_all(t, 0, anyShort);\nconst mtoc2_any_dim = (t, d) => logical_dim(t, d, 0, anyShort);\n// all: short-circuits on zero; emptyResult = 1.\nconst allShort = x => x === 0;\nconst mtoc2_all_all = t => logical_all(t, 1, allShort);\nconst mtoc2_all_dim = (t, d) => logical_dim(t, d, 1, allShort);\n',
   "tensor_repmat.js": "// JS sibling of `tensor_repmat.h`. Two helpers:\n//   - `mtoc2_tensor_repmat(in, nreps, reps)` \u2014 tile a real tensor.\n//   - `mtoc2_tensor_repmat_complex(in, nreps, reps)` \u2014 tile both\n//     lanes of a complex tensor (zero imag when input is real).\n// Negative reps clamp to 0; input shape and reps are right-padded\n// with 1s to a common rank.\n\n\n\nfunction mtoc2_tensor_repmat(input, nreps, repsIn) {\n  const reps = [];\n  for (let i = 0; i < nreps; i++) {\n    const r = repsIn[i] < 0 ? 0 : repsIn[i];\n    reps.push(r);\n  }\n  const inShape = input.shape;\n  const inNdim = inShape.length;\n  const outNdim = Math.max(nreps, inNdim);\n  const padShape = [];\n  const padReps = [];\n  const outDims = [];\n  for (let i = 0; i < outNdim; i++) {\n    padShape.push(i < inNdim ? inShape[i] : 1);\n    padReps.push(i < nreps ? reps[i] : 1);\n    outDims.push(padShape[i] * padReps[i]);\n  }\n  const out = mtoc2_tensor_alloc_nd(outNdim, outDims);\n  let outTotal = 1;\n  for (const d of outDims) outTotal *= d;\n  if (outTotal === 0) return out;\n  let inTotal = 1;\n  for (const d of inShape) inTotal *= d;\n  if (inTotal === 0) return out;\n\n  // Initial copy: trailing-1 padding doesn't change column-major layout.\n  out.data.set(input.data.subarray(0, inTotal), 0);\n\n  const curShape = padShape.slice();\n  let curTotal = inTotal;\n\n  for (let d = 0; d < outNdim; d++) {\n    const rep = padReps[d];\n    if (rep === 1) continue;\n    let blockSize = 1;\n    for (let i = 0; i <= d; i++) blockSize *= curShape[i];\n    if (rep === 0 || blockSize === 0) {\n      // outTotal will be 0; the alloc above already produced an empty\n      // tensor. Done.\n      return out;\n    }\n    const numBlocks = curTotal / blockSize;\n    // Walk blocks in reverse to avoid overwriting source data.\n    for (let b = numBlocks - 1; b >= 0; b--) {\n      const srcOff = b * blockSize;\n      const dstBase = b * blockSize * rep;\n      if (dstBase !== srcOff) {\n        // copyWithin handles overlapping moves correctly.\n        out.data.copyWithin(dstBase, srcOff, srcOff + blockSize);\n      }\n      for (let r = 1; r < rep; r++) {\n        out.data.copyWithin(\n          dstBase + r * blockSize,\n          dstBase,\n          dstBase + blockSize\n        );\n      }\n    }\n    curShape[d] *= rep;\n    curTotal *= rep;\n  }\n  return out;\n}\n\nfunction mtoc2_tensor_repmat_complex(input, nreps, repsIn) {\n  const reps = [];\n  for (let i = 0; i < nreps; i++) {\n    const r = repsIn[i] < 0 ? 0 : repsIn[i];\n    reps.push(r);\n  }\n  const inShape = input.shape;\n  const inNdim = inShape.length;\n  const outNdim = Math.max(nreps, inNdim);\n  const padShape = [];\n  const padReps = [];\n  const outDims = [];\n  for (let i = 0; i < outNdim; i++) {\n    padShape.push(i < inNdim ? inShape[i] : 1);\n    padReps.push(i < nreps ? reps[i] : 1);\n    outDims.push(padShape[i] * padReps[i]);\n  }\n  const out = mtoc2_tensor_alloc_nd_complex(outNdim, outDims);\n  let outTotal = 1;\n  for (const d of outDims) outTotal *= d;\n  if (outTotal === 0) return out;\n  let inTotal = 1;\n  for (const d of inShape) inTotal *= d;\n  if (inTotal === 0) return out;\n\n  const im = input.imag;\n  out.data.set(input.data.subarray(0, inTotal), 0);\n  if (im !== undefined) out.imag.set(im.subarray(0, inTotal), 0);\n\n  const curShape = padShape.slice();\n  let curTotal = inTotal;\n\n  for (let d = 0; d < outNdim; d++) {\n    const rep = padReps[d];\n    if (rep === 1) continue;\n    let blockSize = 1;\n    for (let i = 0; i <= d; i++) blockSize *= curShape[i];\n    if (rep === 0 || blockSize === 0) return out;\n    const numBlocks = curTotal / blockSize;\n    for (let b = numBlocks - 1; b >= 0; b--) {\n      const srcOff = b * blockSize;\n      const dstBase = b * blockSize * rep;\n      if (dstBase !== srcOff) {\n        out.data.copyWithin(dstBase, srcOff, srcOff + blockSize);\n        out.imag.copyWithin(dstBase, srcOff, srcOff + blockSize);\n      }\n      for (let r = 1; r < rep; r++) {\n        out.data.copyWithin(\n          dstBase + r * blockSize,\n          dstBase,\n          dstBase + blockSize\n        );\n        out.imag.copyWithin(\n          dstBase + r * blockSize,\n          dstBase,\n          dstBase + blockSize\n        );\n      }\n    }\n    curShape[d] *= rep;\n    curTotal *= rep;\n  }\n  return out;\n}\n",
   "tensor_reshape_nd.js": "// JS sibling of `tensor_reshape_nd.h`. Reshape a real tensor to an\n// N-D shape, supporting one `-1` auto-infer slot. Same error\n// behaviour as the C side: throws (instead of `abort()`) on bad\n// inputs.\n\n\nfunction mtoc2_reshape_nd(input, ndim, dims) {\n  let inTotal = 1;\n  for (const d of input.shape) inTotal *= d;\n  let inferIdx = -1;\n  let explicitProd = 1;\n  for (let i = 0; i < ndim; i++) {\n    if (dims[i] === -1) {\n      if (inferIdx !== -1) {\n        throw new Error(\"reshape: at most one '[]' auto-infer slot allowed\");\n      }\n      inferIdx = i;\n    } else if (dims[i] < 0) {\n      throw new Error(\n        `reshape: dim ${i + 1} must be a non-negative integer (got ${dims[i]})`\n      );\n    } else {\n      explicitProd *= dims[i];\n    }\n  }\n  const resolved = new Array(ndim);\n  for (let i = 0; i < ndim; i++) resolved[i] = dims[i];\n  let outTotal;\n  if (inferIdx !== -1) {\n    if (explicitProd === 0 && inTotal !== 0) {\n      throw new Error(\n        `reshape: input has ${inTotal} elements but explicit dims around '[]' multiply to 0`\n      );\n    }\n    if (explicitProd > 0 && inTotal % explicitProd !== 0) {\n      throw new Error(\n        `reshape: input has ${inTotal} elements, not divisible by ${explicitProd}`\n      );\n    }\n    resolved[inferIdx] = explicitProd === 0 ? 0 : inTotal / explicitProd;\n    outTotal = inTotal;\n  } else {\n    outTotal = explicitProd;\n    if (inTotal !== outTotal) {\n      throw new Error(\n        `reshape: number of elements must not change (in=${inTotal}, out=${outTotal})`\n      );\n    }\n  }\n  const out = mtoc2_tensor_alloc_nd(ndim, resolved);\n  if (outTotal > 0) out.data.set(input.data.subarray(0, outTotal));\n  return out;\n}\n",
   "tensor_reshape_nd_complex.js": "// JS sibling of `tensor_reshape_nd_complex.h`. Reshape a complex\n// tensor to an N-D shape. Same auto-infer / element-count rules as\n// the real reshape; both lanes are reinterpreted (no rearrangement).\n\n\nfunction mtoc2_reshape_nd_complex(input, ndim, dims) {\n  let inTotal = 1;\n  for (const d of input.shape) inTotal *= d;\n  let inferIdx = -1;\n  let explicitProd = 1;\n  for (let i = 0; i < ndim; i++) {\n    if (dims[i] === -1) {\n      if (inferIdx !== -1) {\n        throw new Error(\"reshape: at most one '[]' auto-infer slot allowed\");\n      }\n      inferIdx = i;\n    } else if (dims[i] < 0) {\n      throw new Error(\n        `reshape: dim ${i + 1} must be a non-negative integer (got ${dims[i]})`\n      );\n    } else {\n      explicitProd *= dims[i];\n    }\n  }\n  const resolved = new Array(ndim);\n  for (let i = 0; i < ndim; i++) resolved[i] = dims[i];\n  let outTotal;\n  if (inferIdx !== -1) {\n    if (explicitProd === 0 && inTotal !== 0) {\n      throw new Error(\n        `reshape: input has ${inTotal} elements but explicit dims around '[]' multiply to 0`\n      );\n    }\n    if (explicitProd > 0 && inTotal % explicitProd !== 0) {\n      throw new Error(\n        `reshape: input has ${inTotal} elements, not divisible by ${explicitProd}`\n      );\n    }\n    resolved[inferIdx] = explicitProd === 0 ? 0 : inTotal / explicitProd;\n    outTotal = inTotal;\n  } else {\n    outTotal = explicitProd;\n    if (inTotal !== outTotal) {\n      throw new Error(\n        `reshape: number of elements must not change (in=${inTotal}, out=${outTotal})`\n      );\n    }\n  }\n  const out = mtoc2_tensor_alloc_nd_complex(ndim, resolved);\n  if (outTotal > 0) {\n    out.data.set(input.data.subarray(0, outTotal));\n    if (input.imag !== undefined) {\n      out.imag.set(input.imag.subarray(0, outTotal));\n    }\n  }\n  return out;\n}\n",
   "tensor_size.js": "// JS sibling of `tensor_size.h`. Build a 1\xD7ndim row tensor whose\n// elements are the input's dim sizes.\n\n\nfunction mtoc2_tensor_size_row(a) {\n  const n = a.shape.length;\n  const r = mtoc2_tensor_alloc(1, n);\n  for (let i = 0; i < n; i++) r.data[i] = a.shape[i];\n  return r;\n}\n",
-  "tensor_sort_real.js": "// JS sibling of `tensor_sort_real.h`. Stable sort on real and\n// complex tensors. The descending flag flips the comparator while\n// keeping the tie-break on ascending original index. Complex sort\n// orders by magnitude then phase (matches numbl).\n\n\n\nfunction pair_sort_indices(a, descending) {\n  const n = a.data.length;\n  const idx = new Array(n);\n  for (let i = 0; i < n; i++) idx[i] = i;\n  idx.sort((p, q) => {\n    const av = a.data[p];\n    const bv = a.data[q];\n    // NaN ranks as the maximum (MATLAB): last when ascending, first\n    // when descending. Without this, NaN compares false both ways and\n    // falls through to the index tie-break, making the comparator\n    // non-transitive and corrupting the whole array.\n    const an = av !== av;\n    const bn = bv !== bv;\n    if (an || bn) {\n      if (an && bn) return p - q;\n      if (an) return descending ? -1 : 1;\n      return descending ? 1 : -1;\n    }\n    if (av < bv) return descending ? 1 : -1;\n    if (av > bv) return descending ? -1 : 1;\n    return p - q;\n  });\n  return idx;\n}\n\nfunction complex_sort_indices(a, descending) {\n  const n = a.data.length;\n  const im = a.imag;\n  const idx = new Array(n);\n  const mag = new Float64Array(n);\n  const ph = new Float64Array(n);\n  for (let i = 0; i < n; i++) {\n    const re = a.data[i];\n    const xi = im !== undefined ? im[i] : 0;\n    mag[i] = Math.hypot(re, xi);\n    ph[i] = Math.atan2(xi, re);\n    idx[i] = i;\n  }\n  idx.sort((p, q) => {\n    if (mag[p] < mag[q]) return descending ? 1 : -1;\n    if (mag[p] > mag[q]) return descending ? -1 : 1;\n    if (ph[p] < ph[q]) return descending ? 1 : -1;\n    if (ph[p] > ph[q]) return descending ? -1 : 1;\n    return p - q;\n  });\n  return idx;\n}\n\nfunction mtoc2_sort_real(a, descending) {\n  const v = mtoc2_tensor_alloc_nd(a.shape.length, a.shape);\n  if (a.data.length === 0) return v;\n  const sorted = pair_sort_indices(a, descending);\n  for (let i = 0; i < sorted.length; i++) v.data[i] = a.data[sorted[i]];\n  return v;\n}\n\nfunction mtoc2_sort_real_2(a, descending) {\n  const v = mtoc2_tensor_alloc_nd(a.shape.length, a.shape);\n  const ix = mtoc2_tensor_alloc_nd(a.shape.length, a.shape);\n  if (a.data.length === 0) return { v, ix };\n  const sorted = pair_sort_indices(a, descending);\n  for (let i = 0; i < sorted.length; i++) {\n    v.data[i] = a.data[sorted[i]];\n    ix.data[i] = sorted[i] + 1;\n  }\n  return { v, ix };\n}\n\nfunction mtoc2_sort_complex(a, descending) {\n  const v = mtoc2_tensor_alloc_nd_complex(a.shape.length, a.shape);\n  if (a.data.length === 0) return v;\n  const sorted = complex_sort_indices(a, descending);\n  const im = a.imag;\n  for (let i = 0; i < sorted.length; i++) {\n    v.data[i] = a.data[sorted[i]];\n    if (im !== undefined) v.imag[i] = im[sorted[i]];\n  }\n  return v;\n}\n\nfunction mtoc2_sort_complex_2(a, descending) {\n  const v = mtoc2_tensor_alloc_nd_complex(a.shape.length, a.shape);\n  const ix = mtoc2_tensor_alloc_nd(a.shape.length, a.shape);\n  if (a.data.length === 0) return { v, ix };\n  const sorted = complex_sort_indices(a, descending);\n  const im = a.imag;\n  for (let i = 0; i < sorted.length; i++) {\n    v.data[i] = a.data[sorted[i]];\n    if (im !== undefined) v.imag[i] = im[sorted[i]];\n    ix.data[i] = sorted[i] + 1;\n  }\n  return { v, ix };\n}\n",
+  "tensor_sort_real.js": "// JS sibling of `tensor_sort_real.h`. Stable sort on real and\n// complex tensors. The descending flag flips the comparator while\n// keeping the tie-break on ascending original index. Complex sort\n// orders by magnitude then phase (matches numbl).\n\n\n\nfunction pair_sort_indices(a, descending) {\n  const n = a.data.length;\n  const idx = new Array(n);\n  for (let i = 0; i < n; i++) idx[i] = i;\n  idx.sort((p, q) => {\n    const av = a.data[p];\n    const bv = a.data[q];\n    // NaN ranks as the maximum (MATLAB): last when ascending, first\n    // when descending. Without this, NaN compares false both ways and\n    // falls through to the index tie-break, making the comparator\n    // non-transitive and corrupting the whole array.\n    const an = av !== av;\n    const bn = bv !== bv;\n    if (an || bn) {\n      if (an && bn) return p - q;\n      if (an) return descending ? -1 : 1;\n      return descending ? 1 : -1;\n    }\n    if (av < bv) return descending ? 1 : -1;\n    if (av > bv) return descending ? -1 : 1;\n    return p - q;\n  });\n  return idx;\n}\n\nfunction complex_sort_indices(a, descending) {\n  const n = a.data.length;\n  const im = a.imag;\n  const idx = new Array(n);\n  for (let i = 0; i < n; i++) idx[i] = i;\n\n  // All-zero imaginary lane \u2192 order by signed real value (matches the\n  // interpreter and MATLAB on real data), not by magnitude. NaNs sort\n  // last when ascending, first when descending.\n  let realMode = true;\n  if (im !== undefined) {\n    for (let i = 0; i < n; i++) {\n      if (im[i] !== 0) {\n        realMode = false;\n        break;\n      }\n    }\n  }\n  if (realMode) {\n    const re = a.data;\n    idx.sort((p, q) => {\n      const rp = re[p];\n      const rq = re[q];\n      const pNaN = rp !== rp;\n      const qNaN = rq !== rq;\n      if (pNaN && qNaN) return 0;\n      if (descending) {\n        if (pNaN) return -1;\n        if (qNaN) return 1;\n        return rp < rq ? 1 : rp > rq ? -1 : 0;\n      }\n      if (pNaN) return 1;\n      if (qNaN) return -1;\n      return rp < rq ? -1 : rp > rq ? 1 : 0;\n    });\n    return idx;\n  }\n\n  const mag = new Float64Array(n);\n  const ph = new Float64Array(n);\n  for (let i = 0; i < n; i++) {\n    const re = a.data[i];\n    const xi = im !== undefined ? im[i] : 0;\n    mag[i] = Math.hypot(re, xi);\n    ph[i] = Math.atan2(xi, re);\n  }\n  idx.sort((p, q) => {\n    if (mag[p] < mag[q]) return descending ? 1 : -1;\n    if (mag[p] > mag[q]) return descending ? -1 : 1;\n    if (ph[p] < ph[q]) return descending ? 1 : -1;\n    if (ph[p] > ph[q]) return descending ? -1 : 1;\n    return p - q;\n  });\n  return idx;\n}\n\nfunction mtoc2_sort_real(a, descending) {\n  const v = mtoc2_tensor_alloc_nd(a.shape.length, a.shape);\n  if (a.data.length === 0) return v;\n  const sorted = pair_sort_indices(a, descending);\n  for (let i = 0; i < sorted.length; i++) v.data[i] = a.data[sorted[i]];\n  return v;\n}\n\nfunction mtoc2_sort_real_2(a, descending) {\n  const v = mtoc2_tensor_alloc_nd(a.shape.length, a.shape);\n  const ix = mtoc2_tensor_alloc_nd(a.shape.length, a.shape);\n  if (a.data.length === 0) return { v, ix };\n  const sorted = pair_sort_indices(a, descending);\n  for (let i = 0; i < sorted.length; i++) {\n    v.data[i] = a.data[sorted[i]];\n    ix.data[i] = sorted[i] + 1;\n  }\n  return { v, ix };\n}\n\nfunction mtoc2_sort_complex(a, descending) {\n  const v = mtoc2_tensor_alloc_nd_complex(a.shape.length, a.shape);\n  if (a.data.length === 0) return v;\n  const sorted = complex_sort_indices(a, descending);\n  const im = a.imag;\n  for (let i = 0; i < sorted.length; i++) {\n    v.data[i] = a.data[sorted[i]];\n    if (im !== undefined) v.imag[i] = im[sorted[i]];\n  }\n  return v;\n}\n\nfunction mtoc2_sort_complex_2(a, descending) {\n  const v = mtoc2_tensor_alloc_nd_complex(a.shape.length, a.shape);\n  const ix = mtoc2_tensor_alloc_nd(a.shape.length, a.shape);\n  if (a.data.length === 0) return { v, ix };\n  const sorted = complex_sort_indices(a, descending);\n  const im = a.imag;\n  for (let i = 0; i < sorted.length; i++) {\n    v.data[i] = a.data[sorted[i]];\n    if (im !== undefined) v.imag[i] = im[sorted[i]];\n    ix.data[i] = sorted[i] + 1;\n  }\n  return { v, ix };\n}\n",
   "tensor_transpose.js": "// JS sibling of `tensor_transpose.h`. Real 2-D non-conjugate\n// transpose. Mirrors `transposeCore` semantics: column-major in,\n// column-major out.\n\n\nfunction mtoc2_tensor_transpose(a) {\n  const m = a.shape[0];\n  const n = a.shape[1];\n  const r = mtoc2_tensor_alloc(n, m);\n  for (let sc = 0; sc < n; sc++) {\n    for (let sr = 0; sr < m; sr++) {\n      r.data[sc + sr * n] = a.data[sr + sc * m];\n    }\n  }\n  return r;\n}\n",
   "tensor_transpose_complex.js": "// JS sibling of `tensor_transpose_complex.h`. Real 2-D non-conjugate\n// transpose for a complex tensor \u2014 both lanes get the same index\n// permutation. `'` (conjugate transpose) lowers to\n// `transpose(conj(z))` upstream, so this helper isn't responsible\n// for negating the imag lane.\n\n\nfunction mtoc2_tensor_transpose_complex(a) {\n  const m = a.shape[0];\n  const n = a.shape[1];\n  const r = mtoc2_tensor_alloc_complex(n, m);\n  const aim = a.imag;\n  for (let sc = 0; sc < n; sc++) {\n    for (let sr = 0; sr < m; sr++) {\n      r.data[sc + sr * n] = a.data[sr + sc * m];\n      r.imag[sc + sr * n] = aim !== undefined ? aim[sr + sc * m] : 0;\n    }\n  }\n  return r;\n}\n",
   "tensor_triangular.js": "// JS sibling of `tensor_triangular.h`. Four helpers: `triu` / `tril`\n// keep entries where `j - i >= k` / `i - j >= -k`; their `_complex`\n// siblings walk both lanes. Mirrors `triPart` in numbl's\n// `interpreter/builtins/array-extras.ts`.\n\n\n\nfunction mtoc2_tensor_triu(a, k) {\n  const rows = a.shape[0];\n  const cols = a.shape[1];\n  const out = mtoc2_tensor_alloc(rows, cols);\n  for (let j = 0; j < cols; j++) {\n    for (let i = 0; i < rows; i++) {\n      if (j - i >= k) {\n        const idx = i + j * rows;\n        out.data[idx] = a.data[idx];\n      }\n    }\n  }\n  return out;\n}\n\nfunction mtoc2_tensor_tril(a, k) {\n  const rows = a.shape[0];\n  const cols = a.shape[1];\n  const out = mtoc2_tensor_alloc(rows, cols);\n  for (let j = 0; j < cols; j++) {\n    for (let i = 0; i < rows; i++) {\n      if (i - j >= -k) {\n        const idx = i + j * rows;\n        out.data[idx] = a.data[idx];\n      }\n    }\n  }\n  return out;\n}\n\nfunction mtoc2_tensor_triu_complex(a, k) {\n  const rows = a.shape[0];\n  const cols = a.shape[1];\n  const out = mtoc2_tensor_alloc_nd_complex(2, [rows, cols]);\n  const im = a.imag;\n  for (let j = 0; j < cols; j++) {\n    for (let i = 0; i < rows; i++) {\n      if (j - i >= k) {\n        const idx = i + j * rows;\n        out.data[idx] = a.data[idx];\n        if (im !== undefined) out.imag[idx] = im[idx];\n      }\n    }\n  }\n  return out;\n}\n\nfunction mtoc2_tensor_tril_complex(a, k) {\n  const rows = a.shape[0];\n  const cols = a.shape[1];\n  const out = mtoc2_tensor_alloc_nd_complex(2, [rows, cols]);\n  const im = a.imag;\n  for (let j = 0; j < cols; j++) {\n    for (let i = 0; i < rows; i++) {\n      if (i - j >= -k) {\n        const idx = i + j * rows;\n        out.data[idx] = a.data[idx];\n        if (im !== undefined) out.imag[idx] = im[idx];\n      }\n    }\n  }\n  return out;\n}\n",
@@ -66154,6 +66226,7 @@ var JS_IMPORTS = {
   "tensor_fill_nd.js": ["tensor_alloc_nd.js", "tensor_alloc_nd_complex.js"],
   "tensor_fill_square.js": ["tensor_fill_nd.js"],
   "tensor_flip.js": ["tensor_alloc_nd.js", "tensor_alloc_nd_complex.js"],
+  "tensor_imag_all_zero.js": [],
   "tensor_linspace.js": ["tensor_alloc.js"],
   "tensor_logical_real.js": ["tensor_alloc_nd.js"],
   "tensor_logspace.js": ["tensor_alloc.js"],
@@ -73776,7 +73849,7 @@ function checkArity2(argTypes, nargout) {
 function staticVerdict(t) {
   if (!isNumeric2(t)) return true;
   if (!t.isComplex) return true;
-  if (!isScalar(t)) return false;
+  if (!isScalar(t)) return "runtime";
   const ex = t.exact;
   if (ex !== void 0 && typeof ex === "object" && "im" in ex) {
     const im = ex.im;
@@ -73793,23 +73866,33 @@ var isreal = {
     return [scalarLogical(v)];
   },
   emitC({ argTypes, argsC, useRuntime }) {
-    const v = staticVerdict(argTypes[0]);
+    const t = argTypes[0];
+    const v = staticVerdict(t);
     if (v === true) return "1.0";
     if (v === false) return "0.0";
+    if (isNumeric2(t) && t.isComplex && !isScalar(t)) {
+      useRuntime("mtoc2_tensor_imag_all_zero");
+      return `mtoc2_tensor_imag_all_zero(${argsC[0]})`;
+    }
     useRuntime("mtoc2_cscalar");
     return `(cimag(${argsC[0]}) == 0.0)`;
   },
-  emitJs({ argTypes, argsJs }) {
-    const v = staticVerdict(argTypes[0]);
+  emitJs({ argTypes, argsJs, useRuntime }) {
+    const t = argTypes[0];
+    const v = staticVerdict(t);
     if (v === true) return "true";
     if (v === false) return "false";
+    if (isNumeric2(t) && t.isComplex && !isScalar(t)) {
+      useRuntime("mtoc2_tensor_imag_all_zero");
+      return `mtoc2_tensor_imag_all_zero(${argsJs[0]})`;
+    }
     return `(${argsJs[0]}.im === 0)`;
   },
   call({ args }) {
     const v = args[0];
     if (typeof v === "number" || typeof v === "boolean") return [true];
     if (isComplexValue(v)) return [v.im === 0];
-    if (isTensor(v)) return [!v.imag];
+    if (isTensor(v)) return [mtoc2_tensor_imag_all_zero(v)];
     return [true];
   }
 };
@@ -81969,6 +82052,13 @@ var REGISTRY2 = /* @__PURE__ */ new Map([
       "mtoc2_cdiv"
     ])
   ],
+  // `isreal` on a complex-typed tensor: scan the imag lane (true iff all
+  // zero). Lets the JIT report realness by value, matching the
+  // interpreter on tensors the type system could not prove real.
+  [
+    "mtoc2_tensor_imag_all_zero",
+    loadSnippet("tensor_imag_all_zero.h", ["mtoc2_tensor_t"])
+  ],
   // ── Elementwise binary/unary on real tensors ──────────────────────
   // One snippet covers all 11 funcs (4×_tt, 4×_ts, 2×_st, 1×uminus).
   // Builtins activate by op-specific synthetic name; all map to the