numbl 0.1.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/binding.gyp +59 -3
  2. package/dist-cli/cli.js +22538 -7936
  3. package/dist-lib/lib.js +34682 -20852
  4. package/dist-lib/numbl-core/executeCode.d.ts +13 -0
  5. package/dist-lib/numbl-core/fileIOAdapter.d.ts +2 -0
  6. package/dist-lib/numbl-core/helpers/reduction-helpers.d.ts +7 -2
  7. package/dist-lib/numbl-core/interpreter/builtins/datetime.d.ts +39 -0
  8. package/dist-lib/numbl-core/interpreter/builtins/index.d.ts +1 -0
  9. package/dist-lib/numbl-core/interpreter/builtins/time-system.d.ts +1 -0
  10. package/dist-lib/numbl-core/interpreter/builtins/types.d.ts +96 -5
  11. package/dist-lib/numbl-core/interpreter/interpreter.d.ts +41 -3
  12. package/dist-lib/numbl-core/interpreter/types.d.ts +1 -1
  13. package/dist-lib/numbl-core/jit/c/abi.d.ts +90 -0
  14. package/dist-lib/numbl-core/jit/c/assemble.d.ts +56 -0
  15. package/dist-lib/numbl-core/jit/c/classify.d.ts +70 -0
  16. package/dist-lib/numbl-core/jit/c/compile.d.ts +37 -0
  17. package/dist-lib/numbl-core/jit/c/context.d.ts +152 -0
  18. package/dist-lib/numbl-core/jit/c/emit/assign.d.ts +20 -0
  19. package/dist-lib/numbl-core/jit/c/emit/complexScalar.d.ts +18 -0
  20. package/dist-lib/numbl-core/jit/c/emit/fused.d.ts +42 -0
  21. package/dist-lib/numbl-core/jit/c/emit/helpers.d.ts +40 -0
  22. package/dist-lib/numbl-core/jit/c/emit/index.d.ts +14 -0
  23. package/dist-lib/numbl-core/jit/c/emit/scalar.d.ts +23 -0
  24. package/dist-lib/numbl-core/jit/c/emit/stmt.d.ts +25 -0
  25. package/dist-lib/numbl-core/jit/c/emit/tensor.d.ts +127 -0
  26. package/dist-lib/numbl-core/jit/c/emit/userCall.d.ts +58 -0
  27. package/dist-lib/numbl-core/jit/c/epilogue.d.ts +26 -0
  28. package/dist-lib/numbl-core/jit/c/feasibility.d.ts +44 -0
  29. package/dist-lib/numbl-core/jit/c/prelude.d.ts +37 -0
  30. package/dist-lib/numbl-core/jit/c/visit.d.ts +63 -0
  31. package/dist-lib/numbl-core/jit/e1/complexKernelEmit.d.ts +46 -0
  32. package/dist-lib/numbl-core/jit/e1/hash.d.ts +10 -0
  33. package/dist-lib/numbl-core/jit/e1/install.d.ts +13 -0
  34. package/dist-lib/numbl-core/jit/e1/kernelEmit.d.ts +54 -0
  35. package/dist-lib/numbl-core/jit/e1/multiReductionKernel.d.ts +66 -0
  36. package/dist-lib/numbl-core/jit/e1/openmpFlag.d.ts +13 -0
  37. package/dist-lib/numbl-core/jit/e1/scalarFnKernel.d.ts +44 -0
  38. package/dist-lib/numbl-core/jit/e2/assignKernel.d.ts +34 -0
  39. package/dist-lib/numbl-core/jit/e2/astToJitExpr.d.ts +25 -0
  40. package/dist-lib/numbl-core/jit/e2/cache.d.ts +80 -0
  41. package/dist-lib/numbl-core/jit/e2/chainKernelEmit.d.ts +55 -0
  42. package/dist-lib/numbl-core/jit/e2/classify.d.ts +119 -0
  43. package/dist-lib/numbl-core/jit/e2/compileFn.d.ts +16 -0
  44. package/dist-lib/numbl-core/jit/e2/complexChainKernelEmit.d.ts +79 -0
  45. package/dist-lib/numbl-core/jit/e2/emitShared.d.ts +71 -0
  46. package/dist-lib/numbl-core/jit/e2/install.d.ts +11 -0
  47. package/dist-lib/numbl-core/jit/e2/liveness.d.ts +29 -0
  48. package/dist-lib/numbl-core/jit/e2/loopKernel.d.ts +49 -0
  49. package/dist-lib/numbl-core/jit/e2/loopKernelEmit.d.ts +75 -0
  50. package/dist-lib/numbl-core/jit/e2/multiReductionDriver.d.ts +24 -0
  51. package/dist-lib/numbl-core/jit/e2/reductionKernelEmit.d.ts +72 -0
  52. package/dist-lib/numbl-core/jit/e2/scalarFnDriver.d.ts +29 -0
  53. package/dist-lib/numbl-core/jit/fusedChainHelpers.d.ts +65 -0
  54. package/dist-lib/numbl-core/jit/fusedScalarEmit.d.ts +69 -0
  55. package/dist-lib/numbl-core/jit/fusion.d.ts +71 -0
  56. package/dist-lib/numbl-core/jit/fusionOps.d.ts +25 -0
  57. package/dist-lib/numbl-core/jit/heavyOps.d.ts +15 -0
  58. package/dist-lib/numbl-core/{interpreter/jit → jit}/index.d.ts +2 -2
  59. package/dist-lib/numbl-core/jit/jitBailSafety.d.ts +41 -0
  60. package/dist-lib/numbl-core/{interpreter/jit → jit}/jitLoop.d.ts +2 -2
  61. package/dist-lib/numbl-core/{interpreter/jit → jit}/jitLoopAnalysis.d.ts +6 -1
  62. package/dist-lib/numbl-core/jit/jitLower.d.ts +122 -0
  63. package/dist-lib/numbl-core/jit/jitLowerExpr.d.ts +27 -0
  64. package/dist-lib/numbl-core/jit/jitLowerStmt.d.ts +9 -0
  65. package/dist-lib/numbl-core/{interpreter/jit → jit}/jitLowerTypes.d.ts +7 -3
  66. package/dist-lib/numbl-core/jit/jitTopLevel.d.ts +22 -0
  67. package/dist-lib/numbl-core/{interpreter/jit → jit}/jitTypes.d.ts +133 -1
  68. package/dist-lib/numbl-core/{interpreter/jit → jit/js}/jitCodegen.d.ts +2 -2
  69. package/dist-lib/numbl-core/{interpreter/jit → jit/js}/jitCodegenHoist.d.ts +19 -1
  70. package/dist-lib/numbl-core/{interpreter/jit → jit/js}/jitHelpers.d.ts +15 -3
  71. package/dist-lib/numbl-core/{interpreter/jit → jit/js}/jitHelpersIndex.d.ts +7 -0
  72. package/dist-lib/numbl-core/jit/js/jitHelpersTensor.d.ts +34 -0
  73. package/dist-lib/numbl-core/jit/js/jsFusedCodegen.d.ts +17 -0
  74. package/dist-lib/numbl-core/jit/js/jsMultiReduction.d.ts +70 -0
  75. package/dist-lib/numbl-core/jit/scalarEmit.d.ts +58 -0
  76. package/dist-lib/numbl-core/lexer/types.d.ts +2 -1
  77. package/dist-lib/numbl-core/native/lapack-bridge.d.ts +39 -1
  78. package/dist-lib/numbl-core/ops/bessel.d.ts +18 -0
  79. package/dist-lib/numbl-core/ops/comparison.d.ts +11 -0
  80. package/dist-lib/numbl-core/ops/complexBinaryElemwise.d.ts +10 -0
  81. package/dist-lib/numbl-core/ops/complexUnaryElemwise.d.ts +8 -0
  82. package/dist-lib/numbl-core/ops/dispatch.d.ts +26 -0
  83. package/dist-lib/numbl-core/ops/index.d.ts +8 -0
  84. package/dist-lib/numbl-core/ops/opCodes.d.ts +70 -0
  85. package/dist-lib/numbl-core/ops/realBinaryElemwise.d.ts +8 -0
  86. package/dist-lib/numbl-core/ops/realUnaryElemwise.d.ts +5 -0
  87. package/dist-lib/numbl-core/ops/reduce.d.ts +6 -0
  88. package/dist-lib/numbl-core/parser/types.d.ts +6 -0
  89. package/dist-lib/numbl-core/runtime/alloc.d.ts +23 -0
  90. package/dist-lib/numbl-core/runtime/runtime.d.ts +1 -0
  91. package/dist-lib/numbl-core/version.d.ts +1 -1
  92. package/native/jit_runtime/jit_runtime.c +261 -0
  93. package/native/jit_runtime/jit_runtime.h +204 -0
  94. package/native/numbl_addon.cpp +62 -1
  95. package/native/ops/bessel.c +572 -0
  96. package/native/ops/comparison.c +150 -0
  97. package/native/ops/complex_binary_elemwise.c +192 -0
  98. package/native/ops/complex_unary_elemwise.c +152 -0
  99. package/native/ops/numbl_ops.c +66 -0
  100. package/native/ops/numbl_ops.h +262 -0
  101. package/native/ops/real_binary_elemwise.c +85 -0
  102. package/native/ops/real_unary_elemwise.c +104 -0
  103. package/native/ops/reduce.c +162 -0
  104. package/native/ops_napi.cpp +320 -0
  105. package/package.json +8 -9
  106. package/dist-lib/numbl-core/interpreter/jit/jitHelpersTensor.d.ts +0 -28
  107. package/dist-lib/numbl-core/interpreter/jit/jitLower.d.ts +0 -23
  108. /package/dist-lib/numbl-core/{interpreter/jit → jit/js}/jitHelpersComplex.d.ts +0 -0
@@ -0,0 +1,79 @@
1
+ /**
2
+ * e2 — complex multi-LHS fused chain C kernel emission (paired-buffer).
3
+ *
4
+ * Sister to [chainKernelEmit.ts](./chainKernelEmit.ts) for chains that
5
+ * produce at least one complex tensor. Mirrors the codegen shape of
6
+ * [e1/complexKernelEmit.ts](../e1/complexKernelEmit.ts) and uses the
7
+ * same fusion envelope (+ - * .* unary +/- conj real imag, real/complex
8
+ * widening, ImagLiteral). Anything outside that subset (`./`,
9
+ * `abs(complex)`, transcendentals on complex) is rejected at the e2
10
+ * lowerer level, which causes the driver to bail to the interpreter
11
+ * per-op complex path — matching e1's fusion fallthrough behavior.
12
+ *
13
+ * void e2cc_<hash>(int64_t n,
14
+ * const double *in_<cta>_re, const double *in_<cta>_im,
15
+ * const double *in_<rtb>,
16
+ * [in_<lhs_input>_re/_im or in_<lhs_input> ...,]
17
+ * double s_<csc>_re, double s_<csc>_im,
18
+ * double s_<rsc>,
19
+ * [out_<lhs>_re, out_<lhs>_im or out_<lhs> ...])
20
+ * {
21
+ * #pragma omp simd
22
+ * for (int64_t i = 0; i < n; i++) {
23
+ * double <clhs1>_re, <clhs1>_im, ..., <rlhs1>, ...;
24
+ * <clhs1>_re = ...; <clhs1>_im = ...;
25
+ * <rlhs1> = ...;
26
+ * out_<clhs>_re[i] = <clhs>_re; out_<clhs>_im[i] = <clhs>_im;
27
+ * out_<rlhs>[i] = <rlhs>;
28
+ * }
29
+ * }
30
+ *
31
+ * Complex chains deliberately stick to `#pragma omp simd` regardless of
32
+ * `--par`: per-element bodies are ~6 flops spread across paired re/im
33
+ * buffers (memory-bandwidth-bound), and thread-spawn overhead dominates
34
+ * the compute win at realistic N. Matches e1's stance.
35
+ */
36
+ import type { ChainAssignSpec } from "./emitShared.js";
37
+ export interface E2ComplexKernelInputs {
38
+ /** Env tensor names, split by complex-ness. */
39
+ complexTensorNames: string[];
40
+ realTensorNames: string[];
41
+ /** Env scalar names, split by complex-ness. */
42
+ complexScalarNames: string[];
43
+ realScalarNames: string[];
44
+ /** Chain LHS names that need `in_<name>` because they're read before
45
+ * being written. Split by complex-ness of the LHS. */
46
+ complexInputLhsNames: string[];
47
+ realInputLhsNames: string[];
48
+ /** Chain LHS names that escape the chain. Split by complex-ness. */
49
+ complexEscapeLhsNames: string[];
50
+ realEscapeLhsNames: string[];
51
+ }
52
+ export interface E2ComplexChainEmitResult {
53
+ kernelName: string;
54
+ cSource: string;
55
+ koffiSig: string;
56
+ hash: string;
57
+ /** In signature order: complex tensors, real tensors. */
58
+ complexInputTensors: string[];
59
+ realInputTensors: string[];
60
+ /** In signature order: complex input LHSs, real input LHSs. */
61
+ complexInputLhsNames: string[];
62
+ realInputLhsNames: string[];
63
+ /** In signature order: complex scalars, real scalars. */
64
+ complexInputScalars: string[];
65
+ realInputScalars: string[];
66
+ /** In signature order: complex escape LHSs, real escape LHSs. */
67
+ complexEscapeLhsNames: string[];
68
+ realEscapeLhsNames: string[];
69
+ chainLength: number;
70
+ }
71
+ /** Per-assign LHS info — complex-ness of the RHS determines whether
72
+ * this stmt emits a paired (re/im) local or a single real local. */
73
+ export interface ComplexChainAssignSpec extends ChainAssignSpec {
74
+ /** True when THIS stmt's RHS is complex. Chain-LHS type can differ
75
+ * per reassignment; we track per-stmt so a `a = real; a = complex;`
76
+ * sequence sees `a` as complex only after the second assign. */
77
+ rhsIsComplex: boolean;
78
+ }
79
+ export declare function emitE2ComplexChainKernel(assigns: ComplexChainAssignSpec[], inputs: E2ComplexKernelInputs): E2ComplexChainEmitResult;
@@ -0,0 +1,71 @@
1
+ /**
2
+ * e2 — shared kernel-emission helpers used by both the chain emitter
3
+ * and the reduction emitter. The two emitters build the same kernel
4
+ * shape up to a few trailing differences (reduction init / combine /
5
+ * out_acc output), so everything that's identical lives here.
6
+ */
7
+ import type { JitExpr } from "../jitTypes.js";
8
+ import { type FusedTarget } from "../fusedScalarEmit.js";
9
+ import type { ScalarOpTarget } from "../scalarEmit.js";
10
+ /**
11
+ * C helper included in every e2 kernel prologue.
12
+ *
13
+ * `-ffast-math` implies `-ffinite-math-only`, which lets the compiler
14
+ * assume no NaN/Inf values and constant-fold `x != x` to 0 and `x == x`
15
+ * to 1. The NaN-detection idiom `mask = x ~= x` would silently return all
16
+ * zeros. `numbl_is_nan_fp` inspects the IEEE-754 bit pattern directly —
17
+ * the optimizer cannot look through `memcpy` to apply finite-math
18
+ * assumptions, so this survives the flag.
19
+ */
20
+ export declare const E2_C_PROLOGUE: string;
21
+ /**
22
+ * e2-specific scalar op target.
23
+ *
24
+ * Identical to `C_SCALAR_TARGET` except `binEq` / `binNe`: when both
25
+ * operand strings are the same C expression (i.e. the source was `x == x`
26
+ * or `x ~= x`), the standard `==` / `!=` forms are constant-folded to
27
+ * 1 / 0 by `-ffinite-math-only`. Replace them with the bit-pattern NaN
28
+ * helper so the self-comparison gives the correct IEEE 754 result.
29
+ */
30
+ export declare const E2_C_SCALAR_TARGET: ScalarOpTarget;
31
+ export interface ChainAssignSpec {
32
+ lhsName: string;
33
+ rhs: JitExpr;
34
+ }
35
+ export interface KernelInputs {
36
+ /** Regular env input tensor names (NOT chain LHSs). */
37
+ tensorNames: string[];
38
+ /** Scalar env input names. */
39
+ scalarNames: string[];
40
+ /** Chain LHS names that need `in_<name>` because they're read before
41
+ * being written. */
42
+ inputLhsNames: string[];
43
+ /** Chain LHS names that escape the chain (materialized via
44
+ * `out_<name>`). Does NOT include a reduce-target name — that one
45
+ * is always chain-local by construction. */
46
+ escapeLhsNames: string[];
47
+ }
48
+ export declare const cInputPtr: (name: string) => string;
49
+ export declare const cOutputPtr: (name: string) => string;
50
+ export declare const cScalarParam: (name: string) => string;
51
+ /** FusedTarget for the per-element body. Resolves Var reads to either
52
+ * the chain-local stack name (once the corresponding assign has run)
53
+ * or `in_<name>[i]` (before that point), mangles scalar param names,
54
+ * and dispatches whitelisted builtins through their `jitEmitC`. */
55
+ export declare function makeFusedTarget(locallyAssigned: ReadonlySet<string>): FusedTarget;
56
+ /** Unique chain LHS names in source order — for the leading
57
+ * `double <a>, <b>;` declaration. */
58
+ export declare function uniqueLhsOrdered(chain: ChainAssignSpec[]): string[];
59
+ /** All tensor-typed names visible to emitFusedScalarExpr: regular env
60
+ * tensors, `in_<lhs>` tensors, and chain LHSs. */
61
+ export declare function allTensorVarsFor(inputs: KernelInputs, chain: ChainAssignSpec[]): Set<string>;
62
+ /** Emit one `<lhs> = <rhsC>;` line per chain assign, growing
63
+ * `locallyAssigned` as we go so later stmts resolve earlier LHSs to
64
+ * the stack-local. */
65
+ export declare function emitChainAssignLines(chain: ChainAssignSpec[], allTensorVars: ReadonlySet<string>, ft: FusedTarget, locallyAssigned: Set<string>): string[];
66
+ /** Kernel param list (tensor → inputLhs → scalar → escapeLhs). Callers
67
+ * append any trailing params (e.g. `double *out_acc`). */
68
+ export declare function buildParamList(inputs: KernelInputs): string[];
69
+ /** koffi type list in the same order as `buildParamList`. Callers
70
+ * append any trailing entries. */
71
+ export declare function buildKoffiParts(inputs: KernelInputs): string[];
@@ -0,0 +1,11 @@
1
+ /**
2
+ * e2 — Node-only install hook.
3
+ *
4
+ * Sets the module-level `e2CompileFn` to the real `compileAndLoad`
5
+ * driver from `c/compile.ts`. The browser bundle never imports this
6
+ * file, so `e2CompileFn` stays at the throwing stub and any attempt
7
+ * to use `--opt e2` from the web fails with a clear message.
8
+ *
9
+ * Idempotent: re-importing in tests doesn't re-install.
10
+ */
11
+ export {};
@@ -0,0 +1,29 @@
1
+ /**
2
+ * e2 — AST liveness helpers.
3
+ *
4
+ * Used by the chain classifier to decide whether a chain LHS is
5
+ * actually used outside the chain's own stmts. If not, it can be
6
+ * compiled as a per-element stack-local instead of being materialized
7
+ * as a tensor output buffer.
8
+ *
9
+ * The "scope" passed in is the innermost enclosing function body or
10
+ * top-level script body — chosen so that for-bodies, if-bodies, etc.
11
+ * are scanned recursively (MATLAB has flat function-level scoping for
12
+ * locals, so a name introduced inside a for-loop is visible to other
13
+ * stmts in the same function body).
14
+ *
15
+ * The walk excludes the chain's own stmts (and the trailing-reduction
16
+ * stmt if any) from the scan, applying the exclusion at every nesting
17
+ * level — so a chain inside a for-body whose LHS is read by another
18
+ * stmt in the same function body counts as referenced, but the chain
19
+ * stmts themselves don't trigger a false positive.
20
+ */
21
+ import type { Stmt } from "../../parser/types.js";
22
+ /**
23
+ * True iff `name` appears anywhere in `scopeBody` outside the stmts
24
+ * listed in `excludeStmts`. The exclusion is by reference identity
25
+ * and is applied at every nesting level — pass the chain stmts (and
26
+ * the trailing-reduction stmt if any) so they don't trigger false
27
+ * positives.
28
+ */
29
+ export declare function isNameReferencedOutsideStmts(scopeBody: readonly Stmt[], excludeStmts: ReadonlySet<Stmt>, name: string): boolean;
@@ -0,0 +1,49 @@
1
+ /**
2
+ * e2 whole-loop C JIT.
3
+ *
4
+ * For a `for varName = lo:hi <body> end` where the body fits a supported
5
+ * shape, emit a single C function that runs all n iterations and call it
6
+ * once, instead of walking the AST on every iteration.
7
+ *
8
+ * Without this path, `--opt e2` pays ~70–100 ns per iter on a trivial
9
+ * `s = s + i` just for AST dispatch; a compiled C loop runs it in <1 ns.
10
+ *
11
+ * Current supported body shapes (all may mix in one loop):
12
+ * - scalar assign `s = s + sin(i) * cos(i) + sqrt(i*0.01)`
13
+ * - scalar indexed read `s = s + x(i)` (real tensor x)
14
+ * - scalar indexed write `y(i) = sin(i*0.01)` (preallocated y)
15
+ * - tensor local (elemwise) `c = a.*b + i*0.001` (per-element
16
+ * expression is
17
+ * inlined into any
18
+ * consuming sum();
19
+ * last-iter value
20
+ * is also written
21
+ * back to the env
22
+ * for MATLAB
23
+ * post-loop
24
+ * visibility)
25
+ * - reductions `s = s + sum(c)` (c is a
26
+ * tensor_local —
27
+ * chained
28
+ * tensor_locals
29
+ * fuse through)
30
+ *
31
+ * Not supported (falls through to the interpreter / other JIT paths):
32
+ * - non-`lo:hi` loop shapes (stepped ranges, `for i = v`)
33
+ * - complex or logical tensor inputs
34
+ * - matrix-matrix / matrix-vector multiplication
35
+ * - bsxfun / broadcast across shapes
36
+ * - function-handle calls, user-function calls
37
+ * - control flow inside the body (if / while / return)
38
+ * - multi-dimensional tensor access
39
+ */
40
+ import type { Stmt } from "../../parser/types.js";
41
+ import type { Interpreter } from "../../interpreter/interpreter.js";
42
+ /**
43
+ * Attempt to compile and execute a for-loop as one C kernel under
44
+ * `--opt e2`. Returns true on success, false to fall back to the regular
45
+ * interpreter path (the caller will run the loop normally).
46
+ */
47
+ export declare function tryE2Loop(interp: Interpreter, stmt: Stmt & {
48
+ type: "For";
49
+ }): boolean;
@@ -0,0 +1,75 @@
1
+ /**
2
+ * e2 whole-loop C emission.
3
+ *
4
+ * Given a classified loop body (`BodyStmt[]`) plus the parameter lists
5
+ * that describe how env values flow in and out, emit a single C
6
+ * function that runs the whole `for varName = lo:hi` loop in one call.
7
+ *
8
+ * Three BodyStmt shapes are supported:
9
+ *
10
+ * scalar_assign `s = s + sin(i)` → one C statement per iter
11
+ * tensor_write `y(i) = sin(i*0.01)` → `v_y[(int64_t)idx-1] = ...`
12
+ * tensor_local `c = a.*b + i*0.001` → no code emitted here; its
13
+ * per-element expression is
14
+ * substituted into whichever
15
+ * reduction consumes it
16
+ *
17
+ * Reductions: a `scalar_assign` carries a list of `sum(<tensor_local>)`
18
+ * rewrites that were pulled out of its RHS upstream. Each is emitted as
19
+ * an inline inner `for __j` loop that accumulates the tensor_local's
20
+ * per-element expression into a fresh local. Chained tensor_locals
21
+ * (`d = sqrt(c+1)` where c is itself a tensor_local) fuse through
22
+ * recursively, so no intermediate buffer is materialized.
23
+ */
24
+ import type { JitExpr } from "../jitTypes.js";
25
+ /** Scalar math builtins we emit as direct C library calls. We bypass
26
+ * each IBuiltin's `jitEmitC` here because some of those reject based
27
+ * on type narrowing (e.g. `sqrt` requires `isNonneg` and we don't
28
+ * propagate sign through Binary ops) — but in a pure-real scalar loop
29
+ * the C semantics (NaN on negative sqrt, etc.) match what a MATLAB
30
+ * user gets from `sqrt` on real numeric input.
31
+ *
32
+ * Exported so the driver's pre-lowering analysis can treat these
33
+ * names as non-env references. */
34
+ export declare const LOOP_SCALAR_BUILTINS: Record<string, string>;
35
+ /** A fused reduction lifted out of a `scalar_assign`'s RHS.
36
+ * `sum(<tensorLocal>)` in the source becomes a synthetic scalar ident
37
+ * `synthName`; the emitter materializes it as an inline inner loop
38
+ * that accumulates `tensorLocal`'s per-element expression. */
39
+ export interface Reduction {
40
+ synthName: string;
41
+ tensorLocal: string;
42
+ op: "sum";
43
+ }
44
+ /** A body statement in a form ready for C emission. */
45
+ export type BodyStmt = {
46
+ kind: "scalar_assign";
47
+ name: string;
48
+ rhs: JitExpr;
49
+ reductions: Reduction[];
50
+ } | {
51
+ kind: "tensor_write";
52
+ name: string;
53
+ idxRhs: JitExpr;
54
+ rhs: JitExpr;
55
+ } | {
56
+ kind: "tensor_local";
57
+ name: string;
58
+ elemExpr: JitExpr;
59
+ lengthTensor: string;
60
+ };
61
+ /** Mangle a MATLAB scalar name to a C local-variable name. Prefix keeps
62
+ * it out of the way of our bookkeeping locals (`lo`, `hi`, `__iv`). */
63
+ export declare function v(name: string): string;
64
+ /** Name for the `int64_t` length companion that travels alongside each
65
+ * tensor param so inner reductions can bound their inline `__j` loop. */
66
+ export declare function lenN(name: string): string;
67
+ /** Names of all tensor_locals in the body, in body-declaration order.
68
+ * Callers use this to allocate matching output buffers in the same
69
+ * order as the kernel's param list. */
70
+ export declare function tensorLocalNames(body: BodyStmt[]): string[];
71
+ export declare function emitLoopKernel(scalarInputVars: string[], tensorInputVars: string[], tensorInoutVars: string[], inoutVars: string[], loopVar: string, body: BodyStmt[]): {
72
+ cSource: string;
73
+ kernelName: string;
74
+ koffiSig: string;
75
+ };
@@ -0,0 +1,24 @@
1
+ /**
2
+ * e2 — multi-reduction driver.
3
+ *
4
+ * Handles a scalar `Assign` whose RHS contains TWO or more reduction
5
+ * calls (`sum`, `prod`, `max`, `min`, `mean`) over the same single
6
+ * tensor variable, e.g.
7
+ *
8
+ * red_acc = red_acc + sum(x) + mean(x) + max(x) + min(x);
9
+ *
10
+ * The default interpreter path makes one pass through the tensor per
11
+ * reduction (4× the memory traffic of the optimal). The e2 driver
12
+ * detects the pattern, compiles ONE kernel that computes every
13
+ * requested reduction in a single pass, and substitutes the reduction
14
+ * subtrees in the RHS with the kernel's scalar outputs before
15
+ * evaluating the residual expression.
16
+ *
17
+ * Reuses [e1/multiReductionKernel.ts](../e1/multiReductionKernel.ts)
18
+ * for the C emission (same shape works for both backends).
19
+ */
20
+ import type { Stmt } from "../../parser/types.js";
21
+ import type { Interpreter } from "../../interpreter/interpreter.js";
22
+ export declare function tryE2MultiReduction(interp: Interpreter, stmt: Stmt & {
23
+ type: "Assign";
24
+ }): boolean;
@@ -0,0 +1,72 @@
1
+ /**
2
+ * e2 — reduction kernel emission.
3
+ *
4
+ * Handles two related patterns in a single emitter:
5
+ *
6
+ * (A) Standalone reduction:
7
+ * acc = [acc OP] reduce(elemwiseExpr)
8
+ * Empty chain prefix; the kernel walks the inputs once and
9
+ * accumulates `reduce(per-element-expr)` into a scalar buffer.
10
+ *
11
+ * (B) Chain + trailing reduction:
12
+ * lhs1 = ...; lhs2 = ...; ...; lhsK = ...;
13
+ * acc = [acc OP] reduce(lhsK)
14
+ * The chain runs in the same per-element loop; lhsK is purely
15
+ * chain-local (never materialized) — the kernel accumulates
16
+ * reduce(lhsK) into the scalar buffer. Other chain LHSs may
17
+ * still escape (extra `out_<name>` outputs).
18
+ *
19
+ * Both cases use the same kernel shape:
20
+ *
21
+ * void e2r_<hash>(int64_t n,
22
+ * ..in_*.., ..in_lhs_input.., ..s_*..,
23
+ * ..out_escape.., double *out_acc)
24
+ * {
25
+ * double acc = <init>;
26
+ * #pragma omp simd
27
+ * for (int64_t i = 0; i < n; i++) {
28
+ * double <chain_lhs1>, ..., <chain_lhsK>;
29
+ * <chain_lhs1> = <stmt0_rhs_C>;
30
+ * ...
31
+ * <chain_lhsK> = <stmtK_rhs_C>;
32
+ * out_<escape>[i] = <escape>;
33
+ * <reduce-combine>(acc, <reduce_value_expr>);
34
+ * }
35
+ * *out_acc = acc;
36
+ * }
37
+ *
38
+ * For "mean": JS combines `acc /= n` after reading the buffer back.
39
+ * For "max"/"min": uses if-update inside the loop (works under
40
+ * `-ffast-math` + `#pragma omp simd`).
41
+ */
42
+ import type { JitExpr } from "../jitTypes.js";
43
+ import { type ChainAssignSpec, type KernelInputs } from "./emitShared.js";
44
+ export interface ReductionEmitSpec {
45
+ /** Chain prefix (length 0 for standalone-reduction). */
46
+ chain: ChainAssignSpec[];
47
+ /** Reduction op name: sum, prod, max, min, mean, any, all. */
48
+ reduceName: string;
49
+ /** Per-element value expression to feed the reduction.
50
+ * - For (A) standalone: the elemwise expression `reduce(...)` was
51
+ * given.
52
+ * - For (B) chain + trailing: a `Var(lastChainLhsName)` JitExpr —
53
+ * the emitter resolves it to the stack-local. */
54
+ reduceValueExpr: JitExpr;
55
+ inputs: KernelInputs;
56
+ }
57
+ export interface E2ReductionEmitResult {
58
+ kernelName: string;
59
+ cSource: string;
60
+ koffiSig: string;
61
+ hash: string;
62
+ inputTensors: string[];
63
+ inputLhsNames: string[];
64
+ inputScalars: string[];
65
+ escapeLhsNames: string[];
66
+ /** True when the kernel produces a scalar reduction output (always
67
+ * true for this emitter; here for symmetry with other entries). */
68
+ hasReductionOutput: true;
69
+ reduceName: string;
70
+ chainLength: number;
71
+ }
72
+ export declare function emitE2ReductionKernel(spec: ReductionEmitSpec, par?: boolean): E2ReductionEmitResult;
@@ -0,0 +1,29 @@
1
+ /**
2
+ * e2 — whole-function scalar C-kernel driver.
3
+ *
4
+ * Mirrors what e1 does for pure-scalar functions (benchmarks/scalar_bench.m's
5
+ * `run_bench` is the motivating case) but triggers straight from the
6
+ * interpreter's `callUserFunction` entry, not through the JS-JIT outer.
7
+ * Under `--opt e2` the JS-JIT is disabled (optimization clamped to 0),
8
+ * so we can't lean on `tryEmitScalarFnKernel` + the `$h.compileKernel`
9
+ * plumbing; instead we invoke the shared lowering + C-emit pipeline
10
+ * directly and call the resulting koffi function with plain scalar
11
+ * args and Float64Array(1) out-buffers per output.
12
+ *
13
+ * Scope:
14
+ * - All args are scalar `number` or `boolean` RuntimeValues.
15
+ * - Declared outputs (the first `nargout || 1` of them) all lower to
16
+ * scalar / boolean types.
17
+ * - The body survives `checkCFeasibility` (no tic/toc, no Index
18
+ * writes, no disp, etc.).
19
+ *
20
+ * Outside this envelope we return `E2_SKIP` and the caller proceeds
21
+ * with the interpreter path. Compilation failures are HARD errors —
22
+ * mirrors the e2 multi-reduction/chain drivers' policy.
23
+ */
24
+ import type { Interpreter } from "../../interpreter/interpreter.js";
25
+ import type { FunctionDef } from "../../interpreter/types.js";
26
+ export declare const E2_SKIP: unique symbol;
27
+ /** Try to run `fn(args)` via a whole-function C kernel. Returns
28
+ * `E2_SKIP` to fall through to the interpreter. */
29
+ export declare function tryE2ScalarFn(interp: Interpreter, fn: FunctionDef, args: unknown[], nargout: number): unknown | typeof E2_SKIP;
@@ -0,0 +1,65 @@
1
+ /**
2
+ * Chain-level helpers shared by the JS and C fused-codegen backends.
3
+ *
4
+ * The per-element scalar expression walker lives in `fusedScalarEmit.ts`;
5
+ * this module covers the surrounding logic that decides which chain dests
6
+ * need a write-back to their tensor buffer, and the reduction-accumulator
7
+ * init/combine snippets for inline reductions.
8
+ *
9
+ * Reductions are parameterized over a small `ReductionLiterals` record so
10
+ * each backend supplies its own spelling of `0` vs `0.0`, `===` vs `==`,
11
+ * `-Infinity` vs `(-1.0/0.0)`, etc. — the control structure is identical.
12
+ */
13
+ import { BinaryOperation } from "../parser/types.js";
14
+ import type { FusibleChain } from "./fusion.js";
15
+ /**
16
+ * Compute the set of distinct dest names in a fused chain and which of
17
+ * them require a write-back into their tensor buffer.
18
+ *
19
+ * A dest normally needs write-back; the exception is the chain's last
20
+ * tensor if it is fully consumed by a trailing reduction (in which case
21
+ * the scalar reduction accumulator is the only output — materialising
22
+ * the tensor buffer would be wasted work). If that last-dest tensor is
23
+ * ALSO a named output of the enclosing function, the write-back is kept
24
+ * so the caller sees the updated buffer.
25
+ */
26
+ export declare function determineWriteBack(chain: FusibleChain, outputTensorNames: ReadonlySet<string>): {
27
+ destNames: Set<string>;
28
+ writeBack: Set<string>;
29
+ reductionConsumes: boolean;
30
+ };
31
+ /**
32
+ * Target-specific literal spellings used by the reduction helpers.
33
+ *
34
+ * The structure of the reduction snippets is identical between JS and
35
+ * C, but the literals differ: JS uses `1`, `-Infinity`, `===`/`!==`,
36
+ * while C uses `1.0`, `(-1.0/0.0)`, `==`/`!=`. The caller picks a
37
+ * record for its target and reuses it.
38
+ */
39
+ export interface ReductionLiterals {
40
+ /** Additive identity (`0` for JS, `0.0` for C). */
41
+ zero: string;
42
+ /** Multiplicative identity / truthy (`1` or `1.0`). */
43
+ one: string;
44
+ /** Positive infinity literal (`Infinity` or `(1.0/0.0)`). */
45
+ posInf: string;
46
+ /** Negative infinity literal (`-Infinity` or `(-1.0/0.0)`). */
47
+ negInf: string;
48
+ /** Strict-equality operator (`===` for JS, `==` for C). */
49
+ eq: string;
50
+ /** Strict-inequality operator (`!==` for JS, `!=` for C). */
51
+ neq: string;
52
+ }
53
+ export declare const JS_REDUCTION_LITERALS: ReductionLiterals;
54
+ export declare const C_REDUCTION_LITERALS: ReductionLiterals;
55
+ /** Initial value expression for a reduction accumulator. */
56
+ export declare function reductionInit(reduceName: string, lits: ReductionLiterals): string;
57
+ /** Statement that folds a per-element `valueExpr` into the accumulator. */
58
+ export declare function reductionCombine(reduceName: string, accVar: string, valueExpr: string, lits: ReductionLiterals): string;
59
+ /**
60
+ * Statement that folds a per-chain `val` into an enclosing accumulator
61
+ * `dest` via the outer-loop op (e.g. `ir_acc = ir_acc + sum(...)`).
62
+ *
63
+ * Target-neutral: `+=` / `-=` / `*=` have identical syntax in JS and C.
64
+ */
65
+ export declare function accumulateOp(op: BinaryOperation, dest: string, val: string): string;
@@ -0,0 +1,69 @@
1
+ /**
2
+ * Shared per-element scalar-expression emission for fused loops.
3
+ *
4
+ * Both the JS-JIT and C-JIT fused-chain emitters walk the chain's
5
+ * expression trees and emit each sub-expression in "per-element"
6
+ * form — tensor Vars become `data[__i]` reads (or a scalar local for
7
+ * chain-produced intermediates), Binary/Unary/Call map to scalar
8
+ * operations that will run once per element of the fused loop.
9
+ *
10
+ * The walk itself is identical between the two backends; only the
11
+ * leaf syntax differs (JS `Math.sin` vs C `sin`, integer literal
12
+ * formatting, mangling prefix). A backend supplies a `FusedTarget`
13
+ * describing those leaves and a value-form `ScalarOpTarget` for the
14
+ * arithmetic/comparison/logical switches.
15
+ *
16
+ * Note: the op target used here must emit comparison / logical ops
17
+ * in *numeric* form (result is a double 0.0/1.0 suitable for tensor
18
+ * write-back). For C this coincides with the regular value target;
19
+ * for JS a second target instance is needed because value-form
20
+ * comparisons return a JS boolean.
21
+ */
22
+ import type { JitExpr } from "./jitTypes.js";
23
+ import type { FusibleChain } from "./fusion.js";
24
+ import { type ScalarOpTarget } from "./scalarEmit.js";
25
+ /** Scalar local name for a chain-produced tensor intermediate. */
26
+ export declare function fusedLocal(name: string): string;
27
+ export interface FusedTarget {
28
+ /** Format a numeric literal (e.g. `1` for JS, `1.0` for C). */
29
+ formatNumber(v: number): string;
30
+ /** Mangle a scalar variable reference (non-tensor). */
31
+ mangle(name: string): string;
32
+ /**
33
+ * Emit a per-element read of tensor var `name` — i.e. the expression
34
+ * that yields `data[__i]` for that tensor. The backend decides how
35
+ * the data pointer is named and whether it's aliased locally.
36
+ */
37
+ tensorElemRead(name: string): string;
38
+ /**
39
+ * Emit a read of tensor `name` at a runtime 1-based scalar index
40
+ * `idxC` — i.e. `data[(int64_t)idx - 1]`. Used by the e2 whole-loop
41
+ * kernel (scalar-context access; elemwise backends can leave this
42
+ * undefined, the emitter will throw on an Index node). Returns `null`
43
+ * to reject.
44
+ */
45
+ tensorScalarIndexRead?(name: string, idxC: string): string | null;
46
+ /**
47
+ * Emit a call to a scalar math builtin. The backend decides which
48
+ * builtins it supports and how they map to library functions (e.g.
49
+ * JS `Math.sin` vs C `sin`). Return `null` to reject.
50
+ *
51
+ * `name` is the builtin name (e.g. `"sin"`, `"mod"`, `"rem"`);
52
+ * `args` are already-emitted per-element scalar expressions.
53
+ */
54
+ emitBuiltinCall(name: string, args: string[]): string | null;
55
+ }
56
+ /** Shared walker: emit a JitExpr as a per-element scalar expression. */
57
+ export declare function emitFusedScalarExpr(expr: JitExpr, chainLocals: ReadonlySet<string>, allTensorVars: ReadonlySet<string>, opTarget: ScalarOpTarget, fusedTarget: FusedTarget): string;
58
+ /**
59
+ * Find the first tensor-param name referenced in a chain's assigns.
60
+ * Used by both backends to pick the length-determining tensor.
61
+ */
62
+ export declare function findTensorParamInChain(chain: FusibleChain, paramTensors: ReadonlySet<string>, allTensorVars: ReadonlySet<string>): string | null;
63
+ /**
64
+ * Collect distinct tensor names referenced in the chain's expression
65
+ * trees that are NOT produced by the chain itself (i.e. read from
66
+ * outside: params or pre-existing locals). Both backends need this to
67
+ * pick a length-reference tensor when no formal param is in the chain.
68
+ */
69
+ export declare function collectInputTensors(chain: FusibleChain, allTensorVars: ReadonlySet<string>): Set<string>;
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Fusion analysis for JIT backends (shared by JS-JIT and C-JIT).
3
+ *
4
+ * Scans a statement list for runs of tensor element-wise assigns that
5
+ * can be collapsed into a single per-element `for` loop. Each such run
6
+ * is a "fusible chain."
7
+ *
8
+ * A chain breaks on:
9
+ * - control flow (If/For/While)
10
+ * - any non-Assign statement
11
+ * - a tensor assign whose RHS references a tensor that is NOT an input
12
+ * param and NOT previously assigned within the same chain
13
+ * - a scalar assign (left for the per-op emitter)
14
+ *
15
+ * An optional **trailing reduction** is absorbed when the statement
16
+ * immediately after a tensor chain is of the form
17
+ * `acc = acc + reduce(lastChainVar)` or
18
+ * `acc = reduce(lastChainVar)`
19
+ * where `reduce` is sum/prod/max/min/mean/any/all. Absorbing the
20
+ * reduction lets the fused loop emit an inline accumulator instead of
21
+ * materialising the intermediate buffer.
22
+ */
23
+ import type { JitExpr, JitStmt } from "./jitTypes.js";
24
+ import { BinaryOperation } from "../parser/types.js";
25
+ /** One tensor assign inside a fusible chain. */
26
+ export interface FusedAssign {
27
+ /** Destination tensor variable name. */
28
+ destName: string;
29
+ /** RHS expression tree (all tensor ops are element-wise). */
30
+ expr: JitExpr;
31
+ }
32
+ /** A trailing reduction absorbed into the fused loop. */
33
+ export interface FusedReduction {
34
+ /** Scalar accumulator variable name (e.g. `chain_acc`). */
35
+ accName: string;
36
+ /** Reduction builtin name (e.g. `sum`). */
37
+ reduceName: string;
38
+ /** The tensor variable being reduced (last chain dest). */
39
+ tensorName: string;
40
+ /**
41
+ * When true, the scalar statement is `acc = acc OP reduce(tensor)`,
42
+ * and `accOp` says which binary op combines the old accumulator with
43
+ * the reduction result. When false, it's a plain `acc = reduce(tensor)`.
44
+ */
45
+ hasAccumulate: boolean;
46
+ accOp?: BinaryOperation;
47
+ }
48
+ /** Describes one fusible chain found in a statement list. */
49
+ export interface FusibleChain {
50
+ /** Index of the first statement in the chain (within the parent list). */
51
+ startIdx: number;
52
+ /** Number of statements consumed (tensor assigns + optional reduction). */
53
+ length: number;
54
+ /** The tensor assigns to fuse. */
55
+ assigns: FusedAssign[];
56
+ /** Optional trailing reduction. */
57
+ reduction?: FusedReduction;
58
+ }
59
+ /**
60
+ * Scan a statement list and return all fusible chains.
61
+ *
62
+ * `paramTensors` is the set of tensor parameter names (input data that
63
+ * will be read via `data[i]` in the fused loop).
64
+ * `allTensorVars` is the full set of tensor-typed variables (params +
65
+ * locals + outputs).
66
+ * `allowedUnaryOps` optionally restricts which tensor unary Call names
67
+ * are fusible. Defaults to `FUSIBLE_TENSOR_UNARY_OPS` (full set).
68
+ * The JS backend passes a restricted set that excludes transcendentals
69
+ * (V8 can't vectorize them, so fusing them is slower than per-op calls).
70
+ */
71
+ export declare function findFusibleChains(stmts: JitStmt[], paramTensors: ReadonlySet<string>, allTensorVars: ReadonlySet<string>, allowedUnaryOps?: ReadonlySet<string>): FusibleChain[];