numbl 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/binding.gyp +8 -3
  2. package/dist-cli/cli.js +6676 -3695
  3. package/dist-lib/lib.js +6703 -3341
  4. package/dist-lib/numbl-core/executeCode.d.ts +3 -10
  5. package/dist-lib/numbl-core/fileIOAdapter.d.ts +2 -0
  6. package/dist-lib/numbl-core/interpreter/interpreter.d.ts +28 -24
  7. package/dist-lib/numbl-core/jit/e1/complexKernelEmit.d.ts +46 -0
  8. package/dist-lib/numbl-core/jit/e1/hash.d.ts +10 -0
  9. package/dist-lib/numbl-core/jit/e1/kernelEmit.d.ts +1 -1
  10. package/dist-lib/numbl-core/jit/e1/multiReductionKernel.d.ts +66 -0
  11. package/dist-lib/numbl-core/jit/e2/assignKernel.d.ts +34 -0
  12. package/dist-lib/numbl-core/jit/e2/astToJitExpr.d.ts +25 -0
  13. package/dist-lib/numbl-core/jit/e2/cache.d.ts +80 -0
  14. package/dist-lib/numbl-core/jit/e2/chainKernelEmit.d.ts +55 -0
  15. package/dist-lib/numbl-core/jit/e2/classify.d.ts +119 -0
  16. package/dist-lib/numbl-core/jit/e2/compileFn.d.ts +16 -0
  17. package/dist-lib/numbl-core/jit/e2/complexChainKernelEmit.d.ts +79 -0
  18. package/dist-lib/numbl-core/jit/e2/emitShared.d.ts +71 -0
  19. package/dist-lib/numbl-core/jit/e2/install.d.ts +11 -0
  20. package/dist-lib/numbl-core/jit/e2/liveness.d.ts +29 -0
  21. package/dist-lib/numbl-core/jit/e2/loopKernel.d.ts +49 -0
  22. package/dist-lib/numbl-core/jit/e2/loopKernelEmit.d.ts +75 -0
  23. package/dist-lib/numbl-core/jit/e2/multiReductionDriver.d.ts +24 -0
  24. package/dist-lib/numbl-core/jit/e2/reductionKernelEmit.d.ts +72 -0
  25. package/dist-lib/numbl-core/jit/e2/scalarFnDriver.d.ts +29 -0
  26. package/dist-lib/numbl-core/jit/fusedScalarEmit.d.ts +8 -0
  27. package/dist-lib/numbl-core/jit/heavyOps.d.ts +15 -0
  28. package/dist-lib/numbl-core/jit/js/jitCodegen.d.ts +1 -1
  29. package/dist-lib/numbl-core/jit/js/jsFusedCodegen.d.ts +1 -1
  30. package/dist-lib/numbl-core/jit/js/jsMultiReduction.d.ts +70 -0
  31. package/dist-lib/numbl-core/version.d.ts +1 -1
  32. package/native/numbl_addon.cpp +9 -0
  33. package/package.json +2 -4
  34. package/dist-lib/numbl-core/jit/c/hybrid.d.ts +0 -42
  35. package/dist-lib/numbl-core/jit/c/install.d.ts +0 -15
  36. package/dist-lib/numbl-core/jit/c/parityError.d.ts +0 -26
  37. package/dist-lib/numbl-core/jit/c/registry.d.ts +0 -51
@@ -22,8 +22,8 @@ export interface ExecOptions {
22
22
  profile?: boolean;
23
23
  /** Called each time a JIT function is compiled, with a description and the generated JS. */
24
24
  onJitCompile?: (description: string, jsCode: string) => void;
25
- /** Called each time the C-JIT compiles a function, with a description and the generated C. */
26
- onCJitCompile?: (description: string, cSource: string) => void;
25
+ /** Called each time an e2 C kernel is compiled, with a description and the generated C source. */
26
+ onCCompile?: (description: string, cCode: string) => void;
27
27
  /** Initial hold state for plotting (persisted across REPL executions). */
28
28
  initialHoldState?: boolean;
29
29
  /** Override or add builtins for this execution only. */
@@ -43,16 +43,8 @@ export interface ExecOptions {
43
43
  * `optimization` is still the base level (typically 1).
44
44
  */
45
45
  experimental?: string;
46
- /** Emit fused per-element loops in C-JIT (requires --opt 2). */
47
- fuse?: boolean;
48
46
  /** Parallelize fused loops with OpenMP threads (--par flag). */
49
47
  par?: boolean;
50
- /**
51
- * Diagnostic mode (`--check-c-jit-parity`, only meaningful with `--opt 2`):
52
- * throw on any C-JIT miss where JS-JIT would have compiled. Lets us
53
- * enumerate parity gaps as hard errors rather than silent fallbacks.
54
- */
55
- checkCJitParity?: boolean;
56
48
  /**
57
49
  * Initial implicit cwd path for the MATLAB-style "cwd is the first search path" feature.
58
50
  * - undefined → auto-detect from `system.cwd()` and scan its files.
@@ -93,6 +85,7 @@ export interface ProfileData {
93
85
  export interface ExecResult {
94
86
  output: string[];
95
87
  generatedJS: string;
88
+ /** Concatenated C-kernel source for all e2 compilations during the run. */
96
89
  generatedC: string;
97
90
  plotInstructions: PlotInstruction[];
98
91
  returnValue: RuntimeValue;
@@ -62,6 +62,8 @@ export interface FileIOAdapter {
62
62
  unzip?(zipfilename: string, outputfolder: string): string[];
63
63
  /** Return the temporary directory path. Optional. */
64
64
  tempdir?(): string;
65
+ /** Return the numbl user/config directory path. Optional. */
66
+ userpath?(): string;
65
67
  /** List directory entries. Returns array of {name, folder, bytes, isdir, mtimeMs}. Optional. */
66
68
  listDir?(dirPath: string): {
67
69
  name: string;
@@ -53,49 +53,53 @@ export declare class Interpreter {
53
53
  fn: (...args: unknown[]) => unknown;
54
54
  source: string;
55
55
  } | null>;
56
- /** @internal Per-instance cache for C-JIT-compiled loops (parallel to loopJitCache). */
57
- loopCJitCache: Map<string, {
58
- fn: (...args: unknown[]) => unknown;
59
- } | null>;
60
56
  /** @internal Progressive type widening for loop JIT: location -> last unified input types. */
61
57
  loopLastInputTypes: Map<string, import("../jit/jitTypes.js").JitType[]>;
62
58
  /** @internal Sibling stmts of the currently-executing stmt (set by execStmts). */
63
59
  _postSiblings: import("../parser/types.js").Stmt[] | null;
64
60
  /** @internal Index in _postSiblings of the next stmt after the current one. */
65
61
  _postSiblingsIdx: number;
62
+ /** @internal Number of EXTRA sibling stmts that the current execStmt
63
+ * consumed beyond the one passed in. The surrounding sibling loop
64
+ * reads this after each execStmt and advances its index by this
65
+ * many. Used by `--opt e2` chain fusion to atomically execute a run
66
+ * of consecutive Assigns as one C kernel. The interpreter must
67
+ * reset this to 0 before each execStmt call. */
68
+ _e2ChainAdvance: number;
69
+ /** @internal The stmt list of the innermost enclosing function body
70
+ * (or top-level script body). Used by `--opt e2` chain liveness
71
+ * analysis to decide whether a chain LHS is actually referenced
72
+ * outside the chain — if not, the LHS becomes a per-element stack-
73
+ * local rather than a materialized output buffer. Pushed on call
74
+ * frame entry, popped on exit. */
75
+ _currentScopeBody: import("../parser/types.js").Stmt[] | null;
76
+ /** @internal Names that "escape" the current scope regardless of
77
+ * textual usage. For functions: the declared output names (plus
78
+ * `varargout`). For top-level scripts: `null`, meaning every name
79
+ * escapes (the surrounding caller can read all script-level vars
80
+ * via `result.variableValues`). Pushed/popped alongside
81
+ * `_currentScopeBody`. */
82
+ _currentScopeExports: Set<string> | null;
66
83
  /**
67
84
  * Optimization level:
68
85
  * 0 — pure AST interpreter, no JIT.
69
86
  * 1 — JS-JIT (default): type-specialize hot functions/loops to JS via `new Function()`.
70
- * 2 — C-JIT: additionally emit C for feasible scalar specializations,
71
- * compile to a native `.node` module, and invoke via N-API.
72
- * Infeasible IR transparently falls back to the JS-JIT path.
73
87
  */
74
88
  optimization: number;
75
89
  /**
76
- * Experimental opt variant selector — e.g. `"e1"` for the prototype
77
- * that keeps JS-JIT as the outer and emits on-demand C kernels for
78
- * fusible tensor chains. Undefined for the standard `--opt <n>` path.
90
+ * Experimental opt variant selector — e.g. `"e1"` for the mode that
91
+ * keeps JS-JIT as the outer and emits on-demand C kernels for fusible
92
+ * tensor chains and pure-scalar user functions. Undefined for the
93
+ * standard `--opt <n>` path.
79
94
  */
80
95
  experimental?: string;
81
- /** Emit fused per-element loops in C-JIT (--fuse flag). */
82
- fuse: boolean;
83
96
  /** Parallelize fused loops with OpenMP threads (--par flag). */
84
97
  par: boolean;
85
- /**
86
- * Diagnostic mode (`--check-c-jit-parity`, only meaningful with `--opt 2`).
87
- * When set, any C-JIT miss where JS-JIT would have compiled throws a
88
- * `CJitParityError` instead of silently falling back — surfacing parity
89
- * gaps as a punch list of features to implement in the C-JIT. Env
90
- * failures (missing `cc`, compile failure) also throw, since the user
91
- * explicitly asked to audit C-JIT coverage.
92
- */
93
- checkCJitParity: boolean;
94
98
  /** Callback for JIT compilation logging (JS codegen). */
95
99
  onJitCompile?: (description: string, jsCode: string) => void;
96
- /** Callback for C-JIT compilation logging (--dump-c). */
97
- onCJitCompile?: (description: string, cSource: string) => void;
98
- /** Verbose log sink (plumbed from ExecOptions.log; used by C-JIT for diagnostics). */
100
+ /** Callback for C-kernel compilation logging (--opt e2 / --dump-c). */
101
+ onCCompile?: (description: string, cCode: string) => void;
102
+ /** Verbose log sink (plumbed from ExecOptions.log). */
99
103
  log?: (message: string) => void;
100
104
  constructor(rt: Runtime, ctx: LoweringContext, functionIndex: FunctionIndex, mainFileName: string, initialVariableValues?: Record<string, RuntimeValue>);
101
105
  /** Clear all JIT and function resolution caches. Called after addpath/rmpath. */
@@ -0,0 +1,46 @@
1
+ /**
2
+ * e1 (experimental) — complex-tensor standalone C-kernel emission.
3
+ *
4
+ * Sister module to `kernelEmit.ts` (which handles real-tensor chains).
5
+ * Given a FusibleChain that produces at least one complex tensor, emit
6
+ * a paired-buffer C kernel of the form
7
+ *
8
+ * void k_<hash>(int64_t n,
9
+ * const double *in_<a>_re, const double *in_<a>_im, // complex tensor
10
+ * const double *in_<b>, // real tensor (widened)
11
+ * double s_<c>_re, double s_<c>_im, // complex scalar
12
+ * double s_<d>, // real scalar
13
+ * double *out_<y>_re, double *out_<y>_im) // complex output
14
+ * {
15
+ * #pragma omp simd
16
+ * for (int64_t i = 0; i < n; i++) {
17
+ * double __f_y_re = ...;
18
+ * double __f_y_im = ...;
19
+ * out_<y>_re[i] = __f_y_re;
20
+ * out_<y>_im[i] = __f_y_im;
21
+ * }
22
+ * }
23
+ *
24
+ * Supports the same fusion envelope as emitComplexPerElem in
25
+ * `c/emit/fused.ts`:
26
+ * - Binary: + - * .*
27
+ * - Unary: + -
28
+ * - Call: conj, real, imag
29
+ * - Operand widening: real tensor / real scalar read with im = 0
30
+ * - ImagLiteral: (0.0, 1.0) pair
31
+ *
32
+ * Complex chains do NOT carry a trailing reduction — `fusion.ts` drops
33
+ * the absorption for complex chains because the inline scalar
34
+ * accumulator can't hold a complex value. Kernels emitted here have no
35
+ * reduction output.
36
+ */
37
+ import type { FusibleChain } from "../fusion.js";
38
+ import type { KernelEmitResult } from "./kernelEmit.js";
39
+ /**
40
+ * Emit a complex-tensor fused chain as a standalone C kernel.
41
+ *
42
+ * Returns null when the chain contains an expression the per-element
43
+ * walker doesn't support (abs, complex divide, transcendental on
44
+ * complex, etc.) — the caller falls back to the JS-JIT per-op path.
45
+ */
46
+ export declare function emitComplexChainKernel(chain: FusibleChain, allTensorVars: ReadonlySet<string>, complexTensorNames: ReadonlySet<string>, complexScalarVars: ReadonlySet<string>, outputTensorNames: ReadonlySet<string>): KernelEmitResult | null;
@@ -0,0 +1,10 @@
1
+ /**
2
+ * Shared content-hash helper for the e1 codegen.
3
+ *
4
+ * 64-bit FNV-1a over UTF-8 code units, returned as 16 hex chars.
5
+ * Deterministic, fully self-contained, and browser-safe (no Node
6
+ * `crypto` dependency). Cryptographic strength isn't needed — the
7
+ * hash is a content-addressed suffix for kernel names and
8
+ * `$h.$kernels[...]` cache keys.
9
+ */
10
+ export declare function fnv1a64Hex(s: string): string;
@@ -51,4 +51,4 @@ export interface KernelEmitResult {
51
51
  * caller emits something like `$h.<kernelName>(${jsCallArgs.join(",")})`. */
52
52
  jsCallArgs: string[];
53
53
  }
54
- export declare function emitChainKernel(chain: FusibleChain, allTensorVars: ReadonlySet<string>, outputTensorNames: ReadonlySet<string>): KernelEmitResult | null;
54
+ export declare function emitChainKernel(chain: FusibleChain, allTensorVars: ReadonlySet<string>, outputTensorNames: ReadonlySet<string>, par: boolean): KernelEmitResult | null;
@@ -0,0 +1,66 @@
1
+ /**
2
+ * e1 — C kernel for multi-reduction scalar assigns.
3
+ *
4
+ * A MATLAB line like
5
+ *
6
+ * red_acc = red_acc + (sum(x) + mean(x) + max(x) + min(x));
7
+ *
8
+ * has four reductions over the same vector. The default JS-JIT path
9
+ * emits four `$h.tSum` / `$h.ib_*` helper calls, each of which scans
10
+ * the whole vector. This module emits a single-pass C kernel that
11
+ * computes every requested reduction in one loop and writes results
12
+ * into caller-allocated scalar slots.
13
+ *
14
+ * Specialised per op-set: a group of `{sum, max, min}` compiles to a
15
+ * different kernel than `{sum, mean, max, min}`. Source-addressed by
16
+ * FNV-1a hash so the JS `$h.$kernels[...]` cache dedupes repeated call
17
+ * sites.
18
+ *
19
+ * NaN handling: `-ffast-math` is on for the compile (matches the other
20
+ * e1 kernels), so naive `isnan` is folded to `false`. The kernel uses
21
+ * an inline bit-pattern NaN check to drive MATLAB's omit-NaN semantics
22
+ * for `max`/`min` and records an `any_non_nan` flag the JS side uses
23
+ * to map an all-NaN input to NaN.
24
+ */
25
+ /** Reductions we can fuse into one pass. `any` / `all` are excluded
26
+ * because their short-circuit `break` would prematurely stop the
27
+ * other accumulators. */
28
+ export type MultiReduceOp = "sum" | "prod" | "max" | "min" | "mean";
29
+ export interface MultiReductionKernelInfo {
30
+ /** Hash-derived C function name, e.g. `mr_3a7f81b2...`. */
31
+ kernelName: string;
32
+ /** Full C source string. */
33
+ cSource: string;
34
+ /** koffi function signature. */
35
+ koffiSig: string;
36
+ /** Content hash. */
37
+ hash: string;
38
+ /**
39
+ * Output slot layout. Each reduction in the kernel writes to its own
40
+ * Float64 slot, in the order of this array. `any_non_nan` (a 0/1 flag
41
+ * stored as double) is at the end when `hasMinOrMax` is true.
42
+ * The JS caller allocates a `Float64Array(slotCount)` and reads slots
43
+ * by index after the call.
44
+ */
45
+ slotNames: string[];
46
+ /** True when the kernel emits an `any_non_nan` slot at index
47
+ * `slotNames.length - 1`. */
48
+ hasAnyNonNan: boolean;
49
+ }
50
+ /**
51
+ * Build a multi-reduction kernel for the given op set. `ops` should be
52
+ * a deduplicated list of reductions to compute (e.g. ["sum", "max"]).
53
+ * The returned `slotNames` preserves insertion order for indexing; if
54
+ * the op set contains `max`/`min`, an extra `any_non_nan` slot is
55
+ * appended (the JS side uses it to override the sentinel max/min with
56
+ * NaN when every input element was NaN).
57
+ *
58
+ * When `par` is true, the per-element loop is emitted as
59
+ * `#pragma omp parallel for simd reduction(...)` with one reduction
60
+ * clause per accumulator and an `if(n >= T)` gate that falls back to
61
+ * serial below the threshold. Requires the caller to link with
62
+ * `-fopenmp`; e1's `install.ts` already does this when libgomp is
63
+ * available. When `par` is false, the loop is emitted as plain
64
+ * `#pragma omp simd` (SIMD-only, single-threaded).
65
+ */
66
+ export declare function emitMultiReductionKernel(ops: readonly MultiReduceOp[], par?: boolean): MultiReductionKernelInfo;
@@ -0,0 +1,34 @@
1
+ /**
2
+ * e2 — per-assign / chain kernel driver.
3
+ *
4
+ * Entry point `tryE2Assign` is called from `interpreterExec.ts` for
5
+ * every `Assign` statement when `interp.experimental === "e2"`.
6
+ *
7
+ * Multi-LHS chain detection: scans consecutive suppressed Assigns
8
+ * regardless of LHS name. For each chain LHS, uses scope-body liveness
9
+ * (via `interp._currentScopeBody`) to decide whether the LHS escapes
10
+ * (materializes as an `out_<name>` buffer) or is purely chain-local
11
+ * (kept as a per-element stack-local). Reads of a chain LHS before
12
+ * its first assign in the chain become `in_<name>` parameters.
13
+ *
14
+ * On success:
15
+ * - Single chain assign: handled like a one-stmt chain.
16
+ * - Multi-stmt chain: one C kernel runs all assigns, only escape
17
+ * LHSs materialize back to env. `interp._e2ChainAdvance` is set
18
+ * so the surrounding loop skips the consumed sibling stmts.
19
+ *
20
+ * Compilation failures are hard errors (RuntimeError). Classification
21
+ * bails (non-classifiable RHS, mismatched lengths, etc.) silently fall
22
+ * through to the regular interpreter path.
23
+ */
24
+ import type { Stmt } from "../../parser/types.js";
25
+ import type { Interpreter } from "../../interpreter/interpreter.js";
26
+ /**
27
+ * Try to compile a chain (1+ stmts) starting at `stmt`. Returns true
28
+ * on success — `interp._e2ChainAdvance` is set to the count of EXTRA
29
+ * sibling stmts the kernel consumed (0 for a single-stmt chain).
30
+ * Returns false to fall back to the regular interpreter path.
31
+ */
32
+ export declare function tryE2Assign(interp: Interpreter, stmt: Stmt & {
33
+ type: "Assign";
34
+ }): boolean;
@@ -0,0 +1,25 @@
1
+ /**
2
+ * e2 — minimal AST `Expr` → `JitExpr` lowerer.
3
+ *
4
+ * Only handles the whitelist that `classify.ts` accepts: Number, Ident,
5
+ * whitelisted Binary/Unary, whitelisted FuncCall. Types are read from
6
+ * the live runtime environment (the caller passes in a per-name
7
+ * `JitType` lookup), so there's no cross-branch unification.
8
+ *
9
+ * The classifier already replaced opaque subtrees with synthetic Ident
10
+ * nodes whose `name` is also in `envTypes`, so this lowerer doesn't need
11
+ * to know about opacity — every Ident it sees has a known runtime type.
12
+ */
13
+ import type { Expr } from "../../parser/types.js";
14
+ import type { JitExpr, JitType } from "../jitTypes.js";
15
+ export declare class E2LowerError extends Error {
16
+ }
17
+ export interface LowerOptions {
18
+ /** When a `FuncCall{name, args}` has `name` in `envTypes` as a tensor,
19
+ * treat it as tensor indexing and lower to an `Index` node instead
20
+ * of looking up a builtin. Used by the e2 whole-loop kernel — the
21
+ * chain emitters don't set this (their classifier has already
22
+ * marked tensor-access FuncCalls as opaque). */
23
+ resolveFuncCallAsTensorIndex?: boolean;
24
+ }
25
+ export declare function lowerAstToJitExpr(expr: Expr, envTypes: ReadonlyMap<string, JitType>, options?: LowerOptions): JitExpr;
@@ -0,0 +1,80 @@
1
+ /**
2
+ * e2 — per-AST-node compiled-kernel cache.
3
+ *
4
+ * Each AST `Expr` (the RHS of an `Assign` we've seen at least once) maps
5
+ * to a per-signature cache: the same expression visited with a different
6
+ * runtime type signature produces a different specialization. The
7
+ * signature includes input names, scalar-vs-tensor, complex-or-not, and
8
+ * the LHS name (since the kernel hard-codes which output to write).
9
+ *
10
+ * The Map is keyed by the AST node identity, not by source text — two
11
+ * identical-looking `r = r .* y` statements at different file:line
12
+ * positions get separate cache entries, so a recompile from a
13
+ * different call-site doesn't poison earlier ones.
14
+ *
15
+ * The cache holds either an `E2CacheEntry` or the `E2_BAILED` sentinel
16
+ * indicating that classification or compilation failed for this
17
+ * signature; the sentinel prevents re-attempting the same hopeless
18
+ * lowering on every invocation.
19
+ */
20
+ import type { Stmt, BinaryOperation } from "../../parser/types.js";
21
+ export type CompiledKernelFn = (...args: unknown[]) => unknown;
22
+ export declare const E2_BAILED: unique symbol;
23
+ export interface E2ReductionInfo {
24
+ /** Reduction op name (sum / prod / max / min / mean / any / all). */
25
+ reduceName: string;
26
+ /** Accumulator variable name in env. */
27
+ accName: string;
28
+ /** When true, the source pattern was `acc = acc OP reduce(...)`;
29
+ * the driver applies the same OP to combine the kernel's scalar
30
+ * output with the existing env value of `acc`. When false, the
31
+ * source pattern was `acc = reduce(...)` and the kernel output is
32
+ * written directly. */
33
+ hasAccumulate: boolean;
34
+ /** Only meaningful when `hasAccumulate` is true. */
35
+ accOp?: BinaryOperation;
36
+ }
37
+ /** Complex-path partitioning info. Present iff the kernel was compiled
38
+ * via the paired-buffer complex emitter. The driver uses these lists
39
+ * to marshal complex tensors (two pointers per tensor), complex
40
+ * scalars (two doubles per scalar), and to allocate complex output
41
+ * buffers (data + imag Float64Arrays). */
42
+ export interface E2ComplexInfo {
43
+ complexTensorNames: string[];
44
+ realTensorNames: string[];
45
+ complexInputLhsNames: string[];
46
+ realInputLhsNames: string[];
47
+ complexScalarNames: string[];
48
+ realScalarNames: string[];
49
+ complexEscapeLhsNames: string[];
50
+ realEscapeLhsNames: string[];
51
+ }
52
+ export interface E2CacheEntry {
53
+ fn: CompiledKernelFn;
54
+ /** Env tensor input names (combined — for diagnostics). When
55
+ * `complex` is defined, the complex marshaling code uses
56
+ * `complex.complexTensorNames` and `complex.realTensorNames`
57
+ * instead of this. */
58
+ tensorNames: string[];
59
+ /** Chain LHS names that need `in_<name>` (between tensors and scalars). */
60
+ inputLhsNames: string[];
61
+ /** Ordered scalar input names. */
62
+ scalarNames: string[];
63
+ /** Chain LHS names that materialize via `out_<name>` (escape names). */
64
+ escapeLhsNames: string[];
65
+ /** Number of chain assigns this entry encodes (0 for a standalone
66
+ * reduction kernel, 1 for a single-assign chain kernel, >=2 for
67
+ * multi-stmt chains). */
68
+ chainLength: number;
69
+ /** Set when the kernel produces a trailing scalar reduction output.
70
+ * The driver allocates a `Float64Array(1)` for `out_acc`, calls the
71
+ * kernel, then combines the result with `env[accName]` per the
72
+ * `accOp` and `hasAccumulate` fields. Complex chains never set this
73
+ * — the complex emitter rejects trailing reductions. */
74
+ reduction?: E2ReductionInfo;
75
+ /** Paired-buffer complex path info. When set, the marshaling code
76
+ * takes the complex branch. */
77
+ complex?: E2ComplexInfo;
78
+ }
79
+ export declare function chainCacheGet(firstStmt: Stmt, sig: string): E2CacheEntry | typeof E2_BAILED | undefined;
80
+ export declare function chainCacheSet(firstStmt: Stmt, sig: string, entry: E2CacheEntry | typeof E2_BAILED): void;
@@ -0,0 +1,55 @@
1
+ /**
2
+ * e2 — multi-LHS fused chain C kernel emission.
3
+ *
4
+ * Given a sequence of `JitExpr` RHSs each writing to a (possibly
5
+ * distinct) chain LHS, produces one C function that runs every assign
6
+ * in a single per-element loop. Each chain LHS becomes a stack-local
7
+ * `double <name>` declared once at the top of the loop body. Within
8
+ * the body, references to a chain-LHS name resolve to the stack-local
9
+ * once the corresponding assign has run; before that point they
10
+ * resolve to `in_<name>[i]` (so the kernel signature includes
11
+ * `in_<lhsName>` for any chain LHS that's read before being written).
12
+ *
13
+ * After the per-iter assigns, every "escape" LHS (one that's actually
14
+ * referenced by the rest of the function body) gets written to its
15
+ * `out_<name>[i]` pointer. Chain-locals (only used inside the chain)
16
+ * are dropped at the end of the iteration with no buffer materialized.
17
+ *
18
+ * void e2c_<hash>(int64_t n,
19
+ * const double *in_<input1>, ...,
20
+ * [const double *in_<lhs_needing_input>, ...,]
21
+ * double s_<scalar1>, ...,
22
+ * double *out_<escape_lhs1>, ...)
23
+ * {
24
+ * #pragma omp simd
25
+ * for (int64_t i = 0; i < n; i++) {
26
+ * double <chain_lhs1>, <chain_lhs2>, ...;
27
+ * <chain_lhs1> = <stmt0_rhs_C>;
28
+ * <chain_lhs2> = <stmt1_rhs_C>;
29
+ * ...
30
+ * out_<escape_lhs1>[i] = <escape_lhs1>;
31
+ * out_<escape_lhs2>[i] = <escape_lhs2>;
32
+ * }
33
+ * }
34
+ */
35
+ import { type ChainAssignSpec, type KernelInputs } from "./emitShared.js";
36
+ export type { ChainAssignSpec } from "./emitShared.js";
37
+ export interface E2ChainEmitResult {
38
+ kernelName: string;
39
+ cSource: string;
40
+ koffiSig: string;
41
+ hash: string;
42
+ /** Tensor input names in signature order — does NOT include any
43
+ * in_<lhs> entries. */
44
+ inputTensors: string[];
45
+ /** Chain LHS names that appear as `in_<name>` in the signature, in
46
+ * order. */
47
+ inputLhsNames: string[];
48
+ /** Scalar input names in signature order. */
49
+ inputScalars: string[];
50
+ /** Chain LHS names that appear as `out_<name>` in the signature,
51
+ * in order. */
52
+ escapeLhsNames: string[];
53
+ chainLength: number;
54
+ }
55
+ export declare function emitE2ChainKernel(assigns: ChainAssignSpec[], inputs: KernelInputs, par?: boolean): E2ChainEmitResult;
@@ -0,0 +1,119 @@
1
+ /**
2
+ * e2 (experimental) — per-assign expression classifier.
3
+ *
4
+ * Walks an AST `Expr` and decides whether it can be compiled into a
5
+ * single per-element C kernel. The classifier never evaluates anything
6
+ * — it only inspects the AST shape and the names referenced.
7
+ *
8
+ * Whitelist:
9
+ * - Number, Ident
10
+ * - Binary with arithmetic / comparison ops
11
+ * - Unary Plus, Minus, Not
12
+ * - FuncCall to a whitelisted scalar math builtin
13
+ *
14
+ * Anything outside the whitelist is recorded as an "opaque root": the
15
+ * driver is expected to evaluate that subtree via the interpreter and
16
+ * bind the result to a fresh synthetic name, then re-classify with that
17
+ * name in scope.
18
+ *
19
+ * The classifier returns a list of opaque-root subtrees and the
20
+ * "rewritten" expression that uses synthetic names where the opaque
21
+ * roots used to be. The driver is responsible for runtime type checks
22
+ * and for actually evaluating the opaque subtrees.
23
+ */
24
+ import type { Expr } from "../../parser/types.js";
25
+ import { BinaryOperation } from "../../parser/types.js";
26
+ export { BinaryOperation } from "../../parser/types.js";
27
+ /** Scalar math builtins that map cleanly to C99. Mirrors the JS-JIT
28
+ * Math.* table plus pow / hypot / atan2 / etc. */
29
+ export declare const E2_BUILTIN_WHITELIST: ReadonlySet<string>;
30
+ /** One opaque subtree the driver must evaluate before invoking the
31
+ * kernel. The classifier replaces it in `emittableExpr` with an Ident
32
+ * named `syntheticName`. */
33
+ export interface OpaqueRoot {
34
+ syntheticName: string;
35
+ expr: Expr;
36
+ }
37
+ export interface ClassifyResult {
38
+ /** AST with opaque subtrees replaced by Ident(syntheticName) nodes. */
39
+ emittableExpr: Expr;
40
+ /** Subtrees the driver must evaluate via the interpreter. */
41
+ opaqueRoots: OpaqueRoot[];
42
+ /** Identifiers referenced in `emittableExpr` that originated from the
43
+ * user's environment (i.e. NOT synthetic opaque-root bindings). The
44
+ * driver looks these up in the env to determine input types. */
45
+ envIdents: Set<string>;
46
+ }
47
+ /** Classify an expression. Always succeeds — the worst case is the
48
+ * whole expression becomes a single opaque root, which the driver
49
+ * will reject. */
50
+ export declare function classifyExpr(expr: Expr): ClassifyResult;
51
+ /** Heuristic gate: an expression is "worth" JIT'ing only when it does
52
+ * some work — a bare Ident or Number is not. The driver also gates
53
+ * on tensor size at runtime; this is just a structural pre-filter to
54
+ * skip the cost of lowering trivial expressions. */
55
+ export declare function isWorthCompiling(emittableExpr: Expr): boolean;
56
+ /** A single classification entry for one assign in a chain. The chain
57
+ * emitter consumes one of these per assign in order. */
58
+ export interface ChainAssignClassification {
59
+ /** The original AST stmt (kept so the cache can key on it). */
60
+ stmt: import("../../parser/types.js").Stmt & {
61
+ type: "Assign";
62
+ };
63
+ /** The classifier's rewritten RHS — opaque subtrees replaced by
64
+ * Ident(syntheticName). */
65
+ emittableExpr: Expr;
66
+ /** Opaque subtrees this assign contributed (driver evaluates these
67
+ * before the kernel call). */
68
+ opaqueRoots: OpaqueRoot[];
69
+ /** Identifiers referenced by this assign's emittableExpr. Includes
70
+ * synthetic opaque-root names. */
71
+ envIdents: Set<string>;
72
+ /** True if the assign reads its own LHS (e.g. `r = r + x`). For the
73
+ * first stmt of a chain this means the kernel needs `in_<lhs>` as
74
+ * an input pointer; for later stmts it just means a chain-local
75
+ * read. */
76
+ selfReadsLhs: boolean;
77
+ }
78
+ export interface ChainClassification {
79
+ /** Chain assigns, in source order. May have multiple distinct LHSs. */
80
+ assigns: ChainAssignClassification[];
81
+ }
82
+ /** Detect a chain of consecutive suppressed classifiable Assigns
83
+ * starting at `stmts[startIdx]`. Each LHS may be a different name;
84
+ * the driver decides — using full-scope liveness — whether each LHS
85
+ * becomes a chain-local or a materialized output buffer.
86
+ *
87
+ * The chain ends at the first non-Assign, the first unsuppressed
88
+ * Assign, or the first Assign whose RHS classification is not worth
89
+ * compiling.
90
+ *
91
+ * Returns null if the very first stmt isn't a chainable Assign. */
92
+ export declare function classifyAssignChain(stmts: import("../../parser/types.js").Stmt[], startIdx: number): ChainClassification | null;
93
+ /** Reduction op names whose semantics the e2 reduction emitter knows. */
94
+ export declare const E2_REDUCTION_OPS: ReadonlySet<string>;
95
+ export interface TrailingReductionMatch {
96
+ /** The original Assign stmt — pinned for cache identity. */
97
+ stmt: import("../../parser/types.js").Stmt & {
98
+ type: "Assign";
99
+ };
100
+ /** LHS = accumulator name. */
101
+ accName: string;
102
+ /** Reduction op name. */
103
+ reduceName: string;
104
+ /** AST expression argument to the reduction call. The driver decides
105
+ * whether to treat it as a Var-targeting-chain-local (for trailing-
106
+ * after-chain) or as a standalone elemwise expression. */
107
+ targetExpr: Expr;
108
+ /** True for `acc = acc OP reduce(...)`; false for `acc = reduce(...)`. */
109
+ hasAccumulate: boolean;
110
+ /** The accumulate op (only meaningful when `hasAccumulate` is true). */
111
+ accOp?: BinaryOperation;
112
+ }
113
+ /** Match an Assign of the form:
114
+ * acc = reduce(arg)
115
+ * acc = acc OP reduce(arg)
116
+ * acc = reduce(arg) OP acc (commutative ops only)
117
+ * where `reduce` is a single-argument call to a known reduction op.
118
+ * Returns the matched details or null. */
119
+ export declare function matchTrailingReduction(stmt: import("../../parser/types.js").Stmt): TrailingReductionMatch | null;
@@ -0,0 +1,16 @@
1
+ /**
2
+ * e2 — browser-safe indirection for the C compile driver.
3
+ *
4
+ * The driver in `c/compile.ts` is Node-only (it shells out to `cc` and
5
+ * loads via koffi). The browser bundle includes the e2 modules but
6
+ * NOT this driver — `setE2CompileFn` from `e2/install.ts` (Node only)
7
+ * swaps in the real implementation. In the browser, the stub throws.
8
+ */
9
+ export type E2CompileFn = (cSource: string, koffiSig: string, kernelName: string, log?: (msg: string) => void) => ((...args: unknown[]) => unknown) | null;
10
+ export declare function setE2CompileFn(fn: E2CompileFn): void;
11
+ export declare function getE2CompileFn(): E2CompileFn;
12
+ /** Minimum element count of the largest tensor input before we'll
13
+ * consider compiling an e2 kernel. Below this, koffi overhead dwarfs
14
+ * the work and falling through to the interpreter is faster.
15
+ * Overridable via `NUMBL_E2_MIN_ELEMS`. */
16
+ export declare function e2MinElems(): number;