numbl 0.1.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +59 -3
- package/dist-cli/cli.js +22538 -7936
- package/dist-lib/lib.js +34682 -20852
- package/dist-lib/numbl-core/executeCode.d.ts +13 -0
- package/dist-lib/numbl-core/fileIOAdapter.d.ts +2 -0
- package/dist-lib/numbl-core/helpers/reduction-helpers.d.ts +7 -2
- package/dist-lib/numbl-core/interpreter/builtins/datetime.d.ts +39 -0
- package/dist-lib/numbl-core/interpreter/builtins/index.d.ts +1 -0
- package/dist-lib/numbl-core/interpreter/builtins/time-system.d.ts +1 -0
- package/dist-lib/numbl-core/interpreter/builtins/types.d.ts +96 -5
- package/dist-lib/numbl-core/interpreter/interpreter.d.ts +41 -3
- package/dist-lib/numbl-core/interpreter/types.d.ts +1 -1
- package/dist-lib/numbl-core/jit/c/abi.d.ts +90 -0
- package/dist-lib/numbl-core/jit/c/assemble.d.ts +56 -0
- package/dist-lib/numbl-core/jit/c/classify.d.ts +70 -0
- package/dist-lib/numbl-core/jit/c/compile.d.ts +37 -0
- package/dist-lib/numbl-core/jit/c/context.d.ts +152 -0
- package/dist-lib/numbl-core/jit/c/emit/assign.d.ts +20 -0
- package/dist-lib/numbl-core/jit/c/emit/complexScalar.d.ts +18 -0
- package/dist-lib/numbl-core/jit/c/emit/fused.d.ts +42 -0
- package/dist-lib/numbl-core/jit/c/emit/helpers.d.ts +40 -0
- package/dist-lib/numbl-core/jit/c/emit/index.d.ts +14 -0
- package/dist-lib/numbl-core/jit/c/emit/scalar.d.ts +23 -0
- package/dist-lib/numbl-core/jit/c/emit/stmt.d.ts +25 -0
- package/dist-lib/numbl-core/jit/c/emit/tensor.d.ts +127 -0
- package/dist-lib/numbl-core/jit/c/emit/userCall.d.ts +58 -0
- package/dist-lib/numbl-core/jit/c/epilogue.d.ts +26 -0
- package/dist-lib/numbl-core/jit/c/feasibility.d.ts +44 -0
- package/dist-lib/numbl-core/jit/c/prelude.d.ts +37 -0
- package/dist-lib/numbl-core/jit/c/visit.d.ts +63 -0
- package/dist-lib/numbl-core/jit/e1/complexKernelEmit.d.ts +46 -0
- package/dist-lib/numbl-core/jit/e1/hash.d.ts +10 -0
- package/dist-lib/numbl-core/jit/e1/install.d.ts +13 -0
- package/dist-lib/numbl-core/jit/e1/kernelEmit.d.ts +54 -0
- package/dist-lib/numbl-core/jit/e1/multiReductionKernel.d.ts +66 -0
- package/dist-lib/numbl-core/jit/e1/openmpFlag.d.ts +13 -0
- package/dist-lib/numbl-core/jit/e1/scalarFnKernel.d.ts +44 -0
- package/dist-lib/numbl-core/jit/e2/assignKernel.d.ts +34 -0
- package/dist-lib/numbl-core/jit/e2/astToJitExpr.d.ts +25 -0
- package/dist-lib/numbl-core/jit/e2/cache.d.ts +80 -0
- package/dist-lib/numbl-core/jit/e2/chainKernelEmit.d.ts +55 -0
- package/dist-lib/numbl-core/jit/e2/classify.d.ts +119 -0
- package/dist-lib/numbl-core/jit/e2/compileFn.d.ts +16 -0
- package/dist-lib/numbl-core/jit/e2/complexChainKernelEmit.d.ts +79 -0
- package/dist-lib/numbl-core/jit/e2/emitShared.d.ts +71 -0
- package/dist-lib/numbl-core/jit/e2/install.d.ts +11 -0
- package/dist-lib/numbl-core/jit/e2/liveness.d.ts +29 -0
- package/dist-lib/numbl-core/jit/e2/loopKernel.d.ts +49 -0
- package/dist-lib/numbl-core/jit/e2/loopKernelEmit.d.ts +75 -0
- package/dist-lib/numbl-core/jit/e2/multiReductionDriver.d.ts +24 -0
- package/dist-lib/numbl-core/jit/e2/reductionKernelEmit.d.ts +72 -0
- package/dist-lib/numbl-core/jit/e2/scalarFnDriver.d.ts +29 -0
- package/dist-lib/numbl-core/jit/fusedChainHelpers.d.ts +65 -0
- package/dist-lib/numbl-core/jit/fusedScalarEmit.d.ts +69 -0
- package/dist-lib/numbl-core/jit/fusion.d.ts +71 -0
- package/dist-lib/numbl-core/jit/fusionOps.d.ts +25 -0
- package/dist-lib/numbl-core/jit/heavyOps.d.ts +15 -0
- package/dist-lib/numbl-core/{interpreter/jit → jit}/index.d.ts +2 -2
- package/dist-lib/numbl-core/jit/jitBailSafety.d.ts +41 -0
- package/dist-lib/numbl-core/{interpreter/jit → jit}/jitLoop.d.ts +2 -2
- package/dist-lib/numbl-core/{interpreter/jit → jit}/jitLoopAnalysis.d.ts +6 -1
- package/dist-lib/numbl-core/jit/jitLower.d.ts +122 -0
- package/dist-lib/numbl-core/jit/jitLowerExpr.d.ts +27 -0
- package/dist-lib/numbl-core/jit/jitLowerStmt.d.ts +9 -0
- package/dist-lib/numbl-core/{interpreter/jit → jit}/jitLowerTypes.d.ts +7 -3
- package/dist-lib/numbl-core/jit/jitTopLevel.d.ts +22 -0
- package/dist-lib/numbl-core/{interpreter/jit → jit}/jitTypes.d.ts +133 -1
- package/dist-lib/numbl-core/{interpreter/jit → jit/js}/jitCodegen.d.ts +2 -2
- package/dist-lib/numbl-core/{interpreter/jit → jit/js}/jitCodegenHoist.d.ts +19 -1
- package/dist-lib/numbl-core/{interpreter/jit → jit/js}/jitHelpers.d.ts +15 -3
- package/dist-lib/numbl-core/{interpreter/jit → jit/js}/jitHelpersIndex.d.ts +7 -0
- package/dist-lib/numbl-core/jit/js/jitHelpersTensor.d.ts +34 -0
- package/dist-lib/numbl-core/jit/js/jsFusedCodegen.d.ts +17 -0
- package/dist-lib/numbl-core/jit/js/jsMultiReduction.d.ts +70 -0
- package/dist-lib/numbl-core/jit/scalarEmit.d.ts +58 -0
- package/dist-lib/numbl-core/lexer/types.d.ts +2 -1
- package/dist-lib/numbl-core/native/lapack-bridge.d.ts +39 -1
- package/dist-lib/numbl-core/ops/bessel.d.ts +18 -0
- package/dist-lib/numbl-core/ops/comparison.d.ts +11 -0
- package/dist-lib/numbl-core/ops/complexBinaryElemwise.d.ts +10 -0
- package/dist-lib/numbl-core/ops/complexUnaryElemwise.d.ts +8 -0
- package/dist-lib/numbl-core/ops/dispatch.d.ts +26 -0
- package/dist-lib/numbl-core/ops/index.d.ts +8 -0
- package/dist-lib/numbl-core/ops/opCodes.d.ts +70 -0
- package/dist-lib/numbl-core/ops/realBinaryElemwise.d.ts +8 -0
- package/dist-lib/numbl-core/ops/realUnaryElemwise.d.ts +5 -0
- package/dist-lib/numbl-core/ops/reduce.d.ts +6 -0
- package/dist-lib/numbl-core/parser/types.d.ts +6 -0
- package/dist-lib/numbl-core/runtime/alloc.d.ts +23 -0
- package/dist-lib/numbl-core/runtime/runtime.d.ts +1 -0
- package/dist-lib/numbl-core/version.d.ts +1 -1
- package/native/jit_runtime/jit_runtime.c +261 -0
- package/native/jit_runtime/jit_runtime.h +204 -0
- package/native/numbl_addon.cpp +62 -1
- package/native/ops/bessel.c +572 -0
- package/native/ops/comparison.c +150 -0
- package/native/ops/complex_binary_elemwise.c +192 -0
- package/native/ops/complex_unary_elemwise.c +152 -0
- package/native/ops/numbl_ops.c +66 -0
- package/native/ops/numbl_ops.h +262 -0
- package/native/ops/real_binary_elemwise.c +85 -0
- package/native/ops/real_unary_elemwise.c +104 -0
- package/native/ops/reduce.c +162 -0
- package/native/ops_napi.cpp +320 -0
- package/package.json +8 -9
- package/dist-lib/numbl-core/interpreter/jit/jitHelpersTensor.d.ts +0 -28
- package/dist-lib/numbl-core/interpreter/jit/jitLower.d.ts +0 -23
- /package/dist-lib/numbl-core/{interpreter/jit → jit/js}/jitHelpersComplex.d.ts +0 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight IR traversal helpers shared across the C-JIT subsystem.
|
|
3
|
+
*
|
|
4
|
+
* Several places in `jit/c/` need to walk a lowered IR body observing
|
|
5
|
+
* (but not transforming) expressions and statements: the feasibility
|
|
6
|
+
* fall-through paths, tensor-classification, hybrid-loop live-in/out
|
|
7
|
+
* analysis, and the shape-propagation / callee-discovery / complex-
|
|
8
|
+
* scalar scans in [assemble.ts](./assemble.ts). Each used to
|
|
9
|
+
* reimplement the same switch-on-tag descent.
|
|
10
|
+
*
|
|
11
|
+
* This module centralizes the descent. Three primitives, composable:
|
|
12
|
+
*
|
|
13
|
+
* - `walkExprNodes(expr, visit)` — post-order walk of every sub-node
|
|
14
|
+
* of `expr` (including `expr` itself). Every leaf calls `visit`
|
|
15
|
+
* once; nothing is skipped. Adding a new JitExpr tag means editing
|
|
16
|
+
* this one function.
|
|
17
|
+
*
|
|
18
|
+
* - `walkStmts(body, visit)` — pre-order walk of every statement in
|
|
19
|
+
* `body`, recursing into If/For/While nested bodies. Does NOT
|
|
20
|
+
* traverse expressions inside the stmt — callers that need that
|
|
21
|
+
* compose with `walkStmtExprs` + `walkExprNodes`.
|
|
22
|
+
*
|
|
23
|
+
* - `walkStmtExprs(stmt, visit)` — call `visit` on each top-level
|
|
24
|
+
* expression attached to `stmt` (the `expr` in an Assign, the
|
|
25
|
+
* `cond` in an If, the `start`/`end`/`step` in a For, etc.). Does
|
|
26
|
+
* NOT recurse into nested expression sub-nodes (use `walkExprNodes`
|
|
27
|
+
* for that) and does NOT walk into nested stmt bodies.
|
|
28
|
+
*
|
|
29
|
+
* The dispatchers in `feasibility.ts`, `emit/stmt.ts`, and
|
|
30
|
+
* `emit/fused.ts` keep their native switches — they produce structured
|
|
31
|
+
* results (feasibility verdicts, emitted C lines), so a callback-based
|
|
32
|
+
* observer doesn't fit their shape.
|
|
33
|
+
*/
|
|
34
|
+
import type { JitExpr, JitStmt } from "../jitTypes.js";
|
|
35
|
+
/**
|
|
36
|
+
* Walk every sub-node of `expr` in post-order (children first, then
|
|
37
|
+
* `expr` itself), calling `visit` on each. Leaves (NumberLiteral,
|
|
38
|
+
* ImagLiteral, Var, StringLiteral, MemberRead) are still visited once.
|
|
39
|
+
*
|
|
40
|
+
* Adding a new JitExpr tag: add a case here. Observer callers (which
|
|
41
|
+
* is all of them) don't need to know about tag-specific sub-node
|
|
42
|
+
* fields — this is the one place those are encoded.
|
|
43
|
+
*/
|
|
44
|
+
export declare function walkExprNodes(expr: JitExpr, visit: (e: JitExpr) => void): void;
|
|
45
|
+
/**
|
|
46
|
+
* Walk every statement in `body`, recursing into nested If / For /
|
|
47
|
+
* While bodies. Pre-order: `visit` is called on each stmt before
|
|
48
|
+
* descending. Does NOT traverse expressions inside the stmt.
|
|
49
|
+
*/
|
|
50
|
+
export declare function walkStmts(body: JitStmt[], visit: (s: JitStmt) => void): void;
|
|
51
|
+
/**
|
|
52
|
+
* Call `visit` on every top-level expression attached to `stmt` — the
|
|
53
|
+
* RHS of an Assign, the indices + value of an AssignIndex, the start /
|
|
54
|
+
* end / step of a For, the cond of an If / While, and so on. Does
|
|
55
|
+
* NOT recurse into expression sub-nodes (compose with `walkExprNodes`)
|
|
56
|
+
* and does NOT descend into nested stmt bodies (compose with
|
|
57
|
+
* `walkStmts`).
|
|
58
|
+
*
|
|
59
|
+
* For If, the `cond` of the primary branch AND each elseif is visited;
|
|
60
|
+
* the bodies themselves are stmt-trees, not exprs, and are reached via
|
|
61
|
+
* `walkStmts` recursion.
|
|
62
|
+
*/
|
|
63
|
+
export declare function walkStmtExprs(stmt: JitStmt, visit: (e: JitExpr) => void): void;
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e1 (experimental) — complex-tensor standalone C-kernel emission.
|
|
3
|
+
*
|
|
4
|
+
* Sister module to `kernelEmit.ts` (which handles real-tensor chains).
|
|
5
|
+
* Given a FusibleChain that produces at least one complex tensor, emit
|
|
6
|
+
* a paired-buffer C kernel of the form
|
|
7
|
+
*
|
|
8
|
+
* void k_<hash>(int64_t n,
|
|
9
|
+
* const double *in_<a>_re, const double *in_<a>_im, // complex tensor
|
|
10
|
+
* const double *in_<b>, // real tensor (widened)
|
|
11
|
+
* double s_<c>_re, double s_<c>_im, // complex scalar
|
|
12
|
+
* double s_<d>, // real scalar
|
|
13
|
+
* double *out_<y>_re, double *out_<y>_im) // complex output
|
|
14
|
+
* {
|
|
15
|
+
* #pragma omp simd
|
|
16
|
+
* for (int64_t i = 0; i < n; i++) {
|
|
17
|
+
* double __f_y_re = ...;
|
|
18
|
+
* double __f_y_im = ...;
|
|
19
|
+
* out_<y>_re[i] = __f_y_re;
|
|
20
|
+
* out_<y>_im[i] = __f_y_im;
|
|
21
|
+
* }
|
|
22
|
+
* }
|
|
23
|
+
*
|
|
24
|
+
* Supports the same fusion envelope as emitComplexPerElem in
|
|
25
|
+
* `c/emit/fused.ts`:
|
|
26
|
+
* - Binary: + - * .*
|
|
27
|
+
* - Unary: + -
|
|
28
|
+
* - Call: conj, real, imag
|
|
29
|
+
* - Operand widening: real tensor / real scalar read with im = 0
|
|
30
|
+
* - ImagLiteral: (0.0, 1.0) pair
|
|
31
|
+
*
|
|
32
|
+
* Complex chains do NOT carry a trailing reduction — `fusion.ts` drops
|
|
33
|
+
* the absorption for complex chains because the inline scalar
|
|
34
|
+
* accumulator can't hold a complex value. Kernels emitted here have no
|
|
35
|
+
* reduction output.
|
|
36
|
+
*/
|
|
37
|
+
import type { FusibleChain } from "../fusion.js";
|
|
38
|
+
import type { KernelEmitResult } from "./kernelEmit.js";
|
|
39
|
+
/**
|
|
40
|
+
* Emit a complex-tensor fused chain as a standalone C kernel.
|
|
41
|
+
*
|
|
42
|
+
* Returns null when the chain contains an expression the per-element
|
|
43
|
+
* walker doesn't support (abs, complex divide, transcendental on
|
|
44
|
+
* complex, etc.) — the caller falls back to the JS-JIT per-op path.
|
|
45
|
+
*/
|
|
46
|
+
export declare function emitComplexChainKernel(chain: FusibleChain, allTensorVars: ReadonlySet<string>, complexTensorNames: ReadonlySet<string>, complexScalarVars: ReadonlySet<string>, outputTensorNames: ReadonlySet<string>): KernelEmitResult | null;
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared content-hash helper for the e1 codegen.
|
|
3
|
+
*
|
|
4
|
+
* 64-bit FNV-1a over UTF-8 code units, returned as 16 hex chars.
|
|
5
|
+
* Deterministic, fully self-contained, and browser-safe (no Node
|
|
6
|
+
* `crypto` dependency). Cryptographic strength isn't needed — the
|
|
7
|
+
* hash is a content-addressed suffix for kernel names and
|
|
8
|
+
* `$h.$kernels[...]` cache keys.
|
|
9
|
+
*/
|
|
10
|
+
export declare function fnv1a64Hex(s: string): string;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Node-only install shim for the e1 (experimental) kernel pipeline.
|
|
3
|
+
*
|
|
4
|
+
* Side-effect import from `cli.ts`. Replaces the `compileKernel` stub on
|
|
5
|
+
* the module-level `jitHelpers` object with a real implementation that
|
|
6
|
+
* shells out to `cc` via `compile.ts` and loads the result through koffi.
|
|
7
|
+
*
|
|
8
|
+
* Registration is idempotent — re-importing this module in tests won't
|
|
9
|
+
* re-install. The kernel cache on `jitHelpers.$kernels` is shared across
|
|
10
|
+
* all specializations in the process so the same fused chain used from
|
|
11
|
+
* two different JIT'd functions compiles only once.
|
|
12
|
+
*/
|
|
13
|
+
export {};
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e1 (experimental) — standalone C-kernel emission for fusible tensor
|
|
3
|
+
* chains used by the JS-JIT.
|
|
4
|
+
*
|
|
5
|
+
* Given a `FusibleChain` the normal JS fused codegen would emit, this
|
|
6
|
+
* module produces an equivalent standalone C function of the form
|
|
7
|
+
*
|
|
8
|
+
* void k_<hash>(int64_t n,
|
|
9
|
+
* const double *in_<x>, ...,
|
|
10
|
+
* double s_<scalar>, ...,
|
|
11
|
+
* double *out_<y>, ...)
|
|
12
|
+
* {
|
|
13
|
+
* #pragma omp simd
|
|
14
|
+
* for (int64_t i = 0; i < n; i++) {
|
|
15
|
+
* double f_tmp1 = <expr>;
|
|
16
|
+
* ...
|
|
17
|
+
* out_<y>[i] = <final>;
|
|
18
|
+
* }
|
|
19
|
+
* }
|
|
20
|
+
*
|
|
21
|
+
* It returns the full C source, a koffi signature string, a content-
|
|
22
|
+
* addressed hash, and the ordered list of JS expressions the generated
|
|
23
|
+
* code should pass as arguments — everything the JS codegen needs to
|
|
24
|
+
* emit a `$h.compileKernel(source, sig); kernel(n, x_data, y_data)`
|
|
25
|
+
* dispatch.
|
|
26
|
+
*
|
|
27
|
+
* The prototype deliberately handles only the common real-tensor chain
|
|
28
|
+
* shape: no reductions, no complex tensors, no dynamic-shape outputs.
|
|
29
|
+
* Any chain that falls outside that envelope causes `emitChainKernel`
|
|
30
|
+
* to return `null`, which signals the caller to fall back to the plain
|
|
31
|
+
* inline JS fused loop.
|
|
32
|
+
*/
|
|
33
|
+
import type { FusibleChain } from "../fusion.js";
|
|
34
|
+
/**
|
|
35
|
+
* A fused chain compiled to a standalone C kernel. The caller (the JS
|
|
36
|
+
* codegen) combines this with a runtime size threshold to emit
|
|
37
|
+
*
|
|
38
|
+
* if (n >= THRESHOLD) $h.<kernelName>(n, x_data, y_data)
|
|
39
|
+
* else <plain JS fused loop>
|
|
40
|
+
*/
|
|
41
|
+
export interface KernelEmitResult {
|
|
42
|
+
/** Hash-derived C function name, e.g. `nk_3a7f81b2`. */
|
|
43
|
+
kernelName: string;
|
|
44
|
+
/** Full C source: `#include` + function definition. */
|
|
45
|
+
cSource: string;
|
|
46
|
+
/** koffi function signature, e.g. `"void nk_3a7f81b2(int64_t, ...)"`. */
|
|
47
|
+
koffiSig: string;
|
|
48
|
+
/** Content hash over the final C source (stable id for caching). */
|
|
49
|
+
hash: string;
|
|
50
|
+
/** Ordered list of JS expressions to pass as call arguments. The
|
|
51
|
+
* caller emits something like `$h.<kernelName>(${jsCallArgs.join(",")})`. */
|
|
52
|
+
jsCallArgs: string[];
|
|
53
|
+
}
|
|
54
|
+
export declare function emitChainKernel(chain: FusibleChain, allTensorVars: ReadonlySet<string>, outputTensorNames: ReadonlySet<string>, par: boolean): KernelEmitResult | null;
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e1 — C kernel for multi-reduction scalar assigns.
|
|
3
|
+
*
|
|
4
|
+
* A MATLAB line like
|
|
5
|
+
*
|
|
6
|
+
* red_acc = red_acc + (sum(x) + mean(x) + max(x) + min(x));
|
|
7
|
+
*
|
|
8
|
+
* has four reductions over the same vector. The default JS-JIT path
|
|
9
|
+
* emits four `$h.tSum` / `$h.ib_*` helper calls, each of which scans
|
|
10
|
+
* the whole vector. This module emits a single-pass C kernel that
|
|
11
|
+
* computes every requested reduction in one loop and writes results
|
|
12
|
+
* into caller-allocated scalar slots.
|
|
13
|
+
*
|
|
14
|
+
* Specialised per op-set: a group of `{sum, max, min}` compiles to a
|
|
15
|
+
* different kernel than `{sum, mean, max, min}`. Source-addressed by
|
|
16
|
+
* FNV-1a hash so the JS `$h.$kernels[...]` cache dedupes repeated call
|
|
17
|
+
* sites.
|
|
18
|
+
*
|
|
19
|
+
* NaN handling: `-ffast-math` is on for the compile (matches the other
|
|
20
|
+
* e1 kernels), so naive `isnan` is folded to `false`. The kernel uses
|
|
21
|
+
* an inline bit-pattern NaN check to drive MATLAB's omit-NaN semantics
|
|
22
|
+
* for `max`/`min` and records an `any_non_nan` flag the JS side uses
|
|
23
|
+
* to map an all-NaN input to NaN.
|
|
24
|
+
*/
|
|
25
|
+
/** Reductions we can fuse into one pass. `any` / `all` are excluded
|
|
26
|
+
* because their short-circuit `break` would prematurely stop the
|
|
27
|
+
* other accumulators. */
|
|
28
|
+
export type MultiReduceOp = "sum" | "prod" | "max" | "min" | "mean";
|
|
29
|
+
export interface MultiReductionKernelInfo {
|
|
30
|
+
/** Hash-derived C function name, e.g. `mr_3a7f81b2...`. */
|
|
31
|
+
kernelName: string;
|
|
32
|
+
/** Full C source string. */
|
|
33
|
+
cSource: string;
|
|
34
|
+
/** koffi function signature. */
|
|
35
|
+
koffiSig: string;
|
|
36
|
+
/** Content hash. */
|
|
37
|
+
hash: string;
|
|
38
|
+
/**
|
|
39
|
+
* Output slot layout. Each reduction in the kernel writes to its own
|
|
40
|
+
* Float64 slot, in the order of this array. `any_non_nan` (a 0/1 flag
|
|
41
|
+
* stored as double) is at the end when `hasMinOrMax` is true.
|
|
42
|
+
* The JS caller allocates a `Float64Array(slotCount)` and reads slots
|
|
43
|
+
* by index after the call.
|
|
44
|
+
*/
|
|
45
|
+
slotNames: string[];
|
|
46
|
+
/** True when the kernel emits an `any_non_nan` slot at index
|
|
47
|
+
* `slotNames.length - 1`. */
|
|
48
|
+
hasAnyNonNan: boolean;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Build a multi-reduction kernel for the given op set. `ops` should be
|
|
52
|
+
* a deduplicated list of reductions to compute (e.g. ["sum", "max"]).
|
|
53
|
+
* The returned `slotNames` preserves insertion order for indexing; if
|
|
54
|
+
* the op set contains `max`/`min`, an extra `any_non_nan` slot is
|
|
55
|
+
* appended (the JS side uses it to override the sentinel max/min with
|
|
56
|
+
* NaN when every input element was NaN).
|
|
57
|
+
*
|
|
58
|
+
* When `par` is true, the per-element loop is emitted as
|
|
59
|
+
* `#pragma omp parallel for simd reduction(...)` with one reduction
|
|
60
|
+
* clause per accumulator and an `if(n >= T)` gate that falls back to
|
|
61
|
+
* serial below the threshold. Requires the caller to link with
|
|
62
|
+
* `-fopenmp`; e1's `install.ts` already does this when libgomp is
|
|
63
|
+
* available. When `par` is false, the loop is emitted as plain
|
|
64
|
+
* `#pragma omp simd` (SIMD-only, single-threaded).
|
|
65
|
+
*/
|
|
66
|
+
export declare function emitMultiReductionKernel(ops: readonly MultiReduceOp[], par?: boolean): MultiReductionKernelInfo;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Runtime-overridable OpenMP availability flag for the e1 codegen path.
|
|
3
|
+
*
|
|
4
|
+
* `scalarFnKernel.ts` is transitively reachable from the JS-JIT module
|
|
5
|
+
* graph that Vite bundles for the web REPL, but `c/compile.ts` is
|
|
6
|
+
* Node-only (child_process, fs, ...). Importing `cJitOpenmpAvailable`
|
|
7
|
+
* directly from `compile.ts` would drag all of that into the browser
|
|
8
|
+
* bundle. Instead we default to `false` here and let Node-only
|
|
9
|
+
* `e1/install.ts` override the getter at install time — the same
|
|
10
|
+
* pattern used for the `compileKernel` stub in `jitHelpers.ts`.
|
|
11
|
+
*/
|
|
12
|
+
export declare function setOpenmpAvailableGetter(fn: () => boolean): void;
|
|
13
|
+
export declare function isOpenmpAvailable(): boolean;
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e1 (experimental) — whole-function scalar kernel emission.
|
|
3
|
+
*
|
|
4
|
+
* Complements [kernelEmit.ts](./kernelEmit.ts) (which handles tensor
|
|
5
|
+
* fusible chains) by covering the other big win case: a user function
|
|
6
|
+
* that is entirely scalar arithmetic — e.g. the inner loop of a
|
|
7
|
+
* Horner-style series, a Runge-Kutta step on a handful of doubles,
|
|
8
|
+
* benchmarks/scalar_bench.m's `run_bench(N, M)`.
|
|
9
|
+
*
|
|
10
|
+
* Under `--opt e1`, when a JIT-able function's signature and body are
|
|
11
|
+
* purely scalar, we call `generateC()` (the same emitter the C-JIT
|
|
12
|
+
* uses at `--opt 2`) and wrap its output with a thin inline JS
|
|
13
|
+
* function that shells out to `$h.compileKernel(...)`. The C source
|
|
14
|
+
* and koffi signature are inlined as JS string literals, so
|
|
15
|
+
* `--dump-js` shows the complete picture.
|
|
16
|
+
*
|
|
17
|
+
* Scope for the prototype:
|
|
18
|
+
* - All params are scalar doubles / booleans (CParamDesc.kind === "scalar")
|
|
19
|
+
* - All outputs are scalar / boolean (COutputDesc.kind === "scalar" | "boolean")
|
|
20
|
+
* - No tic/toc, no Index reads (no errFlag), no disp(...) calls
|
|
21
|
+
*
|
|
22
|
+
* Anything outside that envelope returns `null` and the caller falls
|
|
23
|
+
* back to the plain JS-JIT path, which still benefits from e1's
|
|
24
|
+
* per-chain tensor kernels.
|
|
25
|
+
*/
|
|
26
|
+
import type { FunctionDef } from "../../interpreter/types.js";
|
|
27
|
+
import type { JitStmt, JitType } from "../jitTypes.js";
|
|
28
|
+
import type { GeneratedFn } from "../jitLower.js";
|
|
29
|
+
import type { Interpreter } from "../../interpreter/interpreter.js";
|
|
30
|
+
export interface ScalarFnKernelResult {
|
|
31
|
+
/** The inline-compileKernel JS source. The JIT caller splices this
|
|
32
|
+
* in place of the normal JS-JIT body. */
|
|
33
|
+
jsSource: string;
|
|
34
|
+
/** Content-addressed kernel name from generateC, for logging. */
|
|
35
|
+
kernelName: string;
|
|
36
|
+
/** Raw C source (also embedded in `jsSource` as a string literal).
|
|
37
|
+
* Exposed for `--dump-c` / logging. */
|
|
38
|
+
cSource: string;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Try to emit a whole-function scalar kernel for the given lowered IR.
|
|
42
|
+
* Returns null when the function is not a pure-scalar candidate.
|
|
43
|
+
*/
|
|
44
|
+
export declare function tryEmitScalarFnKernel(interp: Interpreter, fn: FunctionDef, body: JitStmt[], outputNames: string[], localVars: Set<string>, outputType: JitType | null, outputTypes: JitType[], argTypes: JitType[], nargout: number, generatedIRBodies: Map<string, GeneratedFn>): ScalarFnKernelResult | null;
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 — per-assign / chain kernel driver.
|
|
3
|
+
*
|
|
4
|
+
* Entry point `tryE2Assign` is called from `interpreterExec.ts` for
|
|
5
|
+
* every `Assign` statement when `interp.experimental === "e2"`.
|
|
6
|
+
*
|
|
7
|
+
* Multi-LHS chain detection: scans consecutive suppressed Assigns
|
|
8
|
+
* regardless of LHS name. For each chain LHS, uses scope-body liveness
|
|
9
|
+
* (via `interp._currentScopeBody`) to decide whether the LHS escapes
|
|
10
|
+
* (materializes as an `out_<name>` buffer) or is purely chain-local
|
|
11
|
+
* (kept as a per-element stack-local). Reads of a chain LHS before
|
|
12
|
+
* its first assign in the chain become `in_<name>` parameters.
|
|
13
|
+
*
|
|
14
|
+
* On success:
|
|
15
|
+
* - Single chain assign: handled like a one-stmt chain.
|
|
16
|
+
* - Multi-stmt chain: one C kernel runs all assigns, only escape
|
|
17
|
+
* LHSs materialize back to env. `interp._e2ChainAdvance` is set
|
|
18
|
+
* so the surrounding loop skips the consumed sibling stmts.
|
|
19
|
+
*
|
|
20
|
+
* Compilation failures are hard errors (RuntimeError). Classification
|
|
21
|
+
* bails (non-classifiable RHS, mismatched lengths, etc.) silently fall
|
|
22
|
+
* through to the regular interpreter path.
|
|
23
|
+
*/
|
|
24
|
+
import type { Stmt } from "../../parser/types.js";
|
|
25
|
+
import type { Interpreter } from "../../interpreter/interpreter.js";
|
|
26
|
+
/**
|
|
27
|
+
* Try to compile a chain (1+ stmts) starting at `stmt`. Returns true
|
|
28
|
+
* on success — `interp._e2ChainAdvance` is set to the count of EXTRA
|
|
29
|
+
* sibling stmts the kernel consumed (0 for a single-stmt chain).
|
|
30
|
+
* Returns false to fall back to the regular interpreter path.
|
|
31
|
+
*/
|
|
32
|
+
export declare function tryE2Assign(interp: Interpreter, stmt: Stmt & {
|
|
33
|
+
type: "Assign";
|
|
34
|
+
}): boolean;
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 — minimal AST `Expr` → `JitExpr` lowerer.
|
|
3
|
+
*
|
|
4
|
+
* Only handles the whitelist that `classify.ts` accepts: Number, Ident,
|
|
5
|
+
* whitelisted Binary/Unary, whitelisted FuncCall. Types are read from
|
|
6
|
+
* the live runtime environment (the caller passes in a per-name
|
|
7
|
+
* `JitType` lookup), so there's no cross-branch unification.
|
|
8
|
+
*
|
|
9
|
+
* The classifier already replaced opaque subtrees with synthetic Ident
|
|
10
|
+
* nodes whose `name` is also in `envTypes`, so this lowerer doesn't need
|
|
11
|
+
* to know about opacity — every Ident it sees has a known runtime type.
|
|
12
|
+
*/
|
|
13
|
+
import type { Expr } from "../../parser/types.js";
|
|
14
|
+
import type { JitExpr, JitType } from "../jitTypes.js";
|
|
15
|
+
export declare class E2LowerError extends Error {
|
|
16
|
+
}
|
|
17
|
+
export interface LowerOptions {
|
|
18
|
+
/** When a `FuncCall{name, args}` has `name` in `envTypes` as a tensor,
|
|
19
|
+
* treat it as tensor indexing and lower to an `Index` node instead
|
|
20
|
+
* of looking up a builtin. Used by the e2 whole-loop kernel — the
|
|
21
|
+
* chain emitters don't set this (their classifier has already
|
|
22
|
+
* marked tensor-access FuncCalls as opaque). */
|
|
23
|
+
resolveFuncCallAsTensorIndex?: boolean;
|
|
24
|
+
}
|
|
25
|
+
export declare function lowerAstToJitExpr(expr: Expr, envTypes: ReadonlyMap<string, JitType>, options?: LowerOptions): JitExpr;
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 — per-AST-node compiled-kernel cache.
|
|
3
|
+
*
|
|
4
|
+
* Each AST `Expr` (the RHS of an `Assign` we've seen at least once) maps
|
|
5
|
+
* to a per-signature cache: the same expression visited with a different
|
|
6
|
+
* runtime type signature produces a different specialization. The
|
|
7
|
+
* signature includes input names, scalar-vs-tensor, complex-or-not, and
|
|
8
|
+
* the LHS name (since the kernel hard-codes which output to write).
|
|
9
|
+
*
|
|
10
|
+
* The Map is keyed by the AST node identity, not by source text — two
|
|
11
|
+
* identical-looking `r = r .* y` statements at different file:line
|
|
12
|
+
* positions get separate cache entries, so a recompile from a
|
|
13
|
+
* different call-site doesn't poison earlier ones.
|
|
14
|
+
*
|
|
15
|
+
* The cache holds either an `E2CacheEntry` or the `E2_BAILED` sentinel
|
|
16
|
+
* indicating that classification or compilation failed for this
|
|
17
|
+
* signature; the sentinel prevents re-attempting the same hopeless
|
|
18
|
+
* lowering on every invocation.
|
|
19
|
+
*/
|
|
20
|
+
import type { Stmt, BinaryOperation } from "../../parser/types.js";
|
|
21
|
+
export type CompiledKernelFn = (...args: unknown[]) => unknown;
|
|
22
|
+
export declare const E2_BAILED: unique symbol;
|
|
23
|
+
export interface E2ReductionInfo {
|
|
24
|
+
/** Reduction op name (sum / prod / max / min / mean / any / all). */
|
|
25
|
+
reduceName: string;
|
|
26
|
+
/** Accumulator variable name in env. */
|
|
27
|
+
accName: string;
|
|
28
|
+
/** When true, the source pattern was `acc = acc OP reduce(...)`;
|
|
29
|
+
* the driver applies the same OP to combine the kernel's scalar
|
|
30
|
+
* output with the existing env value of `acc`. When false, the
|
|
31
|
+
* source pattern was `acc = reduce(...)` and the kernel output is
|
|
32
|
+
* written directly. */
|
|
33
|
+
hasAccumulate: boolean;
|
|
34
|
+
/** Only meaningful when `hasAccumulate` is true. */
|
|
35
|
+
accOp?: BinaryOperation;
|
|
36
|
+
}
|
|
37
|
+
/** Complex-path partitioning info. Present iff the kernel was compiled
|
|
38
|
+
* via the paired-buffer complex emitter. The driver uses these lists
|
|
39
|
+
* to marshal complex tensors (two pointers per tensor), complex
|
|
40
|
+
* scalars (two doubles per scalar), and to allocate complex output
|
|
41
|
+
* buffers (data + imag Float64Arrays). */
|
|
42
|
+
export interface E2ComplexInfo {
|
|
43
|
+
complexTensorNames: string[];
|
|
44
|
+
realTensorNames: string[];
|
|
45
|
+
complexInputLhsNames: string[];
|
|
46
|
+
realInputLhsNames: string[];
|
|
47
|
+
complexScalarNames: string[];
|
|
48
|
+
realScalarNames: string[];
|
|
49
|
+
complexEscapeLhsNames: string[];
|
|
50
|
+
realEscapeLhsNames: string[];
|
|
51
|
+
}
|
|
52
|
+
export interface E2CacheEntry {
|
|
53
|
+
fn: CompiledKernelFn;
|
|
54
|
+
/** Env tensor input names (combined — for diagnostics). When
|
|
55
|
+
* `complex` is defined, the complex marshaling code uses
|
|
56
|
+
* `complex.complexTensorNames` and `complex.realTensorNames`
|
|
57
|
+
* instead of this. */
|
|
58
|
+
tensorNames: string[];
|
|
59
|
+
/** Chain LHS names that need `in_<name>` (between tensors and scalars). */
|
|
60
|
+
inputLhsNames: string[];
|
|
61
|
+
/** Ordered scalar input names. */
|
|
62
|
+
scalarNames: string[];
|
|
63
|
+
/** Chain LHS names that materialize via `out_<name>` (escape names). */
|
|
64
|
+
escapeLhsNames: string[];
|
|
65
|
+
/** Number of chain assigns this entry encodes (0 for a standalone
|
|
66
|
+
* reduction kernel, 1 for a single-assign chain kernel, >=2 for
|
|
67
|
+
* multi-stmt chains). */
|
|
68
|
+
chainLength: number;
|
|
69
|
+
/** Set when the kernel produces a trailing scalar reduction output.
|
|
70
|
+
* The driver allocates a `Float64Array(1)` for `out_acc`, calls the
|
|
71
|
+
* kernel, then combines the result with `env[accName]` per the
|
|
72
|
+
* `accOp` and `hasAccumulate` fields. Complex chains never set this
|
|
73
|
+
* — the complex emitter rejects trailing reductions. */
|
|
74
|
+
reduction?: E2ReductionInfo;
|
|
75
|
+
/** Paired-buffer complex path info. When set, the marshaling code
|
|
76
|
+
* takes the complex branch. */
|
|
77
|
+
complex?: E2ComplexInfo;
|
|
78
|
+
}
|
|
79
|
+
export declare function chainCacheGet(firstStmt: Stmt, sig: string): E2CacheEntry | typeof E2_BAILED | undefined;
|
|
80
|
+
export declare function chainCacheSet(firstStmt: Stmt, sig: string, entry: E2CacheEntry | typeof E2_BAILED): void;
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 — multi-LHS fused chain C kernel emission.
|
|
3
|
+
*
|
|
4
|
+
* Given a sequence of `JitExpr` RHSs each writing to a (possibly
|
|
5
|
+
* distinct) chain LHS, produces one C function that runs every assign
|
|
6
|
+
* in a single per-element loop. Each chain LHS becomes a stack-local
|
|
7
|
+
* `double <name>` declared once at the top of the loop body. Within
|
|
8
|
+
* the body, references to a chain-LHS name resolve to the stack-local
|
|
9
|
+
* once the corresponding assign has run; before that point they
|
|
10
|
+
* resolve to `in_<name>[i]` (so the kernel signature includes
|
|
11
|
+
* `in_<lhsName>` for any chain LHS that's read before being written).
|
|
12
|
+
*
|
|
13
|
+
* After the per-iter assigns, every "escape" LHS (one that's actually
|
|
14
|
+
* referenced by the rest of the function body) gets written to its
|
|
15
|
+
* `out_<name>[i]` pointer. Chain-locals (only used inside the chain)
|
|
16
|
+
* are dropped at the end of the iteration with no buffer materialized.
|
|
17
|
+
*
|
|
18
|
+
* void e2c_<hash>(int64_t n,
|
|
19
|
+
* const double *in_<input1>, ...,
|
|
20
|
+
* [const double *in_<lhs_needing_input>, ...,]
|
|
21
|
+
* double s_<scalar1>, ...,
|
|
22
|
+
* double *out_<escape_lhs1>, ...)
|
|
23
|
+
* {
|
|
24
|
+
* #pragma omp simd
|
|
25
|
+
* for (int64_t i = 0; i < n; i++) {
|
|
26
|
+
* double <chain_lhs1>, <chain_lhs2>, ...;
|
|
27
|
+
* <chain_lhs1> = <stmt0_rhs_C>;
|
|
28
|
+
* <chain_lhs2> = <stmt1_rhs_C>;
|
|
29
|
+
* ...
|
|
30
|
+
* out_<escape_lhs1>[i] = <escape_lhs1>;
|
|
31
|
+
* out_<escape_lhs2>[i] = <escape_lhs2>;
|
|
32
|
+
* }
|
|
33
|
+
* }
|
|
34
|
+
*/
|
|
35
|
+
import { type ChainAssignSpec, type KernelInputs } from "./emitShared.js";
|
|
36
|
+
export type { ChainAssignSpec } from "./emitShared.js";
|
|
37
|
+
export interface E2ChainEmitResult {
|
|
38
|
+
kernelName: string;
|
|
39
|
+
cSource: string;
|
|
40
|
+
koffiSig: string;
|
|
41
|
+
hash: string;
|
|
42
|
+
/** Tensor input names in signature order — does NOT include any
|
|
43
|
+
* in_<lhs> entries. */
|
|
44
|
+
inputTensors: string[];
|
|
45
|
+
/** Chain LHS names that appear as `in_<name>` in the signature, in
|
|
46
|
+
* order. */
|
|
47
|
+
inputLhsNames: string[];
|
|
48
|
+
/** Scalar input names in signature order. */
|
|
49
|
+
inputScalars: string[];
|
|
50
|
+
/** Chain LHS names that appear as `out_<name>` in the signature,
|
|
51
|
+
* in order. */
|
|
52
|
+
escapeLhsNames: string[];
|
|
53
|
+
chainLength: number;
|
|
54
|
+
}
|
|
55
|
+
export declare function emitE2ChainKernel(assigns: ChainAssignSpec[], inputs: KernelInputs, par?: boolean): E2ChainEmitResult;
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 (experimental) — per-assign expression classifier.
|
|
3
|
+
*
|
|
4
|
+
* Walks an AST `Expr` and decides whether it can be compiled into a
|
|
5
|
+
* single per-element C kernel. The classifier never evaluates anything
|
|
6
|
+
* — it only inspects the AST shape and the names referenced.
|
|
7
|
+
*
|
|
8
|
+
* Whitelist:
|
|
9
|
+
* - Number, Ident
|
|
10
|
+
* - Binary with arithmetic / comparison ops
|
|
11
|
+
* - Unary Plus, Minus, Not
|
|
12
|
+
* - FuncCall to a whitelisted scalar math builtin
|
|
13
|
+
*
|
|
14
|
+
* Anything outside the whitelist is recorded as an "opaque root": the
|
|
15
|
+
* driver is expected to evaluate that subtree via the interpreter and
|
|
16
|
+
* bind the result to a fresh synthetic name, then re-classify with that
|
|
17
|
+
* name in scope.
|
|
18
|
+
*
|
|
19
|
+
* The classifier returns a list of opaque-root subtrees and the
|
|
20
|
+
* "rewritten" expression that uses synthetic names where the opaque
|
|
21
|
+
* roots used to be. The driver is responsible for runtime type checks
|
|
22
|
+
* and for actually evaluating the opaque subtrees.
|
|
23
|
+
*/
|
|
24
|
+
import type { Expr } from "../../parser/types.js";
|
|
25
|
+
import { BinaryOperation } from "../../parser/types.js";
|
|
26
|
+
export { BinaryOperation } from "../../parser/types.js";
|
|
27
|
+
/** Scalar math builtins that map cleanly to C99. Mirrors the JS-JIT
|
|
28
|
+
* Math.* table plus pow / hypot / atan2 / etc. */
|
|
29
|
+
export declare const E2_BUILTIN_WHITELIST: ReadonlySet<string>;
|
|
30
|
+
/** One opaque subtree the driver must evaluate before invoking the
|
|
31
|
+
* kernel. The classifier replaces it in `emittableExpr` with an Ident
|
|
32
|
+
* named `syntheticName`. */
|
|
33
|
+
export interface OpaqueRoot {
|
|
34
|
+
syntheticName: string;
|
|
35
|
+
expr: Expr;
|
|
36
|
+
}
|
|
37
|
+
export interface ClassifyResult {
|
|
38
|
+
/** AST with opaque subtrees replaced by Ident(syntheticName) nodes. */
|
|
39
|
+
emittableExpr: Expr;
|
|
40
|
+
/** Subtrees the driver must evaluate via the interpreter. */
|
|
41
|
+
opaqueRoots: OpaqueRoot[];
|
|
42
|
+
/** Identifiers referenced in `emittableExpr` that originated from the
|
|
43
|
+
* user's environment (i.e. NOT synthetic opaque-root bindings). The
|
|
44
|
+
* driver looks these up in the env to determine input types. */
|
|
45
|
+
envIdents: Set<string>;
|
|
46
|
+
}
|
|
47
|
+
/** Classify an expression. Always succeeds — the worst case is the
|
|
48
|
+
* whole expression becomes a single opaque root, which the driver
|
|
49
|
+
* will reject. */
|
|
50
|
+
export declare function classifyExpr(expr: Expr): ClassifyResult;
|
|
51
|
+
/** Heuristic gate: an expression is "worth" JIT'ing only when it does
|
|
52
|
+
* some work — a bare Ident or Number is not. The driver also gates
|
|
53
|
+
* on tensor size at runtime; this is just a structural pre-filter to
|
|
54
|
+
* skip the cost of lowering trivial expressions. */
|
|
55
|
+
export declare function isWorthCompiling(emittableExpr: Expr): boolean;
|
|
56
|
+
/** A single classification entry for one assign in a chain. The chain
|
|
57
|
+
* emitter consumes one of these per assign in order. */
|
|
58
|
+
export interface ChainAssignClassification {
|
|
59
|
+
/** The original AST stmt (kept so the cache can key on it). */
|
|
60
|
+
stmt: import("../../parser/types.js").Stmt & {
|
|
61
|
+
type: "Assign";
|
|
62
|
+
};
|
|
63
|
+
/** The classifier's rewritten RHS — opaque subtrees replaced by
|
|
64
|
+
* Ident(syntheticName). */
|
|
65
|
+
emittableExpr: Expr;
|
|
66
|
+
/** Opaque subtrees this assign contributed (driver evaluates these
|
|
67
|
+
* before the kernel call). */
|
|
68
|
+
opaqueRoots: OpaqueRoot[];
|
|
69
|
+
/** Identifiers referenced by this assign's emittableExpr. Includes
|
|
70
|
+
* synthetic opaque-root names. */
|
|
71
|
+
envIdents: Set<string>;
|
|
72
|
+
/** True if the assign reads its own LHS (e.g. `r = r + x`). For the
|
|
73
|
+
* first stmt of a chain this means the kernel needs `in_<lhs>` as
|
|
74
|
+
* an input pointer; for later stmts it just means a chain-local
|
|
75
|
+
* read. */
|
|
76
|
+
selfReadsLhs: boolean;
|
|
77
|
+
}
|
|
78
|
+
export interface ChainClassification {
|
|
79
|
+
/** Chain assigns, in source order. May have multiple distinct LHSs. */
|
|
80
|
+
assigns: ChainAssignClassification[];
|
|
81
|
+
}
|
|
82
|
+
/** Detect a chain of consecutive suppressed classifiable Assigns
|
|
83
|
+
* starting at `stmts[startIdx]`. Each LHS may be a different name;
|
|
84
|
+
* the driver decides — using full-scope liveness — whether each LHS
|
|
85
|
+
* becomes a chain-local or a materialized output buffer.
|
|
86
|
+
*
|
|
87
|
+
* The chain ends at the first non-Assign, the first unsuppressed
|
|
88
|
+
* Assign, or the first Assign whose RHS classification is not worth
|
|
89
|
+
* compiling.
|
|
90
|
+
*
|
|
91
|
+
* Returns null if the very first stmt isn't a chainable Assign. */
|
|
92
|
+
export declare function classifyAssignChain(stmts: import("../../parser/types.js").Stmt[], startIdx: number): ChainClassification | null;
|
|
93
|
+
/** Reduction op names whose semantics the e2 reduction emitter knows. */
|
|
94
|
+
export declare const E2_REDUCTION_OPS: ReadonlySet<string>;
|
|
95
|
+
export interface TrailingReductionMatch {
|
|
96
|
+
/** The original Assign stmt — pinned for cache identity. */
|
|
97
|
+
stmt: import("../../parser/types.js").Stmt & {
|
|
98
|
+
type: "Assign";
|
|
99
|
+
};
|
|
100
|
+
/** LHS = accumulator name. */
|
|
101
|
+
accName: string;
|
|
102
|
+
/** Reduction op name. */
|
|
103
|
+
reduceName: string;
|
|
104
|
+
/** AST expression argument to the reduction call. The driver decides
|
|
105
|
+
* whether to treat it as a Var-targeting-chain-local (for trailing-
|
|
106
|
+
* after-chain) or as a standalone elemwise expression. */
|
|
107
|
+
targetExpr: Expr;
|
|
108
|
+
/** True for `acc = acc OP reduce(...)`; false for `acc = reduce(...)`. */
|
|
109
|
+
hasAccumulate: boolean;
|
|
110
|
+
/** The accumulate op (only meaningful when `hasAccumulate` is true). */
|
|
111
|
+
accOp?: BinaryOperation;
|
|
112
|
+
}
|
|
113
|
+
/** Match an Assign of the form:
|
|
114
|
+
* acc = reduce(arg)
|
|
115
|
+
* acc = acc OP reduce(arg)
|
|
116
|
+
* acc = reduce(arg) OP acc (commutative ops only)
|
|
117
|
+
* where `reduce` is a single-argument call to a known reduction op.
|
|
118
|
+
* Returns the matched details or null. */
|
|
119
|
+
export declare function matchTrailingReduction(stmt: import("../../parser/types.js").Stmt): TrailingReductionMatch | null;
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 — browser-safe indirection for the C compile driver.
|
|
3
|
+
*
|
|
4
|
+
* The driver in `c/compile.ts` is Node-only (it shells out to `cc` and
|
|
5
|
+
* loads via koffi). The browser bundle includes the e2 modules but
|
|
6
|
+
* NOT this driver — `setE2CompileFn` from `e2/install.ts` (Node only)
|
|
7
|
+
* swaps in the real implementation. In the browser, the stub throws.
|
|
8
|
+
*/
|
|
9
|
+
export type E2CompileFn = (cSource: string, koffiSig: string, kernelName: string, log?: (msg: string) => void) => ((...args: unknown[]) => unknown) | null;
|
|
10
|
+
export declare function setE2CompileFn(fn: E2CompileFn): void;
|
|
11
|
+
export declare function getE2CompileFn(): E2CompileFn;
|
|
12
|
+
/** Minimum element count of the largest tensor input before we'll
|
|
13
|
+
* consider compiling an e2 kernel. Below this, koffi overhead dwarfs
|
|
14
|
+
* the work and falling through to the interpreter is faster.
|
|
15
|
+
* Overridable via `NUMBL_E2_MIN_ELEMS`. */
|
|
16
|
+
export declare function e2MinElems(): number;
|