numbl 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/binding.gyp +8 -3
- package/dist-cli/cli.js +6676 -3695
- package/dist-lib/lib.js +6703 -3341
- package/dist-lib/numbl-core/executeCode.d.ts +3 -10
- package/dist-lib/numbl-core/fileIOAdapter.d.ts +2 -0
- package/dist-lib/numbl-core/interpreter/interpreter.d.ts +28 -24
- package/dist-lib/numbl-core/jit/e1/complexKernelEmit.d.ts +46 -0
- package/dist-lib/numbl-core/jit/e1/hash.d.ts +10 -0
- package/dist-lib/numbl-core/jit/e1/kernelEmit.d.ts +1 -1
- package/dist-lib/numbl-core/jit/e1/multiReductionKernel.d.ts +66 -0
- package/dist-lib/numbl-core/jit/e2/assignKernel.d.ts +34 -0
- package/dist-lib/numbl-core/jit/e2/astToJitExpr.d.ts +25 -0
- package/dist-lib/numbl-core/jit/e2/cache.d.ts +80 -0
- package/dist-lib/numbl-core/jit/e2/chainKernelEmit.d.ts +55 -0
- package/dist-lib/numbl-core/jit/e2/classify.d.ts +119 -0
- package/dist-lib/numbl-core/jit/e2/compileFn.d.ts +16 -0
- package/dist-lib/numbl-core/jit/e2/complexChainKernelEmit.d.ts +79 -0
- package/dist-lib/numbl-core/jit/e2/emitShared.d.ts +71 -0
- package/dist-lib/numbl-core/jit/e2/install.d.ts +11 -0
- package/dist-lib/numbl-core/jit/e2/liveness.d.ts +29 -0
- package/dist-lib/numbl-core/jit/e2/loopKernel.d.ts +49 -0
- package/dist-lib/numbl-core/jit/e2/loopKernelEmit.d.ts +75 -0
- package/dist-lib/numbl-core/jit/e2/multiReductionDriver.d.ts +24 -0
- package/dist-lib/numbl-core/jit/e2/reductionKernelEmit.d.ts +72 -0
- package/dist-lib/numbl-core/jit/e2/scalarFnDriver.d.ts +29 -0
- package/dist-lib/numbl-core/jit/fusedScalarEmit.d.ts +8 -0
- package/dist-lib/numbl-core/jit/heavyOps.d.ts +15 -0
- package/dist-lib/numbl-core/jit/js/jitCodegen.d.ts +1 -1
- package/dist-lib/numbl-core/jit/js/jsFusedCodegen.d.ts +1 -1
- package/dist-lib/numbl-core/jit/js/jsMultiReduction.d.ts +70 -0
- package/dist-lib/numbl-core/version.d.ts +1 -1
- package/native/numbl_addon.cpp +9 -0
- package/package.json +2 -4
- package/dist-lib/numbl-core/jit/c/hybrid.d.ts +0 -42
- package/dist-lib/numbl-core/jit/c/install.d.ts +0 -15
- package/dist-lib/numbl-core/jit/c/parityError.d.ts +0 -26
- package/dist-lib/numbl-core/jit/c/registry.d.ts +0 -51
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 — complex multi-LHS fused chain C kernel emission (paired-buffer).
|
|
3
|
+
*
|
|
4
|
+
* Sister to [chainKernelEmit.ts](./chainKernelEmit.ts) for chains that
|
|
5
|
+
* produce at least one complex tensor. Mirrors the codegen shape of
|
|
6
|
+
* [e1/complexKernelEmit.ts](../e1/complexKernelEmit.ts) and uses the
|
|
7
|
+
* same fusion envelope (+ - * .* unary +/- conj real imag, real/complex
|
|
8
|
+
* widening, ImagLiteral). Anything outside that subset (`./`,
|
|
9
|
+
* `abs(complex)`, transcendentals on complex) is rejected at the e2
|
|
10
|
+
* lowerer level, which causes the driver to bail to the interpreter
|
|
11
|
+
* per-op complex path — matching e1's fusion fallthrough behavior.
|
|
12
|
+
*
|
|
13
|
+
* void e2cc_<hash>(int64_t n,
|
|
14
|
+
* const double *in_<cta>_re, const double *in_<cta>_im,
|
|
15
|
+
* const double *in_<rtb>,
|
|
16
|
+
* [in_<lhs_input>_re/_im or in_<lhs_input> ...,]
|
|
17
|
+
* double s_<csc>_re, double s_<csc>_im,
|
|
18
|
+
* double s_<rsc>,
|
|
19
|
+
* [out_<lhs>_re, out_<lhs>_im or out_<lhs> ...])
|
|
20
|
+
* {
|
|
21
|
+
* #pragma omp simd
|
|
22
|
+
* for (int64_t i = 0; i < n; i++) {
|
|
23
|
+
* double <clhs1>_re, <clhs1>_im, ..., <rlhs1>, ...;
|
|
24
|
+
* <clhs1>_re = ...; <clhs1>_im = ...;
|
|
25
|
+
* <rlhs1> = ...;
|
|
26
|
+
* out_<clhs>_re[i] = <clhs>_re; out_<clhs>_im[i] = <clhs>_im;
|
|
27
|
+
* out_<rlhs>[i] = <rlhs>;
|
|
28
|
+
* }
|
|
29
|
+
* }
|
|
30
|
+
*
|
|
31
|
+
* Complex chains deliberately stick to `#pragma omp simd` regardless of
|
|
32
|
+
* `--par`: per-element bodies are ~6 flops spread across paired re/im
|
|
33
|
+
* buffers (memory-bandwidth-bound), and thread-spawn overhead dominates
|
|
34
|
+
* the compute win at realistic N. Matches e1's stance.
|
|
35
|
+
*/
|
|
36
|
+
import type { ChainAssignSpec } from "./emitShared.js";
|
|
37
|
+
export interface E2ComplexKernelInputs {
|
|
38
|
+
/** Env tensor names, split by complex-ness. */
|
|
39
|
+
complexTensorNames: string[];
|
|
40
|
+
realTensorNames: string[];
|
|
41
|
+
/** Env scalar names, split by complex-ness. */
|
|
42
|
+
complexScalarNames: string[];
|
|
43
|
+
realScalarNames: string[];
|
|
44
|
+
/** Chain LHS names that need `in_<name>` because they're read before
|
|
45
|
+
* being written. Split by complex-ness of the LHS. */
|
|
46
|
+
complexInputLhsNames: string[];
|
|
47
|
+
realInputLhsNames: string[];
|
|
48
|
+
/** Chain LHS names that escape the chain. Split by complex-ness. */
|
|
49
|
+
complexEscapeLhsNames: string[];
|
|
50
|
+
realEscapeLhsNames: string[];
|
|
51
|
+
}
|
|
52
|
+
export interface E2ComplexChainEmitResult {
|
|
53
|
+
kernelName: string;
|
|
54
|
+
cSource: string;
|
|
55
|
+
koffiSig: string;
|
|
56
|
+
hash: string;
|
|
57
|
+
/** In signature order: complex tensors, real tensors. */
|
|
58
|
+
complexInputTensors: string[];
|
|
59
|
+
realInputTensors: string[];
|
|
60
|
+
/** In signature order: complex input LHSs, real input LHSs. */
|
|
61
|
+
complexInputLhsNames: string[];
|
|
62
|
+
realInputLhsNames: string[];
|
|
63
|
+
/** In signature order: complex scalars, real scalars. */
|
|
64
|
+
complexInputScalars: string[];
|
|
65
|
+
realInputScalars: string[];
|
|
66
|
+
/** In signature order: complex escape LHSs, real escape LHSs. */
|
|
67
|
+
complexEscapeLhsNames: string[];
|
|
68
|
+
realEscapeLhsNames: string[];
|
|
69
|
+
chainLength: number;
|
|
70
|
+
}
|
|
71
|
+
/** Per-assign LHS info — complex-ness of the RHS determines whether
|
|
72
|
+
* this stmt emits a paired (re/im) local or a single real local. */
|
|
73
|
+
export interface ComplexChainAssignSpec extends ChainAssignSpec {
|
|
74
|
+
/** True when THIS stmt's RHS is complex. Chain-LHS type can differ
|
|
75
|
+
* per reassignment; we track per-stmt so a `a = real; a = complex;`
|
|
76
|
+
* sequence sees `a` as complex only after the second assign. */
|
|
77
|
+
rhsIsComplex: boolean;
|
|
78
|
+
}
|
|
79
|
+
export declare function emitE2ComplexChainKernel(assigns: ComplexChainAssignSpec[], inputs: E2ComplexKernelInputs): E2ComplexChainEmitResult;
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 — shared kernel-emission helpers used by both the chain emitter
|
|
3
|
+
* and the reduction emitter. The two emitters build the same kernel
|
|
4
|
+
* shape up to a few trailing differences (reduction init / combine /
|
|
5
|
+
* out_acc output), so everything that's identical lives here.
|
|
6
|
+
*/
|
|
7
|
+
import type { JitExpr } from "../jitTypes.js";
|
|
8
|
+
import { type FusedTarget } from "../fusedScalarEmit.js";
|
|
9
|
+
import type { ScalarOpTarget } from "../scalarEmit.js";
|
|
10
|
+
/**
|
|
11
|
+
* C helper included in every e2 kernel prologue.
|
|
12
|
+
*
|
|
13
|
+
* `-ffast-math` implies `-ffinite-math-only`, which lets the compiler
|
|
14
|
+
* assume no NaN/Inf values and constant-fold `x != x` to 0 and `x == x`
|
|
15
|
+
* to 1. The NaN-detection idiom `mask = x ~= x` would silently return all
|
|
16
|
+
* zeros. `numbl_is_nan_fp` inspects the IEEE-754 bit pattern directly —
|
|
17
|
+
* the optimizer cannot look through `memcpy` to apply finite-math
|
|
18
|
+
* assumptions, so this survives the flag.
|
|
19
|
+
*/
|
|
20
|
+
export declare const E2_C_PROLOGUE: string;
|
|
21
|
+
/**
|
|
22
|
+
* e2-specific scalar op target.
|
|
23
|
+
*
|
|
24
|
+
* Identical to `C_SCALAR_TARGET` except `binEq` / `binNe`: when both
|
|
25
|
+
* operand strings are the same C expression (i.e. the source was `x == x`
|
|
26
|
+
* or `x ~= x`), the standard `==` / `!=` forms are constant-folded to
|
|
27
|
+
* 1 / 0 by `-ffinite-math-only`. Replace them with the bit-pattern NaN
|
|
28
|
+
* helper so the self-comparison gives the correct IEEE 754 result.
|
|
29
|
+
*/
|
|
30
|
+
export declare const E2_C_SCALAR_TARGET: ScalarOpTarget;
|
|
31
|
+
export interface ChainAssignSpec {
|
|
32
|
+
lhsName: string;
|
|
33
|
+
rhs: JitExpr;
|
|
34
|
+
}
|
|
35
|
+
export interface KernelInputs {
|
|
36
|
+
/** Regular env input tensor names (NOT chain LHSs). */
|
|
37
|
+
tensorNames: string[];
|
|
38
|
+
/** Scalar env input names. */
|
|
39
|
+
scalarNames: string[];
|
|
40
|
+
/** Chain LHS names that need `in_<name>` because they're read before
|
|
41
|
+
* being written. */
|
|
42
|
+
inputLhsNames: string[];
|
|
43
|
+
/** Chain LHS names that escape the chain (materialized via
|
|
44
|
+
* `out_<name>`). Does NOT include a reduce-target name — that one
|
|
45
|
+
* is always chain-local by construction. */
|
|
46
|
+
escapeLhsNames: string[];
|
|
47
|
+
}
|
|
48
|
+
export declare const cInputPtr: (name: string) => string;
|
|
49
|
+
export declare const cOutputPtr: (name: string) => string;
|
|
50
|
+
export declare const cScalarParam: (name: string) => string;
|
|
51
|
+
/** FusedTarget for the per-element body. Resolves Var reads to either
|
|
52
|
+
* the chain-local stack name (once the corresponding assign has run)
|
|
53
|
+
* or `in_<name>[i]` (before that point), mangles scalar param names,
|
|
54
|
+
* and dispatches whitelisted builtins through their `jitEmitC`. */
|
|
55
|
+
export declare function makeFusedTarget(locallyAssigned: ReadonlySet<string>): FusedTarget;
|
|
56
|
+
/** Unique chain LHS names in source order — for the leading
|
|
57
|
+
* `double <a>, <b>;` declaration. */
|
|
58
|
+
export declare function uniqueLhsOrdered(chain: ChainAssignSpec[]): string[];
|
|
59
|
+
/** All tensor-typed names visible to emitFusedScalarExpr: regular env
|
|
60
|
+
* tensors, `in_<lhs>` tensors, and chain LHSs. */
|
|
61
|
+
export declare function allTensorVarsFor(inputs: KernelInputs, chain: ChainAssignSpec[]): Set<string>;
|
|
62
|
+
/** Emit one `<lhs> = <rhsC>;` line per chain assign, growing
|
|
63
|
+
* `locallyAssigned` as we go so later stmts resolve earlier LHSs to
|
|
64
|
+
* the stack-local. */
|
|
65
|
+
export declare function emitChainAssignLines(chain: ChainAssignSpec[], allTensorVars: ReadonlySet<string>, ft: FusedTarget, locallyAssigned: Set<string>): string[];
|
|
66
|
+
/** Kernel param list (tensor → inputLhs → scalar → escapeLhs). Callers
|
|
67
|
+
* append any trailing params (e.g. `double *out_acc`). */
|
|
68
|
+
export declare function buildParamList(inputs: KernelInputs): string[];
|
|
69
|
+
/** koffi type list in the same order as `buildParamList`. Callers
|
|
70
|
+
* append any trailing entries. */
|
|
71
|
+
export declare function buildKoffiParts(inputs: KernelInputs): string[];
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 — Node-only install hook.
|
|
3
|
+
*
|
|
4
|
+
* Sets the module-level `e2CompileFn` to the real `compileAndLoad`
|
|
5
|
+
* driver from `c/compile.ts`. The browser bundle never imports this
|
|
6
|
+
* file, so `e2CompileFn` stays at the throwing stub and any attempt
|
|
7
|
+
* to use `--opt e2` from the web fails with a clear message.
|
|
8
|
+
*
|
|
9
|
+
* Idempotent: re-importing in tests doesn't re-install.
|
|
10
|
+
*/
|
|
11
|
+
export {};
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 — AST liveness helpers.
|
|
3
|
+
*
|
|
4
|
+
* Used by the chain classifier to decide whether a chain LHS is
|
|
5
|
+
* actually used outside the chain's own stmts. If not, it can be
|
|
6
|
+
* compiled as a per-element stack-local instead of being materialized
|
|
7
|
+
* as a tensor output buffer.
|
|
8
|
+
*
|
|
9
|
+
* The "scope" passed in is the innermost enclosing function body or
|
|
10
|
+
* top-level script body — chosen so that for-bodies, if-bodies, etc.
|
|
11
|
+
* are scanned recursively (MATLAB has flat function-level scoping for
|
|
12
|
+
* locals, so a name introduced inside a for-loop is visible to other
|
|
13
|
+
* stmts in the same function body).
|
|
14
|
+
*
|
|
15
|
+
* The walk excludes the chain's own stmts (and the trailing-reduction
|
|
16
|
+
* stmt if any) from the scan, applying the exclusion at every nesting
|
|
17
|
+
* level — so a chain inside a for-body whose LHS is read by another
|
|
18
|
+
* stmt in the same function body counts as referenced, but the chain
|
|
19
|
+
* stmts themselves don't trigger a false positive.
|
|
20
|
+
*/
|
|
21
|
+
import type { Stmt } from "../../parser/types.js";
|
|
22
|
+
/**
|
|
23
|
+
* True iff `name` appears anywhere in `scopeBody` outside the stmts
|
|
24
|
+
* listed in `excludeStmts`. The exclusion is by reference identity
|
|
25
|
+
* and is applied at every nesting level — pass the chain stmts (and
|
|
26
|
+
* the trailing-reduction stmt if any) so they don't trigger false
|
|
27
|
+
* positives.
|
|
28
|
+
*/
|
|
29
|
+
export declare function isNameReferencedOutsideStmts(scopeBody: readonly Stmt[], excludeStmts: ReadonlySet<Stmt>, name: string): boolean;
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 whole-loop C JIT.
|
|
3
|
+
*
|
|
4
|
+
* For a `for varName = lo:hi <body> end` where the body fits a supported
|
|
5
|
+
* shape, emit a single C function that runs all n iterations and call it
|
|
6
|
+
* once, instead of walking the AST on every iteration.
|
|
7
|
+
*
|
|
8
|
+
* Without this path, `--opt e2` pays ~70–100 ns per iter on a trivial
|
|
9
|
+
* `s = s + i` just for AST dispatch; a compiled C loop runs it in <1 ns.
|
|
10
|
+
*
|
|
11
|
+
* Current supported body shapes (all may mix in one loop):
|
|
12
|
+
* - scalar assign `s = s + sin(i) * cos(i) + sqrt(i*0.01)`
|
|
13
|
+
* - scalar indexed read `s = s + x(i)` (real tensor x)
|
|
14
|
+
* - scalar indexed write `y(i) = sin(i*0.01)` (preallocated y)
|
|
15
|
+
* - tensor local (elemwise) `c = a.*b + i*0.001` (per-element
|
|
16
|
+
* expression is
|
|
17
|
+
* inlined into any
|
|
18
|
+
* consuming sum();
|
|
19
|
+
* last-iter value
|
|
20
|
+
* is also written
|
|
21
|
+
* back to the env
|
|
22
|
+
* for MATLAB
|
|
23
|
+
* post-loop
|
|
24
|
+
* visibility)
|
|
25
|
+
* - reductions `s = s + sum(c)` (c is a
|
|
26
|
+
* tensor_local —
|
|
27
|
+
* chained
|
|
28
|
+
* tensor_locals
|
|
29
|
+
* fuse through)
|
|
30
|
+
*
|
|
31
|
+
* Not supported (falls through to the interpreter / other JIT paths):
|
|
32
|
+
* - non-`lo:hi` loop shapes (stepped ranges, `for i = v`)
|
|
33
|
+
* - complex or logical tensor inputs
|
|
34
|
+
* - matrix-matrix / matrix-vector multiplication
|
|
35
|
+
* - bsxfun / broadcast across shapes
|
|
36
|
+
* - function-handle calls, user-function calls
|
|
37
|
+
* - control flow inside the body (if / while / return)
|
|
38
|
+
* - multi-dimensional tensor access
|
|
39
|
+
*/
|
|
40
|
+
import type { Stmt } from "../../parser/types.js";
|
|
41
|
+
import type { Interpreter } from "../../interpreter/interpreter.js";
|
|
42
|
+
/**
|
|
43
|
+
* Attempt to compile and execute a for-loop as one C kernel under
|
|
44
|
+
* `--opt e2`. Returns true on success, false to fall back to the regular
|
|
45
|
+
* interpreter path (the caller will run the loop normally).
|
|
46
|
+
*/
|
|
47
|
+
export declare function tryE2Loop(interp: Interpreter, stmt: Stmt & {
|
|
48
|
+
type: "For";
|
|
49
|
+
}): boolean;
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 whole-loop C emission.
|
|
3
|
+
*
|
|
4
|
+
* Given a classified loop body (`BodyStmt[]`) plus the parameter lists
|
|
5
|
+
* that describe how env values flow in and out, emit a single C
|
|
6
|
+
* function that runs the whole `for varName = lo:hi` loop in one call.
|
|
7
|
+
*
|
|
8
|
+
* Three BodyStmt shapes are supported:
|
|
9
|
+
*
|
|
10
|
+
* scalar_assign `s = s + sin(i)` → one C statement per iter
|
|
11
|
+
* tensor_write `y(i) = sin(i*0.01)` → `v_y[(int64_t)idx-1] = ...`
|
|
12
|
+
* tensor_local `c = a.*b + i*0.001` → no code emitted here; its
|
|
13
|
+
* per-element expression is
|
|
14
|
+
* substituted into whichever
|
|
15
|
+
* reduction consumes it
|
|
16
|
+
*
|
|
17
|
+
* Reductions: a `scalar_assign` carries a list of `sum(<tensor_local>)`
|
|
18
|
+
* rewrites that were pulled out of its RHS upstream. Each is emitted as
|
|
19
|
+
* an inline inner `for __j` loop that accumulates the tensor_local's
|
|
20
|
+
* per-element expression into a fresh local. Chained tensor_locals
|
|
21
|
+
* (`d = sqrt(c+1)` where c is itself a tensor_local) fuse through
|
|
22
|
+
* recursively, so no intermediate buffer is materialized.
|
|
23
|
+
*/
|
|
24
|
+
import type { JitExpr } from "../jitTypes.js";
|
|
25
|
+
/** Scalar math builtins we emit as direct C library calls. We bypass
|
|
26
|
+
* each IBuiltin's `jitEmitC` here because some of those reject based
|
|
27
|
+
* on type narrowing (e.g. `sqrt` requires `isNonneg` and we don't
|
|
28
|
+
* propagate sign through Binary ops) — but in a pure-real scalar loop
|
|
29
|
+
* the C semantics (NaN on negative sqrt, etc.) match what a MATLAB
|
|
30
|
+
* user gets from `sqrt` on real numeric input.
|
|
31
|
+
*
|
|
32
|
+
* Exported so the driver's pre-lowering analysis can treat these
|
|
33
|
+
* names as non-env references. */
|
|
34
|
+
export declare const LOOP_SCALAR_BUILTINS: Record<string, string>;
|
|
35
|
+
/** A fused reduction lifted out of a `scalar_assign`'s RHS.
|
|
36
|
+
* `sum(<tensorLocal>)` in the source becomes a synthetic scalar ident
|
|
37
|
+
* `synthName`; the emitter materializes it as an inline inner loop
|
|
38
|
+
* that accumulates `tensorLocal`'s per-element expression. */
|
|
39
|
+
export interface Reduction {
|
|
40
|
+
synthName: string;
|
|
41
|
+
tensorLocal: string;
|
|
42
|
+
op: "sum";
|
|
43
|
+
}
|
|
44
|
+
/** A body statement in a form ready for C emission. */
|
|
45
|
+
export type BodyStmt = {
|
|
46
|
+
kind: "scalar_assign";
|
|
47
|
+
name: string;
|
|
48
|
+
rhs: JitExpr;
|
|
49
|
+
reductions: Reduction[];
|
|
50
|
+
} | {
|
|
51
|
+
kind: "tensor_write";
|
|
52
|
+
name: string;
|
|
53
|
+
idxRhs: JitExpr;
|
|
54
|
+
rhs: JitExpr;
|
|
55
|
+
} | {
|
|
56
|
+
kind: "tensor_local";
|
|
57
|
+
name: string;
|
|
58
|
+
elemExpr: JitExpr;
|
|
59
|
+
lengthTensor: string;
|
|
60
|
+
};
|
|
61
|
+
/** Mangle a MATLAB scalar name to a C local-variable name. Prefix keeps
|
|
62
|
+
* it out of the way of our bookkeeping locals (`lo`, `hi`, `__iv`). */
|
|
63
|
+
export declare function v(name: string): string;
|
|
64
|
+
/** Name for the `int64_t` length companion that travels alongside each
|
|
65
|
+
* tensor param so inner reductions can bound their inline `__j` loop. */
|
|
66
|
+
export declare function lenN(name: string): string;
|
|
67
|
+
/** Names of all tensor_locals in the body, in body-declaration order.
|
|
68
|
+
* Callers use this to allocate matching output buffers in the same
|
|
69
|
+
* order as the kernel's param list. */
|
|
70
|
+
export declare function tensorLocalNames(body: BodyStmt[]): string[];
|
|
71
|
+
export declare function emitLoopKernel(scalarInputVars: string[], tensorInputVars: string[], tensorInoutVars: string[], inoutVars: string[], loopVar: string, body: BodyStmt[]): {
|
|
72
|
+
cSource: string;
|
|
73
|
+
kernelName: string;
|
|
74
|
+
koffiSig: string;
|
|
75
|
+
};
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 — multi-reduction driver.
|
|
3
|
+
*
|
|
4
|
+
* Handles a scalar `Assign` whose RHS contains TWO or more reduction
|
|
5
|
+
* calls (`sum`, `prod`, `max`, `min`, `mean`) over the same single
|
|
6
|
+
* tensor variable, e.g.
|
|
7
|
+
*
|
|
8
|
+
* red_acc = red_acc + sum(x) + mean(x) + max(x) + min(x);
|
|
9
|
+
*
|
|
10
|
+
* The default interpreter path makes one pass through the tensor per
|
|
11
|
+
* reduction (4× the memory traffic of the optimal). The e2 driver
|
|
12
|
+
* detects the pattern, compiles ONE kernel that computes every
|
|
13
|
+
* requested reduction in a single pass, and substitutes the reduction
|
|
14
|
+
* subtrees in the RHS with the kernel's scalar outputs before
|
|
15
|
+
* evaluating the residual expression.
|
|
16
|
+
*
|
|
17
|
+
* Reuses [e1/multiReductionKernel.ts](../e1/multiReductionKernel.ts)
|
|
18
|
+
* for the C emission (same shape works for both backends).
|
|
19
|
+
*/
|
|
20
|
+
import type { Stmt } from "../../parser/types.js";
|
|
21
|
+
import type { Interpreter } from "../../interpreter/interpreter.js";
|
|
22
|
+
export declare function tryE2MultiReduction(interp: Interpreter, stmt: Stmt & {
|
|
23
|
+
type: "Assign";
|
|
24
|
+
}): boolean;
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 — reduction kernel emission.
|
|
3
|
+
*
|
|
4
|
+
* Handles two related patterns in a single emitter:
|
|
5
|
+
*
|
|
6
|
+
* (A) Standalone reduction:
|
|
7
|
+
* acc = [acc OP] reduce(elemwiseExpr)
|
|
8
|
+
* Empty chain prefix; the kernel walks the inputs once and
|
|
9
|
+
* accumulates `reduce(per-element-expr)` into a scalar buffer.
|
|
10
|
+
*
|
|
11
|
+
* (B) Chain + trailing reduction:
|
|
12
|
+
* lhs1 = ...; lhs2 = ...; ...; lhsK = ...;
|
|
13
|
+
* acc = [acc OP] reduce(lhsK)
|
|
14
|
+
* The chain runs in the same per-element loop; lhsK is purely
|
|
15
|
+
* chain-local (never materialized) — the kernel accumulates
|
|
16
|
+
* reduce(lhsK) into the scalar buffer. Other chain LHSs may
|
|
17
|
+
* still escape (extra `out_<name>` outputs).
|
|
18
|
+
*
|
|
19
|
+
* Both cases use the same kernel shape:
|
|
20
|
+
*
|
|
21
|
+
* void e2r_<hash>(int64_t n,
|
|
22
|
+
* ..in_*.., ..in_lhs_input.., ..s_*..,
|
|
23
|
+
* ..out_escape.., double *out_acc)
|
|
24
|
+
* {
|
|
25
|
+
* double acc = <init>;
|
|
26
|
+
* #pragma omp simd
|
|
27
|
+
* for (int64_t i = 0; i < n; i++) {
|
|
28
|
+
* double <chain_lhs1>, ..., <chain_lhsK>;
|
|
29
|
+
* <chain_lhs1> = <stmt0_rhs_C>;
|
|
30
|
+
* ...
|
|
31
|
+
* <chain_lhsK> = <stmtK_rhs_C>;
|
|
32
|
+
* out_<escape>[i] = <escape>;
|
|
33
|
+
* <reduce-combine>(acc, <reduce_value_expr>);
|
|
34
|
+
* }
|
|
35
|
+
* *out_acc = acc;
|
|
36
|
+
* }
|
|
37
|
+
*
|
|
38
|
+
* For "mean": JS combines `acc /= n` after reading the buffer back.
|
|
39
|
+
* For "max"/"min": uses if-update inside the loop (works under
|
|
40
|
+
* `-ffast-math` + `#pragma omp simd`).
|
|
41
|
+
*/
|
|
42
|
+
import type { JitExpr } from "../jitTypes.js";
|
|
43
|
+
import { type ChainAssignSpec, type KernelInputs } from "./emitShared.js";
|
|
44
|
+
export interface ReductionEmitSpec {
|
|
45
|
+
/** Chain prefix (length 0 for standalone-reduction). */
|
|
46
|
+
chain: ChainAssignSpec[];
|
|
47
|
+
/** Reduction op name: sum, prod, max, min, mean, any, all. */
|
|
48
|
+
reduceName: string;
|
|
49
|
+
/** Per-element value expression to feed the reduction.
|
|
50
|
+
* - For (A) standalone: the elemwise expression `reduce(...)` was
|
|
51
|
+
* given.
|
|
52
|
+
* - For (B) chain + trailing: a `Var(lastChainLhsName)` JitExpr —
|
|
53
|
+
* the emitter resolves it to the stack-local. */
|
|
54
|
+
reduceValueExpr: JitExpr;
|
|
55
|
+
inputs: KernelInputs;
|
|
56
|
+
}
|
|
57
|
+
export interface E2ReductionEmitResult {
|
|
58
|
+
kernelName: string;
|
|
59
|
+
cSource: string;
|
|
60
|
+
koffiSig: string;
|
|
61
|
+
hash: string;
|
|
62
|
+
inputTensors: string[];
|
|
63
|
+
inputLhsNames: string[];
|
|
64
|
+
inputScalars: string[];
|
|
65
|
+
escapeLhsNames: string[];
|
|
66
|
+
/** True when the kernel produces a scalar reduction output (always
|
|
67
|
+
* true for this emitter; here for symmetry with other entries). */
|
|
68
|
+
hasReductionOutput: true;
|
|
69
|
+
reduceName: string;
|
|
70
|
+
chainLength: number;
|
|
71
|
+
}
|
|
72
|
+
export declare function emitE2ReductionKernel(spec: ReductionEmitSpec, par?: boolean): E2ReductionEmitResult;
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* e2 — whole-function scalar C-kernel driver.
|
|
3
|
+
*
|
|
4
|
+
* Mirrors what e1 does for pure-scalar functions (benchmarks/scalar_bench.m's
|
|
5
|
+
* `run_bench` is the motivating case) but triggers straight from the
|
|
6
|
+
* interpreter's `callUserFunction` entry, not through the JS-JIT outer.
|
|
7
|
+
* Under `--opt e2` the JS-JIT is disabled (optimization clamped to 0),
|
|
8
|
+
* so we can't lean on `tryEmitScalarFnKernel` + the `$h.compileKernel`
|
|
9
|
+
* plumbing; instead we invoke the shared lowering + C-emit pipeline
|
|
10
|
+
* directly and call the resulting koffi function with plain scalar
|
|
11
|
+
* args and Float64Array(1) out-buffers per output.
|
|
12
|
+
*
|
|
13
|
+
* Scope:
|
|
14
|
+
* - All args are scalar `number` or `boolean` RuntimeValues.
|
|
15
|
+
* - Declared outputs (the first `nargout || 1` of them) all lower to
|
|
16
|
+
* scalar / boolean types.
|
|
17
|
+
* - The body survives `checkCFeasibility` (no tic/toc, no Index
|
|
18
|
+
* writes, no disp, etc.).
|
|
19
|
+
*
|
|
20
|
+
* Outside this envelope we return `E2_SKIP` and the caller proceeds
|
|
21
|
+
* with the interpreter path. Compilation failures are HARD errors —
|
|
22
|
+
* mirrors the e2 multi-reduction/chain drivers' policy.
|
|
23
|
+
*/
|
|
24
|
+
import type { Interpreter } from "../../interpreter/interpreter.js";
|
|
25
|
+
import type { FunctionDef } from "../../interpreter/types.js";
|
|
26
|
+
export declare const E2_SKIP: unique symbol;
|
|
27
|
+
/** Try to run `fn(args)` via a whole-function C kernel. Returns
|
|
28
|
+
* `E2_SKIP` to fall through to the interpreter. */
|
|
29
|
+
export declare function tryE2ScalarFn(interp: Interpreter, fn: FunctionDef, args: unknown[], nargout: number): unknown | typeof E2_SKIP;
|
|
@@ -35,6 +35,14 @@ export interface FusedTarget {
|
|
|
35
35
|
* the data pointer is named and whether it's aliased locally.
|
|
36
36
|
*/
|
|
37
37
|
tensorElemRead(name: string): string;
|
|
38
|
+
/**
|
|
39
|
+
* Emit a read of tensor `name` at a runtime 1-based scalar index
|
|
40
|
+
* `idxC` — i.e. `data[(int64_t)idx - 1]`. Used by the e2 whole-loop
|
|
41
|
+
* kernel (scalar-context access; elemwise backends can leave this
|
|
42
|
+
* undefined, the emitter will throw on an Index node). Returns `null`
|
|
43
|
+
* to reject.
|
|
44
|
+
*/
|
|
45
|
+
tensorScalarIndexRead?(name: string, idxC: string): string | null;
|
|
38
46
|
/**
|
|
39
47
|
* Emit a call to a scalar math builtin. The backend decides which
|
|
40
48
|
* builtins it supports and how they map to library functions (e.g.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared heavy-op heuristic for fused-loop emitters (e1 and e2).
|
|
3
|
+
*
|
|
4
|
+
* Counts the number of "expensive" math operations in a JitExpr — the
|
|
5
|
+
* kind of work that's heavy enough per element that OpenMP thread-
|
|
6
|
+
* spawn overhead pays off at N >= 100k. Arithmetic-only chains skip
|
|
7
|
+
* the parallel-for pragma because threads slow them down: the body
|
|
8
|
+
* becomes memory-bandwidth-bound and adding threads only adds overhead.
|
|
9
|
+
*/
|
|
10
|
+
import type { JitExpr } from "./jitTypes.js";
|
|
11
|
+
export declare function countHeavyOps(expr: JitExpr): number;
|
|
12
|
+
/** Minimum element count before `#pragma omp parallel for simd` kicks
|
|
13
|
+
* in. Below this the thread-spawn cost dominates the work.
|
|
14
|
+
* Overridable via `NUMBL_OMP_THRESHOLD` for benchmarks. */
|
|
15
|
+
export declare function ompParallelThreshold(): number;
|
|
@@ -4,4 +4,4 @@
|
|
|
4
4
|
* IR walkers for hoist-pass data collection are in jitCodegenHoist.ts.
|
|
5
5
|
*/
|
|
6
6
|
import { type JitStmt } from "../jitTypes.js";
|
|
7
|
-
export declare function generateJS(body: JitStmt[], params: string[], outputs: string[], nargout: number, localVars: Set<string>, fileName?: string,
|
|
7
|
+
export declare function generateJS(body: JitStmt[], params: string[], outputs: string[], nargout: number, localVars: Set<string>, fileName?: string, experimental?: string, par?: boolean): string;
|
|
@@ -14,4 +14,4 @@
|
|
|
14
14
|
* inside the same loop.
|
|
15
15
|
*/
|
|
16
16
|
import type { FusibleChain } from "../fusion.js";
|
|
17
|
-
export declare function emitJsFusedChain(lines: string[], indent: string, chain: FusibleChain, allTensorVars: ReadonlySet<string>, paramTensors: ReadonlySet<string>, outputTensorNames: ReadonlySet<string>, _localTensorNames: ReadonlySet<string>, mangle: (n: string) => string, experimental?: string): void;
|
|
17
|
+
export declare function emitJsFusedChain(lines: string[], indent: string, chain: FusibleChain, allTensorVars: ReadonlySet<string>, paramTensors: ReadonlySet<string>, outputTensorNames: ReadonlySet<string>, _localTensorNames: ReadonlySet<string>, complexTensorNames: ReadonlySet<string>, complexScalarVars: ReadonlySet<string>, mangle: (n: string) => string, experimental?: string, par?: boolean): void;
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JS-side multi-reduction block emission.
|
|
3
|
+
*
|
|
4
|
+
* Detection + dispatch for scalar Assigns of the form
|
|
5
|
+
*
|
|
6
|
+
* acc = <expr tree with >=2 reductions over a single tensor Var>
|
|
7
|
+
*
|
|
8
|
+
* On match, the JS codegen emits a block that runs a single-pass
|
|
9
|
+
* reduction loop (either an inline JS loop, or a compiled C kernel
|
|
10
|
+
* under --opt e1), stashes each reduction result in a local, and then
|
|
11
|
+
* emits the Assign's RHS with each reduction Call substituted by the
|
|
12
|
+
* local that holds its value. The RHS-substitution contract lives in
|
|
13
|
+
* `jitCodegen._multiReductionSubst`, which this module installs and
|
|
14
|
+
* clears around the `emitExpr` call the caller provides.
|
|
15
|
+
*
|
|
16
|
+
* Limited to:
|
|
17
|
+
* - sum / prod / max / min / mean (short-circuit any/all excluded)
|
|
18
|
+
* - a single tensor Var argument to every reduction
|
|
19
|
+
* - all reductions read the same tensor name
|
|
20
|
+
* - the tensor is real-typed
|
|
21
|
+
* - at least two reductions in the RHS
|
|
22
|
+
*
|
|
23
|
+
* NaN semantics for max/min are matched to MATLAB omit-NaN behaviour:
|
|
24
|
+
* IEEE unordered compare already skips NaN, and an `any_non_nan` flag
|
|
25
|
+
* drives the all-NaN → NaN fallback.
|
|
26
|
+
*/
|
|
27
|
+
import type { JitExpr, JitStmt } from "../jitTypes.js";
|
|
28
|
+
import { type MultiReduceOp } from "../e1/multiReductionKernel.js";
|
|
29
|
+
export interface MultiReductionMatch {
|
|
30
|
+
/** The scalar Assign being emitted (target + original RHS). */
|
|
31
|
+
stmt: JitStmt & {
|
|
32
|
+
tag: "Assign";
|
|
33
|
+
};
|
|
34
|
+
/** The tensor variable name all reductions read. */
|
|
35
|
+
tensorName: string;
|
|
36
|
+
/** Distinct ops that actually appear, in first-occurrence order. Used
|
|
37
|
+
* to size the kernel's output buffer and decide which accumulators
|
|
38
|
+
* to declare on the JS fallback path. */
|
|
39
|
+
ops: MultiReduceOp[];
|
|
40
|
+
/** Every reduction Call node found in the RHS, paired with its op.
|
|
41
|
+
* The JS emitter builds a substitution map from each Call to the
|
|
42
|
+
* local that holds its result. */
|
|
43
|
+
sites: {
|
|
44
|
+
call: JitExpr & {
|
|
45
|
+
tag: "Call";
|
|
46
|
+
};
|
|
47
|
+
op: MultiReduceOp;
|
|
48
|
+
}[];
|
|
49
|
+
}
|
|
50
|
+
/** Try to match the multi-reduction pattern on a single statement. */
|
|
51
|
+
export declare function tryMatchMultiReduction(stmt: JitStmt): MultiReductionMatch | null;
|
|
52
|
+
/**
|
|
53
|
+
* Emit the multi-reduction block. Writes into `lines`.
|
|
54
|
+
*
|
|
55
|
+
* The block:
|
|
56
|
+
* 1. Aliases `<tensorName>.data` to a local and reads its length.
|
|
57
|
+
* 2. Under e1, compiles/dispatches a single-pass C kernel that fills
|
|
58
|
+
* a Float64Array scratch with the accumulator values, with an
|
|
59
|
+
* inline JS fallback at small `n`. Under non-e1, emits just the
|
|
60
|
+
* JS loop.
|
|
61
|
+
* 3. Post-loop: `mean = sum / n`, and a NaN fixup for max/min when
|
|
62
|
+
* every input element was NaN.
|
|
63
|
+
* 4. Installs `_multiReductionSubst` pointing each reduction Call at
|
|
64
|
+
* its local, emits the Assign's RHS via the caller-provided
|
|
65
|
+
* `emitExpr`, and writes the final `<target> = <rhs>;`.
|
|
66
|
+
*/
|
|
67
|
+
export declare function emitMultiReductionBlock(lines: string[], indent: string, match: MultiReductionMatch, mangleName: (n: string) => string, emitExprWithSubst: (expr: JitExpr, subst: Map<JitExpr, string>) => string, experimental: string | undefined, par: boolean): void;
|
|
68
|
+
/** Reset per-function counter so generated names stay stable between
|
|
69
|
+
* compiles of the same IR. Called from `generateJS`. */
|
|
70
|
+
export declare function resetMultiReductionState(): void;
|
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
/** Numbl version, used for JIT disk cache invalidation. */
|
|
2
|
-
export declare const NUMBL_VERSION = "0.
|
|
2
|
+
export declare const NUMBL_VERSION = "0.3.0";
|
package/native/numbl_addon.cpp
CHANGED
|
@@ -22,9 +22,11 @@
|
|
|
22
22
|
#include "numbl_addon_common.h"
|
|
23
23
|
#include <cstdlib>
|
|
24
24
|
|
|
25
|
+
#ifndef __APPLE__
|
|
25
26
|
extern "C" {
|
|
26
27
|
void openblas_set_num_threads(int num_threads);
|
|
27
28
|
}
|
|
29
|
+
#endif
|
|
28
30
|
|
|
29
31
|
// ── Addon version ────────────────────────────────────────────────────────────
|
|
30
32
|
// Bump this integer whenever the addon's API changes (new functions, signature
|
|
@@ -58,9 +60,16 @@ static Napi::Value AddonVersion(const Napi::CallbackInfo& info) {
|
|
|
58
60
|
Napi::Object Init(Napi::Env env, Napi::Object exports) {
|
|
59
61
|
// Use single-threaded BLAS unless the user explicitly set the env var.
|
|
60
62
|
// Multi-threaded BLAS adds overhead for the many small matmuls in numbl.
|
|
63
|
+
#ifdef __APPLE__
|
|
64
|
+
// Accelerate reads VECLIB_MAXIMUM_THREADS from the environment on first use.
|
|
65
|
+
if (!std::getenv("VECLIB_MAXIMUM_THREADS")) {
|
|
66
|
+
setenv("VECLIB_MAXIMUM_THREADS", "1", 0);
|
|
67
|
+
}
|
|
68
|
+
#else
|
|
61
69
|
if (!std::getenv("OPENBLAS_NUM_THREADS")) {
|
|
62
70
|
openblas_set_num_threads(1);
|
|
63
71
|
}
|
|
72
|
+
#endif
|
|
64
73
|
exports.Set(Napi::String::New(env, "addonVersion"),
|
|
65
74
|
Napi::Function::New(env, AddonVersion));
|
|
66
75
|
exports.Set(Napi::String::New(env, "inv"),
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "numbl",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Run .m source files in the browser and on the command line by compiling to JavaScript",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"type": "module",
|
|
@@ -61,9 +61,7 @@
|
|
|
61
61
|
"test:coverage:all": "bash scripts/coverage-all.sh",
|
|
62
62
|
"test:scripts": "bash numbl_test_scripts/run_all.sh",
|
|
63
63
|
"test:browser": "playwright test",
|
|
64
|
-
"test:scripts:
|
|
65
|
-
"test:scripts:c-jit-fuse": "bash numbl_test_scripts/run_c_jit_fuse.sh",
|
|
66
|
-
"test:scripts:c-jit-parity": "bash numbl_test_scripts/run_c_jit_parity.sh",
|
|
64
|
+
"test:scripts:e1": "bash numbl_test_scripts/run_e1.sh",
|
|
67
65
|
"build:wasm": "bash numbl_test_scripts/build_wasm.sh",
|
|
68
66
|
"update-readme": "tsx scripts/update-readme-usage.ts",
|
|
69
67
|
"check-readme": "tsx scripts/update-readme-usage.ts --check",
|