@vorionsys/atsf-core 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/basis/parser.d.ts +74 -74
- package/dist/basis/parser.js +3 -3
- package/dist/basis/parser.js.map +1 -1
- package/dist/common/config.d.ts +16 -16
- package/dist/enforce/fast-path.d.ts +134 -0
- package/dist/enforce/fast-path.d.ts.map +1 -0
- package/dist/enforce/fast-path.js +257 -0
- package/dist/enforce/fast-path.js.map +1 -0
- package/dist/enforce/pipeline-optimizer.d.ts +111 -0
- package/dist/enforce/pipeline-optimizer.d.ts.map +1 -0
- package/dist/enforce/pipeline-optimizer.js +370 -0
- package/dist/enforce/pipeline-optimizer.js.map +1 -0
- package/dist/enforce/policy-cache.d.ts +92 -0
- package/dist/enforce/policy-cache.d.ts.map +1 -0
- package/dist/enforce/policy-cache.js +186 -0
- package/dist/enforce/policy-cache.js.map +1 -0
- package/dist/enforce/trust-cache.d.ts +118 -0
- package/dist/enforce/trust-cache.d.ts.map +1 -0
- package/dist/enforce/trust-cache.js +218 -0
- package/dist/enforce/trust-cache.js.map +1 -0
- package/dist/paramesphere/gpu-svd.d.ts +102 -0
- package/dist/paramesphere/gpu-svd.d.ts.map +1 -0
- package/dist/paramesphere/gpu-svd.js +668 -0
- package/dist/paramesphere/gpu-svd.js.map +1 -0
- package/dist/paramesphere/index.d.ts +2 -0
- package/dist/paramesphere/index.d.ts.map +1 -1
- package/dist/paramesphere/index.js +1 -0
- package/dist/paramesphere/index.js.map +1 -1
- package/dist/paramesphere/paramesphere-engine.d.ts +40 -3
- package/dist/paramesphere/paramesphere-engine.d.ts.map +1 -1
- package/dist/paramesphere/paramesphere-engine.js +133 -6
- package/dist/paramesphere/paramesphere-engine.js.map +1 -1
- package/dist/paramesphere/scheduled-verifier.d.ts +136 -0
- package/dist/paramesphere/scheduled-verifier.d.ts.map +1 -0
- package/dist/paramesphere/scheduled-verifier.js +338 -0
- package/dist/paramesphere/scheduled-verifier.js.map +1 -0
- package/dist/paramesphere/svd-worker-pool.d.ts +37 -0
- package/dist/paramesphere/svd-worker-pool.d.ts.map +1 -0
- package/dist/paramesphere/svd-worker-pool.js +144 -0
- package/dist/paramesphere/svd-worker-pool.js.map +1 -0
- package/dist/paramesphere/svd-worker.d.ts +2 -0
- package/dist/paramesphere/svd-worker.d.ts.map +1 -0
- package/dist/paramesphere/svd-worker.js +103 -0
- package/dist/paramesphere/svd-worker.js.map +1 -0
- package/dist/paramesphere/types.d.ts +14 -0
- package/dist/paramesphere/types.d.ts.map +1 -1
- package/dist/paramesphere/types.js.map +1 -1
- package/dist/phase6/types.d.ts +257 -257
- package/dist/phase6/types.js +1 -1
- package/dist/phase6/types.js.map +1 -1
- package/package.json +2 -2
|
@@ -0,0 +1,668 @@
|
|
|
1
|
+
// SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
// Copyright 2024-2026 Vorion LLC
|
|
3
|
+
/**
|
|
4
|
+
* GPU-Offloaded SVD Computation for ParameSphere
|
|
5
|
+
*
|
|
6
|
+
* Provides optional GPU-accelerated singular value decomposition for
|
|
7
|
+
* T5-T7 scale models where CPU power iteration becomes expensive on
|
|
8
|
+
* large weight matrices.
|
|
9
|
+
*
|
|
10
|
+
* Three provider implementations:
|
|
11
|
+
* - **WebGpuSvdProvider** — Uses WebGPU compute shaders for browser/Deno
|
|
12
|
+
* environments that expose `navigator.gpu`.
|
|
13
|
+
* - **OnnxSvdProvider** — Uses ONNX Runtime with CUDA execution provider
|
|
14
|
+
* for Node.js server environments.
|
|
15
|
+
* - **CpuFallbackProvider** — Wraps the existing CPU power-iteration
|
|
16
|
+
* implementation for environments where no GPU is available.
|
|
17
|
+
*
|
|
18
|
+
* All providers satisfy the same `GpuSvdProvider` interface and produce
|
|
19
|
+
* results within 1e-6 tolerance of the CPU baseline.
|
|
20
|
+
*
|
|
21
|
+
* @packageDocumentation
|
|
22
|
+
*/
|
|
23
|
+
import { topKSingularValues } from './paramesphere-engine.js';
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
// CPU Fallback Provider
|
|
26
|
+
// ---------------------------------------------------------------------------
|
|
27
|
+
/**
|
|
28
|
+
* CPU fallback provider that delegates to the existing power-iteration
|
|
29
|
+
* implementation. This is the baseline — all other providers must match
|
|
30
|
+
* its output within 1e-6 tolerance.
|
|
31
|
+
*/
|
|
32
|
+
export class CpuFallbackProvider {
|
|
33
|
+
async available() {
|
|
34
|
+
return true; // CPU is always available
|
|
35
|
+
}
|
|
36
|
+
async topKSingularValues(data, rows, cols, k) {
|
|
37
|
+
return topKSingularValues(data, rows, cols, k);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
// WebGPU SVD Provider
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
/**
|
|
44
|
+
* WGSL compute shader for matrix-vector product A^T * A * v.
|
|
45
|
+
*
|
|
46
|
+
* This fuses the two matrix-vector multiplications (u = A*v, then
|
|
47
|
+
* v_new = A^T*u) into a single dispatch, halving the number of GPU
|
|
48
|
+
* round-trips per power iteration step.
|
|
49
|
+
*
|
|
50
|
+
* Bindings:
|
|
51
|
+
* @group(0) @binding(0) — matrix A (row-major, f32 for GPU efficiency)
|
|
52
|
+
* @group(0) @binding(1) — input vector v (f32)
|
|
53
|
+
* @group(0) @binding(2) — output vector result (f32)
|
|
54
|
+
* @group(0) @binding(3) — dimensions uniform (rows, cols as u32)
|
|
55
|
+
*/
|
|
56
|
+
const WGSL_ATA_SHADER = /* wgsl */ `
|
|
57
|
+
struct Dims {
|
|
58
|
+
rows: u32,
|
|
59
|
+
cols: u32,
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
@group(0) @binding(0) var<storage, read> matrix_a: array<f32>;
|
|
63
|
+
@group(0) @binding(1) var<storage, read> vec_in: array<f32>;
|
|
64
|
+
@group(0) @binding(2) var<storage, read_write> vec_out: array<f32>;
|
|
65
|
+
@group(0) @binding(3) var<uniform> dims: Dims;
|
|
66
|
+
|
|
67
|
+
// Stage 1: Compute u = A * v (result has 'rows' elements)
|
|
68
|
+
// Stage 2: Compute result = A^T * u (result has 'cols' elements)
|
|
69
|
+
// We do both in one shader by using a workgroup-shared intermediate.
|
|
70
|
+
|
|
71
|
+
// For simplicity and correctness, this shader computes one element of
|
|
72
|
+
// the output (A^T * A * v) per invocation. Each invocation j computes:
|
|
73
|
+
// result[j] = sum_i( A[i][j] * sum_k( A[i][k] * v[k] ) )
|
|
74
|
+
// which is the j-th element of A^T * (A * v).
|
|
75
|
+
|
|
76
|
+
@compute @workgroup_size(64)
|
|
77
|
+
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
|
78
|
+
let j = gid.x;
|
|
79
|
+
let rows = dims.rows;
|
|
80
|
+
let cols = dims.cols;
|
|
81
|
+
|
|
82
|
+
if (j >= cols) {
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
var acc: f32 = 0.0;
|
|
87
|
+
|
|
88
|
+
// For each row i, compute dot(A[i,:], v) then accumulate A[i][j] * dot
|
|
89
|
+
for (var i: u32 = 0u; i < rows; i = i + 1u) {
|
|
90
|
+
var row_dot: f32 = 0.0;
|
|
91
|
+
let row_off = i * cols;
|
|
92
|
+
for (var k: u32 = 0u; k < cols; k = k + 1u) {
|
|
93
|
+
row_dot = row_dot + matrix_a[row_off + k] * vec_in[k];
|
|
94
|
+
}
|
|
95
|
+
acc = acc + matrix_a[row_off + j] * row_dot;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
vec_out[j] = acc;
|
|
99
|
+
}
|
|
100
|
+
`;
|
|
101
|
+
/**
|
|
102
|
+
* WebGPU-based SVD provider.
|
|
103
|
+
*
|
|
104
|
+
* Uses a compute shader for the expensive A^T*A*v products in the power
|
|
105
|
+
* iteration loop. Falls back to CPU if WebGPU is unavailable.
|
|
106
|
+
*
|
|
107
|
+
* Note: This provider is primarily useful in browser or Deno environments
|
|
108
|
+
* that expose the WebGPU API. In Node.js, prefer `OnnxSvdProvider`.
|
|
109
|
+
*/
|
|
110
|
+
export class WebGpuSvdProvider {
|
|
111
|
+
device = null;
|
|
112
|
+
pipeline = null;
|
|
113
|
+
fallback = new CpuFallbackProvider();
|
|
114
|
+
initPromise = null;
|
|
115
|
+
/** Power iteration parameters matching the CPU implementation. */
|
|
116
|
+
static POWER_ITER_MAX = 300;
|
|
117
|
+
static POWER_ITER_TOL = 1e-10;
|
|
118
|
+
async available() {
|
|
119
|
+
if (this.initPromise)
|
|
120
|
+
return this.initPromise;
|
|
121
|
+
this.initPromise = this.initGpu();
|
|
122
|
+
return this.initPromise;
|
|
123
|
+
}
|
|
124
|
+
async topKSingularValues(data, rows, cols, k) {
|
|
125
|
+
const gpuReady = await this.available();
|
|
126
|
+
if (!gpuReady || !this.device || !this.pipeline) {
|
|
127
|
+
return this.fallback.topKSingularValues(data, rows, cols, k);
|
|
128
|
+
}
|
|
129
|
+
try {
|
|
130
|
+
return await this.computeOnGpu(data, rows, cols, k);
|
|
131
|
+
}
|
|
132
|
+
catch {
|
|
133
|
+
// Any GPU error falls back to CPU gracefully
|
|
134
|
+
return this.fallback.topKSingularValues(data, rows, cols, k);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
// -------------------------------------------------------------------------
|
|
138
|
+
// GPU Initialization
|
|
139
|
+
// -------------------------------------------------------------------------
|
|
140
|
+
async initGpu() {
|
|
141
|
+
try {
|
|
142
|
+
// Check for WebGPU availability (globalThis for cross-env compat)
|
|
143
|
+
const nav = globalThis.navigator;
|
|
144
|
+
if (!nav?.gpu)
|
|
145
|
+
return false;
|
|
146
|
+
const adapter = await nav.gpu.requestAdapter();
|
|
147
|
+
if (!adapter)
|
|
148
|
+
return false;
|
|
149
|
+
this.device = await adapter.requestDevice();
|
|
150
|
+
// Compile the compute shader
|
|
151
|
+
const shaderModule = this.device.createShaderModule({
|
|
152
|
+
code: WGSL_ATA_SHADER,
|
|
153
|
+
});
|
|
154
|
+
this.pipeline = this.device.createComputePipeline({
|
|
155
|
+
layout: 'auto',
|
|
156
|
+
compute: {
|
|
157
|
+
module: shaderModule,
|
|
158
|
+
entryPoint: 'main',
|
|
159
|
+
},
|
|
160
|
+
});
|
|
161
|
+
return true;
|
|
162
|
+
}
|
|
163
|
+
catch {
|
|
164
|
+
this.device = null;
|
|
165
|
+
this.pipeline = null;
|
|
166
|
+
return false;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
// -------------------------------------------------------------------------
|
|
170
|
+
// GPU Computation
|
|
171
|
+
// -------------------------------------------------------------------------
|
|
172
|
+
async computeOnGpu(data, rows, cols, k) {
|
|
173
|
+
const device = this.device;
|
|
174
|
+
const pipeline = this.pipeline;
|
|
175
|
+
const rank = Math.min(rows, cols, k);
|
|
176
|
+
const sigmas = new Float64Array(rank);
|
|
177
|
+
// Work on a copy for deflation (same as CPU path)
|
|
178
|
+
const A = new Float64Array(data);
|
|
179
|
+
for (let s = 0; s < rank; s++) {
|
|
180
|
+
// Convert current matrix to Float32 for GPU
|
|
181
|
+
const matF32 = new Float32Array(A.length);
|
|
182
|
+
for (let i = 0; i < A.length; i++)
|
|
183
|
+
matF32[i] = A[i];
|
|
184
|
+
// Initialize v with deterministic pseudo-random vector (matching CPU)
|
|
185
|
+
const v = new Float64Array(cols);
|
|
186
|
+
for (let j = 0; j < cols; j++) {
|
|
187
|
+
v[j] = Math.sin((s + 1) * (j + 1) * 0.7071);
|
|
188
|
+
}
|
|
189
|
+
normalizeVecLocal(v);
|
|
190
|
+
let sigma = 0;
|
|
191
|
+
// Create GPU buffers
|
|
192
|
+
const matBuffer = device.createBuffer({
|
|
193
|
+
size: matF32.byteLength,
|
|
194
|
+
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
|
|
195
|
+
});
|
|
196
|
+
const vecInBuffer = device.createBuffer({
|
|
197
|
+
size: cols * 4, // f32
|
|
198
|
+
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
|
|
199
|
+
});
|
|
200
|
+
const vecOutBuffer = device.createBuffer({
|
|
201
|
+
size: cols * 4, // f32
|
|
202
|
+
usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
|
|
203
|
+
});
|
|
204
|
+
const dimsBuffer = device.createBuffer({
|
|
205
|
+
size: 8, // 2x u32
|
|
206
|
+
usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
|
|
207
|
+
});
|
|
208
|
+
const readbackBuffer = device.createBuffer({
|
|
209
|
+
size: cols * 4,
|
|
210
|
+
usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
|
|
211
|
+
});
|
|
212
|
+
// Upload matrix and dimensions (only once per singular value)
|
|
213
|
+
device.queue.writeBuffer(matBuffer, 0, matF32);
|
|
214
|
+
device.queue.writeBuffer(dimsBuffer, 0, new Uint32Array([rows, cols]));
|
|
215
|
+
const bindGroup = device.createBindGroup({
|
|
216
|
+
layout: pipeline.getBindGroupLayout(0),
|
|
217
|
+
entries: [
|
|
218
|
+
{ binding: 0, resource: { buffer: matBuffer } },
|
|
219
|
+
{ binding: 1, resource: { buffer: vecInBuffer } },
|
|
220
|
+
{ binding: 2, resource: { buffer: vecOutBuffer } },
|
|
221
|
+
{ binding: 3, resource: { buffer: dimsBuffer } },
|
|
222
|
+
],
|
|
223
|
+
});
|
|
224
|
+
// Power iteration loop
|
|
225
|
+
for (let iter = 0; iter < WebGpuSvdProvider.POWER_ITER_MAX; iter++) {
|
|
226
|
+
// Upload current v as f32
|
|
227
|
+
const vF32 = new Float32Array(cols);
|
|
228
|
+
for (let j = 0; j < cols; j++)
|
|
229
|
+
vF32[j] = v[j];
|
|
230
|
+
device.queue.writeBuffer(vecInBuffer, 0, vF32);
|
|
231
|
+
// Dispatch compute shader: A^T * A * v
|
|
232
|
+
const commandEncoder = device.createCommandEncoder();
|
|
233
|
+
const pass = commandEncoder.beginComputePass();
|
|
234
|
+
pass.setPipeline(pipeline);
|
|
235
|
+
pass.setBindGroup(0, bindGroup);
|
|
236
|
+
pass.dispatchWorkgroups(Math.ceil(cols / 64));
|
|
237
|
+
pass.end();
|
|
238
|
+
// Copy result to readback buffer
|
|
239
|
+
commandEncoder.copyBufferToBuffer(vecOutBuffer, 0, readbackBuffer, 0, cols * 4);
|
|
240
|
+
device.queue.submit([commandEncoder.finish()]);
|
|
241
|
+
// Read back result
|
|
242
|
+
await readbackBuffer.mapAsync(GPUMapMode.READ);
|
|
243
|
+
const resultF32 = new Float32Array(readbackBuffer.getMappedRange().slice(0));
|
|
244
|
+
readbackBuffer.unmap();
|
|
245
|
+
// Convert back to Float64 and extract sigma
|
|
246
|
+
const vNew = new Float64Array(cols);
|
|
247
|
+
for (let j = 0; j < cols; j++)
|
|
248
|
+
vNew[j] = resultF32[j];
|
|
249
|
+
const sigmaNew = vecNormLocal(vNew);
|
|
250
|
+
if (sigmaNew < 1e-15)
|
|
251
|
+
break;
|
|
252
|
+
scaleVecLocal(vNew, 1 / sigmaNew);
|
|
253
|
+
// sigma for A^T*A*v is sigma^2 of A, so take sqrt
|
|
254
|
+
const sigmaEst = Math.sqrt(sigmaNew);
|
|
255
|
+
const diff = Math.abs(sigma - sigmaEst);
|
|
256
|
+
for (let j = 0; j < cols; j++)
|
|
257
|
+
v[j] = vNew[j];
|
|
258
|
+
sigma = sigmaEst;
|
|
259
|
+
if (diff < WebGpuSvdProvider.POWER_ITER_TOL * sigma && iter > 0)
|
|
260
|
+
break;
|
|
261
|
+
}
|
|
262
|
+
sigmas[s] = sigma;
|
|
263
|
+
// Deflate: A <- A - sigma * u * v^T (on CPU — deflation is cheap)
|
|
264
|
+
const u = matVecMulLocal(A, rows, cols, v);
|
|
265
|
+
const uNorm = vecNormLocal(u);
|
|
266
|
+
if (uNorm > 1e-15) {
|
|
267
|
+
scaleVecLocal(u, 1 / uNorm);
|
|
268
|
+
for (let i = 0; i < rows; i++) {
|
|
269
|
+
for (let j = 0; j < cols; j++) {
|
|
270
|
+
A[i * cols + j] -= sigma * u[i] * v[j];
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
// Cleanup GPU buffers for this singular value
|
|
275
|
+
matBuffer.destroy();
|
|
276
|
+
vecInBuffer.destroy();
|
|
277
|
+
vecOutBuffer.destroy();
|
|
278
|
+
dimsBuffer.destroy();
|
|
279
|
+
readbackBuffer.destroy();
|
|
280
|
+
}
|
|
281
|
+
return sigmas;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
// ---------------------------------------------------------------------------
|
|
285
|
+
// ONNX Runtime SVD Provider
|
|
286
|
+
// ---------------------------------------------------------------------------
|
|
287
|
+
/**
|
|
288
|
+
* ONNX Runtime-based SVD provider for Node.js server environments.
|
|
289
|
+
*
|
|
290
|
+
* Uses ONNX Runtime's CUDA execution provider when available, falling
|
|
291
|
+
* back to the CPU execution provider. This is the recommended GPU path
|
|
292
|
+
* for Node.js where WebGPU is not yet available.
|
|
293
|
+
*
|
|
294
|
+
* Implementation strategy: Since ONNX doesn't have a direct SVD op,
|
|
295
|
+
* we implement the same power-iteration algorithm but offload the
|
|
296
|
+
* matrix-vector multiplications (the expensive inner loop) to ONNX
|
|
297
|
+
* Runtime which can run them on GPU via CUDA/cuDNN.
|
|
298
|
+
*/
|
|
299
|
+
export class OnnxSvdProvider {
|
|
300
|
+
session = null;
|
|
301
|
+
onnxAvailable = false;
|
|
302
|
+
executionProvider = 'cpu';
|
|
303
|
+
fallback = new CpuFallbackProvider();
|
|
304
|
+
initPromise = null;
|
|
305
|
+
/** Power iteration parameters matching the CPU implementation. */
|
|
306
|
+
static POWER_ITER_MAX = 300;
|
|
307
|
+
static POWER_ITER_TOL = 1e-10;
|
|
308
|
+
async available() {
|
|
309
|
+
if (this.initPromise)
|
|
310
|
+
return this.initPromise;
|
|
311
|
+
this.initPromise = this.initOnnx();
|
|
312
|
+
return this.initPromise;
|
|
313
|
+
}
|
|
314
|
+
async topKSingularValues(data, rows, cols, k) {
|
|
315
|
+
const onnxReady = await this.available();
|
|
316
|
+
if (!onnxReady || !this.session) {
|
|
317
|
+
return this.fallback.topKSingularValues(data, rows, cols, k);
|
|
318
|
+
}
|
|
319
|
+
try {
|
|
320
|
+
return await this.computeWithOnnx(data, rows, cols, k);
|
|
321
|
+
}
|
|
322
|
+
catch {
|
|
323
|
+
// Any ONNX error falls back to CPU gracefully
|
|
324
|
+
return this.fallback.topKSingularValues(data, rows, cols, k);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
/** Returns the active execution provider name ('cuda' or 'cpu'). */
|
|
328
|
+
getExecutionProvider() {
|
|
329
|
+
return this.executionProvider;
|
|
330
|
+
}
|
|
331
|
+
// -------------------------------------------------------------------------
|
|
332
|
+
// ONNX Initialization
|
|
333
|
+
// -------------------------------------------------------------------------
|
|
334
|
+
async initOnnx() {
|
|
335
|
+
try {
|
|
336
|
+
// Dynamic import to avoid hard dependency on onnxruntime-node
|
|
337
|
+
const ort = await importOnnxRuntime();
|
|
338
|
+
if (!ort)
|
|
339
|
+
return false;
|
|
340
|
+
// Try CUDA first, then fall back to CPU
|
|
341
|
+
const providers = [];
|
|
342
|
+
try {
|
|
343
|
+
// Probe for CUDA availability
|
|
344
|
+
providers.push('CUDAExecutionProvider');
|
|
345
|
+
}
|
|
346
|
+
catch {
|
|
347
|
+
// CUDA not available
|
|
348
|
+
}
|
|
349
|
+
providers.push('CPUExecutionProvider');
|
|
350
|
+
// Create a minimal MatMul model in-memory for GPU-accelerated mat-vec
|
|
351
|
+
const modelBytes = buildMatMulOnnxModel();
|
|
352
|
+
this.session = await ort.InferenceSession.create(modelBytes.buffer, { executionProviders: providers });
|
|
353
|
+
// Detect which provider was actually selected
|
|
354
|
+
this.executionProvider = providers.includes('CUDAExecutionProvider')
|
|
355
|
+
? 'cuda'
|
|
356
|
+
: 'cpu';
|
|
357
|
+
this.onnxAvailable = true;
|
|
358
|
+
return true;
|
|
359
|
+
}
|
|
360
|
+
catch {
|
|
361
|
+
this.onnxAvailable = false;
|
|
362
|
+
return false;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
// -------------------------------------------------------------------------
|
|
366
|
+
// ONNX Computation
|
|
367
|
+
// -------------------------------------------------------------------------
|
|
368
|
+
async computeWithOnnx(data, rows, cols, k) {
|
|
369
|
+
const rank = Math.min(rows, cols, k);
|
|
370
|
+
const sigmas = new Float64Array(rank);
|
|
371
|
+
const A = new Float64Array(data);
|
|
372
|
+
for (let s = 0; s < rank; s++) {
|
|
373
|
+
let v = new Float64Array(cols);
|
|
374
|
+
for (let j = 0; j < cols; j++) {
|
|
375
|
+
v[j] = Math.sin((s + 1) * (j + 1) * 0.7071);
|
|
376
|
+
}
|
|
377
|
+
normalizeVecLocal(v);
|
|
378
|
+
let sigma = 0;
|
|
379
|
+
for (let iter = 0; iter < OnnxSvdProvider.POWER_ITER_MAX; iter++) {
|
|
380
|
+
// u = A * v (via ONNX MatMul or CPU)
|
|
381
|
+
const u = await this.onnxMatVecMul(A, rows, cols, v);
|
|
382
|
+
sigma = vecNormLocal(u);
|
|
383
|
+
if (sigma < 1e-15)
|
|
384
|
+
break;
|
|
385
|
+
scaleVecLocal(u, 1 / sigma);
|
|
386
|
+
// v_new = A^T * u (via ONNX MatMul or CPU)
|
|
387
|
+
const vNew = await this.onnxMatTransVecMul(A, rows, cols, u);
|
|
388
|
+
const sigmaNew = vecNormLocal(vNew);
|
|
389
|
+
if (sigmaNew < 1e-15)
|
|
390
|
+
break;
|
|
391
|
+
scaleVecLocal(vNew, 1 / sigmaNew);
|
|
392
|
+
const diff = Math.abs(sigma - sigmaNew);
|
|
393
|
+
v = vNew;
|
|
394
|
+
sigma = sigmaNew;
|
|
395
|
+
if (diff < OnnxSvdProvider.POWER_ITER_TOL * sigma)
|
|
396
|
+
break;
|
|
397
|
+
}
|
|
398
|
+
sigmas[s] = sigma;
|
|
399
|
+
// Deflate
|
|
400
|
+
const u = matVecMulLocal(A, rows, cols, v);
|
|
401
|
+
const uNorm = vecNormLocal(u);
|
|
402
|
+
if (uNorm > 1e-15) {
|
|
403
|
+
scaleVecLocal(u, 1 / uNorm);
|
|
404
|
+
for (let i = 0; i < rows; i++) {
|
|
405
|
+
for (let j = 0; j < cols; j++) {
|
|
406
|
+
A[i * cols + j] -= sigma * u[i] * v[j];
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
return sigmas;
|
|
412
|
+
}
|
|
413
|
+
/**
|
|
414
|
+
* GPU-accelerated matrix-vector multiply via ONNX Runtime.
|
|
415
|
+
* Falls back to CPU implementation if ONNX session is unavailable.
|
|
416
|
+
*/
|
|
417
|
+
async onnxMatVecMul(A, rows, cols, v) {
|
|
418
|
+
if (!this.session) {
|
|
419
|
+
return matVecMulLocal(A, rows, cols, v);
|
|
420
|
+
}
|
|
421
|
+
try {
|
|
422
|
+
const ort = await importOnnxRuntime();
|
|
423
|
+
if (!ort)
|
|
424
|
+
return matVecMulLocal(A, rows, cols, v);
|
|
425
|
+
// Reshape: A is [rows, cols], v is [cols, 1] → result is [rows, 1]
|
|
426
|
+
const matData = new Float32Array(A.length);
|
|
427
|
+
for (let i = 0; i < A.length; i++)
|
|
428
|
+
matData[i] = A[i];
|
|
429
|
+
const vecData = new Float32Array(cols);
|
|
430
|
+
for (let i = 0; i < cols; i++)
|
|
431
|
+
vecData[i] = v[i];
|
|
432
|
+
const matTensor = new ort.Tensor('float32', matData, [rows, cols]);
|
|
433
|
+
const vecTensor = new ort.Tensor('float32', vecData, [cols, 1]);
|
|
434
|
+
const results = await this.session.run({ A: matTensor, B: vecTensor });
|
|
435
|
+
const outputData = results['Y'].data;
|
|
436
|
+
const result = new Float64Array(rows);
|
|
437
|
+
for (let i = 0; i < rows; i++)
|
|
438
|
+
result[i] = outputData[i];
|
|
439
|
+
return result;
|
|
440
|
+
}
|
|
441
|
+
catch {
|
|
442
|
+
return matVecMulLocal(A, rows, cols, v);
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
/**
|
|
446
|
+
* GPU-accelerated A^T * u via ONNX Runtime.
|
|
447
|
+
*/
|
|
448
|
+
async onnxMatTransVecMul(A, rows, cols, u) {
|
|
449
|
+
if (!this.session) {
|
|
450
|
+
return matTransVecMulLocal(A, rows, cols, u);
|
|
451
|
+
}
|
|
452
|
+
try {
|
|
453
|
+
const ort = await importOnnxRuntime();
|
|
454
|
+
if (!ort)
|
|
455
|
+
return matTransVecMulLocal(A, rows, cols, u);
|
|
456
|
+
// A^T is [cols, rows], u is [rows, 1] → result is [cols, 1]
|
|
457
|
+
// Instead of transposing, pass u^T * A which gives same result
|
|
458
|
+
const matData = new Float32Array(A.length);
|
|
459
|
+
for (let i = 0; i < A.length; i++)
|
|
460
|
+
matData[i] = A[i];
|
|
461
|
+
const vecData = new Float32Array(rows);
|
|
462
|
+
for (let i = 0; i < rows; i++)
|
|
463
|
+
vecData[i] = u[i];
|
|
464
|
+
// Reshape as [1, rows] x [rows, cols] = [1, cols]
|
|
465
|
+
const vecTensor = new ort.Tensor('float32', vecData, [1, rows]);
|
|
466
|
+
const matTensor = new ort.Tensor('float32', matData, [rows, cols]);
|
|
467
|
+
const results = await this.session.run({ A: vecTensor, B: matTensor });
|
|
468
|
+
const outputData = results['Y'].data;
|
|
469
|
+
const result = new Float64Array(cols);
|
|
470
|
+
for (let i = 0; i < cols; i++)
|
|
471
|
+
result[i] = outputData[i];
|
|
472
|
+
return result;
|
|
473
|
+
}
|
|
474
|
+
catch {
|
|
475
|
+
return matTransVecMulLocal(A, rows, cols, u);
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
/**
|
|
480
|
+
* Dynamically import onnxruntime-node. Returns null if not installed.
|
|
481
|
+
*/
|
|
482
|
+
async function importOnnxRuntime() {
|
|
483
|
+
try {
|
|
484
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
485
|
+
const ort = await import('onnxruntime-node');
|
|
486
|
+
return ort;
|
|
487
|
+
}
|
|
488
|
+
catch {
|
|
489
|
+
return null;
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
/**
|
|
493
|
+
* Build a minimal ONNX protobuf for a MatMul node (A × B = Y).
|
|
494
|
+
*
|
|
495
|
+
* This constructs the raw protobuf bytes for the simplest possible
|
|
496
|
+
* ONNX model: a single MatMul node with two dynamic-shaped inputs
|
|
497
|
+
* and one output. The model is opset 13 compatible.
|
|
498
|
+
*
|
|
499
|
+
* We build the protobuf manually to avoid depending on onnx-proto
|
|
500
|
+
* or protobufjs at runtime.
|
|
501
|
+
*/
|
|
502
|
+
function buildMatMulOnnxModel() {
|
|
503
|
+
// ONNX uses protobuf encoding. We construct a minimal valid model
|
|
504
|
+
// with a single MatMul node. This is a well-known minimal structure.
|
|
505
|
+
//
|
|
506
|
+
// The model graph:
|
|
507
|
+
// inputs: A (float, dynamic shape), B (float, dynamic shape)
|
|
508
|
+
// node: MatMul(A, B) -> Y
|
|
509
|
+
// outputs: Y (float, dynamic shape)
|
|
510
|
+
// Helper to encode a protobuf varint
|
|
511
|
+
function encodeVarint(value) {
|
|
512
|
+
const bytes = [];
|
|
513
|
+
let v = value >>> 0; // ensure unsigned
|
|
514
|
+
while (v > 0x7f) {
|
|
515
|
+
bytes.push((v & 0x7f) | 0x80);
|
|
516
|
+
v >>>= 7;
|
|
517
|
+
}
|
|
518
|
+
bytes.push(v);
|
|
519
|
+
return bytes;
|
|
520
|
+
}
|
|
521
|
+
// Helper to encode a length-delimited protobuf field
|
|
522
|
+
function field(fieldNum, data) {
|
|
523
|
+
const tag = (fieldNum << 3) | 2; // wire type 2 = length-delimited
|
|
524
|
+
const dataArr = data instanceof Uint8Array ? Array.from(data) : data;
|
|
525
|
+
return [...encodeVarint(tag), ...encodeVarint(dataArr.length), ...dataArr];
|
|
526
|
+
}
|
|
527
|
+
// Helper to encode a varint field
|
|
528
|
+
function varintField(fieldNum, value) {
|
|
529
|
+
const tag = (fieldNum << 3) | 0; // wire type 0 = varint
|
|
530
|
+
return [...encodeVarint(tag), ...encodeVarint(value)];
|
|
531
|
+
}
|
|
532
|
+
// Helper to encode a string
|
|
533
|
+
function encodeString(s) {
|
|
534
|
+
return Array.from(new TextEncoder().encode(s));
|
|
535
|
+
}
|
|
536
|
+
// TensorTypeProto for float with unknown shape
|
|
537
|
+
// elem_type = 1 (FLOAT)
|
|
538
|
+
const tensorTypeFloat = [...varintField(1, 1)]; // elem_type = FLOAT
|
|
539
|
+
// TypeProto with tensor_type
|
|
540
|
+
const typeProtoFloat = field(1, tensorTypeFloat); // field 1 = tensor_type
|
|
541
|
+
// ValueInfoProto for input A
|
|
542
|
+
const valueInfoA = [
|
|
543
|
+
...field(1, encodeString('A')), // name
|
|
544
|
+
...field(2, typeProtoFloat), // type
|
|
545
|
+
];
|
|
546
|
+
// ValueInfoProto for input B
|
|
547
|
+
const valueInfoB = [
|
|
548
|
+
...field(1, encodeString('B')),
|
|
549
|
+
...field(2, typeProtoFloat),
|
|
550
|
+
];
|
|
551
|
+
// ValueInfoProto for output Y
|
|
552
|
+
const valueInfoY = [
|
|
553
|
+
...field(1, encodeString('Y')),
|
|
554
|
+
...field(2, typeProtoFloat),
|
|
555
|
+
];
|
|
556
|
+
// NodeProto: MatMul(A, B) -> Y
|
|
557
|
+
const matMulNode = [
|
|
558
|
+
...field(1, encodeString('A')), // input
|
|
559
|
+
...field(1, encodeString('B')), // input
|
|
560
|
+
...field(2, encodeString('Y')), // output
|
|
561
|
+
...field(4, encodeString('MatMul')), // op_type
|
|
562
|
+
];
|
|
563
|
+
// GraphProto
|
|
564
|
+
const graphProto = [
|
|
565
|
+
...field(1, matMulNode), // node (repeated, field 1)
|
|
566
|
+
...field(5, encodeString('svd_matmul')), // name
|
|
567
|
+
...field(11, valueInfoA), // input
|
|
568
|
+
...field(11, valueInfoB), // input
|
|
569
|
+
...field(12, valueInfoY), // output
|
|
570
|
+
];
|
|
571
|
+
// OperatorSetIdProto for opset 13
|
|
572
|
+
const opsetImport = [...varintField(2, 13)]; // version = 13
|
|
573
|
+
// ModelProto
|
|
574
|
+
const modelProto = [
|
|
575
|
+
...varintField(1, 7), // ir_version = 7
|
|
576
|
+
...field(8, opsetImport), // opset_import
|
|
577
|
+
...field(2, encodeString('vorion-svd-matmul')), // producer_name
|
|
578
|
+
...field(7, graphProto), // graph
|
|
579
|
+
];
|
|
580
|
+
return new Uint8Array(modelProto);
|
|
581
|
+
}
|
|
582
|
+
// ---------------------------------------------------------------------------
|
|
583
|
+
// Factory Function
|
|
584
|
+
// ---------------------------------------------------------------------------
|
|
585
|
+
/**
|
|
586
|
+
* Create the best available SVD provider.
|
|
587
|
+
*
|
|
588
|
+
* Auto-detection order:
|
|
589
|
+
* 1. WebGPU (if `preference` is 'gpu' or unset and WebGPU is available)
|
|
590
|
+
* 2. ONNX Runtime with CUDA (if `preference` is 'onnx' or unset and onnxruntime-node is installed)
|
|
591
|
+
* 3. CPU fallback (always available)
|
|
592
|
+
*
|
|
593
|
+
* @param preference - Force a specific provider. Omit to auto-detect.
|
|
594
|
+
* @returns The best available provider.
|
|
595
|
+
*/
|
|
596
|
+
export async function createSvdProvider(preference) {
|
|
597
|
+
if (preference === 'cpu') {
|
|
598
|
+
return new CpuFallbackProvider();
|
|
599
|
+
}
|
|
600
|
+
if (preference === 'gpu') {
|
|
601
|
+
const provider = new WebGpuSvdProvider();
|
|
602
|
+
if (await provider.available())
|
|
603
|
+
return provider;
|
|
604
|
+
// Explicit GPU request but unavailable — fall through to ONNX then CPU
|
|
605
|
+
}
|
|
606
|
+
if (preference === 'onnx') {
|
|
607
|
+
const provider = new OnnxSvdProvider();
|
|
608
|
+
if (await provider.available())
|
|
609
|
+
return provider;
|
|
610
|
+
// Explicit ONNX request but unavailable — fall through to CPU
|
|
611
|
+
return new CpuFallbackProvider();
|
|
612
|
+
}
|
|
613
|
+
// Auto-detect: try WebGPU first, then ONNX, then CPU
|
|
614
|
+
const webgpu = new WebGpuSvdProvider();
|
|
615
|
+
if (await webgpu.available())
|
|
616
|
+
return webgpu;
|
|
617
|
+
const onnx = new OnnxSvdProvider();
|
|
618
|
+
if (await onnx.available())
|
|
619
|
+
return onnx;
|
|
620
|
+
return new CpuFallbackProvider();
|
|
621
|
+
}
|
|
622
|
+
// ---------------------------------------------------------------------------
|
|
623
|
+
// Local Linear Algebra Helpers (duplicated to avoid circular deps)
|
|
624
|
+
// ---------------------------------------------------------------------------
|
|
625
|
+
/** Matrix-vector multiply A * v. */
|
|
626
|
+
function matVecMulLocal(A, m, n, v) {
|
|
627
|
+
const result = new Float64Array(m);
|
|
628
|
+
for (let i = 0; i < m; i++) {
|
|
629
|
+
let sum = 0;
|
|
630
|
+
const rowOff = i * n;
|
|
631
|
+
for (let j = 0; j < n; j++) {
|
|
632
|
+
sum += A[rowOff + j] * v[j];
|
|
633
|
+
}
|
|
634
|
+
result[i] = sum;
|
|
635
|
+
}
|
|
636
|
+
return result;
|
|
637
|
+
}
|
|
638
|
+
/** Matrix-transpose-vector multiply A^T * u. */
|
|
639
|
+
function matTransVecMulLocal(A, m, n, u) {
|
|
640
|
+
const result = new Float64Array(n);
|
|
641
|
+
for (let j = 0; j < n; j++) {
|
|
642
|
+
let sum = 0;
|
|
643
|
+
for (let i = 0; i < m; i++) {
|
|
644
|
+
sum += A[i * n + j] * u[i];
|
|
645
|
+
}
|
|
646
|
+
result[j] = sum;
|
|
647
|
+
}
|
|
648
|
+
return result;
|
|
649
|
+
}
|
|
650
|
+
/** Euclidean norm of a vector. */
|
|
651
|
+
function vecNormLocal(v) {
|
|
652
|
+
let sum = 0;
|
|
653
|
+
for (let i = 0; i < v.length; i++)
|
|
654
|
+
sum += v[i] * v[i];
|
|
655
|
+
return Math.sqrt(sum);
|
|
656
|
+
}
|
|
657
|
+
/** In-place normalisation. */
|
|
658
|
+
function normalizeVecLocal(v) {
|
|
659
|
+
const n = vecNormLocal(v);
|
|
660
|
+
if (n > 1e-15)
|
|
661
|
+
scaleVecLocal(v, 1 / n);
|
|
662
|
+
}
|
|
663
|
+
/** In-place scalar multiply. */
|
|
664
|
+
function scaleVecLocal(v, s) {
|
|
665
|
+
for (let i = 0; i < v.length; i++)
|
|
666
|
+
v[i] *= s;
|
|
667
|
+
}
|
|
668
|
+
//# sourceMappingURL=gpu-svd.js.map
|