@vorionsys/atsf-core 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/dist/basis/parser.d.ts +20 -20
  2. package/dist/basis/parser.js +2 -2
  3. package/dist/basis/parser.js.map +1 -1
  4. package/dist/common/config.d.ts +2 -2
  5. package/dist/index.d.ts +1 -0
  6. package/dist/index.d.ts.map +1 -1
  7. package/dist/index.js +2 -0
  8. package/dist/index.js.map +1 -1
  9. package/dist/paramesphere/gpu-svd.d.ts +102 -0
  10. package/dist/paramesphere/gpu-svd.d.ts.map +1 -0
  11. package/dist/paramesphere/gpu-svd.js +668 -0
  12. package/dist/paramesphere/gpu-svd.js.map +1 -0
  13. package/dist/paramesphere/index.d.ts +2 -0
  14. package/dist/paramesphere/index.d.ts.map +1 -1
  15. package/dist/paramesphere/index.js +1 -0
  16. package/dist/paramesphere/index.js.map +1 -1
  17. package/dist/paramesphere/paramesphere-engine.d.ts +67 -4
  18. package/dist/paramesphere/paramesphere-engine.d.ts.map +1 -1
  19. package/dist/paramesphere/paramesphere-engine.js +215 -7
  20. package/dist/paramesphere/paramesphere-engine.js.map +1 -1
  21. package/dist/paramesphere/types.d.ts +14 -0
  22. package/dist/paramesphere/types.d.ts.map +1 -1
  23. package/dist/paramesphere/types.js.map +1 -1
  24. package/dist/phase6/types.d.ts +44 -44
  25. package/dist/trust-engine/index.d.ts +30 -0
  26. package/dist/trust-engine/index.d.ts.map +1 -1
  27. package/dist/trust-engine/index.js +74 -4
  28. package/dist/trust-engine/index.js.map +1 -1
  29. package/dist/trust-engine/scheduled-verifier.d.ts +127 -0
  30. package/dist/trust-engine/scheduled-verifier.d.ts.map +1 -0
  31. package/dist/trust-engine/scheduled-verifier.js +257 -0
  32. package/dist/trust-engine/scheduled-verifier.js.map +1 -0
  33. package/package.json +1 -1
@@ -0,0 +1,668 @@
1
+ // SPDX-License-Identifier: Apache-2.0
2
+ // Copyright 2024-2026 Vorion LLC
3
+ /**
4
+ * GPU-Offloaded SVD Computation for ParameSphere
5
+ *
6
+ * Provides optional GPU-accelerated singular value decomposition for
7
+ * T5-T7 scale models where CPU power iteration becomes expensive on
8
+ * large weight matrices.
9
+ *
10
+ * Three provider implementations:
11
+ * - **WebGpuSvdProvider** — Uses WebGPU compute shaders for browser/Deno
12
+ * environments that expose `navigator.gpu`.
13
+ * - **OnnxSvdProvider** — Uses ONNX Runtime with CUDA execution provider
14
+ * for Node.js server environments.
15
+ * - **CpuFallbackProvider** — Wraps the existing CPU power-iteration
16
+ * implementation for environments where no GPU is available.
17
+ *
18
+ * All providers satisfy the same `GpuSvdProvider` interface and produce
19
+ * results within 1e-6 tolerance of the CPU baseline.
20
+ *
21
+ * @packageDocumentation
22
+ */
23
+ import { topKSingularValues } from './paramesphere-engine.js';
24
+ // ---------------------------------------------------------------------------
25
+ // CPU Fallback Provider
26
+ // ---------------------------------------------------------------------------
27
+ /**
28
+ * CPU fallback provider that delegates to the existing power-iteration
29
+ * implementation. This is the baseline — all other providers must match
30
+ * its output within 1e-6 tolerance.
31
+ */
32
+ export class CpuFallbackProvider {
33
+ async available() {
34
+ return true; // CPU is always available
35
+ }
36
+ async topKSingularValues(data, rows, cols, k) {
37
+ return topKSingularValues(data, rows, cols, k);
38
+ }
39
+ }
40
+ // ---------------------------------------------------------------------------
41
+ // WebGPU SVD Provider
42
+ // ---------------------------------------------------------------------------
43
+ /**
44
+ * WGSL compute shader for matrix-vector product A^T * A * v.
45
+ *
46
+ * This fuses the two matrix-vector multiplications (u = A*v, then
47
+ * v_new = A^T*u) into a single dispatch, halving the number of GPU
48
+ * round-trips per power iteration step.
49
+ *
50
+ * Bindings:
51
+ * @group(0) @binding(0) — matrix A (row-major, f32 for GPU efficiency)
52
+ * @group(0) @binding(1) — input vector v (f32)
53
+ * @group(0) @binding(2) — output vector result (f32)
54
+ * @group(0) @binding(3) — dimensions uniform (rows, cols as u32)
55
+ */
56
+ const WGSL_ATA_SHADER = /* wgsl */ `
57
+ struct Dims {
58
+ rows: u32,
59
+ cols: u32,
60
+ };
61
+
62
+ @group(0) @binding(0) var<storage, read> matrix_a: array<f32>;
63
+ @group(0) @binding(1) var<storage, read> vec_in: array<f32>;
64
+ @group(0) @binding(2) var<storage, read_write> vec_out: array<f32>;
65
+ @group(0) @binding(3) var<uniform> dims: Dims;
66
+
67
+ // Stage 1: Compute u = A * v (result has 'rows' elements)
68
+ // Stage 2: Compute result = A^T * u (result has 'cols' elements)
69
+ // We do both in one shader by using a workgroup-shared intermediate.
70
+
71
+ // For simplicity and correctness, this shader computes one element of
72
+ // the output (A^T * A * v) per invocation. Each invocation j computes:
73
+ // result[j] = sum_i( A[i][j] * sum_k( A[i][k] * v[k] ) )
74
+ // which is the j-th element of A^T * (A * v).
75
+
76
+ @compute @workgroup_size(64)
77
+ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
78
+ let j = gid.x;
79
+ let rows = dims.rows;
80
+ let cols = dims.cols;
81
+
82
+ if (j >= cols) {
83
+ return;
84
+ }
85
+
86
+ var acc: f32 = 0.0;
87
+
88
+ // For each row i, compute dot(A[i,:], v) then accumulate A[i][j] * dot
89
+ for (var i: u32 = 0u; i < rows; i = i + 1u) {
90
+ var row_dot: f32 = 0.0;
91
+ let row_off = i * cols;
92
+ for (var k: u32 = 0u; k < cols; k = k + 1u) {
93
+ row_dot = row_dot + matrix_a[row_off + k] * vec_in[k];
94
+ }
95
+ acc = acc + matrix_a[row_off + j] * row_dot;
96
+ }
97
+
98
+ vec_out[j] = acc;
99
+ }
100
+ `;
101
+ /**
102
+ * WebGPU-based SVD provider.
103
+ *
104
+ * Uses a compute shader for the expensive A^T*A*v products in the power
105
+ * iteration loop. Falls back to CPU if WebGPU is unavailable.
106
+ *
107
+ * Note: This provider is primarily useful in browser or Deno environments
108
+ * that expose the WebGPU API. In Node.js, prefer `OnnxSvdProvider`.
109
+ */
110
+ export class WebGpuSvdProvider {
111
+ device = null;
112
+ pipeline = null;
113
+ fallback = new CpuFallbackProvider();
114
+ initPromise = null;
115
+ /** Power iteration parameters matching the CPU implementation. */
116
+ static POWER_ITER_MAX = 300;
117
+ static POWER_ITER_TOL = 1e-10;
118
+ async available() {
119
+ if (this.initPromise)
120
+ return this.initPromise;
121
+ this.initPromise = this.initGpu();
122
+ return this.initPromise;
123
+ }
124
+ async topKSingularValues(data, rows, cols, k) {
125
+ const gpuReady = await this.available();
126
+ if (!gpuReady || !this.device || !this.pipeline) {
127
+ return this.fallback.topKSingularValues(data, rows, cols, k);
128
+ }
129
+ try {
130
+ return await this.computeOnGpu(data, rows, cols, k);
131
+ }
132
+ catch {
133
+ // Any GPU error falls back to CPU gracefully
134
+ return this.fallback.topKSingularValues(data, rows, cols, k);
135
+ }
136
+ }
137
+ // -------------------------------------------------------------------------
138
+ // GPU Initialization
139
+ // -------------------------------------------------------------------------
140
+ async initGpu() {
141
+ try {
142
+ // Check for WebGPU availability (globalThis for cross-env compat)
143
+ const nav = globalThis.navigator;
144
+ if (!nav?.gpu)
145
+ return false;
146
+ const adapter = await nav.gpu.requestAdapter();
147
+ if (!adapter)
148
+ return false;
149
+ this.device = await adapter.requestDevice();
150
+ // Compile the compute shader
151
+ const shaderModule = this.device.createShaderModule({
152
+ code: WGSL_ATA_SHADER,
153
+ });
154
+ this.pipeline = this.device.createComputePipeline({
155
+ layout: 'auto',
156
+ compute: {
157
+ module: shaderModule,
158
+ entryPoint: 'main',
159
+ },
160
+ });
161
+ return true;
162
+ }
163
+ catch {
164
+ this.device = null;
165
+ this.pipeline = null;
166
+ return false;
167
+ }
168
+ }
169
+ // -------------------------------------------------------------------------
170
+ // GPU Computation
171
+ // -------------------------------------------------------------------------
172
+ async computeOnGpu(data, rows, cols, k) {
173
+ const device = this.device;
174
+ const pipeline = this.pipeline;
175
+ const rank = Math.min(rows, cols, k);
176
+ const sigmas = new Float64Array(rank);
177
+ // Work on a copy for deflation (same as CPU path)
178
+ const A = new Float64Array(data);
179
+ for (let s = 0; s < rank; s++) {
180
+ // Convert current matrix to Float32 for GPU
181
+ const matF32 = new Float32Array(A.length);
182
+ for (let i = 0; i < A.length; i++)
183
+ matF32[i] = A[i];
184
+ // Initialize v with deterministic pseudo-random vector (matching CPU)
185
+ const v = new Float64Array(cols);
186
+ for (let j = 0; j < cols; j++) {
187
+ v[j] = Math.sin((s + 1) * (j + 1) * 0.7071);
188
+ }
189
+ normalizeVecLocal(v);
190
+ let sigma = 0;
191
+ // Create GPU buffers
192
+ const matBuffer = device.createBuffer({
193
+ size: matF32.byteLength,
194
+ usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
195
+ });
196
+ const vecInBuffer = device.createBuffer({
197
+ size: cols * 4, // f32
198
+ usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
199
+ });
200
+ const vecOutBuffer = device.createBuffer({
201
+ size: cols * 4, // f32
202
+ usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
203
+ });
204
+ const dimsBuffer = device.createBuffer({
205
+ size: 8, // 2x u32
206
+ usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
207
+ });
208
+ const readbackBuffer = device.createBuffer({
209
+ size: cols * 4,
210
+ usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
211
+ });
212
+ // Upload matrix and dimensions (only once per singular value)
213
+ device.queue.writeBuffer(matBuffer, 0, matF32);
214
+ device.queue.writeBuffer(dimsBuffer, 0, new Uint32Array([rows, cols]));
215
+ const bindGroup = device.createBindGroup({
216
+ layout: pipeline.getBindGroupLayout(0),
217
+ entries: [
218
+ { binding: 0, resource: { buffer: matBuffer } },
219
+ { binding: 1, resource: { buffer: vecInBuffer } },
220
+ { binding: 2, resource: { buffer: vecOutBuffer } },
221
+ { binding: 3, resource: { buffer: dimsBuffer } },
222
+ ],
223
+ });
224
+ // Power iteration loop
225
+ for (let iter = 0; iter < WebGpuSvdProvider.POWER_ITER_MAX; iter++) {
226
+ // Upload current v as f32
227
+ const vF32 = new Float32Array(cols);
228
+ for (let j = 0; j < cols; j++)
229
+ vF32[j] = v[j];
230
+ device.queue.writeBuffer(vecInBuffer, 0, vF32);
231
+ // Dispatch compute shader: A^T * A * v
232
+ const commandEncoder = device.createCommandEncoder();
233
+ const pass = commandEncoder.beginComputePass();
234
+ pass.setPipeline(pipeline);
235
+ pass.setBindGroup(0, bindGroup);
236
+ pass.dispatchWorkgroups(Math.ceil(cols / 64));
237
+ pass.end();
238
+ // Copy result to readback buffer
239
+ commandEncoder.copyBufferToBuffer(vecOutBuffer, 0, readbackBuffer, 0, cols * 4);
240
+ device.queue.submit([commandEncoder.finish()]);
241
+ // Read back result
242
+ await readbackBuffer.mapAsync(GPUMapMode.READ);
243
+ const resultF32 = new Float32Array(readbackBuffer.getMappedRange().slice(0));
244
+ readbackBuffer.unmap();
245
+ // Convert back to Float64 and extract sigma
246
+ const vNew = new Float64Array(cols);
247
+ for (let j = 0; j < cols; j++)
248
+ vNew[j] = resultF32[j];
249
+ const sigmaNew = vecNormLocal(vNew);
250
+ if (sigmaNew < 1e-15)
251
+ break;
252
+ scaleVecLocal(vNew, 1 / sigmaNew);
253
+ // sigma for A^T*A*v is sigma^2 of A, so take sqrt
254
+ const sigmaEst = Math.sqrt(sigmaNew);
255
+ const diff = Math.abs(sigma - sigmaEst);
256
+ for (let j = 0; j < cols; j++)
257
+ v[j] = vNew[j];
258
+ sigma = sigmaEst;
259
+ if (diff < WebGpuSvdProvider.POWER_ITER_TOL * sigma && iter > 0)
260
+ break;
261
+ }
262
+ sigmas[s] = sigma;
263
+ // Deflate: A <- A - sigma * u * v^T (on CPU — deflation is cheap)
264
+ const u = matVecMulLocal(A, rows, cols, v);
265
+ const uNorm = vecNormLocal(u);
266
+ if (uNorm > 1e-15) {
267
+ scaleVecLocal(u, 1 / uNorm);
268
+ for (let i = 0; i < rows; i++) {
269
+ for (let j = 0; j < cols; j++) {
270
+ A[i * cols + j] -= sigma * u[i] * v[j];
271
+ }
272
+ }
273
+ }
274
+ // Cleanup GPU buffers for this singular value
275
+ matBuffer.destroy();
276
+ vecInBuffer.destroy();
277
+ vecOutBuffer.destroy();
278
+ dimsBuffer.destroy();
279
+ readbackBuffer.destroy();
280
+ }
281
+ return sigmas;
282
+ }
283
+ }
284
+ // ---------------------------------------------------------------------------
285
+ // ONNX Runtime SVD Provider
286
+ // ---------------------------------------------------------------------------
287
+ /**
288
+ * ONNX Runtime-based SVD provider for Node.js server environments.
289
+ *
290
+ * Uses ONNX Runtime's CUDA execution provider when available, falling
291
+ * back to the CPU execution provider. This is the recommended GPU path
292
+ * for Node.js where WebGPU is not yet available.
293
+ *
294
+ * Implementation strategy: Since ONNX doesn't have a direct SVD op,
295
+ * we implement the same power-iteration algorithm but offload the
296
+ * matrix-vector multiplications (the expensive inner loop) to ONNX
297
+ * Runtime which can run them on GPU via CUDA/cuDNN.
298
+ */
299
+ export class OnnxSvdProvider {
300
+ session = null;
301
+ onnxAvailable = false;
302
+ executionProvider = 'cpu';
303
+ fallback = new CpuFallbackProvider();
304
+ initPromise = null;
305
+ /** Power iteration parameters matching the CPU implementation. */
306
+ static POWER_ITER_MAX = 300;
307
+ static POWER_ITER_TOL = 1e-10;
308
+ async available() {
309
+ if (this.initPromise)
310
+ return this.initPromise;
311
+ this.initPromise = this.initOnnx();
312
+ return this.initPromise;
313
+ }
314
+ async topKSingularValues(data, rows, cols, k) {
315
+ const onnxReady = await this.available();
316
+ if (!onnxReady || !this.session) {
317
+ return this.fallback.topKSingularValues(data, rows, cols, k);
318
+ }
319
+ try {
320
+ return await this.computeWithOnnx(data, rows, cols, k);
321
+ }
322
+ catch {
323
+ // Any ONNX error falls back to CPU gracefully
324
+ return this.fallback.topKSingularValues(data, rows, cols, k);
325
+ }
326
+ }
327
+ /** Returns the active execution provider name ('cuda' or 'cpu'). */
328
+ getExecutionProvider() {
329
+ return this.executionProvider;
330
+ }
331
+ // -------------------------------------------------------------------------
332
+ // ONNX Initialization
333
+ // -------------------------------------------------------------------------
334
+ async initOnnx() {
335
+ try {
336
+ // Dynamic import to avoid hard dependency on onnxruntime-node
337
+ const ort = await importOnnxRuntime();
338
+ if (!ort)
339
+ return false;
340
+ // Try CUDA first, then fall back to CPU
341
+ const providers = [];
342
+ try {
343
+ // Probe for CUDA availability
344
+ providers.push('CUDAExecutionProvider');
345
+ }
346
+ catch {
347
+ // CUDA not available
348
+ }
349
+ providers.push('CPUExecutionProvider');
350
+ // Create a minimal MatMul model in-memory for GPU-accelerated mat-vec
351
+ const modelBytes = buildMatMulOnnxModel();
352
+ this.session = await ort.InferenceSession.create(modelBytes.buffer, { executionProviders: providers });
353
+ // Detect which provider was actually selected
354
+ this.executionProvider = providers.includes('CUDAExecutionProvider')
355
+ ? 'cuda'
356
+ : 'cpu';
357
+ this.onnxAvailable = true;
358
+ return true;
359
+ }
360
+ catch {
361
+ this.onnxAvailable = false;
362
+ return false;
363
+ }
364
+ }
365
+ // -------------------------------------------------------------------------
366
+ // ONNX Computation
367
+ // -------------------------------------------------------------------------
368
+ async computeWithOnnx(data, rows, cols, k) {
369
+ const rank = Math.min(rows, cols, k);
370
+ const sigmas = new Float64Array(rank);
371
+ const A = new Float64Array(data);
372
+ for (let s = 0; s < rank; s++) {
373
+ let v = new Float64Array(cols);
374
+ for (let j = 0; j < cols; j++) {
375
+ v[j] = Math.sin((s + 1) * (j + 1) * 0.7071);
376
+ }
377
+ normalizeVecLocal(v);
378
+ let sigma = 0;
379
+ for (let iter = 0; iter < OnnxSvdProvider.POWER_ITER_MAX; iter++) {
380
+ // u = A * v (via ONNX MatMul or CPU)
381
+ const u = await this.onnxMatVecMul(A, rows, cols, v);
382
+ sigma = vecNormLocal(u);
383
+ if (sigma < 1e-15)
384
+ break;
385
+ scaleVecLocal(u, 1 / sigma);
386
+ // v_new = A^T * u (via ONNX MatMul or CPU)
387
+ const vNew = await this.onnxMatTransVecMul(A, rows, cols, u);
388
+ const sigmaNew = vecNormLocal(vNew);
389
+ if (sigmaNew < 1e-15)
390
+ break;
391
+ scaleVecLocal(vNew, 1 / sigmaNew);
392
+ const diff = Math.abs(sigma - sigmaNew);
393
+ v = vNew;
394
+ sigma = sigmaNew;
395
+ if (diff < OnnxSvdProvider.POWER_ITER_TOL * sigma)
396
+ break;
397
+ }
398
+ sigmas[s] = sigma;
399
+ // Deflate
400
+ const u = matVecMulLocal(A, rows, cols, v);
401
+ const uNorm = vecNormLocal(u);
402
+ if (uNorm > 1e-15) {
403
+ scaleVecLocal(u, 1 / uNorm);
404
+ for (let i = 0; i < rows; i++) {
405
+ for (let j = 0; j < cols; j++) {
406
+ A[i * cols + j] -= sigma * u[i] * v[j];
407
+ }
408
+ }
409
+ }
410
+ }
411
+ return sigmas;
412
+ }
413
+ /**
414
+ * GPU-accelerated matrix-vector multiply via ONNX Runtime.
415
+ * Falls back to CPU implementation if ONNX session is unavailable.
416
+ */
417
+ async onnxMatVecMul(A, rows, cols, v) {
418
+ if (!this.session) {
419
+ return matVecMulLocal(A, rows, cols, v);
420
+ }
421
+ try {
422
+ const ort = await importOnnxRuntime();
423
+ if (!ort)
424
+ return matVecMulLocal(A, rows, cols, v);
425
+ // Reshape: A is [rows, cols], v is [cols, 1] → result is [rows, 1]
426
+ const matData = new Float32Array(A.length);
427
+ for (let i = 0; i < A.length; i++)
428
+ matData[i] = A[i];
429
+ const vecData = new Float32Array(cols);
430
+ for (let i = 0; i < cols; i++)
431
+ vecData[i] = v[i];
432
+ const matTensor = new ort.Tensor('float32', matData, [rows, cols]);
433
+ const vecTensor = new ort.Tensor('float32', vecData, [cols, 1]);
434
+ const results = await this.session.run({ A: matTensor, B: vecTensor });
435
+ const outputData = results['Y'].data;
436
+ const result = new Float64Array(rows);
437
+ for (let i = 0; i < rows; i++)
438
+ result[i] = outputData[i];
439
+ return result;
440
+ }
441
+ catch {
442
+ return matVecMulLocal(A, rows, cols, v);
443
+ }
444
+ }
445
+ /**
446
+ * GPU-accelerated A^T * u via ONNX Runtime.
447
+ */
448
+ async onnxMatTransVecMul(A, rows, cols, u) {
449
+ if (!this.session) {
450
+ return matTransVecMulLocal(A, rows, cols, u);
451
+ }
452
+ try {
453
+ const ort = await importOnnxRuntime();
454
+ if (!ort)
455
+ return matTransVecMulLocal(A, rows, cols, u);
456
+ // A^T is [cols, rows], u is [rows, 1] → result is [cols, 1]
457
+ // Instead of transposing, pass u^T * A which gives same result
458
+ const matData = new Float32Array(A.length);
459
+ for (let i = 0; i < A.length; i++)
460
+ matData[i] = A[i];
461
+ const vecData = new Float32Array(rows);
462
+ for (let i = 0; i < rows; i++)
463
+ vecData[i] = u[i];
464
+ // Reshape as [1, rows] x [rows, cols] = [1, cols]
465
+ const vecTensor = new ort.Tensor('float32', vecData, [1, rows]);
466
+ const matTensor = new ort.Tensor('float32', matData, [rows, cols]);
467
+ const results = await this.session.run({ A: vecTensor, B: matTensor });
468
+ const outputData = results['Y'].data;
469
+ const result = new Float64Array(cols);
470
+ for (let i = 0; i < cols; i++)
471
+ result[i] = outputData[i];
472
+ return result;
473
+ }
474
+ catch {
475
+ return matTransVecMulLocal(A, rows, cols, u);
476
+ }
477
+ }
478
+ }
479
+ /**
480
+ * Dynamically import onnxruntime-node. Returns null if not installed.
481
+ */
482
+ async function importOnnxRuntime() {
483
+ try {
484
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
485
+ const ort = await import('onnxruntime-node');
486
+ return ort;
487
+ }
488
+ catch {
489
+ return null;
490
+ }
491
+ }
492
+ /**
493
+ * Build a minimal ONNX protobuf for a MatMul node (A × B = Y).
494
+ *
495
+ * This constructs the raw protobuf bytes for the simplest possible
496
+ * ONNX model: a single MatMul node with two dynamic-shaped inputs
497
+ * and one output. The model is opset 13 compatible.
498
+ *
499
+ * We build the protobuf manually to avoid depending on onnx-proto
500
+ * or protobufjs at runtime.
501
+ */
502
+ function buildMatMulOnnxModel() {
503
+ // ONNX uses protobuf encoding. We construct a minimal valid model
504
+ // with a single MatMul node. This is a well-known minimal structure.
505
+ //
506
+ // The model graph:
507
+ // inputs: A (float, dynamic shape), B (float, dynamic shape)
508
+ // node: MatMul(A, B) -> Y
509
+ // outputs: Y (float, dynamic shape)
510
+ // Helper to encode a protobuf varint
511
+ function encodeVarint(value) {
512
+ const bytes = [];
513
+ let v = value >>> 0; // ensure unsigned
514
+ while (v > 0x7f) {
515
+ bytes.push((v & 0x7f) | 0x80);
516
+ v >>>= 7;
517
+ }
518
+ bytes.push(v);
519
+ return bytes;
520
+ }
521
+ // Helper to encode a length-delimited protobuf field
522
+ function field(fieldNum, data) {
523
+ const tag = (fieldNum << 3) | 2; // wire type 2 = length-delimited
524
+ const dataArr = data instanceof Uint8Array ? Array.from(data) : data;
525
+ return [...encodeVarint(tag), ...encodeVarint(dataArr.length), ...dataArr];
526
+ }
527
+ // Helper to encode a varint field
528
+ function varintField(fieldNum, value) {
529
+ const tag = (fieldNum << 3) | 0; // wire type 0 = varint
530
+ return [...encodeVarint(tag), ...encodeVarint(value)];
531
+ }
532
+ // Helper to encode a string
533
+ function encodeString(s) {
534
+ return Array.from(new TextEncoder().encode(s));
535
+ }
536
+ // TensorTypeProto for float with unknown shape
537
+ // elem_type = 1 (FLOAT)
538
+ const tensorTypeFloat = [...varintField(1, 1)]; // elem_type = FLOAT
539
+ // TypeProto with tensor_type
540
+ const typeProtoFloat = field(1, tensorTypeFloat); // field 1 = tensor_type
541
+ // ValueInfoProto for input A
542
+ const valueInfoA = [
543
+ ...field(1, encodeString('A')), // name
544
+ ...field(2, typeProtoFloat), // type
545
+ ];
546
+ // ValueInfoProto for input B
547
+ const valueInfoB = [
548
+ ...field(1, encodeString('B')),
549
+ ...field(2, typeProtoFloat),
550
+ ];
551
+ // ValueInfoProto for output Y
552
+ const valueInfoY = [
553
+ ...field(1, encodeString('Y')),
554
+ ...field(2, typeProtoFloat),
555
+ ];
556
+ // NodeProto: MatMul(A, B) -> Y
557
+ const matMulNode = [
558
+ ...field(1, encodeString('A')), // input
559
+ ...field(1, encodeString('B')), // input
560
+ ...field(2, encodeString('Y')), // output
561
+ ...field(4, encodeString('MatMul')), // op_type
562
+ ];
563
+ // GraphProto
564
+ const graphProto = [
565
+ ...field(1, matMulNode), // node (repeated, field 1)
566
+ ...field(5, encodeString('svd_matmul')), // name
567
+ ...field(11, valueInfoA), // input
568
+ ...field(11, valueInfoB), // input
569
+ ...field(12, valueInfoY), // output
570
+ ];
571
+ // OperatorSetIdProto for opset 13
572
+ const opsetImport = [...varintField(2, 13)]; // version = 13
573
+ // ModelProto
574
+ const modelProto = [
575
+ ...varintField(1, 7), // ir_version = 7
576
+ ...field(8, opsetImport), // opset_import
577
+ ...field(2, encodeString('vorion-svd-matmul')), // producer_name
578
+ ...field(7, graphProto), // graph
579
+ ];
580
+ return new Uint8Array(modelProto);
581
+ }
582
+ // ---------------------------------------------------------------------------
583
+ // Factory Function
584
+ // ---------------------------------------------------------------------------
585
+ /**
586
+ * Create the best available SVD provider.
587
+ *
588
+ * Auto-detection order:
589
+ * 1. WebGPU (if `preference` is 'gpu' or unset and WebGPU is available)
590
+ * 2. ONNX Runtime with CUDA (if `preference` is 'onnx' or unset and onnxruntime-node is installed)
591
+ * 3. CPU fallback (always available)
592
+ *
593
+ * @param preference - Force a specific provider. Omit to auto-detect.
594
+ * @returns The best available provider.
595
+ */
596
+ export async function createSvdProvider(preference) {
597
+ if (preference === 'cpu') {
598
+ return new CpuFallbackProvider();
599
+ }
600
+ if (preference === 'gpu') {
601
+ const provider = new WebGpuSvdProvider();
602
+ if (await provider.available())
603
+ return provider;
604
+ // Explicit GPU request but unavailable — fall through to ONNX then CPU
605
+ }
606
+ if (preference === 'onnx') {
607
+ const provider = new OnnxSvdProvider();
608
+ if (await provider.available())
609
+ return provider;
610
+ // Explicit ONNX request but unavailable — fall through to CPU
611
+ return new CpuFallbackProvider();
612
+ }
613
+ // Auto-detect: try WebGPU first, then ONNX, then CPU
614
+ const webgpu = new WebGpuSvdProvider();
615
+ if (await webgpu.available())
616
+ return webgpu;
617
+ const onnx = new OnnxSvdProvider();
618
+ if (await onnx.available())
619
+ return onnx;
620
+ return new CpuFallbackProvider();
621
+ }
622
+ // ---------------------------------------------------------------------------
623
+ // Local Linear Algebra Helpers (duplicated to avoid circular deps)
624
+ // ---------------------------------------------------------------------------
625
+ /** Matrix-vector multiply A * v. */
626
+ function matVecMulLocal(A, m, n, v) {
627
+ const result = new Float64Array(m);
628
+ for (let i = 0; i < m; i++) {
629
+ let sum = 0;
630
+ const rowOff = i * n;
631
+ for (let j = 0; j < n; j++) {
632
+ sum += A[rowOff + j] * v[j];
633
+ }
634
+ result[i] = sum;
635
+ }
636
+ return result;
637
+ }
638
+ /** Matrix-transpose-vector multiply A^T * u. */
639
+ function matTransVecMulLocal(A, m, n, u) {
640
+ const result = new Float64Array(n);
641
+ for (let j = 0; j < n; j++) {
642
+ let sum = 0;
643
+ for (let i = 0; i < m; i++) {
644
+ sum += A[i * n + j] * u[i];
645
+ }
646
+ result[j] = sum;
647
+ }
648
+ return result;
649
+ }
650
+ /** Euclidean norm of a vector. */
651
+ function vecNormLocal(v) {
652
+ let sum = 0;
653
+ for (let i = 0; i < v.length; i++)
654
+ sum += v[i] * v[i];
655
+ return Math.sqrt(sum);
656
+ }
657
+ /** In-place normalisation. */
658
+ function normalizeVecLocal(v) {
659
+ const n = vecNormLocal(v);
660
+ if (n > 1e-15)
661
+ scaleVecLocal(v, 1 / n);
662
+ }
663
+ /** In-place scalar multiply. */
664
+ function scaleVecLocal(v, s) {
665
+ for (let i = 0; i < v.length; i++)
666
+ v[i] *= s;
667
+ }
668
+ //# sourceMappingURL=gpu-svd.js.map