rtmlib-ts 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/.gitattributes +1 -0
  2. package/README.md +202 -0
  3. package/dist/core/base.d.ts +20 -0
  4. package/dist/core/base.d.ts.map +1 -0
  5. package/dist/core/base.js +40 -0
  6. package/dist/core/file.d.ts +11 -0
  7. package/dist/core/file.d.ts.map +1 -0
  8. package/dist/core/file.js +111 -0
  9. package/dist/core/modelCache.d.ts +35 -0
  10. package/dist/core/modelCache.d.ts.map +1 -0
  11. package/dist/core/modelCache.js +161 -0
  12. package/dist/core/posePostprocessing.d.ts +12 -0
  13. package/dist/core/posePostprocessing.d.ts.map +1 -0
  14. package/dist/core/posePostprocessing.js +76 -0
  15. package/dist/core/postprocessing.d.ts +10 -0
  16. package/dist/core/postprocessing.d.ts.map +1 -0
  17. package/dist/core/postprocessing.js +70 -0
  18. package/dist/core/preprocessing.d.ts +14 -0
  19. package/dist/core/preprocessing.d.ts.map +1 -0
  20. package/dist/core/preprocessing.js +79 -0
  21. package/dist/index.d.ts +27 -0
  22. package/dist/index.d.ts.map +1 -0
  23. package/dist/index.js +31 -0
  24. package/dist/models/rtmpose.d.ts +25 -0
  25. package/dist/models/rtmpose.d.ts.map +1 -0
  26. package/dist/models/rtmpose.js +185 -0
  27. package/dist/models/rtmpose3d.d.ts +28 -0
  28. package/dist/models/rtmpose3d.d.ts.map +1 -0
  29. package/dist/models/rtmpose3d.js +184 -0
  30. package/dist/models/yolo12.d.ts +23 -0
  31. package/dist/models/yolo12.d.ts.map +1 -0
  32. package/dist/models/yolo12.js +165 -0
  33. package/dist/models/yolox.d.ts +18 -0
  34. package/dist/models/yolox.d.ts.map +1 -0
  35. package/dist/models/yolox.js +167 -0
  36. package/dist/solution/animalDetector.d.ts +229 -0
  37. package/dist/solution/animalDetector.d.ts.map +1 -0
  38. package/dist/solution/animalDetector.js +663 -0
  39. package/dist/solution/body.d.ts +16 -0
  40. package/dist/solution/body.d.ts.map +1 -0
  41. package/dist/solution/body.js +52 -0
  42. package/dist/solution/bodyWithFeet.d.ts +16 -0
  43. package/dist/solution/bodyWithFeet.d.ts.map +1 -0
  44. package/dist/solution/bodyWithFeet.js +52 -0
  45. package/dist/solution/customDetector.d.ts +137 -0
  46. package/dist/solution/customDetector.d.ts.map +1 -0
  47. package/dist/solution/customDetector.js +342 -0
  48. package/dist/solution/hand.d.ts +14 -0
  49. package/dist/solution/hand.d.ts.map +1 -0
  50. package/dist/solution/hand.js +20 -0
  51. package/dist/solution/index.d.ts +10 -0
  52. package/dist/solution/index.d.ts.map +1 -0
  53. package/dist/solution/index.js +9 -0
  54. package/dist/solution/objectDetector.d.ts +172 -0
  55. package/dist/solution/objectDetector.d.ts.map +1 -0
  56. package/dist/solution/objectDetector.js +606 -0
  57. package/dist/solution/pose3dDetector.d.ts +145 -0
  58. package/dist/solution/pose3dDetector.d.ts.map +1 -0
  59. package/dist/solution/pose3dDetector.js +611 -0
  60. package/dist/solution/poseDetector.d.ts +198 -0
  61. package/dist/solution/poseDetector.d.ts.map +1 -0
  62. package/dist/solution/poseDetector.js +622 -0
  63. package/dist/solution/poseTracker.d.ts +22 -0
  64. package/dist/solution/poseTracker.d.ts.map +1 -0
  65. package/dist/solution/poseTracker.js +106 -0
  66. package/dist/solution/wholebody.d.ts +19 -0
  67. package/dist/solution/wholebody.d.ts.map +1 -0
  68. package/dist/solution/wholebody.js +82 -0
  69. package/dist/solution/wholebody3d.d.ts +22 -0
  70. package/dist/solution/wholebody3d.d.ts.map +1 -0
  71. package/dist/solution/wholebody3d.js +75 -0
  72. package/dist/types/index.d.ts +52 -0
  73. package/dist/types/index.d.ts.map +1 -0
  74. package/dist/types/index.js +5 -0
  75. package/dist/visualization/draw.d.ts +57 -0
  76. package/dist/visualization/draw.d.ts.map +1 -0
  77. package/dist/visualization/draw.js +400 -0
  78. package/dist/visualization/skeleton/coco133.d.ts +350 -0
  79. package/dist/visualization/skeleton/coco133.d.ts.map +1 -0
  80. package/dist/visualization/skeleton/coco133.js +120 -0
  81. package/dist/visualization/skeleton/coco17.d.ts +180 -0
  82. package/dist/visualization/skeleton/coco17.d.ts.map +1 -0
  83. package/dist/visualization/skeleton/coco17.js +48 -0
  84. package/dist/visualization/skeleton/halpe26.d.ts +278 -0
  85. package/dist/visualization/skeleton/halpe26.d.ts.map +1 -0
  86. package/dist/visualization/skeleton/halpe26.js +70 -0
  87. package/dist/visualization/skeleton/hand21.d.ts +196 -0
  88. package/dist/visualization/skeleton/hand21.d.ts.map +1 -0
  89. package/dist/visualization/skeleton/hand21.js +51 -0
  90. package/dist/visualization/skeleton/index.d.ts +10 -0
  91. package/dist/visualization/skeleton/index.d.ts.map +1 -0
  92. package/dist/visualization/skeleton/index.js +9 -0
  93. package/dist/visualization/skeleton/openpose134.d.ts +357 -0
  94. package/dist/visualization/skeleton/openpose134.d.ts.map +1 -0
  95. package/dist/visualization/skeleton/openpose134.js +116 -0
  96. package/dist/visualization/skeleton/openpose18.d.ts +177 -0
  97. package/dist/visualization/skeleton/openpose18.d.ts.map +1 -0
  98. package/dist/visualization/skeleton/openpose18.js +47 -0
  99. package/docs/ANIMAL_DETECTOR.md +450 -0
  100. package/docs/CUSTOM_DETECTOR.md +568 -0
  101. package/docs/OBJECT_DETECTOR.md +373 -0
  102. package/docs/POSE3D_DETECTOR.md +458 -0
  103. package/docs/POSE_DETECTOR.md +442 -0
  104. package/examples/README.md +119 -0
  105. package/examples/index.html +746 -0
  106. package/package.json +51 -0
  107. package/playground/README.md +114 -0
  108. package/playground/app/favicon.ico +0 -0
  109. package/playground/app/globals.css +17 -0
  110. package/playground/app/layout.tsx +19 -0
  111. package/playground/app/page.tsx +1338 -0
  112. package/playground/eslint.config.mjs +18 -0
  113. package/playground/next.config.ts +34 -0
  114. package/playground/package-lock.json +6723 -0
  115. package/playground/package.json +27 -0
  116. package/playground/postcss.config.mjs +7 -0
  117. package/playground/tsconfig.json +34 -0
  118. package/src/core/base.ts +66 -0
  119. package/src/core/file.ts +141 -0
  120. package/src/core/modelCache.ts +189 -0
  121. package/src/core/posePostprocessing.ts +91 -0
  122. package/src/core/postprocessing.ts +93 -0
  123. package/src/core/preprocessing.ts +127 -0
  124. package/src/index.ts +69 -0
  125. package/src/models/rtmpose.ts +265 -0
  126. package/src/models/rtmpose3d.ts +289 -0
  127. package/src/models/yolo12.ts +220 -0
  128. package/src/models/yolox.ts +214 -0
  129. package/src/solution/animalDetector.ts +955 -0
  130. package/src/solution/body.ts +89 -0
  131. package/src/solution/bodyWithFeet.ts +89 -0
  132. package/src/solution/customDetector.ts +474 -0
  133. package/src/solution/hand.ts +52 -0
  134. package/src/solution/index.ts +10 -0
  135. package/src/solution/objectDetector.ts +816 -0
  136. package/src/solution/pose3dDetector.ts +890 -0
  137. package/src/solution/poseDetector.ts +892 -0
  138. package/src/solution/poseTracker.ts +172 -0
  139. package/src/solution/wholebody.ts +130 -0
  140. package/src/solution/wholebody3d.ts +125 -0
  141. package/src/types/index.ts +62 -0
  142. package/src/visualization/draw.ts +543 -0
  143. package/src/visualization/skeleton/coco133.ts +131 -0
  144. package/src/visualization/skeleton/coco17.ts +49 -0
  145. package/src/visualization/skeleton/halpe26.ts +71 -0
  146. package/src/visualization/skeleton/hand21.ts +52 -0
  147. package/src/visualization/skeleton/index.ts +10 -0
  148. package/src/visualization/skeleton/openpose134.ts +125 -0
  149. package/src/visualization/skeleton/openpose18.ts +48 -0
  150. package/tsconfig.json +32 -0
@@ -0,0 +1,890 @@
1
+ /**
2
+ * Pose3DDetector - 3D Pose Estimation API
3
+ * Combines YOLOX detector with RTMW3D 3D pose model
4
+ *
5
+ * @example
6
+ * ```typescript
7
+ * // Initialize with default models
8
+ * const detector = new Pose3DDetector();
9
+ * await detector.init();
10
+ *
11
+ * // From canvas
12
+ * const result = await detector.detectFromCanvas(canvas);
13
+ * console.log(result.keypoints[0][0]); // [x, y, z] - 3D coordinates
14
+ *
15
+ * // With custom models
16
+ * const detector2 = new Pose3DDetector({
17
+ * detModel: 'path/to/yolox.onnx',
18
+ * poseModel: 'path/to/rtmw3d.onnx',
19
+ * });
20
+ * ```
21
+ */
22
+
23
+ import * as ort from 'onnxruntime-web';
24
+ import { getCachedModel, isModelCached } from '../core/modelCache';
25
+ import { Wholebody3DResult } from './wholebody3d';
26
+
27
+ // Configure ONNX Runtime Web
28
+ ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.0/dist/';
29
+ ort.env.wasm.simd = true;
30
+ ort.env.wasm.proxy = false;
31
+
32
+ /**
33
+ * Configuration options for Pose3DDetector
34
+ */
35
+ export interface Pose3DDetectorConfig {
36
+ /** Path to YOLOX detection model (optional - uses default if not specified) */
37
+ detModel?: string;
38
+ /** Path to RTMW3D 3D pose estimation model (optional - uses default if not specified) */
39
+ poseModel?: string;
40
+ /** Detection input size (default: [640, 640]) */
41
+ detInputSize?: [number, number];
42
+ /** Pose input size (default: [384, 288]) */
43
+ poseInputSize?: [number, number];
44
+ /** Detection confidence threshold (default: 0.45) */
45
+ detConfidence?: number;
46
+ /** NMS IoU threshold (default: 0.7) */
47
+ nmsThreshold?: number;
48
+ /** Pose keypoint confidence threshold (default: 0.3) */
49
+ poseConfidence?: number;
50
+ /** Execution backend (default: 'webgpu') */
51
+ backend?: 'wasm' | 'webgpu';
52
+ /** Enable model caching (default: true) */
53
+ cache?: boolean;
54
+ /** Z-axis range in meters (default: 2.1744869) */
55
+ zRange?: number;
56
+ }
57
+
58
+ /**
59
+ * 3D Person result with 3D keypoints
60
+ */
61
+ export interface Person3D {
62
+ /** Bounding box coordinates */
63
+ bbox: {
64
+ x1: number;
65
+ y1: number;
66
+ x2: number;
67
+ y2: number;
68
+ confidence: number;
69
+ };
70
+ /** 17 3D keypoints [x, y, z] in meters */
71
+ keypoints: number[][];
72
+ /** Keypoint scores (0-1) */
73
+ scores: number[];
74
+ /** 2D projection of keypoints */
75
+ keypoints2d: number[][];
76
+ /** Normalized SimCC coordinates */
77
+ keypointsSimcc: number[][];
78
+ }
79
+
80
+ /**
81
+ * Detection statistics
82
+ */
83
+ export interface Pose3DStats {
84
+ /** Number of detected people */
85
+ personCount: number;
86
+ /** Detection inference time (ms) */
87
+ detTime: number;
88
+ /** 3D Pose estimation time (ms) */
89
+ poseTime: number;
90
+ /** Total processing time (ms) */
91
+ totalTime: number;
92
+ }
93
+
94
+ /**
95
+ * COCO17 keypoint names
96
+ */
97
+ const KEYPOINT_NAMES_3D = [
98
+ 'nose',
99
+ 'left_eye',
100
+ 'right_eye',
101
+ 'left_ear',
102
+ 'right_ear',
103
+ 'left_shoulder',
104
+ 'right_shoulder',
105
+ 'left_elbow',
106
+ 'right_elbow',
107
+ 'left_wrist',
108
+ 'right_wrist',
109
+ 'left_hip',
110
+ 'right_hip',
111
+ 'left_knee',
112
+ 'right_knee',
113
+ 'left_ankle',
114
+ 'right_ankle',
115
+ ];
116
+
117
+ /**
118
+ * Default configuration - uses HuggingFace models
119
+ */
120
+ const DEFAULT_CONFIG: Required<Pose3DDetectorConfig> = {
121
+ detModel: 'https://huggingface.co/demon2233/rtmlib-ts/resolve/main/yolo/yolov12n.onnx',
122
+ poseModel: 'https://huggingface.co/Soykaf/RTMW3D-x/resolve/main/onnx/rtmw3d-x_8xb64_cocktail14-384x288-b0a0eab7_20240626.onnx',
123
+ detInputSize: [640, 640],
124
+ poseInputSize: [288, 384], // [width=288, height=384] - creates tensor [1,3,384,288]
125
+ detConfidence: 0.45,
126
+ nmsThreshold: 0.7,
127
+ poseConfidence: 0.3,
128
+ backend: 'webgpu', // Default to WebGPU for better performance
129
+ cache: true,
130
+ zRange: 2.1744869,
131
+ };
132
+
133
+ export class Pose3DDetector {
134
+ private config: Required<Pose3DDetectorConfig>;
135
+ private detSession: ort.InferenceSession | null = null;
136
+ private poseSession: ort.InferenceSession | null = null;
137
+ private initialized = false;
138
+ private outputNamesLogged = false;
139
+
140
+ // Pre-allocated buffers for better performance
141
+ private canvas: HTMLCanvasElement | null = null;
142
+ private ctx: CanvasRenderingContext2D | null = null;
143
+ private poseCanvas: HTMLCanvasElement | null = null;
144
+ private poseCtx: CanvasRenderingContext2D | null = null;
145
+ private poseTensorBuffer: Float32Array | null = null;
146
+ private detInputSize: [number, number] = [640, 640];
147
+ private poseInputSize: [number, number] = [288, 384]; // [width=288, height=384]
148
+
149
+ // Pre-allocated source canvas for pose cropping (avoid recreation)
150
+ private srcPoseCanvas: HTMLCanvasElement | null = null;
151
+ private srcPoseCtx: CanvasRenderingContext2D | null = null;
152
+
153
+ constructor(config: Pose3DDetectorConfig = {}) {
154
+ this.config = { ...DEFAULT_CONFIG, ...config };
155
+ // Disable caching for large 3D models by default
156
+ if (config.cache === undefined) {
157
+ this.config.cache = false;
158
+ }
159
+ }
160
+
161
+ /**
162
+ * Initialize both detection and 3D pose models
163
+ */
164
+ async init(): Promise<void> {
165
+ if (this.initialized) return;
166
+
167
+ try {
168
+ // Load detection model
169
+ console.log(`[Pose3DDetector] Loading detection model from: ${this.config.detModel}`);
170
+ let detBuffer: ArrayBuffer;
171
+
172
+ if (this.config.cache) {
173
+ const detCached = await isModelCached(this.config.detModel);
174
+ console.log(`[Pose3DDetector] Det model cache ${detCached ? 'hit' : 'miss'}`);
175
+ detBuffer = await getCachedModel(this.config.detModel);
176
+ } else {
177
+ const detResponse = await fetch(this.config.detModel);
178
+ if (!detResponse.ok) {
179
+ throw new Error(`Failed to fetch det model: HTTP ${detResponse.status}`);
180
+ }
181
+ detBuffer = await detResponse.arrayBuffer();
182
+ }
183
+
184
+ this.detSession = await ort.InferenceSession.create(detBuffer, {
185
+ executionProviders: [this.config.backend],
186
+ graphOptimizationLevel: 'all',
187
+ });
188
+ console.log(`[Pose3DDetector] Detection model loaded, size: ${(detBuffer.byteLength / 1024 / 1024).toFixed(2)} MB`);
189
+
190
+ // Load 3D pose model
191
+ console.log(`[Pose3DDetector] Loading 3D pose model from: ${this.config.poseModel}`);
192
+ let poseBuffer: ArrayBuffer;
193
+
194
+ if (this.config.cache) {
195
+ const poseCached = await isModelCached(this.config.poseModel);
196
+ console.log(`[Pose3DDetector] 3D Pose model cache ${poseCached ? 'hit' : 'miss'}`);
197
+ poseBuffer = await getCachedModel(this.config.poseModel);
198
+ } else {
199
+ const poseResponse = await fetch(this.config.poseModel);
200
+ if (!poseResponse.ok) {
201
+ throw new Error(`Failed to fetch pose model: HTTP ${poseResponse.status}`);
202
+ }
203
+ poseBuffer = await poseResponse.arrayBuffer();
204
+ }
205
+
206
+ this.poseSession = await ort.InferenceSession.create(poseBuffer, {
207
+ executionProviders: [this.config.backend],
208
+ graphOptimizationLevel: 'all',
209
+ });
210
+ console.log(`[Pose3DDetector] 3D Pose model loaded, size: ${(poseBuffer.byteLength / 1024 / 1024).toFixed(2)} MB`);
211
+
212
+ // Pre-allocate resources
213
+ const [detW, detH] = this.config.detInputSize;
214
+ this.detInputSize = [detW, detH];
215
+
216
+ const [poseW, poseH] = this.config.poseInputSize;
217
+ this.poseInputSize = [poseW, poseH];
218
+
219
+ // Main canvas for detection
220
+ this.canvas = document.createElement('canvas');
221
+ this.canvas.width = detW;
222
+ this.canvas.height = detH;
223
+ this.ctx = this.canvas.getContext('2d', {
224
+ willReadFrequently: true,
225
+ alpha: false
226
+ })!;
227
+
228
+ // Pose crop canvas
229
+ this.poseCanvas = document.createElement('canvas');
230
+ this.poseCanvas.width = poseW;
231
+ this.poseCanvas.height = poseH;
232
+ this.poseCtx = this.poseCanvas.getContext('2d', {
233
+ willReadFrequently: true,
234
+ alpha: false
235
+ })!;
236
+
237
+ // Pre-allocate pose tensor buffer
238
+ this.poseTensorBuffer = new Float32Array(3 * poseW * poseH);
239
+
240
+ // Source canvas will be created on first use (dynamic size)
241
+ this.srcPoseCanvas = null;
242
+ this.srcPoseCtx = null;
243
+
244
+ this.initialized = true;
245
+ console.log(`[Pose3DDetector] ✅ Initialized (det:${detW}x${detH}, pose:${poseW}x${poseH}, 3D)`);
246
+ } catch (error) {
247
+ console.error('[Pose3DDetector] ❌ Initialization failed:', error);
248
+ throw error;
249
+ }
250
+ }
251
+
252
+ /**
253
+ * Detect 3D poses from HTMLCanvasElement
254
+ */
255
+ async detectFromCanvas(canvas: HTMLCanvasElement): Promise<Wholebody3DResult> {
256
+ const ctx = canvas.getContext('2d');
257
+ if (!ctx) {
258
+ throw new Error('Could not get 2D context from canvas');
259
+ }
260
+
261
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
262
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
263
+ }
264
+
265
+ /**
266
+ * Detect 3D poses from HTMLVideoElement
267
+ */
268
+ async detectFromVideo(
269
+ video: HTMLVideoElement,
270
+ targetCanvas?: HTMLCanvasElement
271
+ ): Promise<Wholebody3DResult> {
272
+ if (video.readyState < 2) {
273
+ throw new Error('Video not ready. Ensure video is loaded and playing.');
274
+ }
275
+
276
+ const canvas = targetCanvas || document.createElement('canvas');
277
+ canvas.width = video.videoWidth;
278
+ canvas.height = video.videoHeight;
279
+
280
+ const ctx = canvas.getContext('2d');
281
+ if (!ctx) {
282
+ throw new Error('Could not get 2D context from canvas');
283
+ }
284
+
285
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
286
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
287
+
288
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
289
+ }
290
+
291
+ /**
292
+ * Detect 3D poses from HTMLImageElement
293
+ */
294
+ async detectFromImage(
295
+ image: HTMLImageElement,
296
+ targetCanvas?: HTMLCanvasElement
297
+ ): Promise<Wholebody3DResult> {
298
+ if (!image.complete || !image.naturalWidth) {
299
+ throw new Error('Image not loaded. Ensure image is fully loaded.');
300
+ }
301
+
302
+ const canvas = targetCanvas || document.createElement('canvas');
303
+ canvas.width = image.naturalWidth;
304
+ canvas.height = image.naturalHeight;
305
+
306
+ const ctx = canvas.getContext('2d');
307
+ if (!ctx) {
308
+ throw new Error('Could not get 2D context from canvas');
309
+ }
310
+
311
+ ctx.drawImage(image, 0, 0);
312
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
313
+
314
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
315
+ }
316
+
317
+ /**
318
+ * Detect 3D poses from ImageBitmap
319
+ */
320
+ async detectFromBitmap(
321
+ bitmap: ImageBitmap,
322
+ targetCanvas?: HTMLCanvasElement
323
+ ): Promise<Wholebody3DResult> {
324
+ const canvas = targetCanvas || document.createElement('canvas');
325
+ canvas.width = bitmap.width;
326
+ canvas.height = bitmap.height;
327
+
328
+ const ctx = canvas.getContext('2d');
329
+ if (!ctx) {
330
+ throw new Error('Could not get 2D context from canvas');
331
+ }
332
+
333
+ ctx.drawImage(bitmap, 0, 0);
334
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
335
+
336
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
337
+ }
338
+
339
+ /**
340
+ * Detect 3D poses from File
341
+ */
342
+ async detectFromFile(
343
+ file: File,
344
+ targetCanvas?: HTMLCanvasElement
345
+ ): Promise<Wholebody3DResult> {
346
+ return new Promise((resolve, reject) => {
347
+ const img = new Image();
348
+ img.onload = async () => {
349
+ try {
350
+ const results = await this.detectFromImage(img, targetCanvas);
351
+ resolve(results);
352
+ } catch (error) {
353
+ reject(error);
354
+ }
355
+ };
356
+ img.onerror = () => reject(new Error('Failed to load image from file'));
357
+ img.src = URL.createObjectURL(file);
358
+ });
359
+ }
360
+
361
+ /**
362
+ * Detect 3D poses from Blob
363
+ */
364
+ async detectFromBlob(
365
+ blob: Blob,
366
+ targetCanvas?: HTMLCanvasElement
367
+ ): Promise<Wholebody3DResult> {
368
+ const bitmap = await createImageBitmap(blob);
369
+ const results = await this.detectFromBitmap(bitmap, targetCanvas);
370
+ bitmap.close();
371
+ return results;
372
+ }
373
+
374
+ /**
375
+ * Detect 3D poses from raw image data
376
+ */
377
+ async detect(
378
+ imageData: Uint8Array,
379
+ width: number,
380
+ height: number
381
+ ): Promise<Wholebody3DResult> {
382
+ if (!this.initialized) {
383
+ await this.init();
384
+ }
385
+
386
+ const startTime = performance.now();
387
+
388
+ // Step 1: Detect people
389
+ const detStart = performance.now();
390
+ const bboxes = await this.detectPeople(imageData, width, height);
391
+ const detTime = performance.now() - detStart;
392
+
393
+ // Step 2: Estimate 3D poses for each person
394
+ const poseStart = performance.now();
395
+ const allKeypoints: number[][][] = [];
396
+ const allScores: number[][] = [];
397
+ const allKeypointsSimcc: number[][][] = [];
398
+ const allKeypoints2d: number[][][] = [];
399
+
400
+ // Reset source canvas for new image (will be recreated on first bbox)
401
+ this.srcPoseCanvas = null;
402
+ this.srcPoseCtx = null;
403
+
404
+ for (const bbox of bboxes) {
405
+ const poseResult = await this.estimatePose3D(imageData, width, height, bbox);
406
+ allKeypoints.push(poseResult.keypoints);
407
+ allScores.push(poseResult.scores);
408
+ allKeypointsSimcc.push(poseResult.keypointsSimcc);
409
+ allKeypoints2d.push(poseResult.keypoints2d);
410
+ }
411
+
412
+ const poseTime = performance.now() - poseStart;
413
+ const totalTime = performance.now() - startTime;
414
+
415
+ // Attach stats
416
+ const result: Wholebody3DResult = {
417
+ keypoints: allKeypoints,
418
+ scores: allScores,
419
+ keypointsSimcc: allKeypointsSimcc,
420
+ keypoints2d: allKeypoints2d,
421
+ };
422
+
423
+ (result as any).stats = {
424
+ personCount: allKeypoints.length,
425
+ detTime: Math.round(detTime),
426
+ poseTime: Math.round(poseTime),
427
+ totalTime: Math.round(totalTime),
428
+ } as Pose3DStats;
429
+
430
+ return result;
431
+ }
432
+
433
+ /**
434
+ * Detect people using YOLOX
435
+ */
436
+ private async detectPeople(
437
+ imageData: Uint8Array,
438
+ width: number,
439
+ height: number
440
+ ): Promise<Array<{ x1: number; y1: number; x2: number; y2: number; confidence: number }>> {
441
+ const [inputH, inputW] = this.config.detInputSize;
442
+
443
+ const { tensor, paddingX, paddingY, scaleX, scaleY } = this.preprocessYOLO(
444
+ imageData,
445
+ width,
446
+ height,
447
+ [inputW, inputH]
448
+ );
449
+
450
+ const inputTensor = new ort.Tensor('float32', tensor, [1, 3, inputH, inputW]);
451
+ const inputName = this.detSession!.inputNames[0];
452
+
453
+ const feeds: Record<string, ort.Tensor> = {};
454
+ feeds[inputName] = inputTensor;
455
+
456
+ const results = await this.detSession!.run(feeds);
457
+ const output = results[this.detSession!.outputNames[0]];
458
+
459
+ return this.postprocessYOLO(
460
+ output.data as Float32Array,
461
+ output.dims[1],
462
+ width,
463
+ height,
464
+ paddingX,
465
+ paddingY,
466
+ scaleX,
467
+ scaleY
468
+ );
469
+ }
470
+
471
+ /**
472
+ * Estimate 3D pose for a single person
473
+ */
474
+ private async estimatePose3D(
475
+ imageData: Uint8Array,
476
+ imgWidth: number,
477
+ imgHeight: number,
478
+ bbox: { x1: number; y1: number; x2: number; y2: number; confidence: number }
479
+ ): Promise<{
480
+ keypoints: number[][];
481
+ scores: number[];
482
+ keypointsSimcc: number[][];
483
+ keypoints2d: number[][];
484
+ }> {
485
+ const [inputW, inputH] = this.config.poseInputSize;
486
+
487
+ const { tensor, center, scale } = this.preprocessPose(
488
+ imageData,
489
+ imgWidth,
490
+ imgHeight,
491
+ bbox,
492
+ [inputW, inputH]
493
+ );
494
+
495
+ const inputTensor = new ort.Tensor('float32', tensor, [1, 3, inputH, inputW]);
496
+
497
+ // Use dynamic input name
498
+ const inputName = this.poseSession!.inputNames[0];
499
+ const feeds: Record<string, ort.Tensor> = {};
500
+ feeds[inputName] = inputTensor;
501
+
502
+ const results = await this.poseSession!.run(feeds);
503
+
504
+ // Debug output names on first run only
505
+ if (!this.outputNamesLogged) {
506
+ console.log('[Pose3DDetector] Output names:', this.poseSession!.outputNames);
507
+ console.log('[Pose3DDetector] Output shapes:', this.poseSession!.outputNames.map(k => results[k].dims));
508
+ this.outputNamesLogged = true;
509
+ }
510
+
511
+ // Get output tensors using session's outputNames
512
+ // Model input is [width=288, height=384], so:
513
+ // X output has dim 576 (288*2), Y output has dim 768 (384*2)
514
+ const outputNames = this.poseSession!.outputNames;
515
+ let simccX: ort.Tensor, simccY: ort.Tensor, simccZ: ort.Tensor;
516
+
517
+ // Find outputs by shape
518
+ const shape0 = results[outputNames[0]].dims[2];
519
+ const shape1 = results[outputNames[1]].dims[2];
520
+ const shape2 = results[outputNames[2]].dims[2];
521
+
522
+ // X has smaller shape (576), Y has larger (768)
523
+ if (shape0 === 576) simccX = results[outputNames[0]];
524
+ else if (shape1 === 576) simccX = results[outputNames[1]];
525
+ else simccX = results[outputNames[2]];
526
+
527
+ if (shape0 === 768) simccY = results[outputNames[0]];
528
+ else if (shape1 === 768) simccY = results[outputNames[1]];
529
+ else simccY = results[outputNames[2]];
530
+
531
+ // Z is the remaining one
532
+ const usedIndices = [
533
+ simccX === results[outputNames[0]] ? 0 : simccX === results[outputNames[1]] ? 1 : 2,
534
+ simccY === results[outputNames[0]] ? 0 : simccY === results[outputNames[1]] ? 1 : 2,
535
+ ];
536
+ simccZ = results[outputNames[3 - usedIndices[0] - usedIndices[1]]];
537
+
538
+ return this.postprocessPose3D(
539
+ simccX.data as Float32Array,
540
+ simccY.data as Float32Array,
541
+ simccZ.data as Float32Array,
542
+ simccX.dims as number[],
543
+ simccY.dims as number[],
544
+ simccZ.dims as number[],
545
+ center,
546
+ scale,
547
+ imgWidth,
548
+ imgHeight
549
+ );
550
+ }
551
+
552
+ private preprocessYOLO(
553
+ imageData: Uint8Array,
554
+ imgWidth: number,
555
+ imgHeight: number,
556
+ inputSize: [number, number]
557
+ ): {
558
+ tensor: Float32Array;
559
+ paddingX: number;
560
+ paddingY: number;
561
+ scaleX: number;
562
+ scaleY: number;
563
+ } {
564
+ const [inputW, inputH] = inputSize;
565
+
566
+ if (!this.canvas || !this.ctx) {
567
+ this.canvas = document.createElement('canvas');
568
+ this.canvas.width = inputW;
569
+ this.canvas.height = inputH;
570
+ this.ctx = this.canvas.getContext('2d', { willReadFrequently: true, alpha: false })!;
571
+ }
572
+
573
+ const ctx = this.ctx;
574
+ ctx.fillStyle = '#000000';
575
+ ctx.fillRect(0, 0, inputW, inputH);
576
+
577
+ const aspectRatio = imgWidth / imgHeight;
578
+ const targetAspectRatio = inputW / inputH;
579
+
580
+ let drawWidth: number, drawHeight: number, offsetX: number, offsetY: number;
581
+
582
+ if (aspectRatio > targetAspectRatio) {
583
+ drawWidth = inputW;
584
+ drawHeight = Math.floor(inputW / aspectRatio);
585
+ offsetX = 0;
586
+ offsetY = Math.floor((inputH - drawHeight) / 2);
587
+ } else {
588
+ drawHeight = inputH;
589
+ drawWidth = Math.floor(inputH * aspectRatio);
590
+ offsetX = Math.floor((inputW - drawWidth) / 2);
591
+ offsetY = 0;
592
+ }
593
+
594
+ const srcCanvas = document.createElement('canvas');
595
+ const srcCtx = srcCanvas.getContext('2d')!;
596
+ srcCanvas.width = imgWidth;
597
+ srcCanvas.height = imgHeight;
598
+
599
+ const srcImageData = srcCtx.createImageData(imgWidth, imgHeight);
600
+ srcImageData.data.set(imageData);
601
+ srcCtx.putImageData(srcImageData, 0, 0);
602
+
603
+ ctx.drawImage(srcCanvas, 0, 0, imgWidth, imgHeight, offsetX, offsetY, drawWidth, drawHeight);
604
+
605
+ const paddedData = ctx.getImageData(0, 0, inputW, inputH);
606
+ const tensor = new Float32Array(inputW * inputH * 3);
607
+
608
+ for (let i = 0; i < paddedData.data.length; i += 4) {
609
+ const pixelIdx = i / 4;
610
+ tensor[pixelIdx] = paddedData.data[i] / 255;
611
+ tensor[pixelIdx + inputW * inputH] = paddedData.data[i + 1] / 255;
612
+ tensor[pixelIdx + 2 * inputW * inputH] = paddedData.data[i + 2] / 255;
613
+ }
614
+
615
+ const scaleX = imgWidth / drawWidth;
616
+ const scaleY = imgHeight / drawHeight;
617
+
618
+ return { tensor, paddingX: offsetX, paddingY: offsetY, scaleX, scaleY };
619
+ }
620
+
621
+ private postprocessYOLO(
622
+ output: Float32Array,
623
+ numDetections: number,
624
+ imgWidth: number,
625
+ imgHeight: number,
626
+ paddingX: number,
627
+ paddingY: number,
628
+ scaleX: number,
629
+ scaleY: number
630
+ ): Array<{ x1: number; y1: number; x2: number; y2: number; confidence: number }> {
631
+ const detections: Array<{ x1: number; y1: number; x2: number; y2: number; confidence: number }> = [];
632
+
633
+ for (let i = 0; i < numDetections; i++) {
634
+ const idx = i * 6;
635
+ const x1 = output[idx];
636
+ const y1 = output[idx + 1];
637
+ const x2 = output[idx + 2];
638
+ const y2 = output[idx + 3];
639
+ const confidence = output[idx + 4];
640
+ const classId = Math.round(output[idx + 5]);
641
+
642
+ if (confidence < this.config.detConfidence || classId !== 0) continue;
643
+
644
+ const tx1 = (x1 - paddingX) * scaleX;
645
+ const ty1 = (y1 - paddingY) * scaleY;
646
+ const tx2 = (x2 - paddingX) * scaleX;
647
+ const ty2 = (y2 - paddingY) * scaleY;
648
+
649
+ detections.push({
650
+ x1: Math.max(0, tx1),
651
+ y1: Math.max(0, ty1),
652
+ x2: Math.min(imgWidth, tx2),
653
+ y2: Math.min(imgHeight, ty2),
654
+ confidence,
655
+ });
656
+ }
657
+
658
+ return this.applyNMS(detections, this.config.nmsThreshold);
659
+ }
660
+
661
+ private preprocessPose(
662
+ imageData: Uint8Array,
663
+ imgWidth: number,
664
+ imgHeight: number,
665
+ bbox: { x1: number; y1: number; x2: number; y2: number; confidence: number },
666
+ inputSize: [number, number]
667
+ ): { tensor: Float32Array; center: [number, number]; scale: [number, number] } {
668
+ const [inputW, inputH] = inputSize;
669
+ const bboxWidth = bbox.x2 - bbox.x1;
670
+ const bboxHeight = bbox.y2 - bbox.y1;
671
+
672
+ // Center of bbox (same as Python)
673
+ const center: [number, number] = [
674
+ bbox.x1 + bboxWidth / 2,
675
+ bbox.y1 + bboxHeight / 2,
676
+ ];
677
+
678
+ // Scale with padding (same as Python bbox_xyxy2cs with padding=1.25)
679
+ let scaleW = bboxWidth * 1.25;
680
+ let scaleH = bboxHeight * 1.25;
681
+
682
+ // Adjust scale to match model aspect ratio (same as top_down_affine)
683
+ const modelAspectRatio = inputW / inputH;
684
+ const bboxAspectRatio = scaleW / scaleH;
685
+
686
+ if (bboxAspectRatio > modelAspectRatio) {
687
+ scaleH = scaleW / modelAspectRatio;
688
+ } else {
689
+ scaleW = scaleH * modelAspectRatio;
690
+ }
691
+
692
+ const scale: [number, number] = [scaleW, scaleH];
693
+
694
+ // Reuse pose canvas
695
+ if (!this.poseCanvas || !this.poseCtx) {
696
+ this.poseCanvas = document.createElement('canvas');
697
+ this.poseCanvas.width = inputW;
698
+ this.poseCanvas.height = inputH;
699
+ this.poseCtx = this.poseCanvas.getContext('2d', {
700
+ willReadFrequently: true,
701
+ alpha: false
702
+ })!;
703
+ }
704
+
705
+ // Reuse source canvas for original image (avoid recreation per bbox)
706
+ if (!this.srcPoseCanvas || !this.srcPoseCtx) {
707
+ this.srcPoseCanvas = document.createElement('canvas');
708
+ this.srcPoseCanvas.width = imgWidth;
709
+ this.srcPoseCanvas.height = imgHeight;
710
+ this.srcPoseCtx = this.srcPoseCanvas.getContext('2d', {
711
+ willReadFrequently: true,
712
+ alpha: false
713
+ })!;
714
+ // Copy image data once
715
+ const srcImageData = this.srcPoseCtx.createImageData(imgWidth, imgHeight);
716
+ srcImageData.data.set(imageData);
717
+ this.srcPoseCtx.putImageData(srcImageData, 0, 0);
718
+ }
719
+
720
+ const ctx = this.poseCtx;
721
+ ctx.clearRect(0, 0, inputW, inputH);
722
+
723
+ // Crop and resize using drawImage (single GPU operation)
724
+ const srcX = center[0] - scaleW / 2;
725
+ const srcY = center[1] - scaleH / 2;
726
+ ctx.drawImage(this.srcPoseCanvas, srcX, srcY, scaleW, scaleH, 0, 0, inputW, inputH);
727
+
728
+ const croppedData = ctx.getImageData(0, 0, inputW, inputH);
729
+ const tensor = this.poseTensorBuffer!;
730
+ const len = croppedData.data.length;
731
+ const planeSize = inputW * inputH;
732
+
733
+ // Normalization constants
734
+ const mean0 = 123.675, mean1 = 116.28, mean2 = 103.53;
735
+ const stdInv0 = 1 / 58.395, stdInv1 = 1 / 57.12, stdInv2 = 1 / 57.375;
736
+
737
+ // Optimized normalization loop - process 4 pixels at once (SIMD-like)
738
+ for (let i = 0; i < len; i += 16) {
739
+ const p1 = i / 4, p2 = p1 + 1, p3 = p1 + 2, p4 = p1 + 3;
740
+
741
+ // R channel
742
+ tensor[p1] = (croppedData.data[i] - mean0) * stdInv0;
743
+ tensor[p2] = (croppedData.data[i + 4] - mean0) * stdInv0;
744
+ tensor[p3] = (croppedData.data[i + 8] - mean0) * stdInv0;
745
+ tensor[p4] = (croppedData.data[i + 12] - mean0) * stdInv0;
746
+
747
+ // G channel
748
+ tensor[p1 + planeSize] = (croppedData.data[i + 1] - mean1) * stdInv1;
749
+ tensor[p2 + planeSize] = (croppedData.data[i + 5] - mean1) * stdInv1;
750
+ tensor[p3 + planeSize] = (croppedData.data[i + 9] - mean1) * stdInv1;
751
+ tensor[p4 + planeSize] = (croppedData.data[i + 13] - mean1) * stdInv1;
752
+
753
+ // B channel
754
+ tensor[p1 + planeSize * 2] = (croppedData.data[i + 2] - mean2) * stdInv2;
755
+ tensor[p2 + planeSize * 2] = (croppedData.data[i + 6] - mean2) * stdInv2;
756
+ tensor[p3 + planeSize * 2] = (croppedData.data[i + 10] - mean2) * stdInv2;
757
+ tensor[p4 + planeSize * 2] = (croppedData.data[i + 14] - mean2) * stdInv2;
758
+ }
759
+
760
+ return { tensor, center, scale };
761
+ }
762
+
763
+ private postprocessPose3D(
764
+ simccX: Float32Array,
765
+ simccY: Float32Array,
766
+ simccZ: Float32Array,
767
+ shapeX: number[],
768
+ shapeY: number[],
769
+ shapeZ: number[],
770
+ center: [number, number],
771
+ scale: [number, number],
772
+ imgWidth: number,
773
+ imgHeight: number
774
+ ): {
775
+ keypoints: number[][];
776
+ scores: number[];
777
+ keypointsSimcc: number[][];
778
+ keypoints2d: number[][];
779
+ } {
780
+ const numKeypoints = shapeX[1];
781
+ const wx = shapeX[2];
782
+ const wy = shapeY[2];
783
+ const wz = shapeZ[2];
784
+
785
+ const keypoints: number[][] = [];
786
+ const scores: number[] = [];
787
+ const keypointsSimcc: number[][] = [];
788
+ const keypoints2d: number[][] = [];
789
+
790
+ for (let k = 0; k < numKeypoints; k++) {
791
+ let maxX = -Infinity, argmaxX = 0;
792
+ for (let i = 0; i < wx; i++) {
793
+ const val = simccX[k * wx + i];
794
+ if (val > maxX) { maxX = val; argmaxX = i; }
795
+ }
796
+
797
+ let maxY = -Infinity, argmaxY = 0;
798
+ for (let i = 0; i < wy; i++) {
799
+ const val = simccY[k * wy + i];
800
+ if (val > maxY) { maxY = val; argmaxY = i; }
801
+ }
802
+
803
+ let maxZ = -Infinity, argmaxZ = 0;
804
+ for (let i = 0; i < wz; i++) {
805
+ const val = simccZ[k * wz + i];
806
+ if (val > maxZ) { maxZ = val; argmaxZ = i; }
807
+ }
808
+
809
+ const score = maxX > maxY ? maxX : maxY;
810
+
811
+ // Normalize to [0, 1]
812
+ const normX = argmaxX / wx;
813
+ const normY = argmaxY / wy;
814
+ const normZ = argmaxZ / wz;
815
+
816
+ // 3D coordinates in model space
817
+ const kptX = (normX - 0.5) * 2.0;
818
+ const kptY = (normY - 0.5) * 2.0;
819
+ const kptZMetric = (normZ - 0.5) * this.config.zRange * 2;
820
+
821
+ keypoints.push([kptX, kptY, kptZMetric]);
822
+ keypointsSimcc.push([normX, normY, normZ]);
823
+
824
+ // 2D coordinates in original image space
825
+ // Convert from normalized SimCC coords [0, 1] to crop space, then to image space
826
+ // Formula: kpt = center - scale/2 + norm * scale (same as in rtmpose3d.ts)
827
+ const kpt2dX = normX * scale[0] + center[0] - 0.5 * scale[0];
828
+ const kpt2dY = normY * scale[1] + center[1] - 0.5 * scale[1];
829
+
830
+ // Clamp to image bounds
831
+ const clampedX = Math.max(0, Math.min(imgWidth, kpt2dX));
832
+ const clampedY = Math.max(0, Math.min(imgHeight, kpt2dY));
833
+
834
+ keypoints2d.push([clampedX, clampedY]);
835
+
836
+ scores.push(score);
837
+ }
838
+
839
+ return { keypoints, scores, keypointsSimcc, keypoints2d };
840
+ }
841
+
842
+ private applyNMS(
843
+ detections: Array<{ x1: number; y1: number; x2: number; y2: number; confidence: number }>,
844
+ iouThreshold: number
845
+ ): typeof detections {
846
+ if (detections.length === 0) return [];
847
+
848
+ detections.sort((a, b) => b.confidence - a.confidence);
849
+
850
+ const selected: typeof detections = [];
851
+ const used = new Set<number>();
852
+
853
+ for (let i = 0; i < detections.length; i++) {
854
+ if (used.has(i)) continue;
855
+
856
+ selected.push(detections[i]);
857
+ used.add(i);
858
+
859
+ for (let j = i + 1; j < detections.length; j++) {
860
+ if (used.has(j)) continue;
861
+
862
+ const iou = this.calculateIoU(detections[i], detections[j]);
863
+ if (iou > iouThreshold) {
864
+ used.add(j);
865
+ }
866
+ }
867
+ }
868
+
869
+ return selected;
870
+ }
871
+
872
+ private calculateIoU(
873
+ box1: { x1: number; y1: number; x2: number; y2: number },
874
+ box2: { x1: number; y1: number; x2: number; y2: number }
875
+ ): number {
876
+ const x1 = Math.max(box1.x1, box2.x1);
877
+ const y1 = Math.max(box1.y1, box2.y1);
878
+ const x2 = Math.min(box1.x2, box2.x2);
879
+ const y2 = Math.min(box1.y2, box2.y2);
880
+
881
+ if (x2 <= x1 || y2 <= y1) return 0;
882
+
883
+ const intersection = (x2 - x1) * (y2 - y1);
884
+ const area1 = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
885
+ const area2 = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
886
+ const union = area1 + area2 - intersection;
887
+
888
+ return intersection / union;
889
+ }
890
+ }