rtmlib-ts 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/.gitattributes +1 -0
  2. package/README.md +202 -0
  3. package/dist/core/base.d.ts +20 -0
  4. package/dist/core/base.d.ts.map +1 -0
  5. package/dist/core/base.js +40 -0
  6. package/dist/core/file.d.ts +11 -0
  7. package/dist/core/file.d.ts.map +1 -0
  8. package/dist/core/file.js +111 -0
  9. package/dist/core/modelCache.d.ts +35 -0
  10. package/dist/core/modelCache.d.ts.map +1 -0
  11. package/dist/core/modelCache.js +161 -0
  12. package/dist/core/posePostprocessing.d.ts +12 -0
  13. package/dist/core/posePostprocessing.d.ts.map +1 -0
  14. package/dist/core/posePostprocessing.js +76 -0
  15. package/dist/core/postprocessing.d.ts +10 -0
  16. package/dist/core/postprocessing.d.ts.map +1 -0
  17. package/dist/core/postprocessing.js +70 -0
  18. package/dist/core/preprocessing.d.ts +14 -0
  19. package/dist/core/preprocessing.d.ts.map +1 -0
  20. package/dist/core/preprocessing.js +79 -0
  21. package/dist/index.d.ts +27 -0
  22. package/dist/index.d.ts.map +1 -0
  23. package/dist/index.js +31 -0
  24. package/dist/models/rtmpose.d.ts +25 -0
  25. package/dist/models/rtmpose.d.ts.map +1 -0
  26. package/dist/models/rtmpose.js +185 -0
  27. package/dist/models/rtmpose3d.d.ts +28 -0
  28. package/dist/models/rtmpose3d.d.ts.map +1 -0
  29. package/dist/models/rtmpose3d.js +184 -0
  30. package/dist/models/yolo12.d.ts +23 -0
  31. package/dist/models/yolo12.d.ts.map +1 -0
  32. package/dist/models/yolo12.js +165 -0
  33. package/dist/models/yolox.d.ts +18 -0
  34. package/dist/models/yolox.d.ts.map +1 -0
  35. package/dist/models/yolox.js +167 -0
  36. package/dist/solution/animalDetector.d.ts +229 -0
  37. package/dist/solution/animalDetector.d.ts.map +1 -0
  38. package/dist/solution/animalDetector.js +663 -0
  39. package/dist/solution/body.d.ts +16 -0
  40. package/dist/solution/body.d.ts.map +1 -0
  41. package/dist/solution/body.js +52 -0
  42. package/dist/solution/bodyWithFeet.d.ts +16 -0
  43. package/dist/solution/bodyWithFeet.d.ts.map +1 -0
  44. package/dist/solution/bodyWithFeet.js +52 -0
  45. package/dist/solution/customDetector.d.ts +137 -0
  46. package/dist/solution/customDetector.d.ts.map +1 -0
  47. package/dist/solution/customDetector.js +342 -0
  48. package/dist/solution/hand.d.ts +14 -0
  49. package/dist/solution/hand.d.ts.map +1 -0
  50. package/dist/solution/hand.js +20 -0
  51. package/dist/solution/index.d.ts +10 -0
  52. package/dist/solution/index.d.ts.map +1 -0
  53. package/dist/solution/index.js +9 -0
  54. package/dist/solution/objectDetector.d.ts +172 -0
  55. package/dist/solution/objectDetector.d.ts.map +1 -0
  56. package/dist/solution/objectDetector.js +606 -0
  57. package/dist/solution/pose3dDetector.d.ts +145 -0
  58. package/dist/solution/pose3dDetector.d.ts.map +1 -0
  59. package/dist/solution/pose3dDetector.js +611 -0
  60. package/dist/solution/poseDetector.d.ts +198 -0
  61. package/dist/solution/poseDetector.d.ts.map +1 -0
  62. package/dist/solution/poseDetector.js +622 -0
  63. package/dist/solution/poseTracker.d.ts +22 -0
  64. package/dist/solution/poseTracker.d.ts.map +1 -0
  65. package/dist/solution/poseTracker.js +106 -0
  66. package/dist/solution/wholebody.d.ts +19 -0
  67. package/dist/solution/wholebody.d.ts.map +1 -0
  68. package/dist/solution/wholebody.js +82 -0
  69. package/dist/solution/wholebody3d.d.ts +22 -0
  70. package/dist/solution/wholebody3d.d.ts.map +1 -0
  71. package/dist/solution/wholebody3d.js +75 -0
  72. package/dist/types/index.d.ts +52 -0
  73. package/dist/types/index.d.ts.map +1 -0
  74. package/dist/types/index.js +5 -0
  75. package/dist/visualization/draw.d.ts +57 -0
  76. package/dist/visualization/draw.d.ts.map +1 -0
  77. package/dist/visualization/draw.js +400 -0
  78. package/dist/visualization/skeleton/coco133.d.ts +350 -0
  79. package/dist/visualization/skeleton/coco133.d.ts.map +1 -0
  80. package/dist/visualization/skeleton/coco133.js +120 -0
  81. package/dist/visualization/skeleton/coco17.d.ts +180 -0
  82. package/dist/visualization/skeleton/coco17.d.ts.map +1 -0
  83. package/dist/visualization/skeleton/coco17.js +48 -0
  84. package/dist/visualization/skeleton/halpe26.d.ts +278 -0
  85. package/dist/visualization/skeleton/halpe26.d.ts.map +1 -0
  86. package/dist/visualization/skeleton/halpe26.js +70 -0
  87. package/dist/visualization/skeleton/hand21.d.ts +196 -0
  88. package/dist/visualization/skeleton/hand21.d.ts.map +1 -0
  89. package/dist/visualization/skeleton/hand21.js +51 -0
  90. package/dist/visualization/skeleton/index.d.ts +10 -0
  91. package/dist/visualization/skeleton/index.d.ts.map +1 -0
  92. package/dist/visualization/skeleton/index.js +9 -0
  93. package/dist/visualization/skeleton/openpose134.d.ts +357 -0
  94. package/dist/visualization/skeleton/openpose134.d.ts.map +1 -0
  95. package/dist/visualization/skeleton/openpose134.js +116 -0
  96. package/dist/visualization/skeleton/openpose18.d.ts +177 -0
  97. package/dist/visualization/skeleton/openpose18.d.ts.map +1 -0
  98. package/dist/visualization/skeleton/openpose18.js +47 -0
  99. package/docs/ANIMAL_DETECTOR.md +450 -0
  100. package/docs/CUSTOM_DETECTOR.md +568 -0
  101. package/docs/OBJECT_DETECTOR.md +373 -0
  102. package/docs/POSE3D_DETECTOR.md +458 -0
  103. package/docs/POSE_DETECTOR.md +442 -0
  104. package/examples/README.md +119 -0
  105. package/examples/index.html +746 -0
  106. package/package.json +51 -0
  107. package/playground/README.md +114 -0
  108. package/playground/app/favicon.ico +0 -0
  109. package/playground/app/globals.css +17 -0
  110. package/playground/app/layout.tsx +19 -0
  111. package/playground/app/page.tsx +1338 -0
  112. package/playground/eslint.config.mjs +18 -0
  113. package/playground/next.config.ts +34 -0
  114. package/playground/package-lock.json +6723 -0
  115. package/playground/package.json +27 -0
  116. package/playground/postcss.config.mjs +7 -0
  117. package/playground/tsconfig.json +34 -0
  118. package/src/core/base.ts +66 -0
  119. package/src/core/file.ts +141 -0
  120. package/src/core/modelCache.ts +189 -0
  121. package/src/core/posePostprocessing.ts +91 -0
  122. package/src/core/postprocessing.ts +93 -0
  123. package/src/core/preprocessing.ts +127 -0
  124. package/src/index.ts +69 -0
  125. package/src/models/rtmpose.ts +265 -0
  126. package/src/models/rtmpose3d.ts +289 -0
  127. package/src/models/yolo12.ts +220 -0
  128. package/src/models/yolox.ts +214 -0
  129. package/src/solution/animalDetector.ts +955 -0
  130. package/src/solution/body.ts +89 -0
  131. package/src/solution/bodyWithFeet.ts +89 -0
  132. package/src/solution/customDetector.ts +474 -0
  133. package/src/solution/hand.ts +52 -0
  134. package/src/solution/index.ts +10 -0
  135. package/src/solution/objectDetector.ts +816 -0
  136. package/src/solution/pose3dDetector.ts +890 -0
  137. package/src/solution/poseDetector.ts +892 -0
  138. package/src/solution/poseTracker.ts +172 -0
  139. package/src/solution/wholebody.ts +130 -0
  140. package/src/solution/wholebody3d.ts +125 -0
  141. package/src/types/index.ts +62 -0
  142. package/src/visualization/draw.ts +543 -0
  143. package/src/visualization/skeleton/coco133.ts +131 -0
  144. package/src/visualization/skeleton/coco17.ts +49 -0
  145. package/src/visualization/skeleton/halpe26.ts +71 -0
  146. package/src/visualization/skeleton/hand21.ts +52 -0
  147. package/src/visualization/skeleton/index.ts +10 -0
  148. package/src/visualization/skeleton/openpose134.ts +125 -0
  149. package/src/visualization/skeleton/openpose18.ts +48 -0
  150. package/tsconfig.json +32 -0
@@ -0,0 +1,611 @@
1
+ /**
2
+ * Pose3DDetector - 3D Pose Estimation API
3
+ * Combines YOLOX detector with RTMW3D 3D pose model
4
+ *
5
+ * @example
6
+ * ```typescript
7
+ * // Initialize with default models
8
+ * const detector = new Pose3DDetector();
9
+ * await detector.init();
10
+ *
11
+ * // From canvas
12
+ * const result = await detector.detectFromCanvas(canvas);
13
+ * console.log(result.keypoints[0][0]); // [x, y, z] - 3D coordinates
14
+ *
15
+ * // With custom models
16
+ * const detector2 = new Pose3DDetector({
17
+ * detModel: 'path/to/yolox.onnx',
18
+ * poseModel: 'path/to/rtmw3d.onnx',
19
+ * });
20
+ * ```
21
+ */
22
+ import * as ort from 'onnxruntime-web';
23
+ import { getCachedModel, isModelCached } from '../core/modelCache';
24
+ // Configure ONNX Runtime Web
25
+ ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.0/dist/';
26
+ ort.env.wasm.simd = true;
27
+ ort.env.wasm.proxy = false;
28
+ /**
29
+ * COCO17 keypoint names
30
+ */
31
+ const KEYPOINT_NAMES_3D = [
32
+ 'nose',
33
+ 'left_eye',
34
+ 'right_eye',
35
+ 'left_ear',
36
+ 'right_ear',
37
+ 'left_shoulder',
38
+ 'right_shoulder',
39
+ 'left_elbow',
40
+ 'right_elbow',
41
+ 'left_wrist',
42
+ 'right_wrist',
43
+ 'left_hip',
44
+ 'right_hip',
45
+ 'left_knee',
46
+ 'right_knee',
47
+ 'left_ankle',
48
+ 'right_ankle',
49
+ ];
50
+ /**
51
+ * Default configuration - uses HuggingFace models
52
+ */
53
+ const DEFAULT_CONFIG = {
54
+ detModel: 'https://huggingface.co/demon2233/rtmlib-ts/resolve/main/yolo/yolov12n.onnx',
55
+ poseModel: 'https://huggingface.co/Soykaf/RTMW3D-x/resolve/main/onnx/rtmw3d-x_8xb64_cocktail14-384x288-b0a0eab7_20240626.onnx',
56
+ detInputSize: [640, 640],
57
+ poseInputSize: [288, 384], // [width=288, height=384] - creates tensor [1,3,384,288]
58
+ detConfidence: 0.45,
59
+ nmsThreshold: 0.7,
60
+ poseConfidence: 0.3,
61
+ backend: 'webgpu', // Default to WebGPU for better performance
62
+ cache: true,
63
+ zRange: 2.1744869,
64
+ };
65
+ export class Pose3DDetector {
66
+ constructor(config = {}) {
67
+ this.detSession = null;
68
+ this.poseSession = null;
69
+ this.initialized = false;
70
+ this.outputNamesLogged = false;
71
+ // Pre-allocated buffers for better performance
72
+ this.canvas = null;
73
+ this.ctx = null;
74
+ this.poseCanvas = null;
75
+ this.poseCtx = null;
76
+ this.poseTensorBuffer = null;
77
+ this.detInputSize = [640, 640];
78
+ this.poseInputSize = [288, 384]; // [width=288, height=384]
79
+ // Pre-allocated source canvas for pose cropping (avoid recreation)
80
+ this.srcPoseCanvas = null;
81
+ this.srcPoseCtx = null;
82
+ this.config = { ...DEFAULT_CONFIG, ...config };
83
+ // Disable caching for large 3D models by default
84
+ if (config.cache === undefined) {
85
+ this.config.cache = false;
86
+ }
87
+ }
88
+ /**
89
+ * Initialize both detection and 3D pose models
90
+ */
91
+ async init() {
92
+ if (this.initialized)
93
+ return;
94
+ try {
95
+ // Load detection model
96
+ console.log(`[Pose3DDetector] Loading detection model from: ${this.config.detModel}`);
97
+ let detBuffer;
98
+ if (this.config.cache) {
99
+ const detCached = await isModelCached(this.config.detModel);
100
+ console.log(`[Pose3DDetector] Det model cache ${detCached ? 'hit' : 'miss'}`);
101
+ detBuffer = await getCachedModel(this.config.detModel);
102
+ }
103
+ else {
104
+ const detResponse = await fetch(this.config.detModel);
105
+ if (!detResponse.ok) {
106
+ throw new Error(`Failed to fetch det model: HTTP ${detResponse.status}`);
107
+ }
108
+ detBuffer = await detResponse.arrayBuffer();
109
+ }
110
+ this.detSession = await ort.InferenceSession.create(detBuffer, {
111
+ executionProviders: [this.config.backend],
112
+ graphOptimizationLevel: 'all',
113
+ });
114
+ console.log(`[Pose3DDetector] Detection model loaded, size: ${(detBuffer.byteLength / 1024 / 1024).toFixed(2)} MB`);
115
+ // Load 3D pose model
116
+ console.log(`[Pose3DDetector] Loading 3D pose model from: ${this.config.poseModel}`);
117
+ let poseBuffer;
118
+ if (this.config.cache) {
119
+ const poseCached = await isModelCached(this.config.poseModel);
120
+ console.log(`[Pose3DDetector] 3D Pose model cache ${poseCached ? 'hit' : 'miss'}`);
121
+ poseBuffer = await getCachedModel(this.config.poseModel);
122
+ }
123
+ else {
124
+ const poseResponse = await fetch(this.config.poseModel);
125
+ if (!poseResponse.ok) {
126
+ throw new Error(`Failed to fetch pose model: HTTP ${poseResponse.status}`);
127
+ }
128
+ poseBuffer = await poseResponse.arrayBuffer();
129
+ }
130
+ this.poseSession = await ort.InferenceSession.create(poseBuffer, {
131
+ executionProviders: [this.config.backend],
132
+ graphOptimizationLevel: 'all',
133
+ });
134
+ console.log(`[Pose3DDetector] 3D Pose model loaded, size: ${(poseBuffer.byteLength / 1024 / 1024).toFixed(2)} MB`);
135
+ // Pre-allocate resources
136
+ const [detW, detH] = this.config.detInputSize;
137
+ this.detInputSize = [detW, detH];
138
+ const [poseW, poseH] = this.config.poseInputSize;
139
+ this.poseInputSize = [poseW, poseH];
140
+ // Main canvas for detection
141
+ this.canvas = document.createElement('canvas');
142
+ this.canvas.width = detW;
143
+ this.canvas.height = detH;
144
+ this.ctx = this.canvas.getContext('2d', {
145
+ willReadFrequently: true,
146
+ alpha: false
147
+ });
148
+ // Pose crop canvas
149
+ this.poseCanvas = document.createElement('canvas');
150
+ this.poseCanvas.width = poseW;
151
+ this.poseCanvas.height = poseH;
152
+ this.poseCtx = this.poseCanvas.getContext('2d', {
153
+ willReadFrequently: true,
154
+ alpha: false
155
+ });
156
+ // Pre-allocate pose tensor buffer
157
+ this.poseTensorBuffer = new Float32Array(3 * poseW * poseH);
158
+ // Source canvas will be created on first use (dynamic size)
159
+ this.srcPoseCanvas = null;
160
+ this.srcPoseCtx = null;
161
+ this.initialized = true;
162
+ console.log(`[Pose3DDetector] ✅ Initialized (det:${detW}x${detH}, pose:${poseW}x${poseH}, 3D)`);
163
+ }
164
+ catch (error) {
165
+ console.error('[Pose3DDetector] ❌ Initialization failed:', error);
166
+ throw error;
167
+ }
168
+ }
169
+ /**
170
+ * Detect 3D poses from HTMLCanvasElement
171
+ */
172
+ async detectFromCanvas(canvas) {
173
+ const ctx = canvas.getContext('2d');
174
+ if (!ctx) {
175
+ throw new Error('Could not get 2D context from canvas');
176
+ }
177
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
178
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
179
+ }
180
+ /**
181
+ * Detect 3D poses from HTMLVideoElement
182
+ */
183
+ async detectFromVideo(video, targetCanvas) {
184
+ if (video.readyState < 2) {
185
+ throw new Error('Video not ready. Ensure video is loaded and playing.');
186
+ }
187
+ const canvas = targetCanvas || document.createElement('canvas');
188
+ canvas.width = video.videoWidth;
189
+ canvas.height = video.videoHeight;
190
+ const ctx = canvas.getContext('2d');
191
+ if (!ctx) {
192
+ throw new Error('Could not get 2D context from canvas');
193
+ }
194
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
195
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
196
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
197
+ }
198
+ /**
199
+ * Detect 3D poses from HTMLImageElement
200
+ */
201
+ async detectFromImage(image, targetCanvas) {
202
+ if (!image.complete || !image.naturalWidth) {
203
+ throw new Error('Image not loaded. Ensure image is fully loaded.');
204
+ }
205
+ const canvas = targetCanvas || document.createElement('canvas');
206
+ canvas.width = image.naturalWidth;
207
+ canvas.height = image.naturalHeight;
208
+ const ctx = canvas.getContext('2d');
209
+ if (!ctx) {
210
+ throw new Error('Could not get 2D context from canvas');
211
+ }
212
+ ctx.drawImage(image, 0, 0);
213
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
214
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
215
+ }
216
+ /**
217
+ * Detect 3D poses from ImageBitmap
218
+ */
219
+ async detectFromBitmap(bitmap, targetCanvas) {
220
+ const canvas = targetCanvas || document.createElement('canvas');
221
+ canvas.width = bitmap.width;
222
+ canvas.height = bitmap.height;
223
+ const ctx = canvas.getContext('2d');
224
+ if (!ctx) {
225
+ throw new Error('Could not get 2D context from canvas');
226
+ }
227
+ ctx.drawImage(bitmap, 0, 0);
228
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
229
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
230
+ }
231
+ /**
232
+ * Detect 3D poses from File
233
+ */
234
+ async detectFromFile(file, targetCanvas) {
235
+ return new Promise((resolve, reject) => {
236
+ const img = new Image();
237
+ img.onload = async () => {
238
+ try {
239
+ const results = await this.detectFromImage(img, targetCanvas);
240
+ resolve(results);
241
+ }
242
+ catch (error) {
243
+ reject(error);
244
+ }
245
+ };
246
+ img.onerror = () => reject(new Error('Failed to load image from file'));
247
+ img.src = URL.createObjectURL(file);
248
+ });
249
+ }
250
+ /**
251
+ * Detect 3D poses from Blob
252
+ */
253
+ async detectFromBlob(blob, targetCanvas) {
254
+ const bitmap = await createImageBitmap(blob);
255
+ const results = await this.detectFromBitmap(bitmap, targetCanvas);
256
+ bitmap.close();
257
+ return results;
258
+ }
259
+ /**
260
+ * Detect 3D poses from raw image data
261
+ */
262
+ async detect(imageData, width, height) {
263
+ if (!this.initialized) {
264
+ await this.init();
265
+ }
266
+ const startTime = performance.now();
267
+ // Step 1: Detect people
268
+ const detStart = performance.now();
269
+ const bboxes = await this.detectPeople(imageData, width, height);
270
+ const detTime = performance.now() - detStart;
271
+ // Step 2: Estimate 3D poses for each person
272
+ const poseStart = performance.now();
273
+ const allKeypoints = [];
274
+ const allScores = [];
275
+ const allKeypointsSimcc = [];
276
+ const allKeypoints2d = [];
277
+ // Reset source canvas for new image (will be recreated on first bbox)
278
+ this.srcPoseCanvas = null;
279
+ this.srcPoseCtx = null;
280
+ for (const bbox of bboxes) {
281
+ const poseResult = await this.estimatePose3D(imageData, width, height, bbox);
282
+ allKeypoints.push(poseResult.keypoints);
283
+ allScores.push(poseResult.scores);
284
+ allKeypointsSimcc.push(poseResult.keypointsSimcc);
285
+ allKeypoints2d.push(poseResult.keypoints2d);
286
+ }
287
+ const poseTime = performance.now() - poseStart;
288
+ const totalTime = performance.now() - startTime;
289
+ // Attach stats
290
+ const result = {
291
+ keypoints: allKeypoints,
292
+ scores: allScores,
293
+ keypointsSimcc: allKeypointsSimcc,
294
+ keypoints2d: allKeypoints2d,
295
+ };
296
+ result.stats = {
297
+ personCount: allKeypoints.length,
298
+ detTime: Math.round(detTime),
299
+ poseTime: Math.round(poseTime),
300
+ totalTime: Math.round(totalTime),
301
+ };
302
+ return result;
303
+ }
304
+ /**
305
+ * Detect people using YOLOX
306
+ */
307
+ async detectPeople(imageData, width, height) {
308
+ const [inputH, inputW] = this.config.detInputSize;
309
+ const { tensor, paddingX, paddingY, scaleX, scaleY } = this.preprocessYOLO(imageData, width, height, [inputW, inputH]);
310
+ const inputTensor = new ort.Tensor('float32', tensor, [1, 3, inputH, inputW]);
311
+ const inputName = this.detSession.inputNames[0];
312
+ const feeds = {};
313
+ feeds[inputName] = inputTensor;
314
+ const results = await this.detSession.run(feeds);
315
+ const output = results[this.detSession.outputNames[0]];
316
+ return this.postprocessYOLO(output.data, output.dims[1], width, height, paddingX, paddingY, scaleX, scaleY);
317
+ }
318
+ /**
319
+ * Estimate 3D pose for a single person
320
+ */
321
+ async estimatePose3D(imageData, imgWidth, imgHeight, bbox) {
322
+ const [inputW, inputH] = this.config.poseInputSize;
323
+ const { tensor, center, scale } = this.preprocessPose(imageData, imgWidth, imgHeight, bbox, [inputW, inputH]);
324
+ const inputTensor = new ort.Tensor('float32', tensor, [1, 3, inputH, inputW]);
325
+ // Use dynamic input name
326
+ const inputName = this.poseSession.inputNames[0];
327
+ const feeds = {};
328
+ feeds[inputName] = inputTensor;
329
+ const results = await this.poseSession.run(feeds);
330
+ // Debug output names on first run only
331
+ if (!this.outputNamesLogged) {
332
+ console.log('[Pose3DDetector] Output names:', this.poseSession.outputNames);
333
+ console.log('[Pose3DDetector] Output shapes:', this.poseSession.outputNames.map(k => results[k].dims));
334
+ this.outputNamesLogged = true;
335
+ }
336
+ // Get output tensors using session's outputNames
337
+ // Model input is [width=288, height=384], so:
338
+ // X output has dim 576 (288*2), Y output has dim 768 (384*2)
339
+ const outputNames = this.poseSession.outputNames;
340
+ let simccX, simccY, simccZ;
341
+ // Find outputs by shape
342
+ const shape0 = results[outputNames[0]].dims[2];
343
+ const shape1 = results[outputNames[1]].dims[2];
344
+ const shape2 = results[outputNames[2]].dims[2];
345
+ // X has smaller shape (576), Y has larger (768)
346
+ if (shape0 === 576)
347
+ simccX = results[outputNames[0]];
348
+ else if (shape1 === 576)
349
+ simccX = results[outputNames[1]];
350
+ else
351
+ simccX = results[outputNames[2]];
352
+ if (shape0 === 768)
353
+ simccY = results[outputNames[0]];
354
+ else if (shape1 === 768)
355
+ simccY = results[outputNames[1]];
356
+ else
357
+ simccY = results[outputNames[2]];
358
+ // Z is the remaining one
359
+ const usedIndices = [
360
+ simccX === results[outputNames[0]] ? 0 : simccX === results[outputNames[1]] ? 1 : 2,
361
+ simccY === results[outputNames[0]] ? 0 : simccY === results[outputNames[1]] ? 1 : 2,
362
+ ];
363
+ simccZ = results[outputNames[3 - usedIndices[0] - usedIndices[1]]];
364
+ return this.postprocessPose3D(simccX.data, simccY.data, simccZ.data, simccX.dims, simccY.dims, simccZ.dims, center, scale, imgWidth, imgHeight);
365
+ }
366
+ preprocessYOLO(imageData, imgWidth, imgHeight, inputSize) {
367
+ const [inputW, inputH] = inputSize;
368
+ if (!this.canvas || !this.ctx) {
369
+ this.canvas = document.createElement('canvas');
370
+ this.canvas.width = inputW;
371
+ this.canvas.height = inputH;
372
+ this.ctx = this.canvas.getContext('2d', { willReadFrequently: true, alpha: false });
373
+ }
374
+ const ctx = this.ctx;
375
+ ctx.fillStyle = '#000000';
376
+ ctx.fillRect(0, 0, inputW, inputH);
377
+ const aspectRatio = imgWidth / imgHeight;
378
+ const targetAspectRatio = inputW / inputH;
379
+ let drawWidth, drawHeight, offsetX, offsetY;
380
+ if (aspectRatio > targetAspectRatio) {
381
+ drawWidth = inputW;
382
+ drawHeight = Math.floor(inputW / aspectRatio);
383
+ offsetX = 0;
384
+ offsetY = Math.floor((inputH - drawHeight) / 2);
385
+ }
386
+ else {
387
+ drawHeight = inputH;
388
+ drawWidth = Math.floor(inputH * aspectRatio);
389
+ offsetX = Math.floor((inputW - drawWidth) / 2);
390
+ offsetY = 0;
391
+ }
392
+ const srcCanvas = document.createElement('canvas');
393
+ const srcCtx = srcCanvas.getContext('2d');
394
+ srcCanvas.width = imgWidth;
395
+ srcCanvas.height = imgHeight;
396
+ const srcImageData = srcCtx.createImageData(imgWidth, imgHeight);
397
+ srcImageData.data.set(imageData);
398
+ srcCtx.putImageData(srcImageData, 0, 0);
399
+ ctx.drawImage(srcCanvas, 0, 0, imgWidth, imgHeight, offsetX, offsetY, drawWidth, drawHeight);
400
+ const paddedData = ctx.getImageData(0, 0, inputW, inputH);
401
+ const tensor = new Float32Array(inputW * inputH * 3);
402
+ for (let i = 0; i < paddedData.data.length; i += 4) {
403
+ const pixelIdx = i / 4;
404
+ tensor[pixelIdx] = paddedData.data[i] / 255;
405
+ tensor[pixelIdx + inputW * inputH] = paddedData.data[i + 1] / 255;
406
+ tensor[pixelIdx + 2 * inputW * inputH] = paddedData.data[i + 2] / 255;
407
+ }
408
+ const scaleX = imgWidth / drawWidth;
409
+ const scaleY = imgHeight / drawHeight;
410
+ return { tensor, paddingX: offsetX, paddingY: offsetY, scaleX, scaleY };
411
+ }
412
+ postprocessYOLO(output, numDetections, imgWidth, imgHeight, paddingX, paddingY, scaleX, scaleY) {
413
+ const detections = [];
414
+ for (let i = 0; i < numDetections; i++) {
415
+ const idx = i * 6;
416
+ const x1 = output[idx];
417
+ const y1 = output[idx + 1];
418
+ const x2 = output[idx + 2];
419
+ const y2 = output[idx + 3];
420
+ const confidence = output[idx + 4];
421
+ const classId = Math.round(output[idx + 5]);
422
+ if (confidence < this.config.detConfidence || classId !== 0)
423
+ continue;
424
+ const tx1 = (x1 - paddingX) * scaleX;
425
+ const ty1 = (y1 - paddingY) * scaleY;
426
+ const tx2 = (x2 - paddingX) * scaleX;
427
+ const ty2 = (y2 - paddingY) * scaleY;
428
+ detections.push({
429
+ x1: Math.max(0, tx1),
430
+ y1: Math.max(0, ty1),
431
+ x2: Math.min(imgWidth, tx2),
432
+ y2: Math.min(imgHeight, ty2),
433
+ confidence,
434
+ });
435
+ }
436
+ return this.applyNMS(detections, this.config.nmsThreshold);
437
+ }
438
+ preprocessPose(imageData, imgWidth, imgHeight, bbox, inputSize) {
439
+ const [inputW, inputH] = inputSize;
440
+ const bboxWidth = bbox.x2 - bbox.x1;
441
+ const bboxHeight = bbox.y2 - bbox.y1;
442
+ // Center of bbox (same as Python)
443
+ const center = [
444
+ bbox.x1 + bboxWidth / 2,
445
+ bbox.y1 + bboxHeight / 2,
446
+ ];
447
+ // Scale with padding (same as Python bbox_xyxy2cs with padding=1.25)
448
+ let scaleW = bboxWidth * 1.25;
449
+ let scaleH = bboxHeight * 1.25;
450
+ // Adjust scale to match model aspect ratio (same as top_down_affine)
451
+ const modelAspectRatio = inputW / inputH;
452
+ const bboxAspectRatio = scaleW / scaleH;
453
+ if (bboxAspectRatio > modelAspectRatio) {
454
+ scaleH = scaleW / modelAspectRatio;
455
+ }
456
+ else {
457
+ scaleW = scaleH * modelAspectRatio;
458
+ }
459
+ const scale = [scaleW, scaleH];
460
+ // Reuse pose canvas
461
+ if (!this.poseCanvas || !this.poseCtx) {
462
+ this.poseCanvas = document.createElement('canvas');
463
+ this.poseCanvas.width = inputW;
464
+ this.poseCanvas.height = inputH;
465
+ this.poseCtx = this.poseCanvas.getContext('2d', {
466
+ willReadFrequently: true,
467
+ alpha: false
468
+ });
469
+ }
470
+ // Reuse source canvas for original image (avoid recreation per bbox)
471
+ if (!this.srcPoseCanvas || !this.srcPoseCtx) {
472
+ this.srcPoseCanvas = document.createElement('canvas');
473
+ this.srcPoseCanvas.width = imgWidth;
474
+ this.srcPoseCanvas.height = imgHeight;
475
+ this.srcPoseCtx = this.srcPoseCanvas.getContext('2d', {
476
+ willReadFrequently: true,
477
+ alpha: false
478
+ });
479
+ // Copy image data once
480
+ const srcImageData = this.srcPoseCtx.createImageData(imgWidth, imgHeight);
481
+ srcImageData.data.set(imageData);
482
+ this.srcPoseCtx.putImageData(srcImageData, 0, 0);
483
+ }
484
+ const ctx = this.poseCtx;
485
+ ctx.clearRect(0, 0, inputW, inputH);
486
+ // Crop and resize using drawImage (single GPU operation)
487
+ const srcX = center[0] - scaleW / 2;
488
+ const srcY = center[1] - scaleH / 2;
489
+ ctx.drawImage(this.srcPoseCanvas, srcX, srcY, scaleW, scaleH, 0, 0, inputW, inputH);
490
+ const croppedData = ctx.getImageData(0, 0, inputW, inputH);
491
+ const tensor = this.poseTensorBuffer;
492
+ const len = croppedData.data.length;
493
+ const planeSize = inputW * inputH;
494
+ // Normalization constants
495
+ const mean0 = 123.675, mean1 = 116.28, mean2 = 103.53;
496
+ const stdInv0 = 1 / 58.395, stdInv1 = 1 / 57.12, stdInv2 = 1 / 57.375;
497
+ // Optimized normalization loop - process 4 pixels at once (SIMD-like)
498
+ for (let i = 0; i < len; i += 16) {
499
+ const p1 = i / 4, p2 = p1 + 1, p3 = p1 + 2, p4 = p1 + 3;
500
+ // R channel
501
+ tensor[p1] = (croppedData.data[i] - mean0) * stdInv0;
502
+ tensor[p2] = (croppedData.data[i + 4] - mean0) * stdInv0;
503
+ tensor[p3] = (croppedData.data[i + 8] - mean0) * stdInv0;
504
+ tensor[p4] = (croppedData.data[i + 12] - mean0) * stdInv0;
505
+ // G channel
506
+ tensor[p1 + planeSize] = (croppedData.data[i + 1] - mean1) * stdInv1;
507
+ tensor[p2 + planeSize] = (croppedData.data[i + 5] - mean1) * stdInv1;
508
+ tensor[p3 + planeSize] = (croppedData.data[i + 9] - mean1) * stdInv1;
509
+ tensor[p4 + planeSize] = (croppedData.data[i + 13] - mean1) * stdInv1;
510
+ // B channel
511
+ tensor[p1 + planeSize * 2] = (croppedData.data[i + 2] - mean2) * stdInv2;
512
+ tensor[p2 + planeSize * 2] = (croppedData.data[i + 6] - mean2) * stdInv2;
513
+ tensor[p3 + planeSize * 2] = (croppedData.data[i + 10] - mean2) * stdInv2;
514
+ tensor[p4 + planeSize * 2] = (croppedData.data[i + 14] - mean2) * stdInv2;
515
+ }
516
+ return { tensor, center, scale };
517
+ }
518
+ postprocessPose3D(simccX, simccY, simccZ, shapeX, shapeY, shapeZ, center, scale, imgWidth, imgHeight) {
519
+ const numKeypoints = shapeX[1];
520
+ const wx = shapeX[2];
521
+ const wy = shapeY[2];
522
+ const wz = shapeZ[2];
523
+ const keypoints = [];
524
+ const scores = [];
525
+ const keypointsSimcc = [];
526
+ const keypoints2d = [];
527
+ for (let k = 0; k < numKeypoints; k++) {
528
+ let maxX = -Infinity, argmaxX = 0;
529
+ for (let i = 0; i < wx; i++) {
530
+ const val = simccX[k * wx + i];
531
+ if (val > maxX) {
532
+ maxX = val;
533
+ argmaxX = i;
534
+ }
535
+ }
536
+ let maxY = -Infinity, argmaxY = 0;
537
+ for (let i = 0; i < wy; i++) {
538
+ const val = simccY[k * wy + i];
539
+ if (val > maxY) {
540
+ maxY = val;
541
+ argmaxY = i;
542
+ }
543
+ }
544
+ let maxZ = -Infinity, argmaxZ = 0;
545
+ for (let i = 0; i < wz; i++) {
546
+ const val = simccZ[k * wz + i];
547
+ if (val > maxZ) {
548
+ maxZ = val;
549
+ argmaxZ = i;
550
+ }
551
+ }
552
+ const score = maxX > maxY ? maxX : maxY;
553
+ // Normalize to [0, 1]
554
+ const normX = argmaxX / wx;
555
+ const normY = argmaxY / wy;
556
+ const normZ = argmaxZ / wz;
557
+ // 3D coordinates in model space
558
+ const kptX = (normX - 0.5) * 2.0;
559
+ const kptY = (normY - 0.5) * 2.0;
560
+ const kptZMetric = (normZ - 0.5) * this.config.zRange * 2;
561
+ keypoints.push([kptX, kptY, kptZMetric]);
562
+ keypointsSimcc.push([normX, normY, normZ]);
563
+ // 2D coordinates in original image space
564
+ // Convert from normalized SimCC coords [0, 1] to crop space, then to image space
565
+ // Formula: kpt = center - scale/2 + norm * scale (same as in rtmpose3d.ts)
566
+ const kpt2dX = normX * scale[0] + center[0] - 0.5 * scale[0];
567
+ const kpt2dY = normY * scale[1] + center[1] - 0.5 * scale[1];
568
+ // Clamp to image bounds
569
+ const clampedX = Math.max(0, Math.min(imgWidth, kpt2dX));
570
+ const clampedY = Math.max(0, Math.min(imgHeight, kpt2dY));
571
+ keypoints2d.push([clampedX, clampedY]);
572
+ scores.push(score);
573
+ }
574
+ return { keypoints, scores, keypointsSimcc, keypoints2d };
575
+ }
576
+ applyNMS(detections, iouThreshold) {
577
+ if (detections.length === 0)
578
+ return [];
579
+ detections.sort((a, b) => b.confidence - a.confidence);
580
+ const selected = [];
581
+ const used = new Set();
582
+ for (let i = 0; i < detections.length; i++) {
583
+ if (used.has(i))
584
+ continue;
585
+ selected.push(detections[i]);
586
+ used.add(i);
587
+ for (let j = i + 1; j < detections.length; j++) {
588
+ if (used.has(j))
589
+ continue;
590
+ const iou = this.calculateIoU(detections[i], detections[j]);
591
+ if (iou > iouThreshold) {
592
+ used.add(j);
593
+ }
594
+ }
595
+ }
596
+ return selected;
597
+ }
598
+ calculateIoU(box1, box2) {
599
+ const x1 = Math.max(box1.x1, box2.x1);
600
+ const y1 = Math.max(box1.y1, box2.y1);
601
+ const x2 = Math.min(box1.x2, box2.x2);
602
+ const y2 = Math.min(box1.y2, box2.y2);
603
+ if (x2 <= x1 || y2 <= y1)
604
+ return 0;
605
+ const intersection = (x2 - x1) * (y2 - y1);
606
+ const area1 = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
607
+ const area2 = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
608
+ const union = area1 + area2 - intersection;
609
+ return intersection / union;
610
+ }
611
+ }