rtmlib-ts 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/.gitattributes +1 -0
  2. package/README.md +202 -0
  3. package/dist/core/base.d.ts +20 -0
  4. package/dist/core/base.d.ts.map +1 -0
  5. package/dist/core/base.js +40 -0
  6. package/dist/core/file.d.ts +11 -0
  7. package/dist/core/file.d.ts.map +1 -0
  8. package/dist/core/file.js +111 -0
  9. package/dist/core/modelCache.d.ts +35 -0
  10. package/dist/core/modelCache.d.ts.map +1 -0
  11. package/dist/core/modelCache.js +161 -0
  12. package/dist/core/posePostprocessing.d.ts +12 -0
  13. package/dist/core/posePostprocessing.d.ts.map +1 -0
  14. package/dist/core/posePostprocessing.js +76 -0
  15. package/dist/core/postprocessing.d.ts +10 -0
  16. package/dist/core/postprocessing.d.ts.map +1 -0
  17. package/dist/core/postprocessing.js +70 -0
  18. package/dist/core/preprocessing.d.ts +14 -0
  19. package/dist/core/preprocessing.d.ts.map +1 -0
  20. package/dist/core/preprocessing.js +79 -0
  21. package/dist/index.d.ts +27 -0
  22. package/dist/index.d.ts.map +1 -0
  23. package/dist/index.js +31 -0
  24. package/dist/models/rtmpose.d.ts +25 -0
  25. package/dist/models/rtmpose.d.ts.map +1 -0
  26. package/dist/models/rtmpose.js +185 -0
  27. package/dist/models/rtmpose3d.d.ts +28 -0
  28. package/dist/models/rtmpose3d.d.ts.map +1 -0
  29. package/dist/models/rtmpose3d.js +184 -0
  30. package/dist/models/yolo12.d.ts +23 -0
  31. package/dist/models/yolo12.d.ts.map +1 -0
  32. package/dist/models/yolo12.js +165 -0
  33. package/dist/models/yolox.d.ts +18 -0
  34. package/dist/models/yolox.d.ts.map +1 -0
  35. package/dist/models/yolox.js +167 -0
  36. package/dist/solution/animalDetector.d.ts +229 -0
  37. package/dist/solution/animalDetector.d.ts.map +1 -0
  38. package/dist/solution/animalDetector.js +663 -0
  39. package/dist/solution/body.d.ts +16 -0
  40. package/dist/solution/body.d.ts.map +1 -0
  41. package/dist/solution/body.js +52 -0
  42. package/dist/solution/bodyWithFeet.d.ts +16 -0
  43. package/dist/solution/bodyWithFeet.d.ts.map +1 -0
  44. package/dist/solution/bodyWithFeet.js +52 -0
  45. package/dist/solution/customDetector.d.ts +137 -0
  46. package/dist/solution/customDetector.d.ts.map +1 -0
  47. package/dist/solution/customDetector.js +342 -0
  48. package/dist/solution/hand.d.ts +14 -0
  49. package/dist/solution/hand.d.ts.map +1 -0
  50. package/dist/solution/hand.js +20 -0
  51. package/dist/solution/index.d.ts +10 -0
  52. package/dist/solution/index.d.ts.map +1 -0
  53. package/dist/solution/index.js +9 -0
  54. package/dist/solution/objectDetector.d.ts +172 -0
  55. package/dist/solution/objectDetector.d.ts.map +1 -0
  56. package/dist/solution/objectDetector.js +606 -0
  57. package/dist/solution/pose3dDetector.d.ts +145 -0
  58. package/dist/solution/pose3dDetector.d.ts.map +1 -0
  59. package/dist/solution/pose3dDetector.js +611 -0
  60. package/dist/solution/poseDetector.d.ts +198 -0
  61. package/dist/solution/poseDetector.d.ts.map +1 -0
  62. package/dist/solution/poseDetector.js +622 -0
  63. package/dist/solution/poseTracker.d.ts +22 -0
  64. package/dist/solution/poseTracker.d.ts.map +1 -0
  65. package/dist/solution/poseTracker.js +106 -0
  66. package/dist/solution/wholebody.d.ts +19 -0
  67. package/dist/solution/wholebody.d.ts.map +1 -0
  68. package/dist/solution/wholebody.js +82 -0
  69. package/dist/solution/wholebody3d.d.ts +22 -0
  70. package/dist/solution/wholebody3d.d.ts.map +1 -0
  71. package/dist/solution/wholebody3d.js +75 -0
  72. package/dist/types/index.d.ts +52 -0
  73. package/dist/types/index.d.ts.map +1 -0
  74. package/dist/types/index.js +5 -0
  75. package/dist/visualization/draw.d.ts +57 -0
  76. package/dist/visualization/draw.d.ts.map +1 -0
  77. package/dist/visualization/draw.js +400 -0
  78. package/dist/visualization/skeleton/coco133.d.ts +350 -0
  79. package/dist/visualization/skeleton/coco133.d.ts.map +1 -0
  80. package/dist/visualization/skeleton/coco133.js +120 -0
  81. package/dist/visualization/skeleton/coco17.d.ts +180 -0
  82. package/dist/visualization/skeleton/coco17.d.ts.map +1 -0
  83. package/dist/visualization/skeleton/coco17.js +48 -0
  84. package/dist/visualization/skeleton/halpe26.d.ts +278 -0
  85. package/dist/visualization/skeleton/halpe26.d.ts.map +1 -0
  86. package/dist/visualization/skeleton/halpe26.js +70 -0
  87. package/dist/visualization/skeleton/hand21.d.ts +196 -0
  88. package/dist/visualization/skeleton/hand21.d.ts.map +1 -0
  89. package/dist/visualization/skeleton/hand21.js +51 -0
  90. package/dist/visualization/skeleton/index.d.ts +10 -0
  91. package/dist/visualization/skeleton/index.d.ts.map +1 -0
  92. package/dist/visualization/skeleton/index.js +9 -0
  93. package/dist/visualization/skeleton/openpose134.d.ts +357 -0
  94. package/dist/visualization/skeleton/openpose134.d.ts.map +1 -0
  95. package/dist/visualization/skeleton/openpose134.js +116 -0
  96. package/dist/visualization/skeleton/openpose18.d.ts +177 -0
  97. package/dist/visualization/skeleton/openpose18.d.ts.map +1 -0
  98. package/dist/visualization/skeleton/openpose18.js +47 -0
  99. package/docs/ANIMAL_DETECTOR.md +450 -0
  100. package/docs/CUSTOM_DETECTOR.md +568 -0
  101. package/docs/OBJECT_DETECTOR.md +373 -0
  102. package/docs/POSE3D_DETECTOR.md +458 -0
  103. package/docs/POSE_DETECTOR.md +442 -0
  104. package/examples/README.md +119 -0
  105. package/examples/index.html +746 -0
  106. package/package.json +51 -0
  107. package/playground/README.md +114 -0
  108. package/playground/app/favicon.ico +0 -0
  109. package/playground/app/globals.css +17 -0
  110. package/playground/app/layout.tsx +19 -0
  111. package/playground/app/page.tsx +1338 -0
  112. package/playground/eslint.config.mjs +18 -0
  113. package/playground/next.config.ts +34 -0
  114. package/playground/package-lock.json +6723 -0
  115. package/playground/package.json +27 -0
  116. package/playground/postcss.config.mjs +7 -0
  117. package/playground/tsconfig.json +34 -0
  118. package/src/core/base.ts +66 -0
  119. package/src/core/file.ts +141 -0
  120. package/src/core/modelCache.ts +189 -0
  121. package/src/core/posePostprocessing.ts +91 -0
  122. package/src/core/postprocessing.ts +93 -0
  123. package/src/core/preprocessing.ts +127 -0
  124. package/src/index.ts +69 -0
  125. package/src/models/rtmpose.ts +265 -0
  126. package/src/models/rtmpose3d.ts +289 -0
  127. package/src/models/yolo12.ts +220 -0
  128. package/src/models/yolox.ts +214 -0
  129. package/src/solution/animalDetector.ts +955 -0
  130. package/src/solution/body.ts +89 -0
  131. package/src/solution/bodyWithFeet.ts +89 -0
  132. package/src/solution/customDetector.ts +474 -0
  133. package/src/solution/hand.ts +52 -0
  134. package/src/solution/index.ts +10 -0
  135. package/src/solution/objectDetector.ts +816 -0
  136. package/src/solution/pose3dDetector.ts +890 -0
  137. package/src/solution/poseDetector.ts +892 -0
  138. package/src/solution/poseTracker.ts +172 -0
  139. package/src/solution/wholebody.ts +130 -0
  140. package/src/solution/wholebody3d.ts +125 -0
  141. package/src/types/index.ts +62 -0
  142. package/src/visualization/draw.ts +543 -0
  143. package/src/visualization/skeleton/coco133.ts +131 -0
  144. package/src/visualization/skeleton/coco17.ts +49 -0
  145. package/src/visualization/skeleton/halpe26.ts +71 -0
  146. package/src/visualization/skeleton/hand21.ts +52 -0
  147. package/src/visualization/skeleton/index.ts +10 -0
  148. package/src/visualization/skeleton/openpose134.ts +125 -0
  149. package/src/visualization/skeleton/openpose18.ts +48 -0
  150. package/tsconfig.json +32 -0
@@ -0,0 +1,892 @@
1
+ /**
2
+ * PoseDetector - Unified API for person detection and pose estimation
3
+ * Combines YOLO12 detector with RTMW pose model in a single optimized interface
4
+ *
5
+ * @example
6
+ * ```typescript
7
+ * // Initialize with default models (from HuggingFace)
8
+ * const detector = new PoseDetector();
9
+ * await detector.init();
10
+ *
11
+ * // Or with custom models
12
+ * const detector = new PoseDetector({
13
+ * detModel: 'models/yolov12n.onnx',
14
+ * poseModel: 'models/rtmlib/end2end.onnx',
15
+ * });
16
+ * await detector.init();
17
+ *
18
+ * // From canvas
19
+ * const results = await detector.detectFromCanvas(canvas);
20
+ *
21
+ * // From video element
22
+ * const results = await detector.detectFromVideo(videoElement);
23
+ *
24
+ * // From raw image data
25
+ * const results = await detector.detect(imageData, width, height);
26
+ * ```
27
+ */
28
+
29
+ import * as ort from 'onnxruntime-web';
30
+ import { BBox, Detection } from '../types/index';
31
+ import { getCachedModel, isModelCached } from '../core/modelCache';
32
+
33
+ // Configure ONNX Runtime Web
34
+ ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.0/dist/';
35
+ ort.env.wasm.simd = true;
36
+ ort.env.wasm.proxy = false;
37
+
38
+ /**
39
+ * Configuration options for PoseDetector
40
+ */
41
+ export interface PoseDetectorConfig {
42
+ /** Path to YOLO12 detection model (optional - uses default from HuggingFace if not specified) */
43
+ detModel?: string;
44
+ /** Path to RTMW pose estimation model (optional - uses default from HuggingFace if not specified) */
45
+ poseModel?: string;
46
+ /** Detection input size (default: [416, 416]) */
47
+ detInputSize?: [number, number];
48
+ /** Pose input size (default: [384, 288]) */
49
+ poseInputSize?: [number, number];
50
+ /** Detection confidence threshold (default: 0.5) */
51
+ detConfidence?: number;
52
+ /** NMS IoU threshold (default: 0.45) */
53
+ nmsThreshold?: number;
54
+ /** Pose keypoint confidence threshold (default: 0.3) */
55
+ poseConfidence?: number;
56
+ /** Execution backend (default: 'wasm') */
57
+ backend?: 'wasm' | 'webgpu';
58
+ /** Enable model caching (default: true) */
59
+ cache?: boolean;
60
+ }
61
+
62
+ /**
63
+ * Detected person with bounding box and keypoints
64
+ */
65
+ export interface Person {
66
+ /** Bounding box coordinates */
67
+ bbox: {
68
+ x1: number;
69
+ y1: number;
70
+ x2: number;
71
+ y2: number;
72
+ confidence: number;
73
+ };
74
+ /** 17 COCO keypoints coordinates */
75
+ keypoints: Keypoint[];
76
+ /** Keypoint scores (0-1) */
77
+ scores: number[];
78
+ }
79
+
80
+ /**
81
+ * Single keypoint with coordinates and visibility
82
+ */
83
+ export interface Keypoint {
84
+ x: number;
85
+ y: number;
86
+ score: number;
87
+ visible: boolean;
88
+ name: string;
89
+ }
90
+
91
+ /**
92
+ * Detection statistics
93
+ */
94
+ export interface PoseStats {
95
+ /** Number of detected people */
96
+ personCount: number;
97
+ /** Detection inference time (ms) */
98
+ detTime: number;
99
+ /** Pose estimation time (ms) */
100
+ poseTime: number;
101
+ /** Total processing time (ms) */
102
+ totalTime: number;
103
+ }
104
+
105
+ /**
106
+ * COCO17 keypoint names
107
+ */
108
+ const KEYPOINT_NAMES = [
109
+ 'nose',
110
+ 'left_eye',
111
+ 'right_eye',
112
+ 'left_ear',
113
+ 'right_ear',
114
+ 'left_shoulder',
115
+ 'right_shoulder',
116
+ 'left_elbow',
117
+ 'right_elbow',
118
+ 'left_wrist',
119
+ 'right_wrist',
120
+ 'left_hip',
121
+ 'right_hip',
122
+ 'left_knee',
123
+ 'right_knee',
124
+ 'left_ankle',
125
+ 'right_ankle',
126
+ ];
127
+
128
+ /**
129
+ * Default configuration
130
+ */
131
+ const DEFAULT_CONFIG: Required<PoseDetectorConfig> = {
132
+ detModel: 'https://huggingface.co/demon2233/rtmlib-ts/resolve/main/yolo/yolov12n.onnx',
133
+ poseModel: 'https://huggingface.co/demon2233/rtmlib-ts/resolve/main/rtmpose/end2end.onnx',
134
+ detInputSize: [416, 416], // Faster detection
135
+ poseInputSize: [384, 288], // Required by model
136
+ detConfidence: 0.5,
137
+ nmsThreshold: 0.45,
138
+ poseConfidence: 0.3,
139
+ backend: 'webgpu', // Default to WebGPU for better performance
140
+ cache: true,
141
+ };
142
+
143
+ export class PoseDetector {
144
+ private config: Required<PoseDetectorConfig>;
145
+ private detSession: ort.InferenceSession | null = null;
146
+ private poseSession: ort.InferenceSession | null = null;
147
+ private initialized = false;
148
+
149
+ // Pre-allocated buffers for maximum performance
150
+ private canvas: HTMLCanvasElement | null = null;
151
+ private ctx: CanvasRenderingContext2D | null = null;
152
+ private poseCanvas: HTMLCanvasElement | null = null;
153
+ private poseCtx: CanvasRenderingContext2D | null = null;
154
+ private poseTensorBuffer: Float32Array | null = null;
155
+ private detInputSize: [number, number] = [416, 416];
156
+ private poseInputSize: [number, number] = [384, 288];
157
+
158
+ constructor(config: PoseDetectorConfig) {
159
+ this.config = { ...DEFAULT_CONFIG, ...config };
160
+ }
161
+
162
+ /**
163
+ * Initialize both detection and pose models with pre-allocated resources
164
+ */
165
+ async init(): Promise<void> {
166
+ if (this.initialized) return;
167
+
168
+ try {
169
+ // Load detection model
170
+ console.log(`[PoseDetector] Loading detection model from: ${this.config.detModel}`);
171
+ let detBuffer: ArrayBuffer;
172
+
173
+ if (this.config.cache) {
174
+ const detCached = await isModelCached(this.config.detModel);
175
+ console.log(`[PoseDetector] Det model cache ${detCached ? 'hit' : 'miss'}`);
176
+ detBuffer = await getCachedModel(this.config.detModel);
177
+ } else {
178
+ const detResponse = await fetch(this.config.detModel);
179
+ if (!detResponse.ok) {
180
+ throw new Error(`Failed to fetch det model: HTTP ${detResponse.status}`);
181
+ }
182
+ detBuffer = await detResponse.arrayBuffer();
183
+ }
184
+
185
+ this.detSession = await ort.InferenceSession.create(detBuffer, {
186
+ executionProviders: [this.config.backend],
187
+ graphOptimizationLevel: 'all',
188
+ });
189
+ console.log(`[PoseDetector] Detection model loaded, size: ${(detBuffer.byteLength / 1024 / 1024).toFixed(2)} MB`);
190
+
191
+ // Load pose model
192
+ console.log(`[PoseDetector] Loading pose model from: ${this.config.poseModel}`);
193
+ let poseBuffer: ArrayBuffer;
194
+
195
+ if (this.config.cache) {
196
+ const poseCached = await isModelCached(this.config.poseModel);
197
+ console.log(`[PoseDetector] Pose model cache ${poseCached ? 'hit' : 'miss'}`);
198
+ poseBuffer = await getCachedModel(this.config.poseModel);
199
+ } else {
200
+ const poseResponse = await fetch(this.config.poseModel);
201
+ if (!poseResponse.ok) {
202
+ throw new Error(`Failed to fetch pose model: HTTP ${poseResponse.status}`);
203
+ }
204
+ poseBuffer = await poseResponse.arrayBuffer();
205
+ }
206
+
207
+ this.poseSession = await ort.InferenceSession.create(poseBuffer, {
208
+ executionProviders: [this.config.backend],
209
+ graphOptimizationLevel: 'all',
210
+ });
211
+ console.log(`[PoseDetector] Pose model loaded, size: ${(poseBuffer.byteLength / 1024 / 1024).toFixed(2)} MB`);
212
+
213
+ // Pre-allocate all resources
214
+ const [detW, detH] = this.config.detInputSize;
215
+ this.detInputSize = [detW, detH];
216
+
217
+ const [poseW, poseH] = this.config.poseInputSize;
218
+ this.poseInputSize = [poseW, poseH];
219
+
220
+ // Main canvas for detection
221
+ this.canvas = document.createElement('canvas');
222
+ this.canvas.width = detW;
223
+ this.canvas.height = detH;
224
+ this.ctx = this.canvas.getContext('2d', {
225
+ willReadFrequently: true,
226
+ alpha: false
227
+ })!;
228
+
229
+ // Pose crop canvas (reused for each person)
230
+ this.poseCanvas = document.createElement('canvas');
231
+ this.poseCanvas.width = poseW;
232
+ this.poseCanvas.height = poseH;
233
+ this.poseCtx = this.poseCanvas.getContext('2d', {
234
+ willReadFrequently: true,
235
+ alpha: false
236
+ })!;
237
+
238
+ // Pre-allocate pose tensor buffer
239
+ this.poseTensorBuffer = new Float32Array(3 * poseW * poseH);
240
+
241
+ this.initialized = true;
242
+ console.log(`[PoseDetector] ✅ Initialized (det:${detW}x${detH}, pose:${poseW}x${poseH})`);
243
+ } catch (error) {
244
+ console.error('[PoseDetector] ❌ Initialization failed:', error);
245
+ throw error;
246
+ }
247
+ }
248
+
249
+ /**
250
+ * Detect poses from HTMLCanvasElement
251
+ * @param canvas - Canvas element containing the image
252
+ * @returns Array of detected people with keypoints
253
+ */
254
+ async detectFromCanvas(canvas: HTMLCanvasElement): Promise<Person[]> {
255
+ const ctx = canvas.getContext('2d');
256
+ if (!ctx) {
257
+ throw new Error('Could not get 2D context from canvas');
258
+ }
259
+
260
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
261
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
262
+ }
263
+
264
+ /**
265
+ * Detect poses from HTMLVideoElement
266
+ * @param video - Video element to capture frame from
267
+ * @param targetCanvas - Optional canvas for frame extraction (creates one if not provided)
268
+ * @returns Array of detected people with keypoints
269
+ */
270
+ async detectFromVideo(
271
+ video: HTMLVideoElement,
272
+ targetCanvas?: HTMLCanvasElement
273
+ ): Promise<Person[]> {
274
+ if (video.readyState < 2) {
275
+ throw new Error('Video not ready. Ensure video is loaded and playing.');
276
+ }
277
+
278
+ const canvas = targetCanvas || document.createElement('canvas');
279
+ canvas.width = video.videoWidth;
280
+ canvas.height = video.videoHeight;
281
+
282
+ const ctx = canvas.getContext('2d');
283
+ if (!ctx) {
284
+ throw new Error('Could not get 2D context from canvas');
285
+ }
286
+
287
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
288
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
289
+
290
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
291
+ }
292
+
293
+ /**
294
+ * Detect poses from HTMLImageElement
295
+ * @param image - Image element to process
296
+ * @param targetCanvas - Optional canvas for image extraction (creates one if not provided)
297
+ * @returns Array of detected people with keypoints
298
+ */
299
+ async detectFromImage(
300
+ image: HTMLImageElement,
301
+ targetCanvas?: HTMLCanvasElement
302
+ ): Promise<Person[]> {
303
+ if (!image.complete || !image.naturalWidth) {
304
+ throw new Error('Image not loaded. Ensure image is fully loaded.');
305
+ }
306
+
307
+ const canvas = targetCanvas || document.createElement('canvas');
308
+ canvas.width = image.naturalWidth;
309
+ canvas.height = image.naturalHeight;
310
+
311
+ const ctx = canvas.getContext('2d');
312
+ if (!ctx) {
313
+ throw new Error('Could not get 2D context from canvas');
314
+ }
315
+
316
+ ctx.drawImage(image, 0, 0);
317
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
318
+
319
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
320
+ }
321
+
322
+ /**
323
+ * Detect poses from ImageBitmap (efficient for blob/file uploads)
324
+ * @param bitmap - ImageBitmap to process
325
+ * @param targetCanvas - Optional canvas for bitmap extraction (creates one if not provided)
326
+ * @returns Array of detected people with keypoints
327
+ */
328
+ async detectFromBitmap(
329
+ bitmap: ImageBitmap,
330
+ targetCanvas?: HTMLCanvasElement
331
+ ): Promise<Person[]> {
332
+ const canvas = targetCanvas || document.createElement('canvas');
333
+ canvas.width = bitmap.width;
334
+ canvas.height = bitmap.height;
335
+
336
+ const ctx = canvas.getContext('2d');
337
+ if (!ctx) {
338
+ throw new Error('Could not get 2D context from canvas');
339
+ }
340
+
341
+ ctx.drawImage(bitmap, 0, 0);
342
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
343
+
344
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
345
+ }
346
+
347
+ /**
348
+ * Detect poses from File (for file input uploads)
349
+ * @param file - File object from input element
350
+ * @param targetCanvas - Optional canvas for image extraction (creates one if not provided)
351
+ * @returns Array of detected people with keypoints
352
+ */
353
+ async detectFromFile(
354
+ file: File,
355
+ targetCanvas?: HTMLCanvasElement
356
+ ): Promise<Person[]> {
357
+ return new Promise((resolve, reject) => {
358
+ const img = new Image();
359
+ img.onload = async () => {
360
+ try {
361
+ const results = await this.detectFromImage(img, targetCanvas);
362
+ resolve(results);
363
+ } catch (error) {
364
+ reject(error);
365
+ }
366
+ };
367
+ img.onerror = () => reject(new Error('Failed to load image from file'));
368
+ img.src = URL.createObjectURL(file);
369
+ });
370
+ }
371
+
372
+ /**
373
+ * Detect poses from Blob (for camera capture or downloads)
374
+ * @param blob - Blob object to process
375
+ * @param targetCanvas - Optional canvas for image extraction (creates one if not provided)
376
+ * @returns Array of detected people with keypoints
377
+ */
378
+ async detectFromBlob(
379
+ blob: Blob,
380
+ targetCanvas?: HTMLCanvasElement
381
+ ): Promise<Person[]> {
382
+ const bitmap = await createImageBitmap(blob);
383
+ const results = await this.detectFromBitmap(bitmap, targetCanvas);
384
+ bitmap.close();
385
+ return results;
386
+ }
387
+
388
+ /**
389
+ * Detect people and estimate poses in a single call
390
+ * @param imageData - Image data (Uint8Array RGB/RGBA)
391
+ * @param width - Image width
392
+ * @param height - Image height
393
+ * @returns Array of detected people with keypoints
394
+ */
395
+ async detect(
396
+ imageData: Uint8Array,
397
+ width: number,
398
+ height: number
399
+ ): Promise<Person[]> {
400
+ if (!this.initialized) {
401
+ await this.init();
402
+ }
403
+
404
+ const startTime = performance.now();
405
+
406
+ // Step 1: Detect people
407
+ const detStart = performance.now();
408
+ const bboxes = await this.detectPeople(imageData, width, height);
409
+ const detTime = performance.now() - detStart;
410
+
411
+ // Step 2: Estimate poses for each person
412
+ const poseStart = performance.now();
413
+ const people: Person[] = [];
414
+
415
+ for (const bbox of bboxes) {
416
+ const keypoints = await this.estimatePose(imageData, width, height, bbox);
417
+ people.push({
418
+ bbox: {
419
+ x1: bbox.x1,
420
+ y1: bbox.y1,
421
+ x2: bbox.x2,
422
+ y2: bbox.y2,
423
+ confidence: bbox.confidence,
424
+ },
425
+ keypoints,
426
+ scores: keypoints.map(k => k.score),
427
+ });
428
+ }
429
+
430
+ const poseTime = performance.now() - poseStart;
431
+ const totalTime = performance.now() - startTime;
432
+
433
+ // Attach stats (for debugging)
434
+ (people as any).stats = {
435
+ personCount: people.length,
436
+ detTime: Math.round(detTime),
437
+ poseTime: Math.round(poseTime),
438
+ totalTime: Math.round(totalTime),
439
+ } as PoseStats;
440
+
441
+ return people;
442
+ }
443
+
444
+ /**
445
+ * Get detection and pose statistics from last call
446
+ */
447
+ getStats(): PoseStats | null {
448
+ return null; // Stats attached to results
449
+ }
450
+
451
+ /**
452
+ * Detect people using YOLO12
453
+ */
454
+ private async detectPeople(
455
+ imageData: Uint8Array,
456
+ width: number,
457
+ height: number
458
+ ): Promise<Array<{ x1: number; y1: number; x2: number; y2: number; confidence: number }>> {
459
+ const [inputH, inputW] = this.config.detInputSize;
460
+
461
+ // Preprocess
462
+ const { tensor, paddingX, paddingY, scaleX, scaleY } = this.preprocessYOLO(
463
+ imageData,
464
+ width,
465
+ height,
466
+ [inputW, inputH]
467
+ );
468
+
469
+ // Inference - use dynamic input name
470
+ const inputTensor = new ort.Tensor('float32', tensor, [1, 3, inputH, inputW]);
471
+ const inputName = this.detSession!.inputNames[0]; // Dynamic: 'images' or 'pixel_values'
472
+
473
+ const feeds: Record<string, ort.Tensor> = {};
474
+ feeds[inputName] = inputTensor;
475
+
476
+ const results = await this.detSession!.run(feeds);
477
+ const output = results[this.detSession!.outputNames[0]];
478
+
479
+ // Postprocess
480
+ return this.postprocessYOLO(
481
+ output.data as Float32Array,
482
+ output.dims[1],
483
+ width,
484
+ height,
485
+ paddingX,
486
+ paddingY,
487
+ scaleX,
488
+ scaleY
489
+ );
490
+ }
491
+
492
+ /**
493
+ * Estimate pose for a single person
494
+ */
495
+ private async estimatePose(
496
+ imageData: Uint8Array,
497
+ imgWidth: number,
498
+ imgHeight: number,
499
+ bbox: { x1: number; y1: number; x2: number; y2: number; confidence: number }
500
+ ): Promise<Keypoint[]> {
501
+ const [inputH, inputW] = this.config.poseInputSize;
502
+
503
+ // Preprocess
504
+ const { tensor, center, scale } = this.preprocessPose(
505
+ imageData,
506
+ imgWidth,
507
+ imgHeight,
508
+ bbox,
509
+ [inputW, inputH]
510
+ );
511
+
512
+ // Inference
513
+ const inputTensor = new ort.Tensor('float32', tensor, [1, 3, inputH, inputW] as number[]);
514
+ const results = await this.poseSession!.run({ input: inputTensor });
515
+
516
+ // Postprocess
517
+ return this.postprocessPose(
518
+ results.simcc_x.data as Float32Array,
519
+ results.simcc_y.data as Float32Array,
520
+ results.simcc_x.dims as number[],
521
+ results.simcc_y.dims as number[],
522
+ center,
523
+ scale
524
+ );
525
+ }
526
+
527
+ /**
528
+ * YOLO preprocessing with letterbox
529
+ */
530
+ private preprocessYOLO(
531
+ imageData: Uint8Array,
532
+ imgWidth: number,
533
+ imgHeight: number,
534
+ inputSize: [number, number]
535
+ ): {
536
+ tensor: Float32Array;
537
+ paddingX: number;
538
+ paddingY: number;
539
+ scaleX: number;
540
+ scaleY: number;
541
+ } {
542
+ const [inputW, inputH] = inputSize;
543
+
544
+ // Reuse canvas
545
+ if (!this.canvas || !this.ctx) {
546
+ this.canvas = document.createElement('canvas');
547
+ this.ctx = this.canvas.getContext('2d', { willReadFrequently: true })!;
548
+ }
549
+
550
+ this.canvas.width = inputW;
551
+ this.canvas.height = inputH;
552
+ const ctx = this.ctx;
553
+
554
+ // Black background
555
+ ctx.fillStyle = '#000000';
556
+ ctx.fillRect(0, 0, inputW, inputH);
557
+
558
+ // Calculate letterbox
559
+ const aspectRatio = imgWidth / imgHeight;
560
+ const targetAspectRatio = inputW / inputH;
561
+
562
+ let drawWidth: number, drawHeight: number, offsetX: number, offsetY: number;
563
+
564
+ if (aspectRatio > targetAspectRatio) {
565
+ drawWidth = inputW;
566
+ drawHeight = Math.floor(inputW / aspectRatio);
567
+ offsetX = 0;
568
+ offsetY = Math.floor((inputH - drawHeight) / 2);
569
+ } else {
570
+ drawHeight = inputH;
571
+ drawWidth = Math.floor(inputH * aspectRatio);
572
+ offsetX = Math.floor((inputW - drawWidth) / 2);
573
+ offsetY = 0;
574
+ }
575
+
576
+ // Create source canvas
577
+ const srcCanvas = document.createElement('canvas');
578
+ const srcCtx = srcCanvas.getContext('2d')!;
579
+ srcCanvas.width = imgWidth;
580
+ srcCanvas.height = imgHeight;
581
+
582
+ const srcImageData = srcCtx.createImageData(imgWidth, imgHeight);
583
+ srcImageData.data.set(imageData);
584
+ srcCtx.putImageData(srcImageData, 0, 0);
585
+
586
+ // Draw
587
+ ctx.drawImage(srcCanvas, 0, 0, imgWidth, imgHeight, offsetX, offsetY, drawWidth, drawHeight);
588
+
589
+ const paddedData = ctx.getImageData(0, 0, inputW, inputH);
590
+
591
+ // Normalize to [0, 1] and convert to CHW
592
+ const tensor = new Float32Array(inputW * inputH * 3);
593
+ for (let i = 0; i < paddedData.data.length; i += 4) {
594
+ const pixelIdx = i / 4;
595
+ tensor[pixelIdx] = paddedData.data[i] / 255;
596
+ tensor[pixelIdx + inputW * inputH] = paddedData.data[i + 1] / 255;
597
+ tensor[pixelIdx + 2 * inputW * inputH] = paddedData.data[i + 2] / 255;
598
+ }
599
+
600
+ const scaleX = imgWidth / drawWidth;
601
+ const scaleY = imgHeight / drawHeight;
602
+
603
+ return {
604
+ tensor,
605
+ paddingX: offsetX,
606
+ paddingY: offsetY,
607
+ scaleX,
608
+ scaleY,
609
+ };
610
+ }
611
+
612
+ /**
613
+ * YOLO postprocessing with NMS
614
+ */
615
+ private postprocessYOLO(
616
+ output: Float32Array,
617
+ numDetections: number,
618
+ imgWidth: number,
619
+ imgHeight: number,
620
+ paddingX: number,
621
+ paddingY: number,
622
+ scaleX: number,
623
+ scaleY: number
624
+ ): Array<{ x1: number; y1: number; x2: number; y2: number; confidence: number }> {
625
+ const detections: Array<{ x1: number; y1: number; x2: number; y2: number; confidence: number }> = [];
626
+
627
+ for (let i = 0; i < numDetections; i++) {
628
+ const idx = i * 6;
629
+ const x1 = output[idx];
630
+ const y1 = output[idx + 1];
631
+ const x2 = output[idx + 2];
632
+ const y2 = output[idx + 3];
633
+ const confidence = output[idx + 4];
634
+ const classId = Math.round(output[idx + 5]);
635
+
636
+ if (confidence < this.config.detConfidence || classId !== 0) continue;
637
+
638
+ // Transform coordinates
639
+ const tx1 = (x1 - paddingX) * scaleX;
640
+ const ty1 = (y1 - paddingY) * scaleY;
641
+ const tx2 = (x2 - paddingX) * scaleX;
642
+ const ty2 = (y2 - paddingY) * scaleY;
643
+
644
+ detections.push({
645
+ x1: Math.max(0, tx1),
646
+ y1: Math.max(0, ty1),
647
+ x2: Math.min(imgWidth, tx2),
648
+ y2: Math.min(imgHeight, ty2),
649
+ confidence,
650
+ });
651
+ }
652
+
653
+ // NMS
654
+ return this.applyNMS(detections, this.config.nmsThreshold);
655
+ }
656
+
657
+ /**
658
+ * Pose preprocessing with affine crop
659
+ */
660
+ private preprocessPose(
661
+ imageData: Uint8Array,
662
+ imgWidth: number,
663
+ imgHeight: number,
664
+ bbox: { x1: number; y1: number; x2: number; y2: number; confidence: number },
665
+ inputSize: [number, number]
666
+ ): { tensor: Float32Array; center: [number, number]; scale: [number, number] } {
667
+ const [inputW, inputH] = inputSize;
668
+
669
+ const bboxWidth = bbox.x2 - bbox.x1;
670
+ const bboxHeight = bbox.y2 - bbox.y1;
671
+
672
+ const center: [number, number] = [
673
+ bbox.x1 + bboxWidth / 2,
674
+ bbox.y1 + bboxHeight / 2,
675
+ ];
676
+
677
+ // Aspect ratio preservation
678
+ const bboxAspectRatio = bboxWidth / bboxHeight;
679
+ const modelAspectRatio = inputW / inputH;
680
+
681
+ let scaleW: number, scaleH: number;
682
+ if (bboxAspectRatio > modelAspectRatio) {
683
+ scaleW = bboxWidth * 1.25;
684
+ scaleH = scaleW / modelAspectRatio;
685
+ } else {
686
+ scaleH = bboxHeight * 1.25;
687
+ scaleW = scaleH * modelAspectRatio;
688
+ }
689
+
690
+ const scale: [number, number] = [scaleW, scaleH];
691
+
692
+ // Reuse pre-allocated pose canvas
693
+ if (!this.poseCanvas || !this.poseCtx) {
694
+ this.poseCanvas = document.createElement('canvas');
695
+ this.poseCanvas.width = inputW;
696
+ this.poseCanvas.height = inputH;
697
+ this.poseCtx = this.poseCanvas.getContext('2d', {
698
+ willReadFrequently: true,
699
+ alpha: false
700
+ })!;
701
+ this.poseTensorBuffer = new Float32Array(3 * inputW * inputH);
702
+ }
703
+
704
+ const ctx = this.poseCtx;
705
+
706
+ // Fast clear
707
+ ctx.clearRect(0, 0, inputW, inputH);
708
+
709
+ // Create source
710
+ const srcCanvas = document.createElement('canvas');
711
+ const srcCtx = srcCanvas.getContext('2d')!;
712
+ srcCanvas.width = imgWidth;
713
+ srcCanvas.height = imgHeight;
714
+
715
+ const srcImageData = srcCtx.createImageData(imgWidth, imgHeight);
716
+ srcImageData.data.set(imageData);
717
+ srcCtx.putImageData(srcImageData, 0, 0);
718
+
719
+ // Crop and scale
720
+ const srcX = center[0] - scaleW / 2;
721
+ const srcY = center[1] - scaleH / 2;
722
+ ctx.drawImage(srcCanvas, srcX, srcY, scaleW, scaleH, 0, 0, inputW, inputH);
723
+
724
+ const croppedData = ctx.getImageData(0, 0, inputW, inputH);
725
+
726
+ // Optimized normalization with precomputed constants
727
+ const tensor = this.poseTensorBuffer!;
728
+ const len = croppedData.data.length;
729
+ const planeSize = inputW * inputH;
730
+
731
+ // Precompute normalization constants
732
+ const mean0 = 123.675, mean1 = 116.28, mean2 = 103.53;
733
+ const stdInv0 = 1 / 58.395, stdInv1 = 1 / 57.12, stdInv2 = 1 / 57.375;
734
+
735
+ // Unrolled loop (4 pixels at once)
736
+ for (let i = 0; i < len; i += 16) {
737
+ const p1 = i / 4, p2 = p1 + 1, p3 = p1 + 2, p4 = p1 + 3;
738
+
739
+ // R channel
740
+ tensor[p1] = (croppedData.data[i] - mean0) * stdInv0;
741
+ tensor[p2] = (croppedData.data[i + 4] - mean0) * stdInv0;
742
+ tensor[p3] = (croppedData.data[i + 8] - mean0) * stdInv0;
743
+ tensor[p4] = (croppedData.data[i + 12] - mean0) * stdInv0;
744
+
745
+ // G channel
746
+ tensor[p1 + planeSize] = (croppedData.data[i + 1] - mean1) * stdInv1;
747
+ tensor[p2 + planeSize] = (croppedData.data[i + 5] - mean1) * stdInv1;
748
+ tensor[p3 + planeSize] = (croppedData.data[i + 9] - mean1) * stdInv1;
749
+ tensor[p4 + planeSize] = (croppedData.data[i + 13] - mean1) * stdInv1;
750
+
751
+ // B channel
752
+ tensor[p1 + planeSize * 2] = (croppedData.data[i + 2] - mean2) * stdInv2;
753
+ tensor[p2 + planeSize * 2] = (croppedData.data[i + 6] - mean2) * stdInv2;
754
+ tensor[p3 + planeSize * 2] = (croppedData.data[i + 10] - mean2) * stdInv2;
755
+ tensor[p4 + planeSize * 2] = (croppedData.data[i + 14] - mean2) * stdInv2;
756
+ }
757
+
758
+ return { tensor, center, scale };
759
+ }
760
+
761
+ /**
762
+ * Pose postprocessing with SimCC decoding
763
+ */
764
+ private postprocessPose(
765
+ simccX: Float32Array,
766
+ simccY: Float32Array,
767
+ shapeX: number[],
768
+ shapeY: number[],
769
+ center: [number, number],
770
+ scale: [number, number]
771
+ ): Keypoint[] {
772
+ const numKeypoints = shapeX[1];
773
+ const wx = shapeX[2];
774
+ const wy = shapeY[2];
775
+
776
+ const keypoints: Keypoint[] = [];
777
+
778
+ for (let k = 0; k < numKeypoints; k++) {
779
+ // Argmax X
780
+ let maxX = -Infinity;
781
+ let argmaxX = 0;
782
+ for (let i = 0; i < wx; i++) {
783
+ const val = simccX[k * wx + i];
784
+ if (val > maxX) {
785
+ maxX = val;
786
+ argmaxX = i;
787
+ }
788
+ }
789
+
790
+ // Argmax Y
791
+ let maxY = -Infinity;
792
+ let argmaxY = 0;
793
+ for (let i = 0; i < wy; i++) {
794
+ const val = simccY[k * wy + i];
795
+ if (val > maxY) {
796
+ maxY = val;
797
+ argmaxY = i;
798
+ }
799
+ }
800
+
801
+ const score = 0.5 * (maxX + maxY);
802
+ const visible = score > this.config.poseConfidence;
803
+
804
+ // Transform to original coordinates
805
+ const normX = argmaxX / wx;
806
+ const normY = argmaxY / wy;
807
+
808
+ const x = (normX - 0.5) * scale[0] + center[0];
809
+ const y = (normY - 0.5) * scale[1] + center[1];
810
+
811
+ keypoints.push({
812
+ x,
813
+ y,
814
+ score,
815
+ visible,
816
+ name: KEYPOINT_NAMES[k] || `keypoint_${k}`,
817
+ });
818
+ }
819
+
820
+ return keypoints;
821
+ }
822
+
823
+ /**
824
+ * Non-Maximum Suppression
825
+ */
826
+ private applyNMS(
827
+ detections: Array<{ x1: number; y1: number; x2: number; y2: number; confidence: number }>,
828
+ iouThreshold: number
829
+ ): typeof detections {
830
+ if (detections.length === 0) return [];
831
+
832
+ detections.sort((a, b) => b.confidence - a.confidence);
833
+
834
+ const selected: typeof detections = [];
835
+ const used = new Set<number>();
836
+
837
+ for (let i = 0; i < detections.length; i++) {
838
+ if (used.has(i)) continue;
839
+
840
+ selected.push(detections[i]);
841
+ used.add(i);
842
+
843
+ for (let j = i + 1; j < detections.length; j++) {
844
+ if (used.has(j)) continue;
845
+
846
+ const iou = this.calculateIoU(detections[i], detections[j]);
847
+ if (iou > iouThreshold) {
848
+ used.add(j);
849
+ }
850
+ }
851
+ }
852
+
853
+ return selected;
854
+ }
855
+
856
+ /**
857
+ * Calculate IoU between two boxes
858
+ */
859
+ private calculateIoU(
860
+ box1: { x1: number; y1: number; x2: number; y2: number },
861
+ box2: { x1: number; y1: number; x2: number; y2: number }
862
+ ): number {
863
+ const x1 = Math.max(box1.x1, box2.x1);
864
+ const y1 = Math.max(box1.y1, box2.y1);
865
+ const x2 = Math.min(box1.x2, box2.x2);
866
+ const y2 = Math.min(box1.y2, box2.y2);
867
+
868
+ if (x2 <= x1 || y2 <= y1) return 0;
869
+
870
+ const intersection = (x2 - x1) * (y2 - y1);
871
+ const area1 = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
872
+ const area2 = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
873
+ const union = area1 + area2 - intersection;
874
+
875
+ return intersection / union;
876
+ }
877
+
878
+ /**
879
+ * Dispose resources
880
+ */
881
+ dispose(): void {
882
+ if (this.detSession) {
883
+ this.detSession.release();
884
+ this.detSession = null;
885
+ }
886
+ if (this.poseSession) {
887
+ this.poseSession.release();
888
+ this.poseSession = null;
889
+ }
890
+ this.initialized = false;
891
+ }
892
+ }