rtmlib-ts 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/.gitattributes +1 -0
  2. package/README.md +202 -0
  3. package/dist/core/base.d.ts +20 -0
  4. package/dist/core/base.d.ts.map +1 -0
  5. package/dist/core/base.js +40 -0
  6. package/dist/core/file.d.ts +11 -0
  7. package/dist/core/file.d.ts.map +1 -0
  8. package/dist/core/file.js +111 -0
  9. package/dist/core/modelCache.d.ts +35 -0
  10. package/dist/core/modelCache.d.ts.map +1 -0
  11. package/dist/core/modelCache.js +161 -0
  12. package/dist/core/posePostprocessing.d.ts +12 -0
  13. package/dist/core/posePostprocessing.d.ts.map +1 -0
  14. package/dist/core/posePostprocessing.js +76 -0
  15. package/dist/core/postprocessing.d.ts +10 -0
  16. package/dist/core/postprocessing.d.ts.map +1 -0
  17. package/dist/core/postprocessing.js +70 -0
  18. package/dist/core/preprocessing.d.ts +14 -0
  19. package/dist/core/preprocessing.d.ts.map +1 -0
  20. package/dist/core/preprocessing.js +79 -0
  21. package/dist/index.d.ts +27 -0
  22. package/dist/index.d.ts.map +1 -0
  23. package/dist/index.js +31 -0
  24. package/dist/models/rtmpose.d.ts +25 -0
  25. package/dist/models/rtmpose.d.ts.map +1 -0
  26. package/dist/models/rtmpose.js +185 -0
  27. package/dist/models/rtmpose3d.d.ts +28 -0
  28. package/dist/models/rtmpose3d.d.ts.map +1 -0
  29. package/dist/models/rtmpose3d.js +184 -0
  30. package/dist/models/yolo12.d.ts +23 -0
  31. package/dist/models/yolo12.d.ts.map +1 -0
  32. package/dist/models/yolo12.js +165 -0
  33. package/dist/models/yolox.d.ts +18 -0
  34. package/dist/models/yolox.d.ts.map +1 -0
  35. package/dist/models/yolox.js +167 -0
  36. package/dist/solution/animalDetector.d.ts +229 -0
  37. package/dist/solution/animalDetector.d.ts.map +1 -0
  38. package/dist/solution/animalDetector.js +663 -0
  39. package/dist/solution/body.d.ts +16 -0
  40. package/dist/solution/body.d.ts.map +1 -0
  41. package/dist/solution/body.js +52 -0
  42. package/dist/solution/bodyWithFeet.d.ts +16 -0
  43. package/dist/solution/bodyWithFeet.d.ts.map +1 -0
  44. package/dist/solution/bodyWithFeet.js +52 -0
  45. package/dist/solution/customDetector.d.ts +137 -0
  46. package/dist/solution/customDetector.d.ts.map +1 -0
  47. package/dist/solution/customDetector.js +342 -0
  48. package/dist/solution/hand.d.ts +14 -0
  49. package/dist/solution/hand.d.ts.map +1 -0
  50. package/dist/solution/hand.js +20 -0
  51. package/dist/solution/index.d.ts +10 -0
  52. package/dist/solution/index.d.ts.map +1 -0
  53. package/dist/solution/index.js +9 -0
  54. package/dist/solution/objectDetector.d.ts +172 -0
  55. package/dist/solution/objectDetector.d.ts.map +1 -0
  56. package/dist/solution/objectDetector.js +606 -0
  57. package/dist/solution/pose3dDetector.d.ts +145 -0
  58. package/dist/solution/pose3dDetector.d.ts.map +1 -0
  59. package/dist/solution/pose3dDetector.js +611 -0
  60. package/dist/solution/poseDetector.d.ts +198 -0
  61. package/dist/solution/poseDetector.d.ts.map +1 -0
  62. package/dist/solution/poseDetector.js +622 -0
  63. package/dist/solution/poseTracker.d.ts +22 -0
  64. package/dist/solution/poseTracker.d.ts.map +1 -0
  65. package/dist/solution/poseTracker.js +106 -0
  66. package/dist/solution/wholebody.d.ts +19 -0
  67. package/dist/solution/wholebody.d.ts.map +1 -0
  68. package/dist/solution/wholebody.js +82 -0
  69. package/dist/solution/wholebody3d.d.ts +22 -0
  70. package/dist/solution/wholebody3d.d.ts.map +1 -0
  71. package/dist/solution/wholebody3d.js +75 -0
  72. package/dist/types/index.d.ts +52 -0
  73. package/dist/types/index.d.ts.map +1 -0
  74. package/dist/types/index.js +5 -0
  75. package/dist/visualization/draw.d.ts +57 -0
  76. package/dist/visualization/draw.d.ts.map +1 -0
  77. package/dist/visualization/draw.js +400 -0
  78. package/dist/visualization/skeleton/coco133.d.ts +350 -0
  79. package/dist/visualization/skeleton/coco133.d.ts.map +1 -0
  80. package/dist/visualization/skeleton/coco133.js +120 -0
  81. package/dist/visualization/skeleton/coco17.d.ts +180 -0
  82. package/dist/visualization/skeleton/coco17.d.ts.map +1 -0
  83. package/dist/visualization/skeleton/coco17.js +48 -0
  84. package/dist/visualization/skeleton/halpe26.d.ts +278 -0
  85. package/dist/visualization/skeleton/halpe26.d.ts.map +1 -0
  86. package/dist/visualization/skeleton/halpe26.js +70 -0
  87. package/dist/visualization/skeleton/hand21.d.ts +196 -0
  88. package/dist/visualization/skeleton/hand21.d.ts.map +1 -0
  89. package/dist/visualization/skeleton/hand21.js +51 -0
  90. package/dist/visualization/skeleton/index.d.ts +10 -0
  91. package/dist/visualization/skeleton/index.d.ts.map +1 -0
  92. package/dist/visualization/skeleton/index.js +9 -0
  93. package/dist/visualization/skeleton/openpose134.d.ts +357 -0
  94. package/dist/visualization/skeleton/openpose134.d.ts.map +1 -0
  95. package/dist/visualization/skeleton/openpose134.js +116 -0
  96. package/dist/visualization/skeleton/openpose18.d.ts +177 -0
  97. package/dist/visualization/skeleton/openpose18.d.ts.map +1 -0
  98. package/dist/visualization/skeleton/openpose18.js +47 -0
  99. package/docs/ANIMAL_DETECTOR.md +450 -0
  100. package/docs/CUSTOM_DETECTOR.md +568 -0
  101. package/docs/OBJECT_DETECTOR.md +373 -0
  102. package/docs/POSE3D_DETECTOR.md +458 -0
  103. package/docs/POSE_DETECTOR.md +442 -0
  104. package/examples/README.md +119 -0
  105. package/examples/index.html +746 -0
  106. package/package.json +51 -0
  107. package/playground/README.md +114 -0
  108. package/playground/app/favicon.ico +0 -0
  109. package/playground/app/globals.css +17 -0
  110. package/playground/app/layout.tsx +19 -0
  111. package/playground/app/page.tsx +1338 -0
  112. package/playground/eslint.config.mjs +18 -0
  113. package/playground/next.config.ts +34 -0
  114. package/playground/package-lock.json +6723 -0
  115. package/playground/package.json +27 -0
  116. package/playground/postcss.config.mjs +7 -0
  117. package/playground/tsconfig.json +34 -0
  118. package/src/core/base.ts +66 -0
  119. package/src/core/file.ts +141 -0
  120. package/src/core/modelCache.ts +189 -0
  121. package/src/core/posePostprocessing.ts +91 -0
  122. package/src/core/postprocessing.ts +93 -0
  123. package/src/core/preprocessing.ts +127 -0
  124. package/src/index.ts +69 -0
  125. package/src/models/rtmpose.ts +265 -0
  126. package/src/models/rtmpose3d.ts +289 -0
  127. package/src/models/yolo12.ts +220 -0
  128. package/src/models/yolox.ts +214 -0
  129. package/src/solution/animalDetector.ts +955 -0
  130. package/src/solution/body.ts +89 -0
  131. package/src/solution/bodyWithFeet.ts +89 -0
  132. package/src/solution/customDetector.ts +474 -0
  133. package/src/solution/hand.ts +52 -0
  134. package/src/solution/index.ts +10 -0
  135. package/src/solution/objectDetector.ts +816 -0
  136. package/src/solution/pose3dDetector.ts +890 -0
  137. package/src/solution/poseDetector.ts +892 -0
  138. package/src/solution/poseTracker.ts +172 -0
  139. package/src/solution/wholebody.ts +130 -0
  140. package/src/solution/wholebody3d.ts +125 -0
  141. package/src/types/index.ts +62 -0
  142. package/src/visualization/draw.ts +543 -0
  143. package/src/visualization/skeleton/coco133.ts +131 -0
  144. package/src/visualization/skeleton/coco17.ts +49 -0
  145. package/src/visualization/skeleton/halpe26.ts +71 -0
  146. package/src/visualization/skeleton/hand21.ts +52 -0
  147. package/src/visualization/skeleton/index.ts +10 -0
  148. package/src/visualization/skeleton/openpose134.ts +125 -0
  149. package/src/visualization/skeleton/openpose18.ts +48 -0
  150. package/tsconfig.json +32 -0
@@ -0,0 +1,816 @@
1
+ /**
2
+ * ObjectDetector - Universal object detection API
3
+ * Supports YOLO12 and other YOLO models for multi-class detection
4
+ *
5
+ * @example
6
+ * ```typescript
7
+ * // Initialize with default model (YOLOv12n from HuggingFace)
8
+ * const detector = new ObjectDetector({
9
+ * classes: ['person', 'car', 'dog'], // Filter specific classes
10
+ * });
11
+ * await detector.init();
12
+ *
13
+ * // Or with custom model
14
+ * const detector = new ObjectDetector({
15
+ * model: 'models/yolov12n.onnx',
16
+ * classes: ['person'],
17
+ * });
18
+ * await detector.init();
19
+ *
20
+ * // Detect from canvas
21
+ * const objects = await detector.detectFromCanvas(canvas);
22
+ *
23
+ * // Detect all classes
24
+ * const allObjects = await detector.detectFromCanvas(canvas, { classes: null });
25
+ * ```
26
+ */
27
+
28
+ import * as ort from 'onnxruntime-web';
29
+ import { getCachedModel, isModelCached } from '../core/modelCache';
30
+
31
+ // Configure ONNX Runtime Web
32
+ ort.env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.0/dist/';
33
+ ort.env.wasm.simd = true;
34
+ ort.env.wasm.proxy = false;
35
+
36
+ /**
37
+ * COCO 80-class names
38
+ */
39
+ export const COCO_CLASSES: string[] = [
40
+ 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
41
+ 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
42
+ 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
43
+ 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
44
+ 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
45
+ 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
46
+ 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
47
+ 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
48
+ 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator',
49
+ 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush',
50
+ ];
51
+
52
+ /**
53
+ * Configuration options for ObjectDetector
54
+ */
55
+ export interface ObjectDetectorConfig {
56
+ /** Path to YOLO detection model (optional - uses default YOLOv12n from HuggingFace if not specified) */
57
+ model?: string;
58
+ /** Input size (default: [416, 416] for speed) */
59
+ inputSize?: [number, number];
60
+ /** Confidence threshold (default: 0.5) */
61
+ confidence?: number;
62
+ /** NMS IoU threshold (default: 0.45) */
63
+ nmsThreshold?: number;
64
+ /** Classes to detect (null = all, default: ['person']) */
65
+ classes?: string[] | null;
66
+ /** Execution backend (default: 'wasm') */
67
+ backend?: 'wasm' | 'webgpu';
68
+ /** Performance mode (default: 'balanced') */
69
+ mode?: 'performance' | 'balanced' | 'lightweight';
70
+ /** Device type (for future use) */
71
+ device?: 'cpu' | 'gpu';
72
+ /** Enable model caching (default: true) */
73
+ cache?: boolean;
74
+ }
75
+
76
+ /**
77
+ * Detected object with bounding box and class
78
+ */
79
+ export interface DetectedObject {
80
+ /** Bounding box coordinates */
81
+ bbox: {
82
+ x1: number;
83
+ y1: number;
84
+ x2: number;
85
+ y2: number;
86
+ confidence: number;
87
+ };
88
+ /** Class ID (0-79 for COCO) */
89
+ classId: number;
90
+ /** Class name */
91
+ className: string;
92
+ /** Detection confidence (0-1) */
93
+ confidence: number;
94
+ }
95
+
96
+ /**
97
+ * Detection statistics
98
+ */
99
+ export interface DetectionStats {
100
+ /** Total number of detections */
101
+ totalCount: number;
102
+ /** Detections per class */
103
+ classCounts: Record<string, number>;
104
+ /** Inference time (ms) */
105
+ inferenceTime: number;
106
+ }
107
+
108
+ /**
109
+ * Default configuration
110
+ */
111
+ const DEFAULT_CONFIG: Required<ObjectDetectorConfig> = {
112
+ model: 'https://huggingface.co/demon2233/rtmlib-ts/resolve/main/yolo/yolov12n.onnx',
113
+ inputSize: [416, 416], // Faster default
114
+ confidence: 0.5,
115
+ nmsThreshold: 0.45,
116
+ classes: ['person'],
117
+ backend: 'webgpu', // Default to WebGPU for better performance
118
+ mode: 'balanced',
119
+ device: 'cpu',
120
+ cache: true,
121
+ };
122
+
123
+ // Performance presets
124
+ const MODE_PRESETS: Record<string, { inputSize: [number, number]; confidence: number }> = {
125
+ performance: { inputSize: [640, 640], confidence: 0.3 }, // High accuracy
126
+ balanced: { inputSize: [416, 416], confidence: 0.5 }, // Balanced
127
+ lightweight: { inputSize: [320, 320], confidence: 0.6 }, // Fastest
128
+ };
129
+
130
+ export class ObjectDetector {
131
+ private config: Required<ObjectDetectorConfig>;
132
+ private session: ort.InferenceSession | null = null;
133
+ private initialized = false;
134
+ private classFilter: Set<number> | null = null;
135
+
136
+ // Pre-allocated reusable resources for performance
137
+ private canvas: HTMLCanvasElement | null = null;
138
+ private ctx: CanvasRenderingContext2D | null = null;
139
+ private tensorBuffer: Float32Array | null = null;
140
+ private inputSize: [number, number] = [416, 416];
141
+
142
+ constructor(config: ObjectDetectorConfig) {
143
+ // Apply mode preset if specified
144
+ let finalConfig = { ...DEFAULT_CONFIG, ...config };
145
+
146
+ // Apply mode preset if specified
147
+ if (config.mode && MODE_PRESETS[config.mode]) {
148
+ const preset = MODE_PRESETS[config.mode];
149
+ // Only override if not explicitly set
150
+ if (!config.inputSize) finalConfig.inputSize = preset.inputSize;
151
+ if (!config.confidence) finalConfig.confidence = preset.confidence;
152
+ }
153
+
154
+ this.config = finalConfig;
155
+ this.updateClassFilter();
156
+
157
+ console.log(`[ObjectDetector] Initialized with mode: ${config.mode || 'balanced'}, input: ${this.config.inputSize[0]}x${this.config.inputSize[1]}`);
158
+ }
159
+
160
+ /**
161
+ * Update class filter based on config
162
+ */
163
+ private updateClassFilter(): void {
164
+ if (!this.config.classes) {
165
+ this.classFilter = null;
166
+ return;
167
+ }
168
+
169
+ this.classFilter = new Set<number>();
170
+ this.config.classes.forEach((className) => {
171
+ const classId = COCO_CLASSES.indexOf(className.toLowerCase());
172
+ if (classId !== -1) {
173
+ this.classFilter!.add(classId);
174
+ } else {
175
+ console.warn(`[ObjectDetector] Unknown class: ${className}`);
176
+ }
177
+ });
178
+ }
179
+
180
+ /**
181
+ * Set which classes to detect
182
+ * @param classes - Array of class names or null for all classes
183
+ */
184
+ setClasses(classes: string[] | null): void {
185
+ this.config.classes = classes;
186
+ this.updateClassFilter();
187
+ }
188
+
189
+ /**
190
+ * Get list of available COCO classes
191
+ */
192
+ getAvailableClasses(): string[] {
193
+ return [...COCO_CLASSES];
194
+ }
195
+
196
+ /**
197
+ * Get currently filtered classes
198
+ */
199
+ getFilteredClasses(): string[] | null {
200
+ return this.config.classes;
201
+ }
202
+
203
+ /**
204
+ * Initialize detection model and pre-allocate resources
205
+ */
206
+ async init(): Promise<void> {
207
+ if (this.initialized) return;
208
+
209
+ try {
210
+ console.log(`[ObjectDetector] Loading model from: ${this.config.model}`);
211
+
212
+ let modelBuffer: ArrayBuffer;
213
+
214
+ // Use cached model if caching is enabled
215
+ if (this.config.cache) {
216
+ const isCached = await isModelCached(this.config.model);
217
+ console.log(`[ObjectDetector] Cache ${isCached ? 'hit' : 'miss'} for model`);
218
+ modelBuffer = await getCachedModel(this.config.model);
219
+ } else {
220
+ console.log(`[ObjectDetector] Caching disabled, fetching from network`);
221
+ const response = await fetch(this.config.model);
222
+ if (!response.ok) {
223
+ throw new Error(`Failed to fetch model: HTTP ${response.status} ${response.statusText}`);
224
+ }
225
+ modelBuffer = await response.arrayBuffer();
226
+ }
227
+
228
+ console.log(`[ObjectDetector] Model loaded, size: ${(modelBuffer.byteLength / 1024 / 1024).toFixed(2)} MB`);
229
+
230
+ this.session = await ort.InferenceSession.create(modelBuffer, {
231
+ executionProviders: [this.config.backend],
232
+ graphOptimizationLevel: 'all',
233
+ });
234
+
235
+ // Pre-allocate canvas and tensor buffer for performance
236
+ const [w, h] = this.config.inputSize;
237
+ this.inputSize = [w, h];
238
+
239
+ this.canvas = document.createElement('canvas');
240
+ this.canvas.width = w;
241
+ this.canvas.height = h;
242
+ this.ctx = this.canvas.getContext('2d', {
243
+ willReadFrequently: true,
244
+ alpha: false // Faster, no transparency
245
+ })!;
246
+
247
+ // Pre-allocate tensor buffer (3 channels * width * height)
248
+ this.tensorBuffer = new Float32Array(3 * w * h);
249
+
250
+ this.initialized = true;
251
+ console.log(`[ObjectDetector] ✅ Initialized (${w}x${h}, ${this.config.backend})`);
252
+ } catch (error) {
253
+ console.error('[ObjectDetector] ❌ Initialization failed:', error);
254
+ throw error;
255
+ }
256
+ }
257
+
258
+ /**
259
+ * Detect objects from HTMLCanvasElement
260
+ */
261
+ async detectFromCanvas(canvas: HTMLCanvasElement): Promise<DetectedObject[]> {
262
+ const ctx = canvas.getContext('2d');
263
+ if (!ctx) {
264
+ throw new Error('Could not get 2D context from canvas');
265
+ }
266
+
267
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
268
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
269
+ }
270
+
271
+ /**
272
+ * Detect objects from HTMLVideoElement
273
+ */
274
+ async detectFromVideo(
275
+ video: HTMLVideoElement,
276
+ targetCanvas?: HTMLCanvasElement
277
+ ): Promise<DetectedObject[]> {
278
+ if (video.readyState < 2) {
279
+ throw new Error('Video not ready. Ensure video is loaded and playing.');
280
+ }
281
+
282
+ const canvas = targetCanvas || document.createElement('canvas');
283
+ canvas.width = video.videoWidth;
284
+ canvas.height = video.videoHeight;
285
+
286
+ const ctx = canvas.getContext('2d');
287
+ if (!ctx) {
288
+ throw new Error('Could not get 2D context from canvas');
289
+ }
290
+
291
+ ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
292
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
293
+
294
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
295
+ }
296
+
297
+ /**
298
+ * Detect objects from HTMLImageElement
299
+ */
300
+ async detectFromImage(
301
+ image: HTMLImageElement,
302
+ targetCanvas?: HTMLCanvasElement
303
+ ): Promise<DetectedObject[]> {
304
+ if (!image.complete || !image.naturalWidth) {
305
+ throw new Error('Image not loaded. Ensure image is fully loaded.');
306
+ }
307
+
308
+ const canvas = targetCanvas || document.createElement('canvas');
309
+ canvas.width = image.naturalWidth;
310
+ canvas.height = image.naturalHeight;
311
+
312
+ const ctx = canvas.getContext('2d');
313
+ if (!ctx) {
314
+ throw new Error('Could not get 2D context from canvas');
315
+ }
316
+
317
+ ctx.drawImage(image, 0, 0);
318
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
319
+
320
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
321
+ }
322
+
323
+ /**
324
+ * Detect objects from ImageBitmap
325
+ */
326
+ async detectFromBitmap(
327
+ bitmap: ImageBitmap,
328
+ targetCanvas?: HTMLCanvasElement
329
+ ): Promise<DetectedObject[]> {
330
+ const canvas = targetCanvas || document.createElement('canvas');
331
+ canvas.width = bitmap.width;
332
+ canvas.height = bitmap.height;
333
+
334
+ const ctx = canvas.getContext('2d');
335
+ if (!ctx) {
336
+ throw new Error('Could not get 2D context from canvas');
337
+ }
338
+
339
+ ctx.drawImage(bitmap, 0, 0);
340
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height);
341
+
342
+ return this.detect(new Uint8Array(imageData.data.buffer), canvas.width, canvas.height);
343
+ }
344
+
345
+ /**
346
+ * Detect objects from File
347
+ */
348
+ async detectFromFile(
349
+ file: File,
350
+ targetCanvas?: HTMLCanvasElement
351
+ ): Promise<DetectedObject[]> {
352
+ return new Promise((resolve, reject) => {
353
+ const img = new Image();
354
+ img.onload = async () => {
355
+ try {
356
+ const results = await this.detectFromImage(img, targetCanvas);
357
+ resolve(results);
358
+ } catch (error) {
359
+ reject(error);
360
+ }
361
+ };
362
+ img.onerror = () => reject(new Error('Failed to load image from file'));
363
+ img.src = URL.createObjectURL(file);
364
+ });
365
+ }
366
+
367
+ /**
368
+ * Detect objects from Blob
369
+ */
370
+ async detectFromBlob(
371
+ blob: Blob,
372
+ targetCanvas?: HTMLCanvasElement
373
+ ): Promise<DetectedObject[]> {
374
+ const bitmap = await createImageBitmap(blob);
375
+ const results = await this.detectFromBitmap(bitmap, targetCanvas);
376
+ bitmap.close();
377
+ return results;
378
+ }
379
+
380
+ /**
381
+ * Detect objects from raw image data
382
+ */
383
+ async detect(
384
+ imageData: Uint8Array,
385
+ width: number,
386
+ height: number
387
+ ): Promise<DetectedObject[]> {
388
+ if (!this.initialized) {
389
+ await this.init();
390
+ }
391
+
392
+ const startTime = performance.now();
393
+
394
+ const [inputH, inputW] = this.config.inputSize;
395
+
396
+ // Preprocess
397
+ const { tensor, paddingX, paddingY, scaleX, scaleY } = this.preprocess(
398
+ imageData,
399
+ width,
400
+ height,
401
+ [inputW, inputH]
402
+ );
403
+
404
+ // Inference - use dynamic input name
405
+ const inputTensor = new ort.Tensor('float32', tensor, [1, 3, inputH, inputW]);
406
+ const inputName = this.session!.inputNames[0]; // Dynamic: 'images' or 'pixel_values'
407
+
408
+ console.log(`[ObjectDetector] Using input name: ${inputName}`);
409
+ console.log(`[ObjectDetector] Input shape: [1, 3, ${inputH}, ${inputW}]`);
410
+
411
+ const feeds: Record<string, ort.Tensor> = {};
412
+ feeds[inputName] = inputTensor;
413
+
414
+ const results = await this.session!.run(feeds);
415
+ const output = results[this.session!.outputNames[0]];
416
+
417
+ console.log(`[ObjectDetector] Output shape: [${output.dims}]`);
418
+ console.log(`[ObjectDetector] Output type: ${output.type}`);
419
+
420
+ // Postprocess
421
+ const detections = this.postprocess(
422
+ output.data as Float32Array,
423
+ output.dims[1],
424
+ output.dims as number[],
425
+ width,
426
+ height,
427
+ paddingX,
428
+ paddingY,
429
+ scaleX,
430
+ scaleY
431
+ );
432
+
433
+ const inferenceTime = performance.now() - startTime;
434
+
435
+ // Attach stats
436
+ (detections as any).stats = this.calculateStats(detections, inferenceTime);
437
+
438
+ return detections;
439
+ }
440
+
441
+ /**
442
+ * Optimized preprocess with resource reuse
443
+ */
444
+ private preprocess(
445
+ imageData: Uint8Array,
446
+ imgWidth: number,
447
+ imgHeight: number,
448
+ inputSize: [number, number]
449
+ ): {
450
+ tensor: Float32Array;
451
+ paddingX: number;
452
+ paddingY: number;
453
+ scaleX: number;
454
+ scaleY: number;
455
+ } {
456
+ const [inputW, inputH] = inputSize;
457
+
458
+ // Reuse pre-allocated canvas
459
+ if (!this.canvas || !this.ctx) {
460
+ this.canvas = document.createElement('canvas');
461
+ this.canvas.width = inputW;
462
+ this.canvas.height = inputH;
463
+ this.ctx = this.canvas.getContext('2d', {
464
+ willReadFrequently: true,
465
+ alpha: false
466
+ })!;
467
+ this.tensorBuffer = new Float32Array(3 * inputW * inputH);
468
+ }
469
+
470
+ const ctx = this.ctx;
471
+
472
+ // Fast clear
473
+ ctx.clearRect(0, 0, inputW, inputH);
474
+
475
+ // Calculate letterbox
476
+ const aspectRatio = imgWidth / imgHeight;
477
+ const targetAspectRatio = inputW / inputH;
478
+
479
+ let drawWidth: number, drawHeight: number, offsetX: number, offsetY: number;
480
+
481
+ if (aspectRatio > targetAspectRatio) {
482
+ drawWidth = inputW;
483
+ drawHeight = (inputW / aspectRatio) | 0; // Faster than Math.floor
484
+ offsetX = 0;
485
+ offsetY = ((inputH - drawHeight) / 2) | 0;
486
+ } else {
487
+ drawHeight = inputH;
488
+ drawWidth = (inputH * aspectRatio) | 0;
489
+ offsetX = ((inputW - drawWidth) / 2) | 0;
490
+ offsetY = 0;
491
+ }
492
+
493
+ // Draw directly without intermediate canvas (faster)
494
+ const srcCanvas = document.createElement('canvas');
495
+ srcCanvas.width = imgWidth;
496
+ srcCanvas.height = imgHeight;
497
+ const srcCtx = srcCanvas.getContext('2d')!;
498
+
499
+ const srcImageData = srcCtx.createImageData(imgWidth, imgHeight);
500
+ srcImageData.data.set(imageData);
501
+ srcCtx.putImageData(srcImageData, 0, 0);
502
+
503
+ // Draw with letterbox
504
+ ctx.drawImage(srcCanvas as CanvasImageSource, 0, 0, imgWidth, imgHeight, offsetX, offsetY, drawWidth, drawHeight);
505
+
506
+ const paddedData = ctx.getImageData(0, 0, inputW, inputH);
507
+
508
+ // Optimized normalization loop (reuse buffer)
509
+ const tensor = this.tensorBuffer!;
510
+ const len = paddedData.data.length;
511
+ const planeSize = inputW * inputH;
512
+
513
+ // Unroll loop for speed (process 4 pixels at once)
514
+ for (let i = 0; i < len; i += 16) {
515
+ const i1 = i, i2 = i + 4, i3 = i + 8, i4 = i + 12;
516
+ const p1 = i1 / 4, p2 = i2 / 4, p3 = i3 / 4, p4 = i4 / 4;
517
+
518
+ // R channel
519
+ tensor[p1] = paddedData.data[i1] * 0.003921569; // / 255
520
+ tensor[p2] = paddedData.data[i2] * 0.003921569;
521
+ tensor[p3] = paddedData.data[i3] * 0.003921569;
522
+ tensor[p4] = paddedData.data[i4] * 0.003921569;
523
+
524
+ // G channel
525
+ tensor[p1 + planeSize] = paddedData.data[i1 + 1] * 0.003921569;
526
+ tensor[p2 + planeSize] = paddedData.data[i2 + 1] * 0.003921569;
527
+ tensor[p3 + planeSize] = paddedData.data[i3 + 1] * 0.003921569;
528
+ tensor[p4 + planeSize] = paddedData.data[i4 + 1] * 0.003921569;
529
+
530
+ // B channel
531
+ tensor[p1 + planeSize * 2] = paddedData.data[i1 + 2] * 0.003921569;
532
+ tensor[p2 + planeSize * 2] = paddedData.data[i2 + 2] * 0.003921569;
533
+ tensor[p3 + planeSize * 2] = paddedData.data[i3 + 2] * 0.003921569;
534
+ tensor[p4 + planeSize * 2] = paddedData.data[i4 + 2] * 0.003921569;
535
+ }
536
+
537
+ const scaleX = imgWidth / drawWidth;
538
+ const scaleY = imgHeight / drawHeight;
539
+
540
+ return {
541
+ tensor,
542
+ paddingX: offsetX,
543
+ paddingY: offsetY,
544
+ scaleX,
545
+ scaleY,
546
+ };
547
+ }
548
+
549
+ /**
550
+ * Postprocess YOLO output - supports multiple output formats
551
+ */
552
+ private postprocess(
553
+ output: Float32Array,
554
+ numDetections: number,
555
+ outputShape: number[],
556
+ imgWidth: number,
557
+ imgHeight: number,
558
+ paddingX: number,
559
+ paddingY: number,
560
+ scaleX: number,
561
+ scaleY: number
562
+ ): DetectedObject[] {
563
+ const detections: DetectedObject[] = [];
564
+
565
+ // Format 1: [batch, boxes, 6] - [x1, y1, x2, y2, conf, class]
566
+ if (outputShape.length === 3 && outputShape[2] === 6) {
567
+ for (let i = 0; i < numDetections; i++) {
568
+ const idx = i * 6;
569
+ const x1 = output[idx];
570
+ const y1 = output[idx + 1];
571
+ const x2 = output[idx + 2];
572
+ const y2 = output[idx + 3];
573
+ const confidence = output[idx + 4];
574
+ const classId = Math.round(output[idx + 5]);
575
+
576
+ if (confidence < this.config.confidence) continue;
577
+ if (this.classFilter && !this.classFilter.has(classId)) continue;
578
+ if (x2 <= x1 || y2 <= y1) continue;
579
+
580
+ const tx1 = (x1 - paddingX) * scaleX;
581
+ const ty1 = (y1 - paddingY) * scaleY;
582
+ const tx2 = (x2 - paddingX) * scaleX;
583
+ const ty2 = (y2 - paddingY) * scaleY;
584
+
585
+ detections.push({
586
+ bbox: {
587
+ x1: Math.max(0, tx1),
588
+ y1: Math.max(0, ty1),
589
+ x2: Math.min(imgWidth, tx2),
590
+ y2: Math.min(imgHeight, ty2),
591
+ confidence,
592
+ },
593
+ classId,
594
+ className: COCO_CLASSES[classId] || `class_${classId}`,
595
+ confidence,
596
+ });
597
+ }
598
+ }
599
+ // Format 2: [batch, boxes, 80+] - YOLOv26 style
600
+ // Format: [class_scores..., cx, cy, w, h] - center format with width/height
601
+ else if (outputShape.length === 3 && outputShape[2] >= 80) {
602
+ const numClasses = outputShape[2] - 4;
603
+ const [inputH, inputW] = this.config.inputSize;
604
+
605
+ console.log(`[ObjectDetector] Trying YOLOv26 format (center format) with ${numClasses} classes`);
606
+
607
+ for (let i = 0; i < numDetections; i++) {
608
+ const baseIdx = i * outputShape[2];
609
+
610
+ // Raw bbox values - try direct interpretation first
611
+ // YOLOv26 may output already decoded coordinates
612
+ let x1 = output[baseIdx + numClasses];
613
+ let y1 = output[baseIdx + numClasses + 1];
614
+ let x2 = output[baseIdx + numClasses + 2];
615
+ let y2 = output[baseIdx + numClasses + 3];
616
+
617
+ // If values are very small (< 1), they might be logits - apply sigmoid
618
+ if (Math.abs(x1) < 1 && Math.abs(y1) < 1) {
619
+ // Apply sigmoid and scale
620
+ x1 = (1 / (1 + Math.exp(-x1))) * inputW;
621
+ y1 = (1 / (1 + Math.exp(-y1))) * inputH;
622
+ x2 = (1 / (1 + Math.exp(-x2))) * inputW;
623
+ y2 = (1 / (1 + Math.exp(-y2))) * inputH;
624
+ }
625
+ // If values are negative but large, apply sigmoid only
626
+ else if (x1 < 0 || y1 < 0) {
627
+ x1 = (1 / (1 + Math.exp(-x1))) * inputW;
628
+ y1 = (1 / (1 + Math.exp(-y1))) * inputH;
629
+ x2 = (1 / (1 + Math.exp(-x2))) * inputW;
630
+ y2 = (1 / (1 + Math.exp(-y2))) * inputH;
631
+ }
632
+ // Otherwise use as-is (already decoded)
633
+
634
+ // Debug first detection
635
+ if (i === 0) {
636
+ console.log(`[ObjectDetector] Raw bbox: [${output[baseIdx + numClasses]}, ${output[baseIdx + numClasses + 1]}, ${output[baseIdx + numClasses + 2]}, ${output[baseIdx + numClasses + 3]}]`);
637
+ console.log(`[ObjectDetector] Decoded bbox: [${x1.toFixed(1)}, ${y1.toFixed(1)}, ${x2.toFixed(1)}, ${y2.toFixed(1)}]`);
638
+ }
639
+
640
+ // Find best class and confidence
641
+ let bestClass = 0;
642
+ let bestScore = -Infinity;
643
+
644
+ for (let c = 0; c < numClasses; c++) {
645
+ const score = output[baseIdx + c];
646
+ if (score > bestScore) {
647
+ bestScore = score;
648
+ bestClass = c;
649
+ }
650
+ }
651
+
652
+ // Apply sigmoid to class score
653
+ const confidence = 1 / (1 + Math.exp(-bestScore));
654
+
655
+ // Debug first few detections
656
+ if (i < 5 && confidence > 0.05) {
657
+ console.log(`[ObjectDetector] Box ${i}: [${x1.toFixed(1)}, ${y1.toFixed(1)}, ${x2.toFixed(1)}, ${y2.toFixed(1)}]`);
658
+ console.log(`[ObjectDetector] -> class=${bestClass} (${COCO_CLASSES[bestClass] || 'unknown'}), confidence=${(confidence * 100).toFixed(1)}%`);
659
+ }
660
+
661
+ if (confidence < this.config.confidence) continue;
662
+ if (this.classFilter && !this.classFilter.has(bestClass)) continue;
663
+ if (x2 <= x1 || y2 <= y1) continue;
664
+ if (x1 < 0 && x2 < 0) continue;
665
+ if (y1 < 0 && y2 < 0) continue;
666
+
667
+ // Transform to original image space
668
+ const tx1 = (x1 - paddingX) * scaleX;
669
+ const ty1 = (y1 - paddingY) * scaleY;
670
+ const tx2 = (x2 - paddingX) * scaleX;
671
+ const ty2 = (y2 - paddingY) * scaleY;
672
+
673
+ detections.push({
674
+ bbox: {
675
+ x1: Math.max(0, tx1),
676
+ y1: Math.max(0, ty1),
677
+ x2: Math.min(imgWidth, tx2),
678
+ y2: Math.min(imgHeight, ty2),
679
+ confidence,
680
+ },
681
+ classId: bestClass,
682
+ className: COCO_CLASSES[bestClass] || `class_${bestClass}`,
683
+ confidence,
684
+ });
685
+ }
686
+ }
687
+
688
+ // Debug logging
689
+ if (detections.length > 0) {
690
+ console.log(`[ObjectDetector] ✅ Found ${detections.length} detections`);
691
+ console.log(`[ObjectDetector] First:`, detections[0]);
692
+ } else {
693
+ console.log(`[ObjectDetector] ❌ No detections above threshold ${this.config.confidence}`);
694
+ // Log top 3 scores for debugging
695
+ const topScores: number[] = [];
696
+ const numClasses = outputShape.length === 3 ? outputShape[2] - 4 : 80;
697
+ for (let i = 0; i < Math.min(3, numDetections); i++) {
698
+ const baseIdx = i * outputShape[2];
699
+ let bestScore = -Infinity;
700
+ for (let c = 0; c < numClasses; c++) {
701
+ const score = output[baseIdx + c];
702
+ if (score > bestScore) bestScore = score;
703
+ }
704
+ const confidence = bestScore > 0 && bestScore <= 1 ? bestScore : 1 / (1 + Math.exp(-bestScore));
705
+ topScores.push(confidence);
706
+ }
707
+ console.log(`[ObjectDetector] Top 3 confidences: ${topScores.map(s => (s * 100).toFixed(1) + '%').join(', ')}`);
708
+ }
709
+
710
+ // NMS
711
+ return this.applyMultiClassNMS(detections, this.config.nmsThreshold);
712
+ }
713
+
714
+ /**
715
+ * Multi-class Non-Maximum Suppression
716
+ */
717
+ private applyMultiClassNMS(
718
+ detections: DetectedObject[],
719
+ iouThreshold: number
720
+ ): DetectedObject[] {
721
+ if (detections.length === 0) return [];
722
+
723
+ // Group by class
724
+ const byClass = new Map<number, DetectedObject[]>();
725
+ detections.forEach((det) => {
726
+ const classDets = byClass.get(det.classId) || [];
727
+ classDets.push(det);
728
+ byClass.set(det.classId, classDets);
729
+ });
730
+
731
+ // Apply NMS per class
732
+ const selected: DetectedObject[] = [];
733
+ byClass.forEach((classDets) => {
734
+ classDets.sort((a, b) => b.confidence - a.confidence);
735
+
736
+ const used = new Set<number>();
737
+ for (let i = 0; i < classDets.length; i++) {
738
+ if (used.has(i)) continue;
739
+
740
+ selected.push(classDets[i]);
741
+ used.add(i);
742
+
743
+ for (let j = i + 1; j < classDets.length; j++) {
744
+ if (used.has(j)) continue;
745
+
746
+ const iou = this.calculateIoU(classDets[i].bbox, classDets[j].bbox);
747
+ if (iou > iouThreshold) {
748
+ used.add(j);
749
+ }
750
+ }
751
+ }
752
+ });
753
+
754
+ return selected;
755
+ }
756
+
757
+ /**
758
+ * Calculate IoU between two boxes
759
+ */
760
+ private calculateIoU(
761
+ box1: { x1: number; y1: number; x2: number; y2: number },
762
+ box2: { x1: number; y1: number; x2: number; y2: number }
763
+ ): number {
764
+ const x1 = Math.max(box1.x1, box2.x1);
765
+ const y1 = Math.max(box1.y1, box2.y1);
766
+ const x2 = Math.min(box1.x2, box2.x2);
767
+ const y2 = Math.min(box1.y2, box2.y2);
768
+
769
+ if (x2 <= x1 || y2 <= y1) return 0;
770
+
771
+ const intersection = (x2 - x1) * (y2 - y1);
772
+ const area1 = (box1.x2 - box1.x1) * (box1.y2 - box1.y1);
773
+ const area2 = (box2.x2 - box2.x1) * (box2.y2 - box2.y1);
774
+ const union = area1 + area2 - intersection;
775
+
776
+ return intersection / union;
777
+ }
778
+
779
+ /**
780
+ * Calculate detection statistics
781
+ */
782
+ private calculateStats(
783
+ detections: DetectedObject[],
784
+ inferenceTime: number
785
+ ): DetectionStats {
786
+ const classCounts: Record<string, number> = {};
787
+
788
+ detections.forEach((det) => {
789
+ classCounts[det.className] = (classCounts[det.className] || 0) + 1;
790
+ });
791
+
792
+ return {
793
+ totalCount: detections.length,
794
+ classCounts,
795
+ inferenceTime: Math.round(inferenceTime),
796
+ };
797
+ }
798
+
799
+ /**
800
+ * Get statistics from last detection
801
+ */
802
+ getStats(): DetectionStats | null {
803
+ return null;
804
+ }
805
+
806
+ /**
807
+ * Dispose resources
808
+ */
809
+ dispose(): void {
810
+ if (this.session) {
811
+ this.session.release();
812
+ this.session = null;
813
+ }
814
+ this.initialized = false;
815
+ }
816
+ }