kinemotion 0.67.0__py3-none-any.whl → 0.68.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kinemotion might be problematic. Click here for more details.

@@ -0,0 +1,626 @@
1
+ """Optimized CPU RTMPose tracker with ONNX Runtime performance tuning.
2
+
3
+ This tracker applies specific optimizations for AMD Ryzen CPUs:
4
+ - Tuned threading for Ryzen 7 7800X3D (8 cores/16 threads)
5
+ - Sequential execution mode for better cache locality
6
+ - Optional input size reduction for faster inference
7
+
8
+ Landmark accuracy: 9-12px mean difference from MediaPipe
9
+ - CMJ: 9.7px → 1-3% metric accuracy
10
+ - Drop Jump: 11.5px → 2-5% metric accuracy
11
+
12
+ Performance: 40-68 FPS (vs MediaPipe's 48 FPS)
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from collections.abc import Sequence
18
+ from pathlib import Path
19
+
20
+ import cv2
21
+ import numpy as np
22
+ import onnxruntime as ort
23
+
24
+ from kinemotion.core.timing import NULL_TIMER, Timer
25
+
26
+ # Halpe-26 to kinemotion landmark mapping
27
+ HALPE_TO_KINEMOTION = {
28
+ 0: "nose",
29
+ 5: "left_shoulder",
30
+ 6: "right_shoulder",
31
+ 11: "left_hip",
32
+ 12: "right_hip",
33
+ 13: "left_knee",
34
+ 14: "right_knee",
35
+ 15: "left_ankle",
36
+ 16: "right_ankle",
37
+ 20: "left_foot_index",
38
+ 21: "right_foot_index",
39
+ 24: "left_heel",
40
+ 25: "right_heel",
41
+ }
42
+
43
+
44
+ class OptimizedCPUTracker:
45
+ """RTMPose tracker with CPU-optimized ONNX Runtime configuration.
46
+
47
+ Optimized for AMD Ryzen 7 7800X3D with:
48
+ - intra_op_num_threads=8 (physical cores)
49
+ - inter_op_num_threads=1 (avoid oversubscription)
50
+ - ORT_SEQUENTIAL execution mode
51
+ - Optional reduced input size for faster inference
52
+
53
+ Attributes:
54
+ timer: Optional Timer for measuring operations
55
+ mode: RTMLib mode ('lightweight', 'balanced', 'performance')
56
+ input_size: Pose input size as (height, width) tuple
57
+ intra_threads: Number of intra-op threads
58
+ inter_threads: Number of inter-op threads
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ timer: Timer | None = None,
64
+ mode: str = "lightweight",
65
+ input_size: tuple[int, int] = (192, 256),
66
+ intra_threads: int = 8,
67
+ inter_threads: int = 1,
68
+ verbose: bool = False,
69
+ ) -> None:
70
+ """Initialize the optimized CPU tracker.
71
+
72
+ Args:
73
+ timer: Optional Timer for measuring operations
74
+ mode: RTMLib performance mode
75
+ input_size: Pose model input size as (height, width)
76
+ intra_threads: Number of intra-op threads (default: 8 for Ryzen 7 7800X3D)
77
+ inter_threads: Number of inter-op threads (default: 1 to avoid oversubscription)
78
+ verbose: Print debug information
79
+ """
80
+ self.timer = timer or NULL_TIMER
81
+ self.mode = mode
82
+ self.input_size = input_size
83
+ self.intra_threads = intra_threads
84
+ self.inter_threads = inter_threads
85
+ self.verbose = verbose
86
+
87
+ with self.timer.measure("optimized_cpu_initialization"):
88
+ self._init_models()
89
+
90
+ def _init_models(self) -> None:
91
+ """Initialize ONNX Runtime models with optimized CPU configuration."""
92
+ from importlib.resources import files
93
+
94
+ # Use bundled models from package
95
+ models_dir = Path(files("kinemotion") / "models") # type: ignore[arg-type]
96
+
97
+ # Model paths for RTMPose lightweight (RTMPose-s with Halpe-26)
98
+ det_model_path = models_dir / "yolox_tiny_8xb8-300e_humanart-6f3252f9.onnx"
99
+ pose_model_path = (
100
+ models_dir
101
+ / "rtmpose-s_simcc-body7_pt-body7-halpe26_700e-256x192-7f134165_20230605.onnx"
102
+ )
103
+
104
+ if not det_model_path.exists():
105
+ raise FileNotFoundError(
106
+ f"Detector model not found: {det_model_path}\n"
107
+ f"Please ensure RTMPose models are installed in {models_dir}"
108
+ )
109
+ if not pose_model_path.exists():
110
+ raise FileNotFoundError(
111
+ f"Pose model not found: {pose_model_path}\n"
112
+ f"Please ensure RTMPose models are installed in {models_dir}"
113
+ )
114
+
115
+ # Configure execution providers - CPU only with optimizations
116
+ providers = ["CPUExecutionProvider"]
117
+
118
+ # Configure session options for optimal CPU performance
119
+ so = ort.SessionOptions()
120
+ so.intra_op_num_threads = self.intra_threads
121
+ so.inter_op_num_threads = self.inter_threads
122
+ so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
123
+ so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
124
+ # Enable thread spinning for lower latency
125
+ so.add_session_config_entry("session.intra_op.allow_spinning", "1")
126
+ so.add_session_config_entry("session.inter_op.allow_spinning", "0")
127
+
128
+ if self.verbose:
129
+ print("Optimized CPU Configuration:")
130
+ print(f" intra_op_num_threads: {self.intra_threads}")
131
+ print(f" inter_op_num_threads: {self.inter_threads}")
132
+ print(" execution_mode: ORT_SEQUENTIAL")
133
+ print(f" input_size: {self.input_size}")
134
+ print(f"Available providers: {ort.get_available_providers()}")
135
+
136
+ # Load detection model
137
+ if self.verbose:
138
+ print(f"Loading detection model from {det_model_path}")
139
+ self.det_session = ort.InferenceSession(
140
+ str(det_model_path),
141
+ sess_options=so,
142
+ providers=providers,
143
+ )
144
+
145
+ if self.verbose:
146
+ print(f"Detection model providers: {self.det_session.get_providers()}")
147
+
148
+ # Load pose model
149
+ if self.verbose:
150
+ print(f"Loading pose model from {pose_model_path}")
151
+ self.pose_session = ort.InferenceSession(
152
+ str(pose_model_path),
153
+ sess_options=so,
154
+ providers=providers,
155
+ )
156
+
157
+ if self.verbose:
158
+ print(f"Pose model providers: {self.pose_session.get_providers()}")
159
+
160
+ # Get input/output info
161
+ self.det_input_name = self.det_session.get_inputs()[0].name
162
+ self.det_output_names = [o.name for o in self.det_session.get_outputs()]
163
+
164
+ self.pose_input_name = self.pose_session.get_inputs()[0].name
165
+ self.pose_output_names = [o.name for o in self.pose_session.get_outputs()]
166
+
167
+ # Store input sizes
168
+ self.det_input_size = (416, 416) # YOLOX-tiny default
169
+ # Note: input_size is (height, width), but pose model expects (width, height)
170
+ self.pose_input_size = (self.input_size[1], self.input_size[0]) # Swap to (width, height)
171
+
172
+ def _preprocess_det(self, img: np.ndarray) -> dict:
173
+ """Preprocess image for detection following rtmlib's YOLOX preprocessing.
174
+
175
+ IMPORTANT: YOLOX expects float32 in [0, 255] range, NOT normalized to [0, 1]!
176
+
177
+ Returns dict with 'input' (tensor) and 'ratio' (scale factor).
178
+ """
179
+ # Create padded image with 114 (YOLOX default padding color)
180
+ h, w = img.shape[:2]
181
+ target_h, target_w = self.det_input_size
182
+
183
+ # Calculate ratio
184
+ ratio = min(target_h / h, target_w / w)
185
+
186
+ # Resize image
187
+ resized = cv2.resize(
188
+ img,
189
+ (int(w * ratio), int(h * ratio)),
190
+ interpolation=cv2.INTER_LINEAR,
191
+ ).astype(np.uint8)
192
+
193
+ # Create padded image
194
+ padded = np.ones((target_h, target_w, 3), dtype=np.uint8) * 114
195
+ padded[: resized.shape[0], : resized.shape[1]] = resized
196
+
197
+ # Convert BGR to RGB (YOLOX expects RGB)
198
+ rgb = cv2.cvtColor(padded, cv2.COLOR_BGR2RGB)
199
+
200
+ # Convert to float32 in [0, 255] range - DO NOT NORMALIZE!
201
+ # rtmlib: img = np.ascontiguousarray(img, dtype=np.float32)
202
+ rgb_float = np.ascontiguousarray(rgb, dtype=np.float32)
203
+
204
+ # Transpose to (3, H, W) and add batch dimension
205
+ transposed = rgb_float.transpose(2, 0, 1)
206
+
207
+ return {
208
+ "input": transposed[np.newaxis, :],
209
+ "ratio": ratio,
210
+ }
211
+
212
+ def _preprocess_pose(self, img: np.ndarray, bbox: list) -> np.ndarray:
213
+ """Preprocess image region for pose estimation using RTMPose preprocessing.
214
+
215
+ This follows RTMLib's preprocessing which uses affine transform and
216
+ specific normalization (mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]).
217
+
218
+ Args:
219
+ img: Original BGR image
220
+ bbox: Bounding box [x1, y1, x2, y2]
221
+
222
+ Returns:
223
+ Preprocessed image tensor ready for ONNX inference
224
+ """
225
+ # Convert bbox to numpy array
226
+ bbox_array = np.array(bbox, dtype=np.float32)
227
+
228
+ # Get center and scale using rtmlib's bbox_xyxy2cs logic
229
+ # padding=1.25 is the default used by RTMPose
230
+ x1, y1, x2, y2 = bbox_array
231
+ center = np.array([x1 + x2, y1 + y2]) * 0.5
232
+ scale = np.array([x2 - x1, y2 - y1]) * 1.25
233
+
234
+ # Reshape bbox to fixed aspect ratio using rtmlib's logic
235
+ model_h, model_w = self.pose_input_size
236
+ aspect_ratio = model_w / model_h
237
+
238
+ b_w = scale[0]
239
+ b_h = scale[1]
240
+
241
+ # Reshape to maintain aspect ratio
242
+ if b_w > b_h * aspect_ratio:
243
+ scale = np.array([b_w, b_w / aspect_ratio], dtype=np.float32)
244
+ else:
245
+ scale = np.array([b_h * aspect_ratio, b_h], dtype=np.float32)
246
+
247
+ # Get affine transformation matrix
248
+ warp_mat = self._get_warp_matrix(center, scale, 0, (model_w, model_h))
249
+
250
+ # Do affine transform
251
+ warped = cv2.warpAffine(img, warp_mat, (model_w, model_h), flags=cv2.INTER_LINEAR)
252
+
253
+ # Normalize using RTMPose's mean and std
254
+ mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
255
+ std = np.array([58.395, 57.12, 57.375], dtype=np.float32)
256
+
257
+ normalized = (warped - mean) / std
258
+
259
+ # Convert to RGB and transpose to NCHW format
260
+ rgb = cv2.cvtColor(normalized.astype(np.float32), cv2.COLOR_BGR2RGB)
261
+ transposed = rgb.transpose(2, 0, 1)
262
+
263
+ return transposed[np.newaxis, :]
264
+
265
+ def _get_warp_matrix(
266
+ self,
267
+ center: np.ndarray,
268
+ scale: np.ndarray,
269
+ rot: float,
270
+ output_size: tuple[int, int],
271
+ ) -> np.ndarray:
272
+ """Calculate the affine transformation matrix for pose preprocessing.
273
+
274
+ This follows rtmlib's get_warp_matrix logic.
275
+
276
+ Args:
277
+ center: Center of the bounding box (x, y)
278
+ scale: Scale of the bounding box [w, h]
279
+ rot: Rotation angle (degrees) - usually 0
280
+ output_size: Destination size (width, height)
281
+
282
+ Returns:
283
+ 2x3 affine transformation matrix
284
+ """
285
+ import math
286
+
287
+ dst_w, dst_h = output_size
288
+
289
+ # Compute transformation matrix
290
+ rot_rad = math.radians(rot)
291
+
292
+ # Source direction vector (rotated)
293
+ src_dir = self._rotate_point(np.array([0.0, scale[0] * -0.5]), rot_rad)
294
+ dst_dir = np.array([0.0, dst_w * -0.5])
295
+
296
+ # Get three corners of the src rectangle
297
+ src: np.ndarray = np.zeros((3, 2), dtype=np.float32)
298
+ src[0, :] = center
299
+ src[1, :] = center + src_dir
300
+ src[2, :] = self._get_3rd_point(src[0, :], src[1, :])
301
+
302
+ # Get three corners of the dst rectangle
303
+ dst: np.ndarray = np.zeros((3, 2), dtype=np.float32)
304
+ dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
305
+ dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
306
+ dst[2, :] = self._get_3rd_point(dst[0, :], dst[1, :])
307
+
308
+ # Get affine transform matrix
309
+ warp_mat = cv2.getAffineTransform(src, dst)
310
+
311
+ return warp_mat
312
+
313
+ def _rotate_point(self, pt: np.ndarray, angle_rad: float) -> np.ndarray:
314
+ """Rotate a point by an angle.
315
+
316
+ Args:
317
+ pt: 2D point coordinates (x, y)
318
+ angle_rad: Rotation angle in radians
319
+
320
+ Returns:
321
+ Rotated point
322
+ """
323
+ import math
324
+
325
+ sn, cs = math.sin(angle_rad), math.cos(angle_rad)
326
+ rot_mat = np.array([[cs, -sn], [sn, cs]])
327
+ return rot_mat @ pt
328
+
329
+ def _get_3rd_point(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
330
+ """Calculate the 3rd point for affine transform matrix.
331
+
332
+ The 3rd point is defined by rotating vector a - b by 90 degrees
333
+ anticlockwise, using b as the rotation center.
334
+
335
+ Args:
336
+ a: The 1st point (x, y)
337
+ b: The 2nd point (x, y)
338
+
339
+ Returns:
340
+ The 3rd point
341
+ """
342
+ direction = a - b
343
+ c = b + np.array([-direction[1], direction[0]])
344
+ return c
345
+
346
+ def _postprocess_det(
347
+ self,
348
+ outputs: tuple[np.ndarray, ...] | Sequence[np.ndarray],
349
+ orig_h: int,
350
+ orig_w: int,
351
+ preprocess_info: dict,
352
+ ) -> list:
353
+ """Postprocess detection outputs.
354
+
355
+ YOLOX ONNX model has NMS built-in, output shape is (1, N, 5) where:
356
+ - 5 values: [x1, y1, x2, y2, score]
357
+ - N is the number of detections after NMS
358
+ """
359
+ detections = outputs[0] # (1, N, 5)
360
+
361
+ if self.verbose:
362
+ print(f"DEBUG: detections shape: {detections.shape}")
363
+ print(f"DEBUG: detections[0]: {detections[0]}")
364
+
365
+ # Extract boxes and scores
366
+ # detections[0] because we have batch dimension
367
+ boxes = detections[0, :, :4] # (N, 4)
368
+ scores = detections[0, :, 4] # (N,)
369
+
370
+ # Filter by confidence
371
+ conf_threshold = 0.3
372
+
373
+ if self.verbose:
374
+ print(f"DEBUG: scores before filter: {scores}")
375
+ print(f"DEBUG: conf_threshold: {conf_threshold}")
376
+
377
+ mask = scores > conf_threshold
378
+
379
+ if not np.any(mask):
380
+ if self.verbose:
381
+ print("DEBUG: No detections passed threshold")
382
+ return []
383
+
384
+ boxes = boxes[mask]
385
+ scores = scores[mask]
386
+
387
+ # Sort by score
388
+ indices = np.argsort(scores)[::-1]
389
+ boxes = boxes[indices]
390
+ scores = scores[indices]
391
+
392
+ # Scale boxes back to original image using ratio
393
+ ratio = preprocess_info["ratio"]
394
+ boxes = boxes / ratio # Scale back to original image size
395
+
396
+ results = []
397
+ for box, score in zip(boxes[:1], scores[:1], strict=True): # Only top 1 person
398
+ x1, y1, x2, y2 = box
399
+
400
+ # Clip to image bounds
401
+ x1 = max(0, min(x1, orig_w))
402
+ y1 = max(0, min(y1, orig_h))
403
+ x2 = max(0, min(x2, orig_w))
404
+ y2 = max(0, min(y2, orig_h))
405
+
406
+ results.append([float(x1), float(y1), float(x2), float(y2), float(score)])
407
+
408
+ return results
409
+
410
+ def process_frame(
411
+ self, frame: np.ndarray, timestamp_ms: int = 0
412
+ ) -> dict[str, tuple[float, float, float]] | None:
413
+ """Process a single frame and extract pose landmarks.
414
+
415
+ Args:
416
+ frame: BGR image frame (OpenCV format)
417
+ timestamp_ms: Frame timestamp in milliseconds (unused, for API compatibility)
418
+
419
+ Returns:
420
+ Dictionary mapping landmark names to (x, y, visibility) tuples,
421
+ or None if no pose detected.
422
+ """
423
+ if frame.size == 0:
424
+ return None
425
+
426
+ height, width = frame.shape[:2]
427
+
428
+ try:
429
+ # Detection
430
+ with self.timer.measure("optimized_cpu_detection"):
431
+ if self.verbose:
432
+ print("DEBUG: Starting detection...")
433
+
434
+ det_input = self._preprocess_det(frame)
435
+
436
+ if self.verbose:
437
+ print(f"DEBUG: det_input input shape: {det_input['input'].shape}")
438
+
439
+ det_outputs = self.det_session.run(
440
+ self.det_output_names,
441
+ {self.det_input_name: det_input["input"]},
442
+ )
443
+
444
+ if self.verbose:
445
+ det_output_0: np.ndarray = det_outputs[0] # type: ignore[index]
446
+ print(f"DEBUG: det_outputs[0] shape: {det_output_0.shape}")
447
+
448
+ detections = self._postprocess_det(list(det_outputs), height, width, det_input) # type: ignore[arg-type]
449
+
450
+ if self.verbose:
451
+ print(f"DEBUG: detections count: {len(detections)}")
452
+
453
+ if not detections:
454
+ return None
455
+
456
+ # Get the highest confidence person bbox
457
+ x1, y1, x2, y2, _score = detections[0]
458
+
459
+ # Pose estimation
460
+ with self.timer.measure("optimized_cpu_pose_inference"):
461
+ try:
462
+ pose_input = self._preprocess_pose(frame, [x1, y1, x2, y2])
463
+ except Exception as e:
464
+ if self.verbose:
465
+ print(f"Preprocessing error: {e}")
466
+ import traceback
467
+
468
+ traceback.print_exc()
469
+ return None
470
+
471
+ try:
472
+ pose_outputs_any = self.pose_session.run(
473
+ self.pose_output_names,
474
+ {self.pose_input_name: pose_input},
475
+ )
476
+ # ONNX Runtime returns tuple[Any, ...], but we know these are ndarrays
477
+ pose_outputs: tuple[np.ndarray, ...] = tuple(
478
+ o if isinstance(o, np.ndarray) else np.array(o) for o in pose_outputs_any
479
+ )
480
+ except Exception as e:
481
+ if self.verbose:
482
+ print(f"ONNX inference error: {e}")
483
+ import traceback
484
+
485
+ traceback.print_exc()
486
+ return None
487
+
488
+ # Extract landmarks from output
489
+ with self.timer.measure("landmark_extraction"):
490
+ landmarks = self._extract_landmarks_from_output(
491
+ pose_outputs, [x1, y1, x2, y2], width, height
492
+ )
493
+
494
+ return landmarks
495
+
496
+ except Exception as e:
497
+ if self.verbose:
498
+ print(f"Optimized CPU tracker error: {e}")
499
+ return None
500
+
501
+ def _extract_landmarks_from_output(
502
+ self,
503
+ outputs: tuple[np.ndarray, ...] | Sequence[np.ndarray],
504
+ bbox: list,
505
+ img_width: int,
506
+ img_height: int,
507
+ ) -> dict[str, tuple[float, float, float]]:
508
+ """Extract landmarks from RTMPose SIMCC output.
509
+
510
+ RTMPose uses SIMCC (SimCC-based coordinate representation) which outputs
511
+ dual 1-D heatmaps for horizontal and vertical coordinates.
512
+
513
+ Args:
514
+ outputs: ONNX model outputs [simcc_x, simcc_y]
515
+ - simcc_x: (1, 26, Wx) horizontal heatmap
516
+ - simcc_y: (1, 26, Wy) vertical heatmap
517
+ bbox: Bounding box [x1, y1, x2, y2]
518
+ img_width: Original image width
519
+ img_height: Original image height
520
+
521
+ Returns:
522
+ Dictionary mapping landmark names to normalized (x, y, visibility) tuples
523
+ """
524
+
525
+ # Extract SIMCC outputs
526
+ simcc_x = outputs[0] # Expected: (1, 26, Wx)
527
+ simcc_y = outputs[1] # Expected: (1, 26, Wy)
528
+
529
+ if self.verbose:
530
+ print(f"DEBUG: simcc_x shape = {simcc_x.shape}")
531
+ print(f"DEBUG: simcc_y shape = {simcc_y.shape}")
532
+
533
+ # Decode SIMCC to get keypoint locations - MATCHING RTMLIB EXACTLY
534
+ # simcc_x shape: (1, 26, wx) where wx = input_width * 2
535
+ # simcc_y shape: (1, 26, wy) where wy = input_height * 2
536
+ n, k, _wx = simcc_x.shape
537
+
538
+ # CRITICAL: Reshape to (n*k, -1) before argmax like rtmlib does
539
+ simcc_x_flat = simcc_x.reshape(n * k, -1)
540
+ simcc_y_flat = simcc_y.reshape(n * k, -1)
541
+
542
+ # Get maximum value locations (argmax along last axis of flat arrays)
543
+ x_locs_flat = np.argmax(simcc_x_flat, axis=1)
544
+ y_locs_flat = np.argmax(simcc_y_flat, axis=1)
545
+
546
+ # Get maximum values
547
+ max_val_x = np.amax(simcc_x_flat, axis=1)
548
+ max_val_y = np.amax(simcc_y_flat, axis=1)
549
+
550
+ # Combine x and y confidence (average of both axes)
551
+ vals_flat = 0.5 * (max_val_x + max_val_y)
552
+
553
+ # Stack locations
554
+ locs_flat = np.stack((x_locs_flat, y_locs_flat), axis=-1).astype(np.float32)
555
+
556
+ # CRITICAL: Mask invalid locations (where confidence <= 0)
557
+ locs_flat[vals_flat <= 0.0] = -1
558
+
559
+ # Reshape back to (n, k, 2) and (n, k)
560
+ locs = locs_flat.reshape(n, k, 2)
561
+ scores = vals_flat.reshape(n, k)
562
+
563
+ # Extract first person's keypoints
564
+ keypoints = locs[0] # (26, 2)
565
+ confidences = scores[0] # (26,)
566
+
567
+ # SIMCC split ratio (default is 2.0 - resolution is 2x input size)
568
+ simcc_split_ratio = 2.0
569
+ keypoints = keypoints / simcc_split_ratio # Now in model input size coordinates
570
+
571
+ # Convert from model input size back to original image coordinates
572
+ # We need the bbox center and scale for rescaling
573
+ x1, y1, x2, y2 = bbox
574
+
575
+ # Calculate center and scale (from bbox_xyxy2cs with padding=1.25)
576
+ center = np.array([x1 + x2, y1 + y2]) * 0.5
577
+ scale = np.array([x2 - x1, y2 - y1]) * 1.25
578
+
579
+ # CRITICAL: Apply the same aspect ratio adjustment used in _preprocess_pose!
580
+ # This matches rtmlib's top_down_affine logic
581
+ model_h, model_w = self.pose_input_size
582
+ aspect_ratio = model_w / model_h
583
+
584
+ b_w = scale[0]
585
+ b_h = scale[1]
586
+
587
+ # Reshape to maintain aspect ratio (must match _preprocess_pose)
588
+ if b_w > b_h * aspect_ratio:
589
+ scale = np.array([b_w, b_w / aspect_ratio], dtype=np.float32)
590
+ else:
591
+ scale = np.array([b_h * aspect_ratio, b_h], dtype=np.float32)
592
+
593
+ # Rescale keypoints to original image
594
+ # Following RTMPose.postprocess() logic:
595
+ # keypoints = keypoints / model_input_size * scale
596
+ # keypoints = keypoints + center - scale / 2
597
+ model_h, model_w = self.pose_input_size
598
+ keypoints = keypoints / np.array([model_w, model_h]) * scale
599
+ keypoints = keypoints + center - scale / 2
600
+
601
+ # Build landmark dictionary with Halpe-26 to kinemotion mapping
602
+ landmarks = {}
603
+
604
+ for halpe_idx, name in HALPE_TO_KINEMOTION.items():
605
+ if halpe_idx >= keypoints.shape[0]:
606
+ continue
607
+
608
+ x_pixel, y_pixel = keypoints[halpe_idx]
609
+ confidence = float(confidences[halpe_idx])
610
+
611
+ # Normalize to [0, 1] like MediaPipe
612
+ x_norm = float(x_pixel / img_width)
613
+ y_norm = float(y_pixel / img_height)
614
+
615
+ # Clamp to valid range
616
+ x_norm = max(0.0, min(1.0, x_norm))
617
+ y_norm = max(0.0, min(1.0, y_norm))
618
+
619
+ # Use confidence as visibility (MediaPipe compatibility)
620
+ landmarks[name] = (x_norm, y_norm, confidence)
621
+
622
+ return landmarks
623
+
624
+ def close(self) -> None:
625
+ """Release resources."""
626
+ pass