PyPI - mediapipe-nightly - Versions diffs - 0.10.21.post20250114__cp39-cp39-manylinux_2_28_x86_64.whl - Mend

mediapipe-nightly 0.10.21.post20250114__cp39-cp39-manylinux_2_28_x86_64.whl

Files changed (593) hide show

mediapipe/python/solutions/face_mesh_test.py ADDED Viewed

@@ -0,0 +1,170 @@
+# Copyright 2020 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for mediapipe.python.solutions.face_mesh."""
+import os
+import tempfile  # pylint: disable=unused-import
+from typing import NamedTuple
+from absl.testing import absltest
+from absl.testing import parameterized
+import cv2
+import numpy as np
+import numpy.testing as npt
+# resources dependency
+# undeclared dependency
+from mediapipe.python.solutions import drawing_styles
+from mediapipe.python.solutions import drawing_utils as mp_drawing
+from mediapipe.python.solutions import face_mesh as mp_faces
+TEST_IMAGE_PATH = 'mediapipe/python/solutions/testdata'
+DIFF_THRESHOLD = 5  # pixels
+EYE_INDICES_TO_LANDMARKS = {
+    33: [345, 178],
+    7: [348, 179],
+    163: [352, 178],
+    144: [357, 179],
+    145: [365, 179],
+    153: [371, 179],
+    154: [378, 178],
+    155: [381, 177],
+    133: [383, 177],
+    246: [347, 175],
+    161: [350, 174],
+    160: [355, 172],
+    159: [362, 170],
+    158: [368, 171],
+    157: [375, 172],
+    173: [380, 175],
+    263: [467, 176],
+    249: [464, 177],
+    390: [460, 177],
+    373: [455, 178],
+    374: [448, 179],
+    380: [441, 179],
+    381: [435, 178],
+    382: [432, 177],
+    362: [430, 177],
+    466: [465, 175],
+    388: [462, 173],
+    387: [457, 171],
+    386: [450, 170],
+    385: [444, 171],
+    384: [437, 172],
+    398: [432, 175]
+}
+IRIS_INDICES_TO_LANDMARKS = {
+    468: [362, 175],
+    469: [371, 175],
+    470: [362, 167],
+    471: [354, 175],
+    472: [363, 182],
+    473: [449, 174],
+    474: [458, 174],
+    475: [449, 167],
+    476: [440, 174],
+    477: [449, 181]
+}
+class FaceMeshTest(parameterized.TestCase):
+  def _annotate(self, frame: np.ndarray, results: NamedTuple, idx: int,
+                draw_iris: bool):
+    for face_landmarks in results.multi_face_landmarks:
+      mp_drawing.draw_landmarks(
+          frame,
+          face_landmarks,
+          mp_faces.FACEMESH_TESSELATION,
+          landmark_drawing_spec=None,
+          connection_drawing_spec=drawing_styles
+          .get_default_face_mesh_tesselation_style())
+      mp_drawing.draw_landmarks(
+          frame,
+          face_landmarks,
+          mp_faces.FACEMESH_CONTOURS,
+          landmark_drawing_spec=None,
+          connection_drawing_spec=drawing_styles
+          .get_default_face_mesh_contours_style())
+      if draw_iris:
+        mp_drawing.draw_landmarks(
+            frame,
+            face_landmarks,
+            mp_faces.FACEMESH_IRISES,
+            landmark_drawing_spec=None,
+            connection_drawing_spec=drawing_styles
+            .get_default_face_mesh_iris_connections_style())
+    path = os.path.join(tempfile.gettempdir(), self.id().split('.')[-1] +
+                                              '_frame_{}.png'.format(idx))
+    cv2.imwrite(path, frame)
+  def test_invalid_image_shape(self):
+    with mp_faces.FaceMesh() as faces:
+      with self.assertRaisesRegex(
+          ValueError, 'Input image must contain three channel rgb data.'):
+        faces.process(np.arange(36, dtype=np.uint8).reshape(3, 3, 4))
+  def test_blank_image(self):
+    with mp_faces.FaceMesh() as faces:
+      image = np.zeros([100, 100, 3], dtype=np.uint8)
+      image.fill(255)
+      results = faces.process(image)
+      self.assertIsNone(results.multi_face_landmarks)
+  @parameterized.named_parameters(
+      ('static_image_mode_no_attention', True, False, 5),
+      ('static_image_mode_with_attention', True, True, 5),
+      ('streaming_mode_no_attention', False, False, 10),
+      ('streaming_mode_with_attention', False, True, 10))
+  def test_face(self, static_image_mode: bool, refine_landmarks: bool,
+                num_frames: int):
+    image_path = os.path.join(os.path.dirname(__file__),
+                              'testdata/portrait.jpg')
+    image = cv2.imread(image_path)
+    rows, cols, _ = image.shape
+    with mp_faces.FaceMesh(
+        static_image_mode=static_image_mode,
+        refine_landmarks=refine_landmarks,
+        min_detection_confidence=0.5) as faces:
+      for idx in range(num_frames):
+        results = faces.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+        self._annotate(image.copy(), results, idx, refine_landmarks)
+        multi_face_landmarks = []
+        for landmarks in results.multi_face_landmarks:
+          self.assertLen(
+              landmarks.landmark, mp_faces.FACEMESH_NUM_LANDMARKS_WITH_IRISES
+              if refine_landmarks else mp_faces.FACEMESH_NUM_LANDMARKS)
+          x = [landmark.x * cols for landmark in landmarks.landmark]
+          y = [landmark.y * rows for landmark in landmarks.landmark]
+          face_landmarks = np.column_stack((x, y))
+          multi_face_landmarks.append(face_landmarks)
+        self.assertLen(multi_face_landmarks, 1)
+        # Verify the eye landmarks are correct as sanity check.
+        for eye_idx, gt_lds in EYE_INDICES_TO_LANDMARKS.items():
+          prediction_error = np.abs(
+              np.asarray(multi_face_landmarks[0][eye_idx]) - np.asarray(gt_lds))
+          npt.assert_array_less(prediction_error, DIFF_THRESHOLD)
+        if refine_landmarks:
+          for iris_idx, gt_lds in IRIS_INDICES_TO_LANDMARKS.items():
+            prediction_error = np.abs(
+                np.asarray(multi_face_landmarks[0][iris_idx]) -
+                np.asarray(gt_lds))
+            npt.assert_array_less(prediction_error, DIFF_THRESHOLD)
+if __name__ == '__main__':
+  absltest.main()

mediapipe/python/solutions/hands.py ADDED Viewed

@@ -0,0 +1,153 @@
+# Copyright 2020 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MediaPipe Hands."""
+import enum
+from typing import NamedTuple
+import numpy as np
+# pylint: disable=unused-import
+from mediapipe.calculators.core import constant_side_packet_calculator_pb2
+from mediapipe.calculators.core import gate_calculator_pb2
+from mediapipe.calculators.core import split_vector_calculator_pb2
+from mediapipe.calculators.tensor import image_to_tensor_calculator_pb2
+from mediapipe.calculators.tensor import inference_calculator_pb2
+from mediapipe.calculators.tensor import tensors_to_classification_calculator_pb2
+from mediapipe.calculators.tensor import tensors_to_detections_calculator_pb2
+from mediapipe.calculators.tensor import tensors_to_landmarks_calculator_pb2
+from mediapipe.calculators.tflite import ssd_anchors_calculator_pb2
+from mediapipe.calculators.util import association_calculator_pb2
+from mediapipe.calculators.util import detections_to_rects_calculator_pb2
+from mediapipe.calculators.util import logic_calculator_pb2
+from mediapipe.calculators.util import non_max_suppression_calculator_pb2
+from mediapipe.calculators.util import rect_transformation_calculator_pb2
+from mediapipe.calculators.util import thresholding_calculator_pb2
+# pylint: enable=unused-import
+from mediapipe.python.solution_base import SolutionBase
+# pylint: disable=unused-import
+from mediapipe.python.solutions.hands_connections import HAND_CONNECTIONS
+# pylint: enable=unused-import
+class HandLandmark(enum.IntEnum):
+  """The 21 hand landmarks."""
+  WRIST = 0
+  THUMB_CMC = 1
+  THUMB_MCP = 2
+  THUMB_IP = 3
+  THUMB_TIP = 4
+  INDEX_FINGER_MCP = 5
+  INDEX_FINGER_PIP = 6
+  INDEX_FINGER_DIP = 7
+  INDEX_FINGER_TIP = 8
+  MIDDLE_FINGER_MCP = 9
+  MIDDLE_FINGER_PIP = 10
+  MIDDLE_FINGER_DIP = 11
+  MIDDLE_FINGER_TIP = 12
+  RING_FINGER_MCP = 13
+  RING_FINGER_PIP = 14
+  RING_FINGER_DIP = 15
+  RING_FINGER_TIP = 16
+  PINKY_MCP = 17
+  PINKY_PIP = 18
+  PINKY_DIP = 19
+  PINKY_TIP = 20
+_BINARYPB_FILE_PATH = 'mediapipe/modules/hand_landmark/hand_landmark_tracking_cpu.binarypb'
+class Hands(SolutionBase):
+  """MediaPipe Hands.
+  MediaPipe Hands processes an RGB image and returns the hand landmarks and
+  handedness (left v.s. right hand) of each detected hand.
+  Note that it determines handedness assuming the input image is mirrored,
+  i.e., taken with a front-facing/selfie camera (
+  https://en.wikipedia.org/wiki/Front-facing_camera) with images flipped
+  horizontally. If that is not the case, use, for instance, cv2.flip(image, 1)
+  to flip the image first for a correct handedness output.
+  Please refer to https://solutions.mediapipe.dev/hands#python-solution-api for
+  usage examples.
+  """
+  def __init__(self,
+               static_image_mode=False,
+               max_num_hands=2,
+               model_complexity=1,
+               min_detection_confidence=0.5,
+               min_tracking_confidence=0.5):
+    """Initializes a MediaPipe Hand object.
+    Args:
+      static_image_mode: Whether to treat the input images as a batch of static
+        and possibly unrelated images, or a video stream. See details in
+        https://solutions.mediapipe.dev/hands#static_image_mode.
+      max_num_hands: Maximum number of hands to detect. See details in
+        https://solutions.mediapipe.dev/hands#max_num_hands.
+      model_complexity: Complexity of the hand landmark model: 0 or 1.
+        Landmark accuracy as well as inference latency generally go up with the
+        model complexity. See details in
+        https://solutions.mediapipe.dev/hands#model_complexity.
+      min_detection_confidence: Minimum confidence value ([0.0, 1.0]) for hand
+        detection to be considered successful. See details in
+        https://solutions.mediapipe.dev/hands#min_detection_confidence.
+      min_tracking_confidence: Minimum confidence value ([0.0, 1.0]) for the
+        hand landmarks to be considered tracked successfully. See details in
+        https://solutions.mediapipe.dev/hands#min_tracking_confidence.
+    """
+    super().__init__(
+        binary_graph_path=_BINARYPB_FILE_PATH,
+        side_inputs={
+            'model_complexity': model_complexity,
+            'num_hands': max_num_hands,
+            'use_prev_landmarks': not static_image_mode,
+        },
+        calculator_params={
+            'palmdetectioncpu__TensorsToDetectionsCalculator.min_score_thresh':
+                min_detection_confidence,
+            'handlandmarkcpu__ThresholdingCalculator.threshold':
+                min_tracking_confidence,
+        },
+        outputs=[
+            'multi_hand_landmarks', 'multi_hand_world_landmarks',
+            'multi_handedness'
+        ])
+  def process(self, image: np.ndarray) -> NamedTuple:
+    """Processes an RGB image and returns the hand landmarks and handedness of each detected hand.
+    Args:
+      image: An RGB image represented as a numpy ndarray.
+    Raises:
+      RuntimeError: If the underlying graph throws any error.
+      ValueError: If the input image is not three channel RGB.
+    Returns:
+      A NamedTuple object with the following fields:
+        1) a "multi_hand_landmarks" field that contains the hand landmarks on
+           each detected hand.
+        2) a "multi_hand_world_landmarks" field that contains the hand landmarks
+           on each detected hand in real-world 3D coordinates that are in meters
+           with the origin at the hand's approximate geometric center.
+        3) a "multi_handedness" field that contains the handedness (left v.s.
+           right hand) of the detected hand.
+    """
+    return super().process(input_data={'image': image})

mediapipe/python/solutions/hands_connections.py ADDED Viewed

@@ -0,0 +1,32 @@
+# Copyright 2021 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MediaPipe Hands connections."""
+HAND_PALM_CONNECTIONS = ((0, 1), (0, 5), (9, 13), (13, 17), (5, 9), (0, 17))
+HAND_THUMB_CONNECTIONS = ((1, 2), (2, 3), (3, 4))
+HAND_INDEX_FINGER_CONNECTIONS = ((5, 6), (6, 7), (7, 8))
+HAND_MIDDLE_FINGER_CONNECTIONS = ((9, 10), (10, 11), (11, 12))
+HAND_RING_FINGER_CONNECTIONS = ((13, 14), (14, 15), (15, 16))
+HAND_PINKY_FINGER_CONNECTIONS = ((17, 18), (18, 19), (19, 20))
+HAND_CONNECTIONS = frozenset().union(*[
+    HAND_PALM_CONNECTIONS, HAND_THUMB_CONNECTIONS,
+    HAND_INDEX_FINGER_CONNECTIONS, HAND_MIDDLE_FINGER_CONNECTIONS,
+    HAND_RING_FINGER_CONNECTIONS, HAND_PINKY_FINGER_CONNECTIONS
+])

mediapipe/python/solutions/hands_test.py ADDED Viewed

@@ -0,0 +1,219 @@
+# Copyright 2020 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for mediapipe.python.solutions.hands."""
+import json
+import os
+import tempfile  # pylint: disable=unused-import
+from typing import NamedTuple
+from absl.testing import absltest
+from absl.testing import parameterized
+import cv2
+import numpy as np
+import numpy.testing as npt
+# resources dependency
+# undeclared dependency
+from mediapipe.python.solutions import drawing_styles
+from mediapipe.python.solutions import drawing_utils as mp_drawing
+from mediapipe.python.solutions import hands as mp_hands
+TEST_IMAGE_PATH = 'mediapipe/python/solutions/testdata'
+LITE_MODEL_DIFF_THRESHOLD = 25  # pixels
+FULL_MODEL_DIFF_THRESHOLD = 20  # pixels
+EXPECTED_HAND_COORDINATES_PREDICTION = [[[580, 34], [504, 50], [459, 94],
+                                         [429, 146], [397, 182], [507, 167],
+                                         [479, 245], [469, 292], [464, 330],
+                                         [545, 180], [534, 265], [533, 319],
+                                         [536, 360], [581, 172], [587, 252],
+                                         [593, 304], [599, 346], [615, 168],
+                                         [628, 223], [638, 258], [648, 288]],
+                                        [[138, 343], [211, 330], [257, 286],
+                                         [289, 237], [322, 203], [219, 216],
+                                         [238, 138], [249, 90], [253, 51],
+                                         [177, 204], [184, 115], [187, 60],
+                                         [185, 19], [138, 208], [131, 127],
+                                         [124, 77], [117, 36], [106, 222],
+                                         [92, 159], [79, 124], [68, 93]]]
+class HandsTest(parameterized.TestCase):
+  def _get_output_path(self, name):
+    return os.path.join(tempfile.gettempdir(), self.id().split('.')[-1] + name)
+  def _landmarks_list_to_array(self, landmark_list, image_shape):
+    rows, cols, _ = image_shape
+    return np.asarray([(lmk.x * cols, lmk.y * rows, lmk.z * cols)
+                       for lmk in landmark_list.landmark])
+  def _world_landmarks_list_to_array(self, landmark_list):
+    return np.asarray([(lmk.x, lmk.y, lmk.z)
+                       for lmk in landmark_list.landmark])
+  def _assert_diff_less(self, array1, array2, threshold):
+    npt.assert_array_less(np.abs(array1 - array2), threshold)
+  def _annotate(self, frame: np.ndarray, results: NamedTuple, idx: int):
+    for hand_landmarks in results.multi_hand_landmarks:
+      mp_drawing.draw_landmarks(
+          frame, hand_landmarks, mp_hands.HAND_CONNECTIONS,
+          drawing_styles.get_default_hand_landmarks_style(),
+          drawing_styles.get_default_hand_connections_style())
+    path = os.path.join(tempfile.gettempdir(), self.id().split('.')[-1] +
+                                              '_frame_{}.png'.format(idx))
+    cv2.imwrite(path, frame)
+  def test_invalid_image_shape(self):
+    with mp_hands.Hands() as hands:
+      with self.assertRaisesRegex(
+          ValueError, 'Input image must contain three channel rgb data.'):
+        hands.process(np.arange(36, dtype=np.uint8).reshape(3, 3, 4))
+  def test_blank_image(self):
+    with mp_hands.Hands() as hands:
+      image = np.zeros([100, 100, 3], dtype=np.uint8)
+      image.fill(255)
+      results = hands.process(image)
+      self.assertIsNone(results.multi_hand_landmarks)
+      self.assertIsNone(results.multi_handedness)
+  @parameterized.named_parameters(
+      ('static_image_mode_with_lite_model', True, 0, 5),
+      ('video_mode_with_lite_model', False, 0, 10),
+      ('static_image_mode_with_full_model', True, 1, 5),
+      ('video_mode_with_full_model', False, 1, 10))
+  def test_multi_hands(self, static_image_mode, model_complexity, num_frames):
+    image_path = os.path.join(os.path.dirname(__file__), 'testdata/hands.jpg')
+    image = cv2.imread(image_path)
+    with mp_hands.Hands(
+        static_image_mode=static_image_mode,
+        max_num_hands=2,
+        model_complexity=model_complexity,
+        min_detection_confidence=0.5) as hands:
+      for idx in range(num_frames):
+        results = hands.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
+        self._annotate(image.copy(), results, idx)
+        handedness = [
+            handedness.classification[0].label
+            for handedness in results.multi_handedness
+        ]
+        multi_hand_coordinates = []
+        rows, cols, _ = image.shape
+        for landmarks in results.multi_hand_landmarks:
+          self.assertLen(landmarks.landmark, 21)
+          x = [landmark.x * cols for landmark in landmarks.landmark]
+          y = [landmark.y * rows for landmark in landmarks.landmark]
+          hand_coordinates = np.column_stack((x, y))
+          multi_hand_coordinates.append(hand_coordinates)
+        self.assertLen(handedness, 2)
+        self.assertLen(multi_hand_coordinates, 2)
+        prediction_error = np.abs(
+            np.asarray(multi_hand_coordinates) -
+            np.asarray(EXPECTED_HAND_COORDINATES_PREDICTION))
+        diff_threshold = LITE_MODEL_DIFF_THRESHOLD if model_complexity == 0 else FULL_MODEL_DIFF_THRESHOLD
+        npt.assert_array_less(prediction_error, diff_threshold)
+  def _process_video(self, model_complexity, video_path,
+                     max_num_hands=1,
+                     num_landmarks=21,
+                     num_dimensions=3):
+    # Predict pose landmarks for each frame.
+    video_cap = cv2.VideoCapture(video_path)
+    landmarks_per_frame = []
+    w_landmarks_per_frame = []
+    with mp_hands.Hands(
+        static_image_mode=False,
+        max_num_hands=max_num_hands,
+        model_complexity=model_complexity,
+        min_detection_confidence=0.5) as hands:
+      while True:
+        # Get next frame of the video.
+        success, input_frame = video_cap.read()
+        if not success:
+          break
+        # Run pose tracker.
+        input_frame = cv2.cvtColor(input_frame, cv2.COLOR_BGR2RGB)
+        frame_shape = input_frame.shape
+        result = hands.process(image=input_frame)
+        frame_landmarks = np.zeros([max_num_hands,
+                                    num_landmarks, num_dimensions]) * np.nan
+        frame_w_landmarks = np.zeros([max_num_hands,
+                                      num_landmarks, num_dimensions]) * np.nan
+        if result.multi_hand_landmarks:
+          for idx, landmarks in enumerate(result.multi_hand_landmarks):
+            landmarks = self._landmarks_list_to_array(landmarks, frame_shape)
+            frame_landmarks[idx] = landmarks
+        if result.multi_hand_world_landmarks:
+          for idx, w_landmarks in enumerate(result.multi_hand_world_landmarks):
+            w_landmarks = self._world_landmarks_list_to_array(w_landmarks)
+            frame_w_landmarks[idx] = w_landmarks
+        landmarks_per_frame.append(frame_landmarks)
+        w_landmarks_per_frame.append(frame_w_landmarks)
+    return (np.array(landmarks_per_frame), np.array(w_landmarks_per_frame))
+  @parameterized.named_parameters(
+      ('full', 1, 'asl_hand.full.npz'))
+  def test_on_video(self, model_complexity, expected_name):
+    """Tests hand models on a video."""
+    video_path = os.path.join(os.path.dirname(__file__),
+                              'testdata/asl_hand.25fps.mp4')
+    expected_path = os.path.join(os.path.dirname(__file__),
+                                 'testdata/{}'.format(expected_name))
+    actual, actual_world = self._process_video(model_complexity, video_path)
+    # Dump actual .npz.
+    npz_path = self._get_output_path(expected_name)
+    np.savez(npz_path, predictions=actual, w_predictions=actual_world)
+    # Dump actual JSON.
+    json_path = self._get_output_path(expected_name.replace('.npz', '.json'))
+    with open(json_path, 'w') as fl:
+      dump_data = {
+          'predictions': np.around(actual, 3).tolist(),
+          'predictions_world': np.around(actual_world, 3).tolist(),
+      }
+      fl.write(json.dumps(dump_data, indent=2, separators=(',', ': ')))
+    # Validate actual vs. expected landmarks.
+    expected = np.load(expected_path)['predictions']
+    assert (
+        actual.shape == expected.shape
+    ), 'Unexpected shape of predictions: {} instead of {}'.format(
+        actual.shape, expected.shape
+    )
+    # large values, use relative tolerance for testing.
+    np.testing.assert_allclose(actual[..., :2], expected[..., :2], rtol=0.1)
+    # Validate actual vs. expected world landmarks.
+    expected_world = np.load(expected_path)['w_predictions']
+    assert (
+        actual_world.shape == expected_world.shape
+    ), 'Unexpected shape of world predictions: {} instead of {}'.format(
+        actual_world.shape, expected_world.shape
+    )
+    # small values, use absolute tolerance for testing.
+    np.testing.assert_array_almost_equal(
+        actual_world, expected_world, decimal=1
+    )
+if __name__ == '__main__':
+  absltest.main()