PyPI - mediapipe-nightly - Versions diffs - 0.10.21.post20250114__cp311-cp311-manylinux_2_28_x86_64.whl - Mend

mediapipe-nightly 0.10.21.post20250114__cp311-cp311-manylinux_2_28_x86_64.whl

Files changed (593) hide show

mediapipe/tasks/python/__init__.py ADDED Viewed

@@ -0,0 +1,27 @@
+# Copyright 2022 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MediaPipe Tasks API."""
+from . import audio
+from . import components
+from . import core
+from . import genai
+from . import text
+from . import vision
+BaseOptions = core.base_options.BaseOptions
+# Remove unnecessary modules to avoid duplication in API docs.
+del core

mediapipe/tasks/python/audio/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+# Copyright 2022 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MediaPipe Tasks Audio API."""
+import mediapipe.tasks.python.audio.core
+import mediapipe.tasks.python.audio.audio_classifier
+import mediapipe.tasks.python.audio.audio_embedder
+AudioClassifier = audio_classifier.AudioClassifier
+AudioClassifierOptions = audio_classifier.AudioClassifierOptions
+AudioClassifierResult = audio_classifier.AudioClassifierResult
+AudioEmbedder = audio_embedder.AudioEmbedder
+AudioEmbedderOptions = audio_embedder.AudioEmbedderOptions
+AudioEmbedderResult = audio_embedder.AudioEmbedderResult
+RunningMode = core.audio_task_running_mode.AudioTaskRunningMode
+# Remove unnecessary modules to avoid duplication in API docs.
+del audio_classifier
+del audio_embedder
+del core
+del mediapipe

mediapipe/tasks/python/audio/audio_classifier.py ADDED Viewed

@@ -0,0 +1,324 @@
+# Copyright 2022 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MediaPipe audio classifier task."""
+import dataclasses
+from typing import Callable, Mapping, List, Optional
+from mediapipe.python import packet_creator
+from mediapipe.python import packet_getter
+from mediapipe.python._framework_bindings import packet
+from mediapipe.tasks.cc.audio.audio_classifier.proto import audio_classifier_graph_options_pb2
+from mediapipe.tasks.cc.components.containers.proto import classifications_pb2
+from mediapipe.tasks.cc.components.processors.proto import classifier_options_pb2
+from mediapipe.tasks.python.audio.core import audio_task_running_mode as running_mode_module
+from mediapipe.tasks.python.audio.core import base_audio_task_api
+from mediapipe.tasks.python.components.containers import audio_data as audio_data_module
+from mediapipe.tasks.python.components.containers import classification_result as classification_result_module
+from mediapipe.tasks.python.core import base_options as base_options_module
+from mediapipe.tasks.python.core import task_info as task_info_module
+from mediapipe.tasks.python.core.optional_dependencies import doc_controls
+AudioClassifierResult = classification_result_module.ClassificationResult
+_AudioClassifierGraphOptionsProto = audio_classifier_graph_options_pb2.AudioClassifierGraphOptions
+_AudioData = audio_data_module.AudioData
+_BaseOptions = base_options_module.BaseOptions
+_ClassifierOptionsProto = classifier_options_pb2.ClassifierOptions
+_RunningMode = running_mode_module.AudioTaskRunningMode
+_TaskInfo = task_info_module.TaskInfo
+_AUDIO_IN_STREAM_NAME = 'audio_in'
+_AUDIO_TAG = 'AUDIO'
+_CLASSIFICATIONS_STREAM_NAME = 'classifications_out'
+_CLASSIFICATIONS_TAG = 'CLASSIFICATIONS'
+_SAMPLE_RATE_IN_STREAM_NAME = 'sample_rate_in'
+_SAMPLE_RATE_TAG = 'SAMPLE_RATE'
+_TASK_GRAPH_NAME = 'mediapipe.tasks.audio.audio_classifier.AudioClassifierGraph'
+_TIMESTAMPED_CLASSIFICATIONS_STREAM_NAME = 'timestamped_classifications_out'
+_TIMESTAMPED_CLASSIFICATIONS_TAG = 'TIMESTAMPED_CLASSIFICATIONS'
+_MICRO_SECONDS_PER_MILLISECOND = 1000
+@dataclasses.dataclass
+class AudioClassifierOptions:
+  """Options for the audio classifier task.
+  Attributes:
+    base_options: Base options for the audio classifier task.
+    running_mode: The running mode of the task. Default to the audio clips mode.
+      Audio classifier task has two running modes: 1) The audio clips mode for
+      running classification on independent audio clips. 2) The audio stream
+      mode for running classification on the audio stream, such as from
+      microphone. In this mode,  the "result_callback" below must be specified
+      to receive the classification results asynchronously.
+    display_names_locale: The locale to use for display names specified through
+      the TFLite Model Metadata.
+    max_results: The maximum number of top-scored classification results to
+      return.
+    score_threshold: Overrides the ones provided in the model metadata. Results
+      below this value are rejected.
+    category_allowlist: Allowlist of category names. If non-empty,
+      classification results whose category name is not in this set will be
+      filtered out. Duplicate or unknown category names are ignored. Mutually
+      exclusive with `category_denylist`.
+    category_denylist: Denylist of category names. If non-empty, classification
+      results whose category name is in this set will be filtered out. Duplicate
+      or unknown category names are ignored. Mutually exclusive with
+      `category_allowlist`.
+    result_callback: The user-defined result callback for processing audio
+      stream data. The result callback should only be specified when the running
+      mode is set to the audio stream mode.
+  """
+  base_options: _BaseOptions
+  running_mode: _RunningMode = _RunningMode.AUDIO_CLIPS
+  display_names_locale: Optional[str] = None
+  max_results: Optional[int] = None
+  score_threshold: Optional[float] = None
+  category_allowlist: Optional[List[str]] = None
+  category_denylist: Optional[List[str]] = None
+  result_callback: Optional[Callable[[AudioClassifierResult, int], None]] = None
+  @doc_controls.do_not_generate_docs
+  def to_pb2(self) -> _AudioClassifierGraphOptionsProto:
+    """Generates an AudioClassifierOptions protobuf object."""
+    base_options_proto = self.base_options.to_pb2()
+    base_options_proto.use_stream_mode = False if self.running_mode == _RunningMode.AUDIO_CLIPS else True
+    classifier_options_proto = _ClassifierOptionsProto(
+        score_threshold=self.score_threshold,
+        category_allowlist=self.category_allowlist,
+        category_denylist=self.category_denylist,
+        display_names_locale=self.display_names_locale,
+        max_results=self.max_results)
+    return _AudioClassifierGraphOptionsProto(
+        base_options=base_options_proto,
+        classifier_options=classifier_options_proto)
+class AudioClassifier(base_audio_task_api.BaseAudioTaskApi):
+  """Class that performs audio classification on audio data.
+  This API expects a TFLite model with mandatory TFLite Model Metadata that
+  contains the mandatory AudioProperties of the solo input audio tensor and the
+  optional (but recommended) category labels as AssociatedFiles with type
+  TENSOR_AXIS_LABELS per output classification tensor.
+  Input tensor:
+    (kTfLiteFloat32)
+    - input audio buffer of size `[batch * samples]`.
+    - batch inference is not supported (`batch` is required to be 1).
+    - for multi-channel models, the channels must be interleaved.
+  At least one output tensor with:
+    (kTfLiteFloat32)
+    - `[1 x N]` array with `N` represents the number of categories.
+    - optional (but recommended) category labels as AssociatedFiles with type
+      TENSOR_AXIS_LABELS, containing one label per line. The first such
+      AssociatedFile (if any) is used to fill the `category_name` field of the
+      results. The `display_name` field is filled from the AssociatedFile (if
+      any) whose locale matches the `display_names_locale` field of the
+      `AudioClassifierOptions` used at creation time ("en" by default, i.e.
+      English). If none of these are available, only the `index` field of the
+      results will be filled.
+  """
+  @classmethod
+  def create_from_model_path(cls, model_path: str) -> 'AudioClassifier':
+    """Creates an `AudioClassifier` object from a TensorFlow Lite model and the default `AudioClassifierOptions`.
+    Note that the created `AudioClassifier` instance is in audio clips mode, for
+    classifying on independent audio clips.
+    Args:
+      model_path: Path to the model.
+    Returns:
+      `AudioClassifier` object that's created from the model file and the
+      default `AudioClassifierOptions`.
+    Raises:
+      ValueError: If failed to create `AudioClassifier` object from the provided
+        file such as invalid file path.
+      RuntimeError: If other types of error occurred.
+    """
+    base_options = _BaseOptions(model_asset_path=model_path)
+    options = AudioClassifierOptions(
+        base_options=base_options, running_mode=_RunningMode.AUDIO_CLIPS)
+    return cls.create_from_options(options)
+  @classmethod
+  def create_from_options(cls,
+                          options: AudioClassifierOptions) -> 'AudioClassifier':
+    """Creates the `AudioClassifier` object from audio classifier options.
+    Args:
+      options: Options for the audio classifier task.
+    Returns:
+      `AudioClassifier` object that's created from `options`.
+    Raises:
+      ValueError: If failed to create `AudioClassifier` object from
+        `AudioClassifierOptions` such as missing the model.
+      RuntimeError: If other types of error occurred.
+    """
+    def packets_callback(output_packets: Mapping[str, packet.Packet]):
+      timestamp_ms = output_packets[
+          _CLASSIFICATIONS_STREAM_NAME].timestamp.value // _MICRO_SECONDS_PER_MILLISECOND
+      if output_packets[_CLASSIFICATIONS_STREAM_NAME].is_empty():
+        options.result_callback(
+            AudioClassifierResult(classifications=[]), timestamp_ms)
+        return
+      classification_result_proto = classifications_pb2.ClassificationResult()
+      classification_result_proto.CopyFrom(
+          packet_getter.get_proto(output_packets[_CLASSIFICATIONS_STREAM_NAME]))
+      options.result_callback(
+          AudioClassifierResult.create_from_pb2(classification_result_proto),
+          timestamp_ms)
+    task_info = _TaskInfo(
+        task_graph=_TASK_GRAPH_NAME,
+        input_streams=[
+            ':'.join([_AUDIO_TAG, _AUDIO_IN_STREAM_NAME]),
+            ':'.join([_SAMPLE_RATE_TAG, _SAMPLE_RATE_IN_STREAM_NAME])
+        ],
+        output_streams=[
+            ':'.join([_CLASSIFICATIONS_TAG, _CLASSIFICATIONS_STREAM_NAME]),
+            ':'.join([
+                _TIMESTAMPED_CLASSIFICATIONS_TAG,
+                _TIMESTAMPED_CLASSIFICATIONS_STREAM_NAME
+            ])
+        ],
+        task_options=options)
+    return cls(
+        # Audio tasks should not drop input audio due to flow limiting, which
+        # may cause data inconsistency.
+        task_info.generate_graph_config(enable_flow_limiting=False),
+        options.running_mode,
+        packets_callback if options.result_callback else None)
+  def classify(self, audio_clip: _AudioData) -> List[AudioClassifierResult]:
+    """Performs audio classification on the provided audio clip.
+    The audio clip is represented as a MediaPipe AudioData. The method accepts
+    audio clips with various length and audio sample rate. It's required to
+    provide the corresponding audio sample rate within the `AudioData` object.
+    The input audio clip may be longer than what the model is able to process
+    in a single inference. When this occurs, the input audio clip is split into
+    multiple chunks starting at different timestamps. For this reason, this
+    function returns a vector of ClassificationResult objects, each associated
+    ith a timestamp corresponding to the start (in milliseconds) of the chunk
+    data that was classified, e.g:
+    ClassificationResult #0 (first chunk of data):
+      timestamp_ms: 0 (starts at 0ms)
+      classifications #0 (single head model):
+        category #0:
+          category_name: "Speech"
+          score: 0.6
+        category #1:
+          category_name: "Music"
+          score: 0.2
+    ClassificationResult #1 (second chunk of data):
+      timestamp_ms: 800 (starts at 800ms)
+      classifications #0 (single head model):
+        category #0:
+          category_name: "Speech"
+          score: 0.5
+       category #1:
+         category_name: "Silence"
+         score: 0.1
+    Args:
+      audio_clip: MediaPipe AudioData.
+    Returns:
+      An `AudioClassifierResult` object that contains a list of
+      classification result objects, each associated with a timestamp
+      corresponding to the start (in milliseconds) of the chunk data that was
+      classified.
+    Raises:
+      ValueError: If any of the input arguments is invalid, such as the sample
+        rate is not provided in the `AudioData` object.
+      RuntimeError: If audio classification failed to run.
+    """
+    if not audio_clip.audio_format.sample_rate:
+      raise ValueError('Must provide the audio sample rate in audio data.')
+    output_packets = self._process_audio_clip({
+        _AUDIO_IN_STREAM_NAME:
+            packet_creator.create_matrix(audio_clip.buffer, transpose=True),
+        _SAMPLE_RATE_IN_STREAM_NAME:
+            packet_creator.create_double(audio_clip.audio_format.sample_rate)
+    })
+    output_list = []
+    classification_result_proto_list = packet_getter.get_proto_list(
+        output_packets[_TIMESTAMPED_CLASSIFICATIONS_STREAM_NAME])
+    for proto in classification_result_proto_list:
+      classification_result_proto = classifications_pb2.ClassificationResult()
+      classification_result_proto.CopyFrom(proto)
+      output_list.append(
+          AudioClassifierResult.create_from_pb2(classification_result_proto))
+    return output_list
+  def classify_async(self, audio_block: _AudioData, timestamp_ms: int) -> None:
+    """Sends audio data (a block in a continuous audio stream) to perform audio classification.
+    Only use this method when the AudioClassifier is created with the audio
+    stream running mode. The input timestamps should be monotonically increasing
+    for adjacent calls of this method. This method will return immediately after
+    the input audio data is accepted. The results will be available via the
+    `result_callback` provided in the `AudioClassifierOptions`. The
+    `classify_async` method is designed to process auido stream data such as
+    microphone input.
+    The input audio data may be longer than what the model is able to process
+    in a single inference. When this occurs, the input audio block is split
+    into multiple chunks. For this reason, the callback may be called multiple
+    times (once per chunk) for each call to this function.
+    The `result_callback` provides:
+      - An `AudioClassifierResult` object that contains a list of
+        classifications.
+      - The input timestamp in milliseconds.
+    Args:
+      audio_block: MediaPipe AudioData.
+      timestamp_ms: The timestamp of the input audio data in milliseconds.
+    Raises:
+      ValueError: If any of the followings:
+        1) The sample rate is not provided in the `AudioData` object or the
+        provided sample rate is inconsistent with the previously received.
+        2) The current input timestamp is smaller than what the audio
+        classifier has already processed.
+    """
+    if not audio_block.audio_format.sample_rate:
+      raise ValueError('Must provide the audio sample rate in audio data.')
+    if not self._default_sample_rate:
+      self._default_sample_rate = audio_block.audio_format.sample_rate
+      self._set_sample_rate(_SAMPLE_RATE_IN_STREAM_NAME,
+                            self._default_sample_rate)
+    elif audio_block.audio_format.sample_rate != self._default_sample_rate:
+      raise ValueError(
+          f'The audio sample rate provided in audio data: '
+          f'{audio_block.audio_format.sample_rate} is inconsistent with '
+          f'the previously received: {self._default_sample_rate}.')
+    self._send_audio_stream_data({
+        _AUDIO_IN_STREAM_NAME:
+            packet_creator.create_matrix(audio_block.buffer, transpose=True).at(
+                timestamp_ms * _MICRO_SECONDS_PER_MILLISECOND)
+    })

mediapipe/tasks/python/audio/audio_embedder.py ADDED Viewed

@@ -0,0 +1,285 @@
+# Copyright 2022 The MediaPipe Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MediaPipe audio embedder task."""
+import dataclasses
+from typing import Callable, Mapping, List, Optional
+from mediapipe.python import packet_creator
+from mediapipe.python import packet_getter
+from mediapipe.python._framework_bindings import packet
+from mediapipe.tasks.cc.audio.audio_embedder.proto import audio_embedder_graph_options_pb2
+from mediapipe.tasks.cc.components.containers.proto import embeddings_pb2
+from mediapipe.tasks.cc.components.processors.proto import embedder_options_pb2
+from mediapipe.tasks.python.audio.core import audio_task_running_mode as running_mode_module
+from mediapipe.tasks.python.audio.core import base_audio_task_api
+from mediapipe.tasks.python.components.containers import audio_data as audio_data_module
+from mediapipe.tasks.python.components.containers import embedding_result as embedding_result_module
+from mediapipe.tasks.python.core import base_options as base_options_module
+from mediapipe.tasks.python.core import task_info as task_info_module
+from mediapipe.tasks.python.core.optional_dependencies import doc_controls
+AudioEmbedderResult = embedding_result_module.EmbeddingResult
+_AudioEmbedderGraphOptionsProto = audio_embedder_graph_options_pb2.AudioEmbedderGraphOptions
+_AudioData = audio_data_module.AudioData
+_BaseOptions = base_options_module.BaseOptions
+_EmbedderOptionsProto = embedder_options_pb2.EmbedderOptions
+_RunningMode = running_mode_module.AudioTaskRunningMode
+_TaskInfo = task_info_module.TaskInfo
+_AUDIO_IN_STREAM_NAME = 'audio_in'
+_AUDIO_TAG = 'AUDIO'
+_EMBEDDINGS_STREAM_NAME = 'embeddings_out'
+_EMBEDDINGS_TAG = 'EMBEDDINGS'
+_SAMPLE_RATE_IN_STREAM_NAME = 'sample_rate_in'
+_SAMPLE_RATE_TAG = 'SAMPLE_RATE'
+_TASK_GRAPH_NAME = 'mediapipe.tasks.audio.audio_embedder.AudioEmbedderGraph'
+_TIMESTAMPTED_EMBEDDINGS_STREAM_NAME = 'timestamped_embeddings_out'
+_TIMESTAMPTED_EMBEDDINGS_TAG = 'TIMESTAMPED_EMBEDDINGS'
+_MICRO_SECONDS_PER_MILLISECOND = 1000
+@dataclasses.dataclass
+class AudioEmbedderOptions:
+  """Options for the audio embedder task.
+  Attributes:
+    base_options: Base options for the audio embedder task.
+    running_mode: The running mode of the task. Default to the audio clips mode.
+      Audio embedder task has two running modes: 1) The audio clips mode for
+      running embedding extraction on independent audio clips. 2) The audio
+      stream mode for running embedding extraction on the audio stream, such as
+      from microphone. In this mode,  the "result_callback" below must be
+      specified to receive the embedding results asynchronously.
+    l2_normalize: Whether to normalize the returned feature vector with L2 norm.
+      Use this option only if the model does not already contain a native
+      L2_NORMALIZATION TF Lite Op. In most cases, this is already the case and
+      L2 norm is thus achieved through TF Lite inference.
+    quantize: Whether the returned embedding should be quantized to bytes via
+      scalar quantization. Embeddings are implicitly assumed to be unit-norm and
+      therefore any dimension is guaranteed to have a value in [-1.0, 1.0]. Use
+      the l2_normalize option if this is not the case.
+    result_callback: The user-defined result callback for processing audio
+      stream data. The result callback should only be specified when the running
+      mode is set to the audio stream mode.
+  """
+  base_options: _BaseOptions
+  running_mode: _RunningMode = _RunningMode.AUDIO_CLIPS
+  l2_normalize: Optional[bool] = None
+  quantize: Optional[bool] = None
+  result_callback: Optional[Callable[[AudioEmbedderResult, int], None]] = None
+  @doc_controls.do_not_generate_docs
+  def to_pb2(self) -> _AudioEmbedderGraphOptionsProto:
+    """Generates an AudioEmbedderOptions protobuf object."""
+    base_options_proto = self.base_options.to_pb2()
+    base_options_proto.use_stream_mode = False if self.running_mode == _RunningMode.AUDIO_CLIPS else True
+    embedder_options_proto = _EmbedderOptionsProto(
+        l2_normalize=self.l2_normalize, quantize=self.quantize)
+    return _AudioEmbedderGraphOptionsProto(
+        base_options=base_options_proto,
+        embedder_options=embedder_options_proto)
+class AudioEmbedder(base_audio_task_api.BaseAudioTaskApi):
+  """Class that performs embedding extraction on audio clips or audio stream.
+  This API expects a TFLite model with mandatory TFLite Model Metadata that
+  contains the mandatory AudioProperties of the solo input audio tensor and the
+  optional (but recommended) label items as AssociatedFiles with type
+  TENSOR_AXIS_LABELS per output embedding tensor.
+  Input tensor:
+    (kTfLiteFloat32)
+    - input audio buffer of size `[batch * samples]`.
+    - batch inference is not supported (`batch` is required to be 1).
+    - for multi-channel models, the channels must be interleaved.
+  At least one output tensor with:
+    (kTfLiteUInt8/kTfLiteFloat32)
+    - `N` components corresponding to the `N` dimensions of the returned
+    feature vector for this output layer.
+    - Either 2 or 4 dimensions, i.e. `[1 x N]` or `[1 x 1 x 1 x N]`.
+  """
+  @classmethod
+  def create_from_model_path(cls, model_path: str) -> 'AudioEmbedder':
+    """Creates an `AudioEmbedder` object from a TensorFlow Lite model and the default `AudioEmbedderOptions`.
+    Note that the created `AudioEmbedder` instance is in audio clips mode, for
+    embedding extraction on the independent audio clips.
+    Args:
+      model_path: Path to the model.
+    Returns:
+      `AudioEmbedder` object that's created from the model file and the
+      default `AudioEmbedderOptions`.
+    Raises:
+      ValueError: If failed to create `AudioEmbedder` object from the provided
+        file such as invalid file path.
+      RuntimeError: If other types of error occurred.
+    """
+    base_options = _BaseOptions(model_asset_path=model_path)
+    options = AudioEmbedderOptions(
+        base_options=base_options, running_mode=_RunningMode.AUDIO_CLIPS)
+    return cls.create_from_options(options)
+  @classmethod
+  def create_from_options(cls,
+                          options: AudioEmbedderOptions) -> 'AudioEmbedder':
+    """Creates the `AudioEmbedder` object from audio embedder options.
+    Args:
+      options: Options for the audio embedder task.
+    Returns:
+      `AudioEmbedder` object that's created from `options`.
+    Raises:
+      ValueError: If failed to create `AudioEmbedder` object from
+        `AudioEmbedderOptions` such as missing the model.
+      RuntimeError: If other types of error occurred.
+    """
+    def packets_callback(output_packets: Mapping[str, packet.Packet]):
+      timestamp_ms = output_packets[
+          _EMBEDDINGS_STREAM_NAME].timestamp.value // _MICRO_SECONDS_PER_MILLISECOND
+      if output_packets[_EMBEDDINGS_STREAM_NAME].is_empty():
+        options.result_callback(
+            AudioEmbedderResult(embeddings=[]), timestamp_ms)
+        return
+      embedding_result_proto = embeddings_pb2.EmbeddingResult()
+      embedding_result_proto.CopyFrom(
+          packet_getter.get_proto(output_packets[_EMBEDDINGS_STREAM_NAME]))
+      options.result_callback(
+          AudioEmbedderResult.create_from_pb2(embedding_result_proto),
+          timestamp_ms)
+    task_info = _TaskInfo(
+        task_graph=_TASK_GRAPH_NAME,
+        input_streams=[
+            ':'.join([_AUDIO_TAG, _AUDIO_IN_STREAM_NAME]),
+            ':'.join([_SAMPLE_RATE_TAG, _SAMPLE_RATE_IN_STREAM_NAME])
+        ],
+        output_streams=[
+            ':'.join([_EMBEDDINGS_TAG, _EMBEDDINGS_STREAM_NAME]), ':'.join([
+                _TIMESTAMPTED_EMBEDDINGS_TAG,
+                _TIMESTAMPTED_EMBEDDINGS_STREAM_NAME
+            ])
+        ],
+        task_options=options)
+    return cls(
+        # Audio tasks should not drop input audio due to flow limiting, which
+        # may cause data inconsistency.
+        task_info.generate_graph_config(enable_flow_limiting=False),
+        options.running_mode,
+        packets_callback if options.result_callback else None)
+  def embed(self, audio_clip: _AudioData) -> List[AudioEmbedderResult]:
+    """Performs embedding extraction on the provided audio clips.
+    The audio clip is represented as a MediaPipe AudioData. The method accepts
+    audio clips with various length and audio sample rate. It's required to
+    provide the corresponding audio sample rate within the `AudioData` object.
+    The input audio clip may be longer than what the model is able to process
+    in a single inference. When this occurs, the input audio clip is split into
+    multiple chunks starting at different timestamps. For this reason, this
+    function returns a vector of EmbeddingResult objects, each associated
+    ith a timestamp corresponding to the start (in milliseconds) of the chunk
+    data on which embedding extraction was carried out.
+    Args:
+      audio_clip: MediaPipe AudioData.
+    Returns:
+      An `AudioEmbedderResult` object that contains a list of embedding result
+      objects, each associated with a timestamp corresponding to the start
+      (in milliseconds) of the chunk data on which embedding extraction was
+      carried out.
+    Raises:
+      ValueError: If any of the input arguments is invalid, such as the sample
+        rate is not provided in the `AudioData` object.
+      RuntimeError: If audio embedding extraction failed to run.
+    """
+    if not audio_clip.audio_format.sample_rate:
+      raise ValueError('Must provide the audio sample rate in audio data.')
+    output_packets = self._process_audio_clip({
+        _AUDIO_IN_STREAM_NAME:
+            packet_creator.create_matrix(audio_clip.buffer, transpose=True),
+        _SAMPLE_RATE_IN_STREAM_NAME:
+            packet_creator.create_double(audio_clip.audio_format.sample_rate)
+    })
+    output_list = []
+    embeddings_proto_list = packet_getter.get_proto_list(
+        output_packets[_TIMESTAMPTED_EMBEDDINGS_STREAM_NAME])
+    for proto in embeddings_proto_list:
+      embedding_result_proto = embeddings_pb2.EmbeddingResult()
+      embedding_result_proto.CopyFrom(proto)
+      output_list.append(
+          AudioEmbedderResult.create_from_pb2(embedding_result_proto))
+    return output_list
+  def embed_async(self, audio_block: _AudioData, timestamp_ms: int) -> None:
+    """Sends audio data (a block in a continuous audio stream) to perform audio embedding extraction.
+    Only use this method when the AudioEmbedder is created with the audio
+    stream running mode. The input timestamps should be monotonically increasing
+    for adjacent calls of this method. This method will return immediately after
+    the input audio data is accepted. The results will be available via the
+    `result_callback` provided in the `AudioEmbedderOptions`. The
+    `embed_async` method is designed to process auido stream data such as
+    microphone input.
+    The input audio data may be longer than what the model is able to process
+    in a single inference. When this occurs, the input audio block is split
+    into multiple chunks. For this reason, the callback may be called multiple
+    times (once per chunk) for each call to this function.
+    The `result_callback` provides:
+      - An `AudioEmbedderResult` object that contains a list of
+        embeddings.
+      - The input timestamp in milliseconds.
+    Args:
+      audio_block: MediaPipe AudioData.
+      timestamp_ms: The timestamp of the input audio data in milliseconds.
+    Raises:
+      ValueError: If any of the followings:
+        1) The sample rate is not provided in the `AudioData` object or the
+        provided sample rate is inconsistent with the previously received.
+        2) The current input timestamp is smaller than what the audio
+        embedder has already processed.
+    """
+    if not audio_block.audio_format.sample_rate:
+      raise ValueError('Must provide the audio sample rate in audio data.')
+    if not self._default_sample_rate:
+      self._default_sample_rate = audio_block.audio_format.sample_rate
+      self._set_sample_rate(_SAMPLE_RATE_IN_STREAM_NAME,
+                            self._default_sample_rate)
+    elif audio_block.audio_format.sample_rate != self._default_sample_rate:
+      raise ValueError(
+          f'The audio sample rate provided in audio data: '
+          f'{audio_block.audio_format.sample_rate} is inconsistent with '
+          f'the previously received: {self._default_sample_rate}.')
+    self._send_audio_stream_data({
+        _AUDIO_IN_STREAM_NAME:
+            packet_creator.create_matrix(audio_block.buffer, transpose=True).at(
+                timestamp_ms * _MICRO_SECONDS_PER_MILLISECOND)
+    })

mediapipe/tasks/python/audio/core/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Copyright 2022 The MediaPipe Authors.
+All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""