PyPI - esphome - Versions diffs - 2024.6.6__py3-none-any.whl → 2024.7.0__py3-none-any.whl - Mend

esphome 2024.6.6py3-none-any.whl → 2024.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

esphome/components/aht10/aht10.cpp +4 -2
esphome/components/climate/climate.cpp +10 -6
esphome/components/climate/climate_traits.h +3 -3
esphome/components/cover/cover.h +2 -2
esphome/components/esp32_camera/__init__.py +6 -3
esphome/components/esp32_can/canbus.py +3 -0
esphome/components/ethernet/ethernet_component.cpp +8 -3
esphome/components/font/__init__.py +2 -28
esphome/components/gree/climate.py +1 -0
esphome/components/gree/gree.cpp +11 -3
esphome/components/gree/gree.h +5 -1
esphome/components/haier/binary_sensor/__init__.py +4 -4
esphome/components/haier/button/__init__.py +1 -1
esphome/components/haier/climate.py +43 -9
esphome/components/haier/haier_base.cpp +4 -0
esphome/components/haier/haier_base.h +11 -1
esphome/components/haier/hon_climate.cpp +109 -55
esphome/components/haier/hon_climate.h +7 -1
esphome/components/haier/hon_packet.h +5 -0
esphome/components/haier/sensor/__init__.py +5 -5
esphome/components/haier/smartair2_climate.cpp +1 -0
esphome/components/haier/text_sensor/__init__.py +4 -4
esphome/components/heatpumpir/climate.py +12 -5
esphome/components/heatpumpir/heatpumpir.cpp +11 -0
esphome/components/heatpumpir/heatpumpir.h +11 -0
esphome/components/http_request/http_request_arduino.cpp +7 -2
esphome/components/http_request/update/http_request_update.cpp +6 -7
esphome/components/http_request/update/http_request_update.h +0 -3
esphome/components/i2s_audio/__init__.py +10 -0
esphome/components/i2s_audio/microphone/__init__.py +7 -0
esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp +2 -3
esphome/components/i2s_audio/microphone/i2s_audio_microphone.h +3 -0
esphome/components/image/__init__.py +2 -29
esphome/components/improv_serial/improv_serial_component.cpp +9 -8
esphome/components/ltr390/ltr390.cpp +44 -29
esphome/components/ltr390/ltr390.h +9 -5
esphome/components/ltr390/sensor.py +35 -5
esphome/components/mdns/__init__.py +3 -3
esphome/components/mdns/mdns_component.cpp +3 -1
esphome/components/mdns/mdns_component.h +3 -1
esphome/components/mdns/mdns_esp32.cpp +2 -1
esphome/components/mdns/mdns_esp8266.cpp +2 -1
esphome/components/mdns/mdns_host.cpp +2 -1
esphome/components/mdns/mdns_libretiny.cpp +2 -1
esphome/components/mdns/mdns_rp2040.cpp +2 -1
esphome/components/micro_wake_word/__init__.py +205 -56
esphome/components/micro_wake_word/micro_wake_word.cpp +225 -275
esphome/components/micro_wake_word/micro_wake_word.h +77 -107
esphome/components/micro_wake_word/preprocessor_settings.h +20 -0
esphome/components/micro_wake_word/streaming_model.cpp +189 -0
esphome/components/micro_wake_word/streaming_model.h +84 -0
esphome/components/mitsubishi/mitsubishi.cpp +1 -0
esphome/components/modbus_controller/text_sensor/__init__.py +2 -1
esphome/components/modbus_controller/text_sensor/modbus_textsensor.cpp +4 -1
esphome/components/modbus_controller/text_sensor/modbus_textsensor.h +1 -1
esphome/components/number/__init__.py +2 -0
esphome/components/ota/ota_backend_arduino_esp32.cpp +22 -7
esphome/components/ota/ota_backend_arduino_esp8266.cpp +23 -8
esphome/components/ota/ota_backend_arduino_libretiny.cpp +22 -7
esphome/components/ota/ota_backend_arduino_rp2040.cpp +22 -7
esphome/components/pmsa003i/pmsa003i.cpp +9 -0
esphome/components/qspi_amoled/display.py +16 -4
esphome/components/qspi_amoled/qspi_amoled.cpp +16 -0
esphome/components/qspi_amoled/qspi_amoled.h +0 -3
esphome/components/remote_base/dooya_protocol.cpp +4 -4
esphome/components/remote_base/rc_switch_protocol.cpp +1 -1
esphome/components/restart/button/__init__.py +2 -0
esphome/components/script/__init__.py +1 -1
esphome/components/sensor/__init__.py +2 -0
esphome/components/tuya/tuya.cpp +8 -2
esphome/components/tuya/tuya.h +3 -1
esphome/components/uart/__init__.py +72 -9
esphome/components/uart/uart_component_esp32_arduino.cpp +18 -4
esphome/components/uart/uart_component_esp_idf.cpp +22 -2
esphome/components/uart/uart_component_host.cpp +295 -0
esphome/components/uart/uart_component_host.h +38 -0
esphome/components/uptime/sensor.py +44 -11
esphome/components/uptime/{uptime_sensor.cpp → uptime_seconds_sensor.cpp} +11 -7
esphome/components/uptime/{uptime_sensor.h → uptime_seconds_sensor.h} +2 -2
esphome/components/uptime/uptime_timestamp_sensor.cpp +39 -0
esphome/components/uptime/uptime_timestamp_sensor.h +30 -0
esphome/components/veml7700/veml7700.cpp +1 -1
esphome/components/veml7700/veml7700.h +5 -5
esphome/components/voice_assistant/voice_assistant.cpp +4 -2
esphome/components/web_server/server_index_v2.h +42 -41
esphome/components/web_server/server_index_v3.h +368 -367
esphome/components/wifi/wifi_component_esp_idf.cpp +1 -1
esphome/components/wifi/wifi_component_pico_w.cpp +18 -2
esphome/components/wireguard/__init__.py +1 -1
esphome/components/x9c/output.py +7 -1
esphome/const.py +2 -1
esphome/core/defines.h +1 -0
esphome/core/helpers.cpp +2 -2
esphome/core/helpers.h +1 -1
esphome/external_files.py +26 -0
{esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/METADATA +1 -1
{esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/RECORD +101 -95
esphome/components/micro_wake_word/audio_preprocessor_int8_model_data.h +0 -493
{esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/LICENSE +0 -0
{esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/WHEEL +0 -0
{esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/entry_points.txt +0 -0
{esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/top_level.txt +0 -0

esphome/components/micro_wake_word/micro_wake_word.cpp CHANGED Viewed

@@ -1,12 +1,5 @@
 #include "micro_wake_word.h"
-/**
- * This is a workaround until we can figure out a way to get
- * the tflite-micro idf component code available in CI
- *
- * */
-//
-#ifndef CLANG_TIDY
+#include "streaming_model.h"
 #ifdef USE_ESP_IDF
@@ -14,13 +7,13 @@
 #include "esphome/core/helpers.h"
 #include "esphome/core/log.h"
-#include "audio_preprocessor_int8_model_data.h"
+#include <frontend.h>
+#include <frontend_util.h>
 #include <tensorflow/lite/core/c/common.h>
 #include <tensorflow/lite/micro/micro_interpreter.h>
 #include <tensorflow/lite/micro/micro_mutable_op_resolver.h>
-#include <cinttypes>
 #include <cmath>
 namespace esphome {
@@ -29,9 +22,9 @@ namespace micro_wake_word {
 static const char *const TAG = "micro_wake_word";
 static const size_t SAMPLE_RATE_HZ = 16000;  // 16 kHz
-static const size_t BUFFER_LENGTH = 500;     // 0.5 seconds
+static const size_t BUFFER_LENGTH = 64;      // 0.064 seconds
 static const size_t BUFFER_SIZE = SAMPLE_RATE_HZ / 1000 * BUFFER_LENGTH;
-static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000;  // 32ms * 16kHz / 1000ms
+static const size_t INPUT_BUFFER_SIZE = 16 * SAMPLE_RATE_HZ / 1000;  // 16ms * 16kHz / 1000ms
 float MicroWakeWord::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }
@@ -56,57 +49,55 @@ static const LogString *micro_wake_word_state_to_string(State state) {
 void MicroWakeWord::dump_config() {
   ESP_LOGCONFIG(TAG, "microWakeWord:");
-  ESP_LOGCONFIG(TAG, "  Wake Word: %s", this->get_wake_word().c_str());
-  ESP_LOGCONFIG(TAG, "  Probability cutoff: %.3f", this->probability_cutoff_);
-  ESP_LOGCONFIG(TAG, "  Sliding window size: %d", this->sliding_window_average_size_);
+  ESP_LOGCONFIG(TAG, "  models:");
+  for (auto &model : this->wake_word_models_) {
+    model.log_model_config();
+  }
+#ifdef USE_MICRO_WAKE_WORD_VAD
+  this->vad_model_->log_model_config();
+#endif
 }
 void MicroWakeWord::setup() {
   ESP_LOGCONFIG(TAG, "Setting up microWakeWord...");
-  if (!this->initialize_models()) {
-    ESP_LOGE(TAG, "Failed to initialize models");
-    this->mark_failed();
-    return;
-  }
-  ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
-  this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE * sizeof(int16_t));
-  if (this->input_buffer_ == nullptr) {
-    ESP_LOGW(TAG, "Could not allocate input buffer");
-    this->mark_failed();
-    return;
-  }
-  this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
-  if (this->ring_buffer_ == nullptr) {
-    ESP_LOGW(TAG, "Could not allocate ring buffer");
+  if (!this->register_streaming_ops_(this->streaming_op_resolver_)) {
     this->mark_failed();
     return;
   }
   ESP_LOGCONFIG(TAG, "Micro Wake Word initialized");
-}
-int MicroWakeWord::read_microphone_() {
-  size_t bytes_read = this->microphone_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
-  if (bytes_read == 0) {
-    return 0;
-  }
-  size_t bytes_free = this->ring_buffer_->free();
-  if (bytes_free < bytes_read) {
-    ESP_LOGW(TAG,
-             "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
-             "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
-             bytes_free, bytes_read);
+  this->frontend_config_.window.size_ms = FEATURE_DURATION_MS;
+  this->frontend_config_.window.step_size_ms = this->features_step_size_;
+  this->frontend_config_.filterbank.num_channels = PREPROCESSOR_FEATURE_SIZE;
+  this->frontend_config_.filterbank.lower_band_limit = 125.0;
+  this->frontend_config_.filterbank.upper_band_limit = 7500.0;
+  this->frontend_config_.noise_reduction.smoothing_bits = 10;
+  this->frontend_config_.noise_reduction.even_smoothing = 0.025;
+  this->frontend_config_.noise_reduction.odd_smoothing = 0.06;
+  this->frontend_config_.noise_reduction.min_signal_remaining = 0.05;
+  this->frontend_config_.pcan_gain_control.enable_pcan = 1;
+  this->frontend_config_.pcan_gain_control.strength = 0.95;
+  this->frontend_config_.pcan_gain_control.offset = 80.0;
+  this->frontend_config_.pcan_gain_control.gain_bits = 21;
+  this->frontend_config_.log_scale.enable_log = 1;
+  this->frontend_config_.log_scale.scale_shift = 6;
+}
-    this->ring_buffer_->reset();
-  }
+void MicroWakeWord::add_wake_word_model(const uint8_t *model_start, float probability_cutoff,
+                                        size_t sliding_window_average_size, const std::string &wake_word,
+                                        size_t tensor_arena_size) {
+  this->wake_word_models_.emplace_back(model_start, probability_cutoff, sliding_window_average_size, wake_word,
+                                       tensor_arena_size);
+}
-  return this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
+#ifdef USE_MICRO_WAKE_WORD_VAD
+void MicroWakeWord::add_vad_model(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_size,
+                                  size_t tensor_arena_size) {
+  this->vad_model_ = make_unique<VADModel>(model_start, probability_cutoff, sliding_window_size, tensor_arena_size);
 }
+#endif
 void MicroWakeWord::loop() {
   switch (this->state_) {
@@ -124,9 +115,12 @@ void MicroWakeWord::loop() {
       }
       break;
     case State::DETECTING_WAKE_WORD:
-      this->read_microphone_();
-      if (this->detect_wake_word_()) {
-        ESP_LOGD(TAG, "Wake Word Detected");
+      while (!this->has_enough_samples_()) {
+        this->read_microphone_();
+      }
+      this->update_model_probabilities_();
+      if (this->detect_wake_words_()) {
+        ESP_LOGD(TAG, "Wake Word '%s' Detected", (this->detected_wake_word_).c_str());
         this->detected_ = true;
         this->set_state_(State::STOP_MICROPHONE);
       }
@@ -136,13 +130,16 @@ void MicroWakeWord::loop() {
       this->microphone_->stop();
       this->set_state_(State::STOPPING_MICROPHONE);
       this->high_freq_.stop();
+      this->unload_models_();
+      this->deallocate_buffers_();
       break;
     case State::STOPPING_MICROPHONE:
       if (this->microphone_->is_stopped()) {
         this->set_state_(State::IDLE);
         if (this->detected_) {
+          this->wake_word_detected_trigger_->trigger(this->detected_wake_word_);
           this->detected_ = false;
-          this->wake_word_detected_trigger_->trigger(this->wake_word_);
+          this->detected_wake_word_ = "";
         }
       }
       break;
@@ -150,14 +147,34 @@ void MicroWakeWord::loop() {
 }
 void MicroWakeWord::start() {
+  if (!this->is_ready()) {
+    ESP_LOGW(TAG, "Wake word detection can't start as the component hasn't been setup yet");
+    return;
+  }
   if (this->is_failed()) {
     ESP_LOGW(TAG, "Wake word component is marked as failed. Please check setup logs");
     return;
   }
+  if (!this->load_models_() || !this->allocate_buffers_()) {
+    ESP_LOGE(TAG, "Failed to load the wake word model(s) or allocate buffers");
+    this->status_set_error();
+  } else {
+    this->status_clear_error();
+  }
+  if (this->status_has_error()) {
+    ESP_LOGW(TAG, "Wake word component has an error. Please check logs");
+    return;
+  }
   if (this->state_ != State::IDLE) {
     ESP_LOGW(TAG, "Wake word is already running");
     return;
   }
+  this->reset_states_();
   this->set_state_(State::START_MICROPHONE);
 }
@@ -179,289 +196,218 @@ void MicroWakeWord::set_state_(State state) {
   this->state_ = state;
 }
-bool MicroWakeWord::initialize_models() {
-  ExternalRAMAllocator<uint8_t> arena_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
-  ExternalRAMAllocator<int8_t> features_allocator(ExternalRAMAllocator<int8_t>::ALLOW_FAILURE);
-  ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
-  this->streaming_tensor_arena_ = arena_allocator.allocate(STREAMING_MODEL_ARENA_SIZE);
-  if (this->streaming_tensor_arena_ == nullptr) {
-    ESP_LOGE(TAG, "Could not allocate the streaming model's tensor arena.");
-    return false;
-  }
-  this->streaming_var_arena_ = arena_allocator.allocate(STREAMING_MODEL_VARIABLE_ARENA_SIZE);
-  if (this->streaming_var_arena_ == nullptr) {
-    ESP_LOGE(TAG, "Could not allocate the streaming model variable's tensor arena.");
-    return false;
-  }
-  this->preprocessor_tensor_arena_ = arena_allocator.allocate(PREPROCESSOR_ARENA_SIZE);
-  if (this->preprocessor_tensor_arena_ == nullptr) {
-    ESP_LOGE(TAG, "Could not allocate the audio preprocessor model's tensor arena.");
-    return false;
-  }
-  this->new_features_data_ = features_allocator.allocate(PREPROCESSOR_FEATURE_SIZE);
-  if (this->new_features_data_ == nullptr) {
-    ESP_LOGE(TAG, "Could not allocate the audio features buffer.");
-    return false;
+size_t MicroWakeWord::read_microphone_() {
+  size_t bytes_read = this->microphone_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
+  if (bytes_read == 0) {
+    return 0;
   }
-  this->preprocessor_audio_buffer_ = audio_samples_allocator.allocate(SAMPLE_DURATION_COUNT);
-  if (this->preprocessor_audio_buffer_ == nullptr) {
-    ESP_LOGE(TAG, "Could not allocate the audio preprocessor's buffer.");
-    return false;
-  }
+  size_t bytes_free = this->ring_buffer_->free();
-  this->preprocessor_model_ = tflite::GetModel(G_AUDIO_PREPROCESSOR_INT8_TFLITE);
-  if (this->preprocessor_model_->version() != TFLITE_SCHEMA_VERSION) {
-    ESP_LOGE(TAG, "Wake word's audio preprocessor model's schema is not supported");
-    return false;
-  }
+  if (bytes_free < bytes_read) {
+    ESP_LOGW(TAG,
+             "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
+             "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
+             bytes_free, bytes_read);
-  this->streaming_model_ = tflite::GetModel(this->model_start_);
-  if (this->streaming_model_->version() != TFLITE_SCHEMA_VERSION) {
-    ESP_LOGE(TAG, "Wake word's streaming model's schema is not supported");
-    return false;
+    this->ring_buffer_->reset();
   }
-  static tflite::MicroMutableOpResolver<18> preprocessor_op_resolver;
-  static tflite::MicroMutableOpResolver<17> streaming_op_resolver;
-  if (!this->register_preprocessor_ops_(preprocessor_op_resolver))
-    return false;
-  if (!this->register_streaming_ops_(streaming_op_resolver))
-    return false;
-  tflite::MicroAllocator *ma =
-      tflite::MicroAllocator::Create(this->streaming_var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);
-  this->mrv_ = tflite::MicroResourceVariables::Create(ma, 15);
-  static tflite::MicroInterpreter static_preprocessor_interpreter(
-      this->preprocessor_model_, preprocessor_op_resolver, this->preprocessor_tensor_arena_, PREPROCESSOR_ARENA_SIZE);
-  static tflite::MicroInterpreter static_streaming_interpreter(this->streaming_model_, streaming_op_resolver,
-                                                               this->streaming_tensor_arena_,
-                                                               STREAMING_MODEL_ARENA_SIZE, this->mrv_);
-  this->preprocessor_interperter_ = &static_preprocessor_interpreter;
-  this->streaming_interpreter_ = &static_streaming_interpreter;
-  // Allocate tensors for each models.
-  if (this->preprocessor_interperter_->AllocateTensors() != kTfLiteOk) {
-    ESP_LOGE(TAG, "Failed to allocate tensors for the audio preprocessor");
-    return false;
-  }
-  if (this->streaming_interpreter_->AllocateTensors() != kTfLiteOk) {
-    ESP_LOGE(TAG, "Failed to allocate tensors for the streaming model");
-    return false;
-  }
+  return this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
+}
-  // Verify input tensor matches expected values
-  TfLiteTensor *input = this->streaming_interpreter_->input(0);
-  if ((input->dims->size != 3) || (input->dims->data[0] != 1) || (input->dims->data[0] != 1) ||
-      (input->dims->data[1] != 1) || (input->dims->data[2] != PREPROCESSOR_FEATURE_SIZE)) {
-    ESP_LOGE(TAG, "Wake word detection model tensor input dimensions is not 1x1x%u", input->dims->data[2]);
-    return false;
-  }
+bool MicroWakeWord::allocate_buffers_() {
+  ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
-  if (input->type != kTfLiteInt8) {
-    ESP_LOGE(TAG, "Wake word detection model tensor input is not int8.");
-    return false;
+  if (this->input_buffer_ == nullptr) {
+    this->input_buffer_ = audio_samples_allocator.allocate(INPUT_BUFFER_SIZE * sizeof(int16_t));
+    if (this->input_buffer_ == nullptr) {
+      ESP_LOGE(TAG, "Could not allocate input buffer");
+      return false;
+    }
   }
-  // Verify output tensor matches expected values
-  TfLiteTensor *output = this->streaming_interpreter_->output(0);
-  if ((output->dims->size != 2) || (output->dims->data[0] != 1) || (output->dims->data[1] != 1)) {
-    ESP_LOGE(TAG, "Wake word detection model tensor output dimensions is not 1x1.");
+  if (this->preprocessor_audio_buffer_ == nullptr) {
+    this->preprocessor_audio_buffer_ = audio_samples_allocator.allocate(this->new_samples_to_get_());
+    if (this->preprocessor_audio_buffer_ == nullptr) {
+      ESP_LOGE(TAG, "Could not allocate the audio preprocessor's buffer.");
+      return false;
+    }
   }
-  if (output->type != kTfLiteUInt8) {
-    ESP_LOGE(TAG, "Wake word detection model tensor input is not uint8.");
-    return false;
+  if (this->ring_buffer_ == nullptr) {
+    this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
+    if (this->ring_buffer_ == nullptr) {
+      ESP_LOGE(TAG, "Could not allocate ring buffer");
+      return false;
+    }
   }
-  this->recent_streaming_probabilities_.resize(this->sliding_window_average_size_, 0.0);
   return true;
 }
-bool MicroWakeWord::update_features_() {
-  // Retrieve strided audio samples
-  int16_t *audio_samples = nullptr;
-  if (!this->stride_audio_samples_(&audio_samples)) {
+void MicroWakeWord::deallocate_buffers_() {
+  ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
+  audio_samples_allocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
+  this->input_buffer_ = nullptr;
+  audio_samples_allocator.deallocate(this->preprocessor_audio_buffer_, this->new_samples_to_get_());
+  this->preprocessor_audio_buffer_ = nullptr;
+}
+bool MicroWakeWord::load_models_() {
+  // Setup preprocesor feature generator
+  if (!FrontendPopulateState(&this->frontend_config_, &this->frontend_state_, AUDIO_SAMPLE_FREQUENCY)) {
+    ESP_LOGD(TAG, "Failed to populate frontend state");
+    FrontendFreeStateContents(&this->frontend_state_);
     return false;
   }
-  // Compute the features for the newest audio samples
-  if (!this->generate_single_feature_(audio_samples, SAMPLE_DURATION_COUNT, this->new_features_data_)) {
+  // Setup streaming models
+  for (auto &model : this->wake_word_models_) {
+    if (!model.load_model(this->streaming_op_resolver_)) {
+      ESP_LOGE(TAG, "Failed to initialize a wake word model.");
+      return false;
+    }
+  }
+#ifdef USE_MICRO_WAKE_WORD_VAD
+  if (!this->vad_model_->load_model(this->streaming_op_resolver_)) {
+    ESP_LOGE(TAG, "Failed to initialize VAD model.");
     return false;
   }
+#endif
   return true;
 }
-float MicroWakeWord::perform_streaming_inference_() {
-  TfLiteTensor *input = this->streaming_interpreter_->input(0);
+void MicroWakeWord::unload_models_() {
+  FrontendFreeStateContents(&this->frontend_state_);
-  size_t bytes_to_copy = input->bytes;
-  memcpy((void *) (tflite::GetTensorData<int8_t>(input)), (const void *) (this->new_features_data_), bytes_to_copy);
-  uint32_t prior_invoke = millis();
-  TfLiteStatus invoke_status = this->streaming_interpreter_->Invoke();
-  if (invoke_status != kTfLiteOk) {
-    ESP_LOGW(TAG, "Streaming Interpreter Invoke failed");
-    return false;
+  for (auto &model : this->wake_word_models_) {
+    model.unload_model();
   }
-  ESP_LOGV(TAG, "Streaming Inference Latency=%" PRIu32 " ms", (millis() - prior_invoke));
-  TfLiteTensor *output = this->streaming_interpreter_->output(0);
-  return static_cast<float>(output->data.uint8[0]) / 255.0;
+#ifdef USE_MICRO_WAKE_WORD_VAD
+  this->vad_model_->unload_model();
+#endif
 }
-bool MicroWakeWord::detect_wake_word_() {
-  // Preprocess the newest audio samples into features
-  if (!this->update_features_()) {
-    return false;
-  }
+void MicroWakeWord::update_model_probabilities_() {
+  int8_t audio_features[PREPROCESSOR_FEATURE_SIZE];
-  // Perform inference
-  float streaming_prob = this->perform_streaming_inference_();
+  if (!this->generate_features_for_window_(audio_features)) {
+    return;
+  }
-  // Add the most recent probability to the sliding window
-  this->recent_streaming_probabilities_[this->last_n_index_] = streaming_prob;
-  ++this->last_n_index_;
-  if (this->last_n_index_ == this->sliding_window_average_size_)
-    this->last_n_index_ = 0;
+  // Increase the counter since the last positive detection
+  this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);
-  float sum = 0.0;
-  for (auto &prob : this->recent_streaming_probabilities_) {
-    sum += prob;
+  for (auto &model : this->wake_word_models_) {
+    // Perform inference
+    model.perform_streaming_inference(audio_features);
   }
+#ifdef USE_MICRO_WAKE_WORD_VAD
+  this->vad_model_->perform_streaming_inference(audio_features);
+#endif
+}
-  float sliding_window_average = sum / static_cast<float>(this->sliding_window_average_size_);
-  // Ensure we have enough samples since the last positive detection
-  this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);
+bool MicroWakeWord::detect_wake_words_() {
+  // Verify we have processed samples since the last positive detection
   if (this->ignore_windows_ < 0) {
     return false;
   }
-  // Detect the wake word if the sliding window average is above the cutoff
-  if (sliding_window_average > this->probability_cutoff_) {
-    this->ignore_windows_ = -MIN_SLICES_BEFORE_DETECTION;
-    for (auto &prob : this->recent_streaming_probabilities_) {
-      prob = 0;
-    }
+#ifdef USE_MICRO_WAKE_WORD_VAD
+  bool vad_state = this->vad_model_->determine_detected();
+#endif
-    ESP_LOGD(TAG, "Wake word sliding average probability is %.3f and most recent probability is %.3f",
-             sliding_window_average, streaming_prob);
-    return true;
+  for (auto &model : this->wake_word_models_) {
+    if (model.determine_detected()) {
+#ifdef USE_MICRO_WAKE_WORD_VAD
+      if (vad_state) {
+#endif
+        this->detected_wake_word_ = model.get_wake_word();
+        return true;
+#ifdef USE_MICRO_WAKE_WORD_VAD
+      } else {
+        ESP_LOGD(TAG, "Wake word model predicts %s, but VAD model doesn't.", model.get_wake_word().c_str());
+      }
+#endif
+    }
   }
   return false;
 }
-void MicroWakeWord::set_sliding_window_average_size(size_t size) {
-  this->sliding_window_average_size_ = size;
-  this->recent_streaming_probabilities_.resize(this->sliding_window_average_size_, 0.0);
-}
-bool MicroWakeWord::slice_available_() {
-  size_t available = this->ring_buffer_->available();
-  return available > (NEW_SAMPLES_TO_GET * sizeof(int16_t));
+bool MicroWakeWord::has_enough_samples_() {
+  return this->ring_buffer_->available() >=
+         (this->features_step_size_ * (AUDIO_SAMPLE_FREQUENCY / 1000)) * sizeof(int16_t);
 }
-bool MicroWakeWord::stride_audio_samples_(int16_t **audio_samples) {
-  if (!this->slice_available_()) {
+bool MicroWakeWord::generate_features_for_window_(int8_t features[PREPROCESSOR_FEATURE_SIZE]) {
+  // Ensure we have enough new audio samples in the ring buffer for a full window
+  if (!this->has_enough_samples_()) {
     return false;
   }
-  // Copy the last 320 bytes (160 samples over 10 ms) from the audio buffer to the start of the audio buffer
-  memcpy((void *) (this->preprocessor_audio_buffer_), (void *) (this->preprocessor_audio_buffer_ + NEW_SAMPLES_TO_GET),
-         HISTORY_SAMPLES_TO_KEEP * sizeof(int16_t));
-  // Copy 640 bytes (320 samples over 20 ms) from the ring buffer into the audio buffer offset 320 bytes (160 samples
-  // over 10 ms)
-  size_t bytes_read = this->ring_buffer_->read((void *) (this->preprocessor_audio_buffer_ + HISTORY_SAMPLES_TO_KEEP),
-                                               NEW_SAMPLES_TO_GET * sizeof(int16_t), pdMS_TO_TICKS(200));
+  size_t bytes_read = this->ring_buffer_->read((void *) (this->preprocessor_audio_buffer_),
+                                               this->new_samples_to_get_() * sizeof(int16_t), pdMS_TO_TICKS(200));
   if (bytes_read == 0) {
     ESP_LOGE(TAG, "Could not read data from Ring Buffer");
-  } else if (bytes_read < NEW_SAMPLES_TO_GET * sizeof(int16_t)) {
+  } else if (bytes_read < this->new_samples_to_get_() * sizeof(int16_t)) {
     ESP_LOGD(TAG, "Partial Read of Data by Model");
     ESP_LOGD(TAG, "Could only read %d bytes when required %d bytes ", bytes_read,
-             (int) (NEW_SAMPLES_TO_GET * sizeof(int16_t)));
-    return false;
+             (int) (this->new_samples_to_get_() * sizeof(int16_t)));
+    return false;
+  }
+  size_t num_samples_read;
+  struct FrontendOutput frontend_output = FrontendProcessSamples(
+      &this->frontend_state_, this->preprocessor_audio_buffer_, this->new_samples_to_get_(), &num_samples_read);
+  for (size_t i = 0; i < frontend_output.size; ++i) {
+    // These scaling values are set to match the TFLite audio frontend int8 output.
+    // The feature pipeline outputs 16-bit signed integers in roughly a 0 to 670
+    // range. In training, these are then arbitrarily divided by 25.6 to get
+    // float values in the rough range of 0.0 to 26.0. This scaling is performed
+    // for historical reasons, to match up with the output of other feature
+    // generators.
+    // The process is then further complicated when we quantize the model. This
+    // means we have to scale the 0.0 to 26.0 real values to the -128 to 127
+    // signed integer numbers.
+    // All this means that to get matching values from our integer feature
+    // output into the tensor input, we have to perform:
+    // input = (((feature / 25.6) / 26.0) * 256) - 128
+    // To simplify this and perform it in 32-bit integer math, we rearrange to:
+    // input = (feature * 256) / (25.6 * 26.0) - 128
+    constexpr int32_t value_scale = 256;
+    constexpr int32_t value_div = 666;  // 666 = 25.6 * 26.0 after rounding
+    int32_t value = ((frontend_output.values[i] * value_scale) + (value_div / 2)) / value_div;
+    value -= 128;
+    if (value < -128) {
+      value = -128;
+    }
+    if (value > 127) {
+      value = 127;
+    }
+    features[i] = value;
   }
-  *audio_samples = this->preprocessor_audio_buffer_;
   return true;
 }
-bool MicroWakeWord::generate_single_feature_(const int16_t *audio_data, const int audio_data_size,
-                                             int8_t feature_output[PREPROCESSOR_FEATURE_SIZE]) {
-  TfLiteTensor *input = this->preprocessor_interperter_->input(0);
-  TfLiteTensor *output = this->preprocessor_interperter_->output(0);
-  std::copy_n(audio_data, audio_data_size, tflite::GetTensorData<int16_t>(input));
-  if (this->preprocessor_interperter_->Invoke() != kTfLiteOk) {
-    ESP_LOGE(TAG, "Failed to preprocess audio for local wake word.");
-    return false;
+void MicroWakeWord::reset_states_() {
+  ESP_LOGD(TAG, "Resetting buffers and probabilities");
+  this->ring_buffer_->reset();
+  this->ignore_windows_ = -MIN_SLICES_BEFORE_DETECTION;
+  for (auto &model : this->wake_word_models_) {
+    model.reset_probabilities();
   }
-  std::memcpy(feature_output, tflite::GetTensorData<int8_t>(output), PREPROCESSOR_FEATURE_SIZE * sizeof(int8_t));
-  return true;
-}
-bool MicroWakeWord::register_preprocessor_ops_(tflite::MicroMutableOpResolver<18> &op_resolver) {
-  if (op_resolver.AddReshape() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddCast() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddStridedSlice() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddConcatenation() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddMul() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddAdd() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddDiv() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddMinimum() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddMaximum() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddWindow() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddFftAutoScale() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddRfft() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddEnergy() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddFilterBank() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddFilterBankSquareRoot() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddFilterBankSpectralSubtraction() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddPCAN() != kTfLiteOk)
-    return false;
-  if (op_resolver.AddFilterBankLog() != kTfLiteOk)
-    return false;
-  return true;
+#ifdef USE_MICRO_WAKE_WORD_VAD
+  this->vad_model_->reset_probabilities();
+#endif
 }
-bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<17> &op_resolver) {
+bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<20> &op_resolver) {
   if (op_resolver.AddCallOnce() != kTfLiteOk)
     return false;
   if (op_resolver.AddVarHandle() != kTfLiteOk)
@@ -496,6 +442,12 @@ bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<17> &
     return false;
   if (op_resolver.AddMaxPool2D() != kTfLiteOk)
     return false;
+  if (op_resolver.AddPad() != kTfLiteOk)
+    return false;
+  if (op_resolver.AddPack() != kTfLiteOk)
+    return false;
+  if (op_resolver.AddSplitV() != kTfLiteOk)
+    return false;
   return true;
 }
@@ -504,5 +456,3 @@ bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<17> &
 }  // namespace esphome
 #endif  // USE_ESP_IDF
-#endif  // CLANG_TIDY

esphome 2024.6.6__py3-none-any.whl → 2024.7.0__py3-none-any.whl

esphome 2024.6.6py3-none-any.whl → 2024.7.0py3-none-any.whl