esphome 2024.6.6__py3-none-any.whl → 2024.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. esphome/components/aht10/aht10.cpp +4 -2
  2. esphome/components/climate/climate.cpp +10 -6
  3. esphome/components/climate/climate_traits.h +3 -3
  4. esphome/components/cover/cover.h +2 -2
  5. esphome/components/esp32_camera/__init__.py +6 -3
  6. esphome/components/esp32_can/canbus.py +3 -0
  7. esphome/components/ethernet/ethernet_component.cpp +8 -3
  8. esphome/components/font/__init__.py +2 -28
  9. esphome/components/gree/climate.py +1 -0
  10. esphome/components/gree/gree.cpp +11 -3
  11. esphome/components/gree/gree.h +5 -1
  12. esphome/components/haier/binary_sensor/__init__.py +4 -4
  13. esphome/components/haier/button/__init__.py +1 -1
  14. esphome/components/haier/climate.py +43 -9
  15. esphome/components/haier/haier_base.cpp +4 -0
  16. esphome/components/haier/haier_base.h +11 -1
  17. esphome/components/haier/hon_climate.cpp +109 -55
  18. esphome/components/haier/hon_climate.h +7 -1
  19. esphome/components/haier/hon_packet.h +5 -0
  20. esphome/components/haier/sensor/__init__.py +5 -5
  21. esphome/components/haier/smartair2_climate.cpp +1 -0
  22. esphome/components/haier/text_sensor/__init__.py +4 -4
  23. esphome/components/heatpumpir/climate.py +12 -5
  24. esphome/components/heatpumpir/heatpumpir.cpp +11 -0
  25. esphome/components/heatpumpir/heatpumpir.h +11 -0
  26. esphome/components/http_request/http_request_arduino.cpp +7 -2
  27. esphome/components/http_request/update/http_request_update.cpp +6 -7
  28. esphome/components/http_request/update/http_request_update.h +0 -3
  29. esphome/components/i2s_audio/__init__.py +10 -0
  30. esphome/components/i2s_audio/microphone/__init__.py +7 -0
  31. esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp +2 -3
  32. esphome/components/i2s_audio/microphone/i2s_audio_microphone.h +3 -0
  33. esphome/components/image/__init__.py +2 -29
  34. esphome/components/improv_serial/improv_serial_component.cpp +9 -8
  35. esphome/components/ltr390/ltr390.cpp +44 -29
  36. esphome/components/ltr390/ltr390.h +9 -5
  37. esphome/components/ltr390/sensor.py +35 -5
  38. esphome/components/mdns/__init__.py +3 -3
  39. esphome/components/mdns/mdns_component.cpp +3 -1
  40. esphome/components/mdns/mdns_component.h +3 -1
  41. esphome/components/mdns/mdns_esp32.cpp +2 -1
  42. esphome/components/mdns/mdns_esp8266.cpp +2 -1
  43. esphome/components/mdns/mdns_host.cpp +2 -1
  44. esphome/components/mdns/mdns_libretiny.cpp +2 -1
  45. esphome/components/mdns/mdns_rp2040.cpp +2 -1
  46. esphome/components/micro_wake_word/__init__.py +205 -56
  47. esphome/components/micro_wake_word/micro_wake_word.cpp +225 -275
  48. esphome/components/micro_wake_word/micro_wake_word.h +77 -107
  49. esphome/components/micro_wake_word/preprocessor_settings.h +20 -0
  50. esphome/components/micro_wake_word/streaming_model.cpp +189 -0
  51. esphome/components/micro_wake_word/streaming_model.h +84 -0
  52. esphome/components/mitsubishi/mitsubishi.cpp +1 -0
  53. esphome/components/modbus_controller/text_sensor/__init__.py +2 -1
  54. esphome/components/modbus_controller/text_sensor/modbus_textsensor.cpp +4 -1
  55. esphome/components/modbus_controller/text_sensor/modbus_textsensor.h +1 -1
  56. esphome/components/number/__init__.py +2 -0
  57. esphome/components/ota/ota_backend_arduino_esp32.cpp +22 -7
  58. esphome/components/ota/ota_backend_arduino_esp8266.cpp +23 -8
  59. esphome/components/ota/ota_backend_arduino_libretiny.cpp +22 -7
  60. esphome/components/ota/ota_backend_arduino_rp2040.cpp +22 -7
  61. esphome/components/pmsa003i/pmsa003i.cpp +9 -0
  62. esphome/components/qspi_amoled/display.py +16 -4
  63. esphome/components/qspi_amoled/qspi_amoled.cpp +16 -0
  64. esphome/components/qspi_amoled/qspi_amoled.h +0 -3
  65. esphome/components/remote_base/dooya_protocol.cpp +4 -4
  66. esphome/components/remote_base/rc_switch_protocol.cpp +1 -1
  67. esphome/components/restart/button/__init__.py +2 -0
  68. esphome/components/script/__init__.py +1 -1
  69. esphome/components/sensor/__init__.py +2 -0
  70. esphome/components/tuya/tuya.cpp +8 -2
  71. esphome/components/tuya/tuya.h +3 -1
  72. esphome/components/uart/__init__.py +72 -9
  73. esphome/components/uart/uart_component_esp32_arduino.cpp +18 -4
  74. esphome/components/uart/uart_component_esp_idf.cpp +22 -2
  75. esphome/components/uart/uart_component_host.cpp +295 -0
  76. esphome/components/uart/uart_component_host.h +38 -0
  77. esphome/components/uptime/sensor.py +44 -11
  78. esphome/components/uptime/{uptime_sensor.cpp → uptime_seconds_sensor.cpp} +11 -7
  79. esphome/components/uptime/{uptime_sensor.h → uptime_seconds_sensor.h} +2 -2
  80. esphome/components/uptime/uptime_timestamp_sensor.cpp +39 -0
  81. esphome/components/uptime/uptime_timestamp_sensor.h +30 -0
  82. esphome/components/veml7700/veml7700.cpp +1 -1
  83. esphome/components/veml7700/veml7700.h +5 -5
  84. esphome/components/voice_assistant/voice_assistant.cpp +4 -2
  85. esphome/components/web_server/server_index_v2.h +42 -41
  86. esphome/components/web_server/server_index_v3.h +368 -367
  87. esphome/components/wifi/wifi_component_esp_idf.cpp +1 -1
  88. esphome/components/wifi/wifi_component_pico_w.cpp +18 -2
  89. esphome/components/wireguard/__init__.py +1 -1
  90. esphome/components/x9c/output.py +7 -1
  91. esphome/const.py +2 -1
  92. esphome/core/defines.h +1 -0
  93. esphome/core/helpers.cpp +2 -2
  94. esphome/core/helpers.h +1 -1
  95. esphome/external_files.py +26 -0
  96. {esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/METADATA +1 -1
  97. {esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/RECORD +101 -95
  98. esphome/components/micro_wake_word/audio_preprocessor_int8_model_data.h +0 -493
  99. {esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/LICENSE +0 -0
  100. {esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/WHEEL +0 -0
  101. {esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/entry_points.txt +0 -0
  102. {esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,5 @@
1
1
  #include "micro_wake_word.h"
2
-
3
- /**
4
- * This is a workaround until we can figure out a way to get
5
- * the tflite-micro idf component code available in CI
6
- *
7
- * */
8
- //
9
- #ifndef CLANG_TIDY
2
+ #include "streaming_model.h"
10
3
 
11
4
  #ifdef USE_ESP_IDF
12
5
 
@@ -14,13 +7,13 @@
14
7
  #include "esphome/core/helpers.h"
15
8
  #include "esphome/core/log.h"
16
9
 
17
- #include "audio_preprocessor_int8_model_data.h"
10
+ #include <frontend.h>
11
+ #include <frontend_util.h>
18
12
 
19
13
  #include <tensorflow/lite/core/c/common.h>
20
14
  #include <tensorflow/lite/micro/micro_interpreter.h>
21
15
  #include <tensorflow/lite/micro/micro_mutable_op_resolver.h>
22
16
 
23
- #include <cinttypes>
24
17
  #include <cmath>
25
18
 
26
19
  namespace esphome {
@@ -29,9 +22,9 @@ namespace micro_wake_word {
29
22
  static const char *const TAG = "micro_wake_word";
30
23
 
31
24
  static const size_t SAMPLE_RATE_HZ = 16000; // 16 kHz
32
- static const size_t BUFFER_LENGTH = 500; // 0.5 seconds
25
+ static const size_t BUFFER_LENGTH = 64; // 0.064 seconds
33
26
  static const size_t BUFFER_SIZE = SAMPLE_RATE_HZ / 1000 * BUFFER_LENGTH;
34
- static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms
27
+ static const size_t INPUT_BUFFER_SIZE = 16 * SAMPLE_RATE_HZ / 1000; // 16ms * 16kHz / 1000ms
35
28
 
36
29
  float MicroWakeWord::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }
37
30
 
@@ -56,57 +49,55 @@ static const LogString *micro_wake_word_state_to_string(State state) {
56
49
 
57
50
  void MicroWakeWord::dump_config() {
58
51
  ESP_LOGCONFIG(TAG, "microWakeWord:");
59
- ESP_LOGCONFIG(TAG, " Wake Word: %s", this->get_wake_word().c_str());
60
- ESP_LOGCONFIG(TAG, " Probability cutoff: %.3f", this->probability_cutoff_);
61
- ESP_LOGCONFIG(TAG, " Sliding window size: %d", this->sliding_window_average_size_);
52
+ ESP_LOGCONFIG(TAG, " models:");
53
+ for (auto &model : this->wake_word_models_) {
54
+ model.log_model_config();
55
+ }
56
+ #ifdef USE_MICRO_WAKE_WORD_VAD
57
+ this->vad_model_->log_model_config();
58
+ #endif
62
59
  }
63
60
 
64
61
  void MicroWakeWord::setup() {
65
62
  ESP_LOGCONFIG(TAG, "Setting up microWakeWord...");
66
63
 
67
- if (!this->initialize_models()) {
68
- ESP_LOGE(TAG, "Failed to initialize models");
69
- this->mark_failed();
70
- return;
71
- }
72
-
73
- ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
74
- this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE * sizeof(int16_t));
75
- if (this->input_buffer_ == nullptr) {
76
- ESP_LOGW(TAG, "Could not allocate input buffer");
77
- this->mark_failed();
78
- return;
79
- }
80
-
81
- this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
82
- if (this->ring_buffer_ == nullptr) {
83
- ESP_LOGW(TAG, "Could not allocate ring buffer");
64
+ if (!this->register_streaming_ops_(this->streaming_op_resolver_)) {
84
65
  this->mark_failed();
85
66
  return;
86
67
  }
87
68
 
88
69
  ESP_LOGCONFIG(TAG, "Micro Wake Word initialized");
89
- }
90
-
91
- int MicroWakeWord::read_microphone_() {
92
- size_t bytes_read = this->microphone_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
93
- if (bytes_read == 0) {
94
- return 0;
95
- }
96
-
97
- size_t bytes_free = this->ring_buffer_->free();
98
70
 
99
- if (bytes_free < bytes_read) {
100
- ESP_LOGW(TAG,
101
- "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
102
- "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
103
- bytes_free, bytes_read);
71
+ this->frontend_config_.window.size_ms = FEATURE_DURATION_MS;
72
+ this->frontend_config_.window.step_size_ms = this->features_step_size_;
73
+ this->frontend_config_.filterbank.num_channels = PREPROCESSOR_FEATURE_SIZE;
74
+ this->frontend_config_.filterbank.lower_band_limit = 125.0;
75
+ this->frontend_config_.filterbank.upper_band_limit = 7500.0;
76
+ this->frontend_config_.noise_reduction.smoothing_bits = 10;
77
+ this->frontend_config_.noise_reduction.even_smoothing = 0.025;
78
+ this->frontend_config_.noise_reduction.odd_smoothing = 0.06;
79
+ this->frontend_config_.noise_reduction.min_signal_remaining = 0.05;
80
+ this->frontend_config_.pcan_gain_control.enable_pcan = 1;
81
+ this->frontend_config_.pcan_gain_control.strength = 0.95;
82
+ this->frontend_config_.pcan_gain_control.offset = 80.0;
83
+ this->frontend_config_.pcan_gain_control.gain_bits = 21;
84
+ this->frontend_config_.log_scale.enable_log = 1;
85
+ this->frontend_config_.log_scale.scale_shift = 6;
86
+ }
104
87
 
105
- this->ring_buffer_->reset();
106
- }
88
+ void MicroWakeWord::add_wake_word_model(const uint8_t *model_start, float probability_cutoff,
89
+ size_t sliding_window_average_size, const std::string &wake_word,
90
+ size_t tensor_arena_size) {
91
+ this->wake_word_models_.emplace_back(model_start, probability_cutoff, sliding_window_average_size, wake_word,
92
+ tensor_arena_size);
93
+ }
107
94
 
108
- return this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
95
+ #ifdef USE_MICRO_WAKE_WORD_VAD
96
+ void MicroWakeWord::add_vad_model(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_size,
97
+ size_t tensor_arena_size) {
98
+ this->vad_model_ = make_unique<VADModel>(model_start, probability_cutoff, sliding_window_size, tensor_arena_size);
109
99
  }
100
+ #endif
110
101
 
111
102
  void MicroWakeWord::loop() {
112
103
  switch (this->state_) {
@@ -124,9 +115,12 @@ void MicroWakeWord::loop() {
124
115
  }
125
116
  break;
126
117
  case State::DETECTING_WAKE_WORD:
127
- this->read_microphone_();
128
- if (this->detect_wake_word_()) {
129
- ESP_LOGD(TAG, "Wake Word Detected");
118
+ while (!this->has_enough_samples_()) {
119
+ this->read_microphone_();
120
+ }
121
+ this->update_model_probabilities_();
122
+ if (this->detect_wake_words_()) {
123
+ ESP_LOGD(TAG, "Wake Word '%s' Detected", (this->detected_wake_word_).c_str());
130
124
  this->detected_ = true;
131
125
  this->set_state_(State::STOP_MICROPHONE);
132
126
  }
@@ -136,13 +130,16 @@ void MicroWakeWord::loop() {
136
130
  this->microphone_->stop();
137
131
  this->set_state_(State::STOPPING_MICROPHONE);
138
132
  this->high_freq_.stop();
133
+ this->unload_models_();
134
+ this->deallocate_buffers_();
139
135
  break;
140
136
  case State::STOPPING_MICROPHONE:
141
137
  if (this->microphone_->is_stopped()) {
142
138
  this->set_state_(State::IDLE);
143
139
  if (this->detected_) {
140
+ this->wake_word_detected_trigger_->trigger(this->detected_wake_word_);
144
141
  this->detected_ = false;
145
- this->wake_word_detected_trigger_->trigger(this->wake_word_);
142
+ this->detected_wake_word_ = "";
146
143
  }
147
144
  }
148
145
  break;
@@ -150,14 +147,34 @@ void MicroWakeWord::loop() {
150
147
  }
151
148
 
152
149
  void MicroWakeWord::start() {
150
+ if (!this->is_ready()) {
151
+ ESP_LOGW(TAG, "Wake word detection can't start as the component hasn't been setup yet");
152
+ return;
153
+ }
154
+
153
155
  if (this->is_failed()) {
154
156
  ESP_LOGW(TAG, "Wake word component is marked as failed. Please check setup logs");
155
157
  return;
156
158
  }
159
+
160
+ if (!this->load_models_() || !this->allocate_buffers_()) {
161
+ ESP_LOGE(TAG, "Failed to load the wake word model(s) or allocate buffers");
162
+ this->status_set_error();
163
+ } else {
164
+ this->status_clear_error();
165
+ }
166
+
167
+ if (this->status_has_error()) {
168
+ ESP_LOGW(TAG, "Wake word component has an error. Please check logs");
169
+ return;
170
+ }
171
+
157
172
  if (this->state_ != State::IDLE) {
158
173
  ESP_LOGW(TAG, "Wake word is already running");
159
174
  return;
160
175
  }
176
+
177
+ this->reset_states_();
161
178
  this->set_state_(State::START_MICROPHONE);
162
179
  }
163
180
 
@@ -179,289 +196,218 @@ void MicroWakeWord::set_state_(State state) {
179
196
  this->state_ = state;
180
197
  }
181
198
 
182
- bool MicroWakeWord::initialize_models() {
183
- ExternalRAMAllocator<uint8_t> arena_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
184
- ExternalRAMAllocator<int8_t> features_allocator(ExternalRAMAllocator<int8_t>::ALLOW_FAILURE);
185
- ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
186
-
187
- this->streaming_tensor_arena_ = arena_allocator.allocate(STREAMING_MODEL_ARENA_SIZE);
188
- if (this->streaming_tensor_arena_ == nullptr) {
189
- ESP_LOGE(TAG, "Could not allocate the streaming model's tensor arena.");
190
- return false;
191
- }
192
-
193
- this->streaming_var_arena_ = arena_allocator.allocate(STREAMING_MODEL_VARIABLE_ARENA_SIZE);
194
- if (this->streaming_var_arena_ == nullptr) {
195
- ESP_LOGE(TAG, "Could not allocate the streaming model variable's tensor arena.");
196
- return false;
197
- }
198
-
199
- this->preprocessor_tensor_arena_ = arena_allocator.allocate(PREPROCESSOR_ARENA_SIZE);
200
- if (this->preprocessor_tensor_arena_ == nullptr) {
201
- ESP_LOGE(TAG, "Could not allocate the audio preprocessor model's tensor arena.");
202
- return false;
203
- }
204
-
205
- this->new_features_data_ = features_allocator.allocate(PREPROCESSOR_FEATURE_SIZE);
206
- if (this->new_features_data_ == nullptr) {
207
- ESP_LOGE(TAG, "Could not allocate the audio features buffer.");
208
- return false;
199
+ size_t MicroWakeWord::read_microphone_() {
200
+ size_t bytes_read = this->microphone_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
201
+ if (bytes_read == 0) {
202
+ return 0;
209
203
  }
210
204
 
211
- this->preprocessor_audio_buffer_ = audio_samples_allocator.allocate(SAMPLE_DURATION_COUNT);
212
- if (this->preprocessor_audio_buffer_ == nullptr) {
213
- ESP_LOGE(TAG, "Could not allocate the audio preprocessor's buffer.");
214
- return false;
215
- }
205
+ size_t bytes_free = this->ring_buffer_->free();
216
206
 
217
- this->preprocessor_model_ = tflite::GetModel(G_AUDIO_PREPROCESSOR_INT8_TFLITE);
218
- if (this->preprocessor_model_->version() != TFLITE_SCHEMA_VERSION) {
219
- ESP_LOGE(TAG, "Wake word's audio preprocessor model's schema is not supported");
220
- return false;
221
- }
207
+ if (bytes_free < bytes_read) {
208
+ ESP_LOGW(TAG,
209
+ "Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
210
+ "Resetting the ring buffer. Wake word detection accuracy will be reduced.",
211
+ bytes_free, bytes_read);
222
212
 
223
- this->streaming_model_ = tflite::GetModel(this->model_start_);
224
- if (this->streaming_model_->version() != TFLITE_SCHEMA_VERSION) {
225
- ESP_LOGE(TAG, "Wake word's streaming model's schema is not supported");
226
- return false;
213
+ this->ring_buffer_->reset();
227
214
  }
228
215
 
229
- static tflite::MicroMutableOpResolver<18> preprocessor_op_resolver;
230
- static tflite::MicroMutableOpResolver<17> streaming_op_resolver;
231
-
232
- if (!this->register_preprocessor_ops_(preprocessor_op_resolver))
233
- return false;
234
- if (!this->register_streaming_ops_(streaming_op_resolver))
235
- return false;
236
-
237
- tflite::MicroAllocator *ma =
238
- tflite::MicroAllocator::Create(this->streaming_var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);
239
- this->mrv_ = tflite::MicroResourceVariables::Create(ma, 15);
240
-
241
- static tflite::MicroInterpreter static_preprocessor_interpreter(
242
- this->preprocessor_model_, preprocessor_op_resolver, this->preprocessor_tensor_arena_, PREPROCESSOR_ARENA_SIZE);
243
-
244
- static tflite::MicroInterpreter static_streaming_interpreter(this->streaming_model_, streaming_op_resolver,
245
- this->streaming_tensor_arena_,
246
- STREAMING_MODEL_ARENA_SIZE, this->mrv_);
247
-
248
- this->preprocessor_interperter_ = &static_preprocessor_interpreter;
249
- this->streaming_interpreter_ = &static_streaming_interpreter;
250
-
251
- // Allocate tensors for each models.
252
- if (this->preprocessor_interperter_->AllocateTensors() != kTfLiteOk) {
253
- ESP_LOGE(TAG, "Failed to allocate tensors for the audio preprocessor");
254
- return false;
255
- }
256
- if (this->streaming_interpreter_->AllocateTensors() != kTfLiteOk) {
257
- ESP_LOGE(TAG, "Failed to allocate tensors for the streaming model");
258
- return false;
259
- }
216
+ return this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
217
+ }
260
218
 
261
- // Verify input tensor matches expected values
262
- TfLiteTensor *input = this->streaming_interpreter_->input(0);
263
- if ((input->dims->size != 3) || (input->dims->data[0] != 1) || (input->dims->data[0] != 1) ||
264
- (input->dims->data[1] != 1) || (input->dims->data[2] != PREPROCESSOR_FEATURE_SIZE)) {
265
- ESP_LOGE(TAG, "Wake word detection model tensor input dimensions is not 1x1x%u", input->dims->data[2]);
266
- return false;
267
- }
219
+ bool MicroWakeWord::allocate_buffers_() {
220
+ ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
268
221
 
269
- if (input->type != kTfLiteInt8) {
270
- ESP_LOGE(TAG, "Wake word detection model tensor input is not int8.");
271
- return false;
222
+ if (this->input_buffer_ == nullptr) {
223
+ this->input_buffer_ = audio_samples_allocator.allocate(INPUT_BUFFER_SIZE * sizeof(int16_t));
224
+ if (this->input_buffer_ == nullptr) {
225
+ ESP_LOGE(TAG, "Could not allocate input buffer");
226
+ return false;
227
+ }
272
228
  }
273
229
 
274
- // Verify output tensor matches expected values
275
- TfLiteTensor *output = this->streaming_interpreter_->output(0);
276
- if ((output->dims->size != 2) || (output->dims->data[0] != 1) || (output->dims->data[1] != 1)) {
277
- ESP_LOGE(TAG, "Wake word detection model tensor output dimensions is not 1x1.");
230
+ if (this->preprocessor_audio_buffer_ == nullptr) {
231
+ this->preprocessor_audio_buffer_ = audio_samples_allocator.allocate(this->new_samples_to_get_());
232
+ if (this->preprocessor_audio_buffer_ == nullptr) {
233
+ ESP_LOGE(TAG, "Could not allocate the audio preprocessor's buffer.");
234
+ return false;
235
+ }
278
236
  }
279
237
 
280
- if (output->type != kTfLiteUInt8) {
281
- ESP_LOGE(TAG, "Wake word detection model tensor input is not uint8.");
282
- return false;
238
+ if (this->ring_buffer_ == nullptr) {
239
+ this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
240
+ if (this->ring_buffer_ == nullptr) {
241
+ ESP_LOGE(TAG, "Could not allocate ring buffer");
242
+ return false;
243
+ }
283
244
  }
284
245
 
285
- this->recent_streaming_probabilities_.resize(this->sliding_window_average_size_, 0.0);
286
-
287
246
  return true;
288
247
  }
289
248
 
290
- bool MicroWakeWord::update_features_() {
291
- // Retrieve strided audio samples
292
- int16_t *audio_samples = nullptr;
293
- if (!this->stride_audio_samples_(&audio_samples)) {
249
+ void MicroWakeWord::deallocate_buffers_() {
250
+ ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
251
+ audio_samples_allocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
252
+ this->input_buffer_ = nullptr;
253
+ audio_samples_allocator.deallocate(this->preprocessor_audio_buffer_, this->new_samples_to_get_());
254
+ this->preprocessor_audio_buffer_ = nullptr;
255
+ }
256
+
257
+ bool MicroWakeWord::load_models_() {
258
+ // Setup preprocesor feature generator
259
+ if (!FrontendPopulateState(&this->frontend_config_, &this->frontend_state_, AUDIO_SAMPLE_FREQUENCY)) {
260
+ ESP_LOGD(TAG, "Failed to populate frontend state");
261
+ FrontendFreeStateContents(&this->frontend_state_);
294
262
  return false;
295
263
  }
296
264
 
297
- // Compute the features for the newest audio samples
298
- if (!this->generate_single_feature_(audio_samples, SAMPLE_DURATION_COUNT, this->new_features_data_)) {
265
+ // Setup streaming models
266
+ for (auto &model : this->wake_word_models_) {
267
+ if (!model.load_model(this->streaming_op_resolver_)) {
268
+ ESP_LOGE(TAG, "Failed to initialize a wake word model.");
269
+ return false;
270
+ }
271
+ }
272
+ #ifdef USE_MICRO_WAKE_WORD_VAD
273
+ if (!this->vad_model_->load_model(this->streaming_op_resolver_)) {
274
+ ESP_LOGE(TAG, "Failed to initialize VAD model.");
299
275
  return false;
300
276
  }
277
+ #endif
301
278
 
302
279
  return true;
303
280
  }
304
281
 
305
- float MicroWakeWord::perform_streaming_inference_() {
306
- TfLiteTensor *input = this->streaming_interpreter_->input(0);
282
+ void MicroWakeWord::unload_models_() {
283
+ FrontendFreeStateContents(&this->frontend_state_);
307
284
 
308
- size_t bytes_to_copy = input->bytes;
309
-
310
- memcpy((void *) (tflite::GetTensorData<int8_t>(input)), (const void *) (this->new_features_data_), bytes_to_copy);
311
-
312
- uint32_t prior_invoke = millis();
313
-
314
- TfLiteStatus invoke_status = this->streaming_interpreter_->Invoke();
315
- if (invoke_status != kTfLiteOk) {
316
- ESP_LOGW(TAG, "Streaming Interpreter Invoke failed");
317
- return false;
285
+ for (auto &model : this->wake_word_models_) {
286
+ model.unload_model();
318
287
  }
319
-
320
- ESP_LOGV(TAG, "Streaming Inference Latency=%" PRIu32 " ms", (millis() - prior_invoke));
321
-
322
- TfLiteTensor *output = this->streaming_interpreter_->output(0);
323
-
324
- return static_cast<float>(output->data.uint8[0]) / 255.0;
288
+ #ifdef USE_MICRO_WAKE_WORD_VAD
289
+ this->vad_model_->unload_model();
290
+ #endif
325
291
  }
326
292
 
327
- bool MicroWakeWord::detect_wake_word_() {
328
- // Preprocess the newest audio samples into features
329
- if (!this->update_features_()) {
330
- return false;
331
- }
293
+ void MicroWakeWord::update_model_probabilities_() {
294
+ int8_t audio_features[PREPROCESSOR_FEATURE_SIZE];
332
295
 
333
- // Perform inference
334
- float streaming_prob = this->perform_streaming_inference_();
296
+ if (!this->generate_features_for_window_(audio_features)) {
297
+ return;
298
+ }
335
299
 
336
- // Add the most recent probability to the sliding window
337
- this->recent_streaming_probabilities_[this->last_n_index_] = streaming_prob;
338
- ++this->last_n_index_;
339
- if (this->last_n_index_ == this->sliding_window_average_size_)
340
- this->last_n_index_ = 0;
300
+ // Increase the counter since the last positive detection
301
+ this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);
341
302
 
342
- float sum = 0.0;
343
- for (auto &prob : this->recent_streaming_probabilities_) {
344
- sum += prob;
303
+ for (auto &model : this->wake_word_models_) {
304
+ // Perform inference
305
+ model.perform_streaming_inference(audio_features);
345
306
  }
307
+ #ifdef USE_MICRO_WAKE_WORD_VAD
308
+ this->vad_model_->perform_streaming_inference(audio_features);
309
+ #endif
310
+ }
346
311
 
347
- float sliding_window_average = sum / static_cast<float>(this->sliding_window_average_size_);
348
-
349
- // Ensure we have enough samples since the last positive detection
350
- this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);
312
+ bool MicroWakeWord::detect_wake_words_() {
313
+ // Verify we have processed samples since the last positive detection
351
314
  if (this->ignore_windows_ < 0) {
352
315
  return false;
353
316
  }
354
317
 
355
- // Detect the wake word if the sliding window average is above the cutoff
356
- if (sliding_window_average > this->probability_cutoff_) {
357
- this->ignore_windows_ = -MIN_SLICES_BEFORE_DETECTION;
358
- for (auto &prob : this->recent_streaming_probabilities_) {
359
- prob = 0;
360
- }
318
+ #ifdef USE_MICRO_WAKE_WORD_VAD
319
+ bool vad_state = this->vad_model_->determine_detected();
320
+ #endif
361
321
 
362
- ESP_LOGD(TAG, "Wake word sliding average probability is %.3f and most recent probability is %.3f",
363
- sliding_window_average, streaming_prob);
364
- return true;
322
+ for (auto &model : this->wake_word_models_) {
323
+ if (model.determine_detected()) {
324
+ #ifdef USE_MICRO_WAKE_WORD_VAD
325
+ if (vad_state) {
326
+ #endif
327
+ this->detected_wake_word_ = model.get_wake_word();
328
+ return true;
329
+ #ifdef USE_MICRO_WAKE_WORD_VAD
330
+ } else {
331
+ ESP_LOGD(TAG, "Wake word model predicts %s, but VAD model doesn't.", model.get_wake_word().c_str());
332
+ }
333
+ #endif
334
+ }
365
335
  }
366
336
 
367
337
  return false;
368
338
  }
369
339
 
370
- void MicroWakeWord::set_sliding_window_average_size(size_t size) {
371
- this->sliding_window_average_size_ = size;
372
- this->recent_streaming_probabilities_.resize(this->sliding_window_average_size_, 0.0);
373
- }
374
-
375
- bool MicroWakeWord::slice_available_() {
376
- size_t available = this->ring_buffer_->available();
377
-
378
- return available > (NEW_SAMPLES_TO_GET * sizeof(int16_t));
340
+ bool MicroWakeWord::has_enough_samples_() {
341
+ return this->ring_buffer_->available() >=
342
+ (this->features_step_size_ * (AUDIO_SAMPLE_FREQUENCY / 1000)) * sizeof(int16_t);
379
343
  }
380
344
 
381
- bool MicroWakeWord::stride_audio_samples_(int16_t **audio_samples) {
382
- if (!this->slice_available_()) {
345
+ bool MicroWakeWord::generate_features_for_window_(int8_t features[PREPROCESSOR_FEATURE_SIZE]) {
346
+ // Ensure we have enough new audio samples in the ring buffer for a full window
347
+ if (!this->has_enough_samples_()) {
383
348
  return false;
384
349
  }
385
350
 
386
- // Copy the last 320 bytes (160 samples over 10 ms) from the audio buffer to the start of the audio buffer
387
- memcpy((void *) (this->preprocessor_audio_buffer_), (void *) (this->preprocessor_audio_buffer_ + NEW_SAMPLES_TO_GET),
388
- HISTORY_SAMPLES_TO_KEEP * sizeof(int16_t));
389
-
390
- // Copy 640 bytes (320 samples over 20 ms) from the ring buffer into the audio buffer offset 320 bytes (160 samples
391
- // over 10 ms)
392
- size_t bytes_read = this->ring_buffer_->read((void *) (this->preprocessor_audio_buffer_ + HISTORY_SAMPLES_TO_KEEP),
393
- NEW_SAMPLES_TO_GET * sizeof(int16_t), pdMS_TO_TICKS(200));
351
+ size_t bytes_read = this->ring_buffer_->read((void *) (this->preprocessor_audio_buffer_),
352
+ this->new_samples_to_get_() * sizeof(int16_t), pdMS_TO_TICKS(200));
394
353
 
395
354
  if (bytes_read == 0) {
396
355
  ESP_LOGE(TAG, "Could not read data from Ring Buffer");
397
- } else if (bytes_read < NEW_SAMPLES_TO_GET * sizeof(int16_t)) {
356
+ } else if (bytes_read < this->new_samples_to_get_() * sizeof(int16_t)) {
398
357
  ESP_LOGD(TAG, "Partial Read of Data by Model");
399
358
  ESP_LOGD(TAG, "Could only read %d bytes when required %d bytes ", bytes_read,
400
- (int) (NEW_SAMPLES_TO_GET * sizeof(int16_t)));
401
- return false;
359
+ (int) (this->new_samples_to_get_() * sizeof(int16_t)));
360
+ return false;
361
+ }
362
+
363
+ size_t num_samples_read;
364
+ struct FrontendOutput frontend_output = FrontendProcessSamples(
365
+ &this->frontend_state_, this->preprocessor_audio_buffer_, this->new_samples_to_get_(), &num_samples_read);
366
+
367
+ for (size_t i = 0; i < frontend_output.size; ++i) {
368
+ // These scaling values are set to match the TFLite audio frontend int8 output.
369
+ // The feature pipeline outputs 16-bit signed integers in roughly a 0 to 670
370
+ // range. In training, these are then arbitrarily divided by 25.6 to get
371
+ // float values in the rough range of 0.0 to 26.0. This scaling is performed
372
+ // for historical reasons, to match up with the output of other feature
373
+ // generators.
374
+ // The process is then further complicated when we quantize the model. This
375
+ // means we have to scale the 0.0 to 26.0 real values to the -128 to 127
376
+ // signed integer numbers.
377
+ // All this means that to get matching values from our integer feature
378
+ // output into the tensor input, we have to perform:
379
+ // input = (((feature / 25.6) / 26.0) * 256) - 128
380
+ // To simplify this and perform it in 32-bit integer math, we rearrange to:
381
+ // input = (feature * 256) / (25.6 * 26.0) - 128
382
+ constexpr int32_t value_scale = 256;
383
+ constexpr int32_t value_div = 666; // 666 = 25.6 * 26.0 after rounding
384
+ int32_t value = ((frontend_output.values[i] * value_scale) + (value_div / 2)) / value_div;
385
+ value -= 128;
386
+ if (value < -128) {
387
+ value = -128;
388
+ }
389
+ if (value > 127) {
390
+ value = 127;
391
+ }
392
+ features[i] = value;
402
393
  }
403
394
 
404
- *audio_samples = this->preprocessor_audio_buffer_;
405
395
  return true;
406
396
  }
407
397
 
408
- bool MicroWakeWord::generate_single_feature_(const int16_t *audio_data, const int audio_data_size,
409
- int8_t feature_output[PREPROCESSOR_FEATURE_SIZE]) {
410
- TfLiteTensor *input = this->preprocessor_interperter_->input(0);
411
- TfLiteTensor *output = this->preprocessor_interperter_->output(0);
412
- std::copy_n(audio_data, audio_data_size, tflite::GetTensorData<int16_t>(input));
413
-
414
- if (this->preprocessor_interperter_->Invoke() != kTfLiteOk) {
415
- ESP_LOGE(TAG, "Failed to preprocess audio for local wake word.");
416
- return false;
398
+ void MicroWakeWord::reset_states_() {
399
+ ESP_LOGD(TAG, "Resetting buffers and probabilities");
400
+ this->ring_buffer_->reset();
401
+ this->ignore_windows_ = -MIN_SLICES_BEFORE_DETECTION;
402
+ for (auto &model : this->wake_word_models_) {
403
+ model.reset_probabilities();
417
404
  }
418
- std::memcpy(feature_output, tflite::GetTensorData<int8_t>(output), PREPROCESSOR_FEATURE_SIZE * sizeof(int8_t));
419
-
420
- return true;
421
- }
422
-
423
- bool MicroWakeWord::register_preprocessor_ops_(tflite::MicroMutableOpResolver<18> &op_resolver) {
424
- if (op_resolver.AddReshape() != kTfLiteOk)
425
- return false;
426
- if (op_resolver.AddCast() != kTfLiteOk)
427
- return false;
428
- if (op_resolver.AddStridedSlice() != kTfLiteOk)
429
- return false;
430
- if (op_resolver.AddConcatenation() != kTfLiteOk)
431
- return false;
432
- if (op_resolver.AddMul() != kTfLiteOk)
433
- return false;
434
- if (op_resolver.AddAdd() != kTfLiteOk)
435
- return false;
436
- if (op_resolver.AddDiv() != kTfLiteOk)
437
- return false;
438
- if (op_resolver.AddMinimum() != kTfLiteOk)
439
- return false;
440
- if (op_resolver.AddMaximum() != kTfLiteOk)
441
- return false;
442
- if (op_resolver.AddWindow() != kTfLiteOk)
443
- return false;
444
- if (op_resolver.AddFftAutoScale() != kTfLiteOk)
445
- return false;
446
- if (op_resolver.AddRfft() != kTfLiteOk)
447
- return false;
448
- if (op_resolver.AddEnergy() != kTfLiteOk)
449
- return false;
450
- if (op_resolver.AddFilterBank() != kTfLiteOk)
451
- return false;
452
- if (op_resolver.AddFilterBankSquareRoot() != kTfLiteOk)
453
- return false;
454
- if (op_resolver.AddFilterBankSpectralSubtraction() != kTfLiteOk)
455
- return false;
456
- if (op_resolver.AddPCAN() != kTfLiteOk)
457
- return false;
458
- if (op_resolver.AddFilterBankLog() != kTfLiteOk)
459
- return false;
460
-
461
- return true;
405
+ #ifdef USE_MICRO_WAKE_WORD_VAD
406
+ this->vad_model_->reset_probabilities();
407
+ #endif
462
408
  }
463
409
 
464
- bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<17> &op_resolver) {
410
+ bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<20> &op_resolver) {
465
411
  if (op_resolver.AddCallOnce() != kTfLiteOk)
466
412
  return false;
467
413
  if (op_resolver.AddVarHandle() != kTfLiteOk)
@@ -496,6 +442,12 @@ bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<17> &
496
442
  return false;
497
443
  if (op_resolver.AddMaxPool2D() != kTfLiteOk)
498
444
  return false;
445
+ if (op_resolver.AddPad() != kTfLiteOk)
446
+ return false;
447
+ if (op_resolver.AddPack() != kTfLiteOk)
448
+ return false;
449
+ if (op_resolver.AddSplitV() != kTfLiteOk)
450
+ return false;
499
451
 
500
452
  return true;
501
453
  }
@@ -504,5 +456,3 @@ bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<17> &
504
456
  } // namespace esphome
505
457
 
506
458
  #endif // USE_ESP_IDF
507
-
508
- #endif // CLANG_TIDY