esphome 2024.6.6__py3-none-any.whl → 2024.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esphome/components/aht10/aht10.cpp +4 -2
- esphome/components/climate/climate.cpp +10 -6
- esphome/components/climate/climate_traits.h +3 -3
- esphome/components/cover/cover.h +2 -2
- esphome/components/esp32_camera/__init__.py +6 -3
- esphome/components/esp32_can/canbus.py +3 -0
- esphome/components/ethernet/ethernet_component.cpp +8 -3
- esphome/components/font/__init__.py +2 -28
- esphome/components/gree/climate.py +1 -0
- esphome/components/gree/gree.cpp +11 -3
- esphome/components/gree/gree.h +5 -1
- esphome/components/haier/binary_sensor/__init__.py +4 -4
- esphome/components/haier/button/__init__.py +1 -1
- esphome/components/haier/climate.py +43 -9
- esphome/components/haier/haier_base.cpp +4 -0
- esphome/components/haier/haier_base.h +11 -1
- esphome/components/haier/hon_climate.cpp +109 -55
- esphome/components/haier/hon_climate.h +7 -1
- esphome/components/haier/hon_packet.h +5 -0
- esphome/components/haier/sensor/__init__.py +5 -5
- esphome/components/haier/smartair2_climate.cpp +1 -0
- esphome/components/haier/text_sensor/__init__.py +4 -4
- esphome/components/heatpumpir/climate.py +12 -5
- esphome/components/heatpumpir/heatpumpir.cpp +11 -0
- esphome/components/heatpumpir/heatpumpir.h +11 -0
- esphome/components/http_request/http_request_arduino.cpp +7 -2
- esphome/components/http_request/update/http_request_update.cpp +6 -7
- esphome/components/http_request/update/http_request_update.h +0 -3
- esphome/components/i2s_audio/__init__.py +10 -0
- esphome/components/i2s_audio/microphone/__init__.py +7 -0
- esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp +2 -3
- esphome/components/i2s_audio/microphone/i2s_audio_microphone.h +3 -0
- esphome/components/image/__init__.py +2 -29
- esphome/components/improv_serial/improv_serial_component.cpp +9 -8
- esphome/components/ltr390/ltr390.cpp +44 -29
- esphome/components/ltr390/ltr390.h +9 -5
- esphome/components/ltr390/sensor.py +35 -5
- esphome/components/mdns/__init__.py +3 -3
- esphome/components/mdns/mdns_component.cpp +3 -1
- esphome/components/mdns/mdns_component.h +3 -1
- esphome/components/mdns/mdns_esp32.cpp +2 -1
- esphome/components/mdns/mdns_esp8266.cpp +2 -1
- esphome/components/mdns/mdns_host.cpp +2 -1
- esphome/components/mdns/mdns_libretiny.cpp +2 -1
- esphome/components/mdns/mdns_rp2040.cpp +2 -1
- esphome/components/micro_wake_word/__init__.py +205 -56
- esphome/components/micro_wake_word/micro_wake_word.cpp +225 -275
- esphome/components/micro_wake_word/micro_wake_word.h +77 -107
- esphome/components/micro_wake_word/preprocessor_settings.h +20 -0
- esphome/components/micro_wake_word/streaming_model.cpp +189 -0
- esphome/components/micro_wake_word/streaming_model.h +84 -0
- esphome/components/mitsubishi/mitsubishi.cpp +1 -0
- esphome/components/modbus_controller/text_sensor/__init__.py +2 -1
- esphome/components/modbus_controller/text_sensor/modbus_textsensor.cpp +4 -1
- esphome/components/modbus_controller/text_sensor/modbus_textsensor.h +1 -1
- esphome/components/number/__init__.py +2 -0
- esphome/components/ota/ota_backend_arduino_esp32.cpp +22 -7
- esphome/components/ota/ota_backend_arduino_esp8266.cpp +23 -8
- esphome/components/ota/ota_backend_arduino_libretiny.cpp +22 -7
- esphome/components/ota/ota_backend_arduino_rp2040.cpp +22 -7
- esphome/components/pmsa003i/pmsa003i.cpp +9 -0
- esphome/components/qspi_amoled/display.py +16 -4
- esphome/components/qspi_amoled/qspi_amoled.cpp +16 -0
- esphome/components/qspi_amoled/qspi_amoled.h +0 -3
- esphome/components/remote_base/dooya_protocol.cpp +4 -4
- esphome/components/remote_base/rc_switch_protocol.cpp +1 -1
- esphome/components/restart/button/__init__.py +2 -0
- esphome/components/script/__init__.py +1 -1
- esphome/components/sensor/__init__.py +2 -0
- esphome/components/tuya/tuya.cpp +8 -2
- esphome/components/tuya/tuya.h +3 -1
- esphome/components/uart/__init__.py +72 -9
- esphome/components/uart/uart_component_esp32_arduino.cpp +18 -4
- esphome/components/uart/uart_component_esp_idf.cpp +22 -2
- esphome/components/uart/uart_component_host.cpp +295 -0
- esphome/components/uart/uart_component_host.h +38 -0
- esphome/components/uptime/sensor.py +44 -11
- esphome/components/uptime/{uptime_sensor.cpp → uptime_seconds_sensor.cpp} +11 -7
- esphome/components/uptime/{uptime_sensor.h → uptime_seconds_sensor.h} +2 -2
- esphome/components/uptime/uptime_timestamp_sensor.cpp +39 -0
- esphome/components/uptime/uptime_timestamp_sensor.h +30 -0
- esphome/components/veml7700/veml7700.cpp +1 -1
- esphome/components/veml7700/veml7700.h +5 -5
- esphome/components/voice_assistant/voice_assistant.cpp +4 -2
- esphome/components/web_server/server_index_v2.h +42 -41
- esphome/components/web_server/server_index_v3.h +368 -367
- esphome/components/wifi/wifi_component_esp_idf.cpp +1 -1
- esphome/components/wifi/wifi_component_pico_w.cpp +18 -2
- esphome/components/wireguard/__init__.py +1 -1
- esphome/components/x9c/output.py +7 -1
- esphome/const.py +2 -1
- esphome/core/defines.h +1 -0
- esphome/core/helpers.cpp +2 -2
- esphome/core/helpers.h +1 -1
- esphome/external_files.py +26 -0
- {esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/METADATA +1 -1
- {esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/RECORD +101 -95
- esphome/components/micro_wake_word/audio_preprocessor_int8_model_data.h +0 -493
- {esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/LICENSE +0 -0
- {esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/WHEEL +0 -0
- {esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/entry_points.txt +0 -0
- {esphome-2024.6.6.dist-info → esphome-2024.7.0.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,5 @@
|
|
1
1
|
#include "micro_wake_word.h"
|
2
|
-
|
3
|
-
/**
|
4
|
-
* This is a workaround until we can figure out a way to get
|
5
|
-
* the tflite-micro idf component code available in CI
|
6
|
-
*
|
7
|
-
* */
|
8
|
-
//
|
9
|
-
#ifndef CLANG_TIDY
|
2
|
+
#include "streaming_model.h"
|
10
3
|
|
11
4
|
#ifdef USE_ESP_IDF
|
12
5
|
|
@@ -14,13 +7,13 @@
|
|
14
7
|
#include "esphome/core/helpers.h"
|
15
8
|
#include "esphome/core/log.h"
|
16
9
|
|
17
|
-
#include
|
10
|
+
#include <frontend.h>
|
11
|
+
#include <frontend_util.h>
|
18
12
|
|
19
13
|
#include <tensorflow/lite/core/c/common.h>
|
20
14
|
#include <tensorflow/lite/micro/micro_interpreter.h>
|
21
15
|
#include <tensorflow/lite/micro/micro_mutable_op_resolver.h>
|
22
16
|
|
23
|
-
#include <cinttypes>
|
24
17
|
#include <cmath>
|
25
18
|
|
26
19
|
namespace esphome {
|
@@ -29,9 +22,9 @@ namespace micro_wake_word {
|
|
29
22
|
static const char *const TAG = "micro_wake_word";
|
30
23
|
|
31
24
|
static const size_t SAMPLE_RATE_HZ = 16000; // 16 kHz
|
32
|
-
static const size_t BUFFER_LENGTH =
|
25
|
+
static const size_t BUFFER_LENGTH = 64; // 0.064 seconds
|
33
26
|
static const size_t BUFFER_SIZE = SAMPLE_RATE_HZ / 1000 * BUFFER_LENGTH;
|
34
|
-
static const size_t INPUT_BUFFER_SIZE =
|
27
|
+
static const size_t INPUT_BUFFER_SIZE = 16 * SAMPLE_RATE_HZ / 1000; // 16ms * 16kHz / 1000ms
|
35
28
|
|
36
29
|
float MicroWakeWord::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }
|
37
30
|
|
@@ -56,57 +49,55 @@ static const LogString *micro_wake_word_state_to_string(State state) {
|
|
56
49
|
|
57
50
|
void MicroWakeWord::dump_config() {
|
58
51
|
ESP_LOGCONFIG(TAG, "microWakeWord:");
|
59
|
-
ESP_LOGCONFIG(TAG, "
|
60
|
-
|
61
|
-
|
52
|
+
ESP_LOGCONFIG(TAG, " models:");
|
53
|
+
for (auto &model : this->wake_word_models_) {
|
54
|
+
model.log_model_config();
|
55
|
+
}
|
56
|
+
#ifdef USE_MICRO_WAKE_WORD_VAD
|
57
|
+
this->vad_model_->log_model_config();
|
58
|
+
#endif
|
62
59
|
}
|
63
60
|
|
64
61
|
void MicroWakeWord::setup() {
|
65
62
|
ESP_LOGCONFIG(TAG, "Setting up microWakeWord...");
|
66
63
|
|
67
|
-
if (!this->
|
68
|
-
ESP_LOGE(TAG, "Failed to initialize models");
|
69
|
-
this->mark_failed();
|
70
|
-
return;
|
71
|
-
}
|
72
|
-
|
73
|
-
ExternalRAMAllocator<int16_t> allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
|
74
|
-
this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE * sizeof(int16_t));
|
75
|
-
if (this->input_buffer_ == nullptr) {
|
76
|
-
ESP_LOGW(TAG, "Could not allocate input buffer");
|
77
|
-
this->mark_failed();
|
78
|
-
return;
|
79
|
-
}
|
80
|
-
|
81
|
-
this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
|
82
|
-
if (this->ring_buffer_ == nullptr) {
|
83
|
-
ESP_LOGW(TAG, "Could not allocate ring buffer");
|
64
|
+
if (!this->register_streaming_ops_(this->streaming_op_resolver_)) {
|
84
65
|
this->mark_failed();
|
85
66
|
return;
|
86
67
|
}
|
87
68
|
|
88
69
|
ESP_LOGCONFIG(TAG, "Micro Wake Word initialized");
|
89
|
-
}
|
90
|
-
|
91
|
-
int MicroWakeWord::read_microphone_() {
|
92
|
-
size_t bytes_read = this->microphone_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
|
93
|
-
if (bytes_read == 0) {
|
94
|
-
return 0;
|
95
|
-
}
|
96
|
-
|
97
|
-
size_t bytes_free = this->ring_buffer_->free();
|
98
70
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
71
|
+
this->frontend_config_.window.size_ms = FEATURE_DURATION_MS;
|
72
|
+
this->frontend_config_.window.step_size_ms = this->features_step_size_;
|
73
|
+
this->frontend_config_.filterbank.num_channels = PREPROCESSOR_FEATURE_SIZE;
|
74
|
+
this->frontend_config_.filterbank.lower_band_limit = 125.0;
|
75
|
+
this->frontend_config_.filterbank.upper_band_limit = 7500.0;
|
76
|
+
this->frontend_config_.noise_reduction.smoothing_bits = 10;
|
77
|
+
this->frontend_config_.noise_reduction.even_smoothing = 0.025;
|
78
|
+
this->frontend_config_.noise_reduction.odd_smoothing = 0.06;
|
79
|
+
this->frontend_config_.noise_reduction.min_signal_remaining = 0.05;
|
80
|
+
this->frontend_config_.pcan_gain_control.enable_pcan = 1;
|
81
|
+
this->frontend_config_.pcan_gain_control.strength = 0.95;
|
82
|
+
this->frontend_config_.pcan_gain_control.offset = 80.0;
|
83
|
+
this->frontend_config_.pcan_gain_control.gain_bits = 21;
|
84
|
+
this->frontend_config_.log_scale.enable_log = 1;
|
85
|
+
this->frontend_config_.log_scale.scale_shift = 6;
|
86
|
+
}
|
104
87
|
|
105
|
-
|
106
|
-
|
88
|
+
void MicroWakeWord::add_wake_word_model(const uint8_t *model_start, float probability_cutoff,
|
89
|
+
size_t sliding_window_average_size, const std::string &wake_word,
|
90
|
+
size_t tensor_arena_size) {
|
91
|
+
this->wake_word_models_.emplace_back(model_start, probability_cutoff, sliding_window_average_size, wake_word,
|
92
|
+
tensor_arena_size);
|
93
|
+
}
|
107
94
|
|
108
|
-
|
95
|
+
#ifdef USE_MICRO_WAKE_WORD_VAD
|
96
|
+
void MicroWakeWord::add_vad_model(const uint8_t *model_start, float probability_cutoff, size_t sliding_window_size,
|
97
|
+
size_t tensor_arena_size) {
|
98
|
+
this->vad_model_ = make_unique<VADModel>(model_start, probability_cutoff, sliding_window_size, tensor_arena_size);
|
109
99
|
}
|
100
|
+
#endif
|
110
101
|
|
111
102
|
void MicroWakeWord::loop() {
|
112
103
|
switch (this->state_) {
|
@@ -124,9 +115,12 @@ void MicroWakeWord::loop() {
|
|
124
115
|
}
|
125
116
|
break;
|
126
117
|
case State::DETECTING_WAKE_WORD:
|
127
|
-
this->
|
128
|
-
|
129
|
-
|
118
|
+
while (!this->has_enough_samples_()) {
|
119
|
+
this->read_microphone_();
|
120
|
+
}
|
121
|
+
this->update_model_probabilities_();
|
122
|
+
if (this->detect_wake_words_()) {
|
123
|
+
ESP_LOGD(TAG, "Wake Word '%s' Detected", (this->detected_wake_word_).c_str());
|
130
124
|
this->detected_ = true;
|
131
125
|
this->set_state_(State::STOP_MICROPHONE);
|
132
126
|
}
|
@@ -136,13 +130,16 @@ void MicroWakeWord::loop() {
|
|
136
130
|
this->microphone_->stop();
|
137
131
|
this->set_state_(State::STOPPING_MICROPHONE);
|
138
132
|
this->high_freq_.stop();
|
133
|
+
this->unload_models_();
|
134
|
+
this->deallocate_buffers_();
|
139
135
|
break;
|
140
136
|
case State::STOPPING_MICROPHONE:
|
141
137
|
if (this->microphone_->is_stopped()) {
|
142
138
|
this->set_state_(State::IDLE);
|
143
139
|
if (this->detected_) {
|
140
|
+
this->wake_word_detected_trigger_->trigger(this->detected_wake_word_);
|
144
141
|
this->detected_ = false;
|
145
|
-
this->
|
142
|
+
this->detected_wake_word_ = "";
|
146
143
|
}
|
147
144
|
}
|
148
145
|
break;
|
@@ -150,14 +147,34 @@ void MicroWakeWord::loop() {
|
|
150
147
|
}
|
151
148
|
|
152
149
|
void MicroWakeWord::start() {
|
150
|
+
if (!this->is_ready()) {
|
151
|
+
ESP_LOGW(TAG, "Wake word detection can't start as the component hasn't been setup yet");
|
152
|
+
return;
|
153
|
+
}
|
154
|
+
|
153
155
|
if (this->is_failed()) {
|
154
156
|
ESP_LOGW(TAG, "Wake word component is marked as failed. Please check setup logs");
|
155
157
|
return;
|
156
158
|
}
|
159
|
+
|
160
|
+
if (!this->load_models_() || !this->allocate_buffers_()) {
|
161
|
+
ESP_LOGE(TAG, "Failed to load the wake word model(s) or allocate buffers");
|
162
|
+
this->status_set_error();
|
163
|
+
} else {
|
164
|
+
this->status_clear_error();
|
165
|
+
}
|
166
|
+
|
167
|
+
if (this->status_has_error()) {
|
168
|
+
ESP_LOGW(TAG, "Wake word component has an error. Please check logs");
|
169
|
+
return;
|
170
|
+
}
|
171
|
+
|
157
172
|
if (this->state_ != State::IDLE) {
|
158
173
|
ESP_LOGW(TAG, "Wake word is already running");
|
159
174
|
return;
|
160
175
|
}
|
176
|
+
|
177
|
+
this->reset_states_();
|
161
178
|
this->set_state_(State::START_MICROPHONE);
|
162
179
|
}
|
163
180
|
|
@@ -179,289 +196,218 @@ void MicroWakeWord::set_state_(State state) {
|
|
179
196
|
this->state_ = state;
|
180
197
|
}
|
181
198
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
this->streaming_tensor_arena_ = arena_allocator.allocate(STREAMING_MODEL_ARENA_SIZE);
|
188
|
-
if (this->streaming_tensor_arena_ == nullptr) {
|
189
|
-
ESP_LOGE(TAG, "Could not allocate the streaming model's tensor arena.");
|
190
|
-
return false;
|
191
|
-
}
|
192
|
-
|
193
|
-
this->streaming_var_arena_ = arena_allocator.allocate(STREAMING_MODEL_VARIABLE_ARENA_SIZE);
|
194
|
-
if (this->streaming_var_arena_ == nullptr) {
|
195
|
-
ESP_LOGE(TAG, "Could not allocate the streaming model variable's tensor arena.");
|
196
|
-
return false;
|
197
|
-
}
|
198
|
-
|
199
|
-
this->preprocessor_tensor_arena_ = arena_allocator.allocate(PREPROCESSOR_ARENA_SIZE);
|
200
|
-
if (this->preprocessor_tensor_arena_ == nullptr) {
|
201
|
-
ESP_LOGE(TAG, "Could not allocate the audio preprocessor model's tensor arena.");
|
202
|
-
return false;
|
203
|
-
}
|
204
|
-
|
205
|
-
this->new_features_data_ = features_allocator.allocate(PREPROCESSOR_FEATURE_SIZE);
|
206
|
-
if (this->new_features_data_ == nullptr) {
|
207
|
-
ESP_LOGE(TAG, "Could not allocate the audio features buffer.");
|
208
|
-
return false;
|
199
|
+
size_t MicroWakeWord::read_microphone_() {
|
200
|
+
size_t bytes_read = this->microphone_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
|
201
|
+
if (bytes_read == 0) {
|
202
|
+
return 0;
|
209
203
|
}
|
210
204
|
|
211
|
-
|
212
|
-
if (this->preprocessor_audio_buffer_ == nullptr) {
|
213
|
-
ESP_LOGE(TAG, "Could not allocate the audio preprocessor's buffer.");
|
214
|
-
return false;
|
215
|
-
}
|
205
|
+
size_t bytes_free = this->ring_buffer_->free();
|
216
206
|
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
207
|
+
if (bytes_free < bytes_read) {
|
208
|
+
ESP_LOGW(TAG,
|
209
|
+
"Not enough free bytes in ring buffer to store incoming audio data (free bytes=%d, incoming bytes=%d). "
|
210
|
+
"Resetting the ring buffer. Wake word detection accuracy will be reduced.",
|
211
|
+
bytes_free, bytes_read);
|
222
212
|
|
223
|
-
|
224
|
-
if (this->streaming_model_->version() != TFLITE_SCHEMA_VERSION) {
|
225
|
-
ESP_LOGE(TAG, "Wake word's streaming model's schema is not supported");
|
226
|
-
return false;
|
213
|
+
this->ring_buffer_->reset();
|
227
214
|
}
|
228
215
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
if (!this->register_preprocessor_ops_(preprocessor_op_resolver))
|
233
|
-
return false;
|
234
|
-
if (!this->register_streaming_ops_(streaming_op_resolver))
|
235
|
-
return false;
|
236
|
-
|
237
|
-
tflite::MicroAllocator *ma =
|
238
|
-
tflite::MicroAllocator::Create(this->streaming_var_arena_, STREAMING_MODEL_VARIABLE_ARENA_SIZE);
|
239
|
-
this->mrv_ = tflite::MicroResourceVariables::Create(ma, 15);
|
240
|
-
|
241
|
-
static tflite::MicroInterpreter static_preprocessor_interpreter(
|
242
|
-
this->preprocessor_model_, preprocessor_op_resolver, this->preprocessor_tensor_arena_, PREPROCESSOR_ARENA_SIZE);
|
243
|
-
|
244
|
-
static tflite::MicroInterpreter static_streaming_interpreter(this->streaming_model_, streaming_op_resolver,
|
245
|
-
this->streaming_tensor_arena_,
|
246
|
-
STREAMING_MODEL_ARENA_SIZE, this->mrv_);
|
247
|
-
|
248
|
-
this->preprocessor_interperter_ = &static_preprocessor_interpreter;
|
249
|
-
this->streaming_interpreter_ = &static_streaming_interpreter;
|
250
|
-
|
251
|
-
// Allocate tensors for each models.
|
252
|
-
if (this->preprocessor_interperter_->AllocateTensors() != kTfLiteOk) {
|
253
|
-
ESP_LOGE(TAG, "Failed to allocate tensors for the audio preprocessor");
|
254
|
-
return false;
|
255
|
-
}
|
256
|
-
if (this->streaming_interpreter_->AllocateTensors() != kTfLiteOk) {
|
257
|
-
ESP_LOGE(TAG, "Failed to allocate tensors for the streaming model");
|
258
|
-
return false;
|
259
|
-
}
|
216
|
+
return this->ring_buffer_->write((void *) this->input_buffer_, bytes_read);
|
217
|
+
}
|
260
218
|
|
261
|
-
|
262
|
-
|
263
|
-
if ((input->dims->size != 3) || (input->dims->data[0] != 1) || (input->dims->data[0] != 1) ||
|
264
|
-
(input->dims->data[1] != 1) || (input->dims->data[2] != PREPROCESSOR_FEATURE_SIZE)) {
|
265
|
-
ESP_LOGE(TAG, "Wake word detection model tensor input dimensions is not 1x1x%u", input->dims->data[2]);
|
266
|
-
return false;
|
267
|
-
}
|
219
|
+
bool MicroWakeWord::allocate_buffers_() {
|
220
|
+
ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
|
268
221
|
|
269
|
-
if (
|
270
|
-
|
271
|
-
|
222
|
+
if (this->input_buffer_ == nullptr) {
|
223
|
+
this->input_buffer_ = audio_samples_allocator.allocate(INPUT_BUFFER_SIZE * sizeof(int16_t));
|
224
|
+
if (this->input_buffer_ == nullptr) {
|
225
|
+
ESP_LOGE(TAG, "Could not allocate input buffer");
|
226
|
+
return false;
|
227
|
+
}
|
272
228
|
}
|
273
229
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
230
|
+
if (this->preprocessor_audio_buffer_ == nullptr) {
|
231
|
+
this->preprocessor_audio_buffer_ = audio_samples_allocator.allocate(this->new_samples_to_get_());
|
232
|
+
if (this->preprocessor_audio_buffer_ == nullptr) {
|
233
|
+
ESP_LOGE(TAG, "Could not allocate the audio preprocessor's buffer.");
|
234
|
+
return false;
|
235
|
+
}
|
278
236
|
}
|
279
237
|
|
280
|
-
if (
|
281
|
-
|
282
|
-
|
238
|
+
if (this->ring_buffer_ == nullptr) {
|
239
|
+
this->ring_buffer_ = RingBuffer::create(BUFFER_SIZE * sizeof(int16_t));
|
240
|
+
if (this->ring_buffer_ == nullptr) {
|
241
|
+
ESP_LOGE(TAG, "Could not allocate ring buffer");
|
242
|
+
return false;
|
243
|
+
}
|
283
244
|
}
|
284
245
|
|
285
|
-
this->recent_streaming_probabilities_.resize(this->sliding_window_average_size_, 0.0);
|
286
|
-
|
287
246
|
return true;
|
288
247
|
}
|
289
248
|
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
249
|
+
void MicroWakeWord::deallocate_buffers_() {
|
250
|
+
ExternalRAMAllocator<int16_t> audio_samples_allocator(ExternalRAMAllocator<int16_t>::ALLOW_FAILURE);
|
251
|
+
audio_samples_allocator.deallocate(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t));
|
252
|
+
this->input_buffer_ = nullptr;
|
253
|
+
audio_samples_allocator.deallocate(this->preprocessor_audio_buffer_, this->new_samples_to_get_());
|
254
|
+
this->preprocessor_audio_buffer_ = nullptr;
|
255
|
+
}
|
256
|
+
|
257
|
+
bool MicroWakeWord::load_models_() {
|
258
|
+
// Setup preprocesor feature generator
|
259
|
+
if (!FrontendPopulateState(&this->frontend_config_, &this->frontend_state_, AUDIO_SAMPLE_FREQUENCY)) {
|
260
|
+
ESP_LOGD(TAG, "Failed to populate frontend state");
|
261
|
+
FrontendFreeStateContents(&this->frontend_state_);
|
294
262
|
return false;
|
295
263
|
}
|
296
264
|
|
297
|
-
//
|
298
|
-
|
265
|
+
// Setup streaming models
|
266
|
+
for (auto &model : this->wake_word_models_) {
|
267
|
+
if (!model.load_model(this->streaming_op_resolver_)) {
|
268
|
+
ESP_LOGE(TAG, "Failed to initialize a wake word model.");
|
269
|
+
return false;
|
270
|
+
}
|
271
|
+
}
|
272
|
+
#ifdef USE_MICRO_WAKE_WORD_VAD
|
273
|
+
if (!this->vad_model_->load_model(this->streaming_op_resolver_)) {
|
274
|
+
ESP_LOGE(TAG, "Failed to initialize VAD model.");
|
299
275
|
return false;
|
300
276
|
}
|
277
|
+
#endif
|
301
278
|
|
302
279
|
return true;
|
303
280
|
}
|
304
281
|
|
305
|
-
|
306
|
-
|
282
|
+
void MicroWakeWord::unload_models_() {
|
283
|
+
FrontendFreeStateContents(&this->frontend_state_);
|
307
284
|
|
308
|
-
|
309
|
-
|
310
|
-
memcpy((void *) (tflite::GetTensorData<int8_t>(input)), (const void *) (this->new_features_data_), bytes_to_copy);
|
311
|
-
|
312
|
-
uint32_t prior_invoke = millis();
|
313
|
-
|
314
|
-
TfLiteStatus invoke_status = this->streaming_interpreter_->Invoke();
|
315
|
-
if (invoke_status != kTfLiteOk) {
|
316
|
-
ESP_LOGW(TAG, "Streaming Interpreter Invoke failed");
|
317
|
-
return false;
|
285
|
+
for (auto &model : this->wake_word_models_) {
|
286
|
+
model.unload_model();
|
318
287
|
}
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
TfLiteTensor *output = this->streaming_interpreter_->output(0);
|
323
|
-
|
324
|
-
return static_cast<float>(output->data.uint8[0]) / 255.0;
|
288
|
+
#ifdef USE_MICRO_WAKE_WORD_VAD
|
289
|
+
this->vad_model_->unload_model();
|
290
|
+
#endif
|
325
291
|
}
|
326
292
|
|
327
|
-
|
328
|
-
|
329
|
-
if (!this->update_features_()) {
|
330
|
-
return false;
|
331
|
-
}
|
293
|
+
void MicroWakeWord::update_model_probabilities_() {
|
294
|
+
int8_t audio_features[PREPROCESSOR_FEATURE_SIZE];
|
332
295
|
|
333
|
-
|
334
|
-
|
296
|
+
if (!this->generate_features_for_window_(audio_features)) {
|
297
|
+
return;
|
298
|
+
}
|
335
299
|
|
336
|
-
//
|
337
|
-
this->
|
338
|
-
++this->last_n_index_;
|
339
|
-
if (this->last_n_index_ == this->sliding_window_average_size_)
|
340
|
-
this->last_n_index_ = 0;
|
300
|
+
// Increase the counter since the last positive detection
|
301
|
+
this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);
|
341
302
|
|
342
|
-
|
343
|
-
|
344
|
-
|
303
|
+
for (auto &model : this->wake_word_models_) {
|
304
|
+
// Perform inference
|
305
|
+
model.perform_streaming_inference(audio_features);
|
345
306
|
}
|
307
|
+
#ifdef USE_MICRO_WAKE_WORD_VAD
|
308
|
+
this->vad_model_->perform_streaming_inference(audio_features);
|
309
|
+
#endif
|
310
|
+
}
|
346
311
|
|
347
|
-
|
348
|
-
|
349
|
-
// Ensure we have enough samples since the last positive detection
|
350
|
-
this->ignore_windows_ = std::min(this->ignore_windows_ + 1, 0);
|
312
|
+
bool MicroWakeWord::detect_wake_words_() {
|
313
|
+
// Verify we have processed samples since the last positive detection
|
351
314
|
if (this->ignore_windows_ < 0) {
|
352
315
|
return false;
|
353
316
|
}
|
354
317
|
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
for (auto &prob : this->recent_streaming_probabilities_) {
|
359
|
-
prob = 0;
|
360
|
-
}
|
318
|
+
#ifdef USE_MICRO_WAKE_WORD_VAD
|
319
|
+
bool vad_state = this->vad_model_->determine_detected();
|
320
|
+
#endif
|
361
321
|
|
362
|
-
|
363
|
-
|
364
|
-
|
322
|
+
for (auto &model : this->wake_word_models_) {
|
323
|
+
if (model.determine_detected()) {
|
324
|
+
#ifdef USE_MICRO_WAKE_WORD_VAD
|
325
|
+
if (vad_state) {
|
326
|
+
#endif
|
327
|
+
this->detected_wake_word_ = model.get_wake_word();
|
328
|
+
return true;
|
329
|
+
#ifdef USE_MICRO_WAKE_WORD_VAD
|
330
|
+
} else {
|
331
|
+
ESP_LOGD(TAG, "Wake word model predicts %s, but VAD model doesn't.", model.get_wake_word().c_str());
|
332
|
+
}
|
333
|
+
#endif
|
334
|
+
}
|
365
335
|
}
|
366
336
|
|
367
337
|
return false;
|
368
338
|
}
|
369
339
|
|
370
|
-
|
371
|
-
this->
|
372
|
-
|
373
|
-
}
|
374
|
-
|
375
|
-
bool MicroWakeWord::slice_available_() {
|
376
|
-
size_t available = this->ring_buffer_->available();
|
377
|
-
|
378
|
-
return available > (NEW_SAMPLES_TO_GET * sizeof(int16_t));
|
340
|
+
bool MicroWakeWord::has_enough_samples_() {
|
341
|
+
return this->ring_buffer_->available() >=
|
342
|
+
(this->features_step_size_ * (AUDIO_SAMPLE_FREQUENCY / 1000)) * sizeof(int16_t);
|
379
343
|
}
|
380
344
|
|
381
|
-
bool MicroWakeWord::
|
382
|
-
|
345
|
+
bool MicroWakeWord::generate_features_for_window_(int8_t features[PREPROCESSOR_FEATURE_SIZE]) {
|
346
|
+
// Ensure we have enough new audio samples in the ring buffer for a full window
|
347
|
+
if (!this->has_enough_samples_()) {
|
383
348
|
return false;
|
384
349
|
}
|
385
350
|
|
386
|
-
|
387
|
-
|
388
|
-
HISTORY_SAMPLES_TO_KEEP * sizeof(int16_t));
|
389
|
-
|
390
|
-
// Copy 640 bytes (320 samples over 20 ms) from the ring buffer into the audio buffer offset 320 bytes (160 samples
|
391
|
-
// over 10 ms)
|
392
|
-
size_t bytes_read = this->ring_buffer_->read((void *) (this->preprocessor_audio_buffer_ + HISTORY_SAMPLES_TO_KEEP),
|
393
|
-
NEW_SAMPLES_TO_GET * sizeof(int16_t), pdMS_TO_TICKS(200));
|
351
|
+
size_t bytes_read = this->ring_buffer_->read((void *) (this->preprocessor_audio_buffer_),
|
352
|
+
this->new_samples_to_get_() * sizeof(int16_t), pdMS_TO_TICKS(200));
|
394
353
|
|
395
354
|
if (bytes_read == 0) {
|
396
355
|
ESP_LOGE(TAG, "Could not read data from Ring Buffer");
|
397
|
-
} else if (bytes_read <
|
356
|
+
} else if (bytes_read < this->new_samples_to_get_() * sizeof(int16_t)) {
|
398
357
|
ESP_LOGD(TAG, "Partial Read of Data by Model");
|
399
358
|
ESP_LOGD(TAG, "Could only read %d bytes when required %d bytes ", bytes_read,
|
400
|
-
(int) (
|
401
|
-
return false;
|
359
|
+
(int) (this->new_samples_to_get_() * sizeof(int16_t)));
|
360
|
+
return false;
|
361
|
+
}
|
362
|
+
|
363
|
+
size_t num_samples_read;
|
364
|
+
struct FrontendOutput frontend_output = FrontendProcessSamples(
|
365
|
+
&this->frontend_state_, this->preprocessor_audio_buffer_, this->new_samples_to_get_(), &num_samples_read);
|
366
|
+
|
367
|
+
for (size_t i = 0; i < frontend_output.size; ++i) {
|
368
|
+
// These scaling values are set to match the TFLite audio frontend int8 output.
|
369
|
+
// The feature pipeline outputs 16-bit signed integers in roughly a 0 to 670
|
370
|
+
// range. In training, these are then arbitrarily divided by 25.6 to get
|
371
|
+
// float values in the rough range of 0.0 to 26.0. This scaling is performed
|
372
|
+
// for historical reasons, to match up with the output of other feature
|
373
|
+
// generators.
|
374
|
+
// The process is then further complicated when we quantize the model. This
|
375
|
+
// means we have to scale the 0.0 to 26.0 real values to the -128 to 127
|
376
|
+
// signed integer numbers.
|
377
|
+
// All this means that to get matching values from our integer feature
|
378
|
+
// output into the tensor input, we have to perform:
|
379
|
+
// input = (((feature / 25.6) / 26.0) * 256) - 128
|
380
|
+
// To simplify this and perform it in 32-bit integer math, we rearrange to:
|
381
|
+
// input = (feature * 256) / (25.6 * 26.0) - 128
|
382
|
+
constexpr int32_t value_scale = 256;
|
383
|
+
constexpr int32_t value_div = 666; // 666 = 25.6 * 26.0 after rounding
|
384
|
+
int32_t value = ((frontend_output.values[i] * value_scale) + (value_div / 2)) / value_div;
|
385
|
+
value -= 128;
|
386
|
+
if (value < -128) {
|
387
|
+
value = -128;
|
388
|
+
}
|
389
|
+
if (value > 127) {
|
390
|
+
value = 127;
|
391
|
+
}
|
392
|
+
features[i] = value;
|
402
393
|
}
|
403
394
|
|
404
|
-
*audio_samples = this->preprocessor_audio_buffer_;
|
405
395
|
return true;
|
406
396
|
}
|
407
397
|
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
if (this->preprocessor_interperter_->Invoke() != kTfLiteOk) {
|
415
|
-
ESP_LOGE(TAG, "Failed to preprocess audio for local wake word.");
|
416
|
-
return false;
|
398
|
+
void MicroWakeWord::reset_states_() {
|
399
|
+
ESP_LOGD(TAG, "Resetting buffers and probabilities");
|
400
|
+
this->ring_buffer_->reset();
|
401
|
+
this->ignore_windows_ = -MIN_SLICES_BEFORE_DETECTION;
|
402
|
+
for (auto &model : this->wake_word_models_) {
|
403
|
+
model.reset_probabilities();
|
417
404
|
}
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
}
|
422
|
-
|
423
|
-
bool MicroWakeWord::register_preprocessor_ops_(tflite::MicroMutableOpResolver<18> &op_resolver) {
|
424
|
-
if (op_resolver.AddReshape() != kTfLiteOk)
|
425
|
-
return false;
|
426
|
-
if (op_resolver.AddCast() != kTfLiteOk)
|
427
|
-
return false;
|
428
|
-
if (op_resolver.AddStridedSlice() != kTfLiteOk)
|
429
|
-
return false;
|
430
|
-
if (op_resolver.AddConcatenation() != kTfLiteOk)
|
431
|
-
return false;
|
432
|
-
if (op_resolver.AddMul() != kTfLiteOk)
|
433
|
-
return false;
|
434
|
-
if (op_resolver.AddAdd() != kTfLiteOk)
|
435
|
-
return false;
|
436
|
-
if (op_resolver.AddDiv() != kTfLiteOk)
|
437
|
-
return false;
|
438
|
-
if (op_resolver.AddMinimum() != kTfLiteOk)
|
439
|
-
return false;
|
440
|
-
if (op_resolver.AddMaximum() != kTfLiteOk)
|
441
|
-
return false;
|
442
|
-
if (op_resolver.AddWindow() != kTfLiteOk)
|
443
|
-
return false;
|
444
|
-
if (op_resolver.AddFftAutoScale() != kTfLiteOk)
|
445
|
-
return false;
|
446
|
-
if (op_resolver.AddRfft() != kTfLiteOk)
|
447
|
-
return false;
|
448
|
-
if (op_resolver.AddEnergy() != kTfLiteOk)
|
449
|
-
return false;
|
450
|
-
if (op_resolver.AddFilterBank() != kTfLiteOk)
|
451
|
-
return false;
|
452
|
-
if (op_resolver.AddFilterBankSquareRoot() != kTfLiteOk)
|
453
|
-
return false;
|
454
|
-
if (op_resolver.AddFilterBankSpectralSubtraction() != kTfLiteOk)
|
455
|
-
return false;
|
456
|
-
if (op_resolver.AddPCAN() != kTfLiteOk)
|
457
|
-
return false;
|
458
|
-
if (op_resolver.AddFilterBankLog() != kTfLiteOk)
|
459
|
-
return false;
|
460
|
-
|
461
|
-
return true;
|
405
|
+
#ifdef USE_MICRO_WAKE_WORD_VAD
|
406
|
+
this->vad_model_->reset_probabilities();
|
407
|
+
#endif
|
462
408
|
}
|
463
409
|
|
464
|
-
bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<
|
410
|
+
bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<20> &op_resolver) {
|
465
411
|
if (op_resolver.AddCallOnce() != kTfLiteOk)
|
466
412
|
return false;
|
467
413
|
if (op_resolver.AddVarHandle() != kTfLiteOk)
|
@@ -496,6 +442,12 @@ bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<17> &
|
|
496
442
|
return false;
|
497
443
|
if (op_resolver.AddMaxPool2D() != kTfLiteOk)
|
498
444
|
return false;
|
445
|
+
if (op_resolver.AddPad() != kTfLiteOk)
|
446
|
+
return false;
|
447
|
+
if (op_resolver.AddPack() != kTfLiteOk)
|
448
|
+
return false;
|
449
|
+
if (op_resolver.AddSplitV() != kTfLiteOk)
|
450
|
+
return false;
|
499
451
|
|
500
452
|
return true;
|
501
453
|
}
|
@@ -504,5 +456,3 @@ bool MicroWakeWord::register_streaming_ops_(tflite::MicroMutableOpResolver<17> &
|
|
504
456
|
} // namespace esphome
|
505
457
|
|
506
458
|
#endif // USE_ESP_IDF
|
507
|
-
|
508
|
-
#endif // CLANG_TIDY
|