react-native-executorch 0.9.0 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/libs/classes.jar +0 -0
- package/common/rnexecutorch/host_objects/JsiConversions.h +43 -0
- package/common/rnexecutorch/models/llm/LLM.cpp +55 -42
- package/common/rnexecutorch/models/llm/LLM.h +4 -3
- package/common/rnexecutorch/models/llm/Types.h +23 -0
- package/common/runner/base_llm_runner.cpp +10 -3
- package/common/runner/base_llm_runner.h +1 -0
- package/common/runner/constants.h +15 -1
- package/common/runner/encoders/audio_encoder.cpp +111 -0
- package/common/runner/encoders/audio_encoder.h +40 -0
- package/common/runner/encoders/vision_encoder.cpp +13 -5
- package/common/runner/encoders/vision_encoder.h +15 -2
- package/common/runner/irunner.h +5 -0
- package/common/runner/multimodal_decoder_runner.h +50 -1
- package/common/runner/multimodal_input.h +16 -1
- package/common/runner/multimodal_prefiller.cpp +374 -64
- package/common/runner/multimodal_prefiller.h +57 -6
- package/common/runner/multimodal_runner.cpp +19 -12
- package/common/runner/multimodal_runner.h +1 -1
- package/common/runner/sampler.cpp +126 -39
- package/common/runner/sampler.h +13 -5
- package/common/runner/text_decoder_runner.cpp +1 -4
- package/common/runner/text_decoder_runner.h +3 -2
- package/common/runner/text_prefiller.cpp +8 -8
- package/common/runner/text_prefiller.h +8 -1
- package/common/runner/text_runner.cpp +35 -9
- package/common/runner/text_token_generator.h +2 -3
- package/common/runner/util.h +0 -1
- package/lib/module/constants/llmDefaults.js +1 -1
- package/lib/module/constants/llmDefaults.js.map +1 -1
- package/lib/module/constants/modelRegistry.js +62 -3
- package/lib/module/constants/modelRegistry.js.map +1 -1
- package/lib/module/constants/modelUrls.js +62 -6
- package/lib/module/constants/modelUrls.js.map +1 -1
- package/lib/module/controllers/LLMController.js +69 -20
- package/lib/module/controllers/LLMController.js.map +1 -1
- package/lib/module/hooks/natural_language_processing/useLLM.js +1 -5
- package/lib/module/hooks/natural_language_processing/useLLM.js.map +1 -1
- package/lib/module/modules/computer_vision/PoseEstimationModule.js +13 -1
- package/lib/module/modules/computer_vision/PoseEstimationModule.js.map +1 -1
- package/lib/module/modules/natural_language_processing/LLMModule.js +12 -7
- package/lib/module/modules/natural_language_processing/LLMModule.js.map +1 -1
- package/lib/module/types/llm.js +11 -0
- package/lib/module/types/llm.js.map +1 -1
- package/lib/module/types/poseEstimation.js.map +1 -1
- package/lib/typescript/constants/llmDefaults.d.ts +1 -1
- package/lib/typescript/constants/llmDefaults.d.ts.map +1 -1
- package/lib/typescript/constants/modelRegistry.d.ts +38 -1
- package/lib/typescript/constants/modelRegistry.d.ts.map +1 -1
- package/lib/typescript/constants/modelUrls.d.ts +52 -12
- package/lib/typescript/constants/modelUrls.d.ts.map +1 -1
- package/lib/typescript/controllers/LLMController.d.ts +7 -9
- package/lib/typescript/controllers/LLMController.d.ts.map +1 -1
- package/lib/typescript/modules/computer_vision/PoseEstimationModule.d.ts +6 -0
- package/lib/typescript/modules/computer_vision/PoseEstimationModule.d.ts.map +1 -1
- package/lib/typescript/modules/natural_language_processing/LLMModule.d.ts +6 -3
- package/lib/typescript/modules/natural_language_processing/LLMModule.d.ts.map +1 -1
- package/lib/typescript/types/llm.d.ts +63 -36
- package/lib/typescript/types/llm.d.ts.map +1 -1
- package/lib/typescript/types/poseEstimation.d.ts +3 -0
- package/lib/typescript/types/poseEstimation.d.ts.map +1 -1
- package/package.json +1 -1
- package/react-native-executorch.podspec +6 -0
- package/src/constants/llmDefaults.ts +1 -1
- package/src/constants/modelRegistry.ts +62 -2
- package/src/constants/modelUrls.ts +69 -6
- package/src/controllers/LLMController.ts +89 -40
- package/src/hooks/natural_language_processing/useLLM.ts +5 -6
- package/src/modules/computer_vision/PoseEstimationModule.ts +12 -0
- package/src/modules/natural_language_processing/LLMModule.ts +19 -8
- package/src/types/llm.ts +64 -34
- package/src/types/poseEstimation.ts +10 -4
- package/third-party/android/libs/executorch/arm64-v8a/libexecutorch.so +0 -0
- package/third-party/android/libs/executorch/x86_64/libexecutorch.so +0 -0
- package/third-party/include/executorch/ExecuTorch.h +2 -0
- package/third-party/include/executorch/ExecuTorchModule.h +46 -0
- package/third-party/include/executorch/extension/data_loader/buffer_data_loader.h +4 -3
- package/third-party/include/executorch/extension/data_loader/mman.h +46 -0
- package/third-party/include/executorch/extension/data_loader/mmap_data_loader.h +4 -0
- package/third-party/include/executorch/extension/data_loader/shared_ptr_data_loader.h +7 -3
- package/third-party/include/executorch/extension/module/module.h +47 -8
- package/third-party/include/executorch/extension/tensor/tensor_ptr.h +17 -5
- package/third-party/include/executorch/kernels/optimized/Functions.h +12 -0
- package/third-party/include/executorch/kernels/optimized/NativeFunctions.h +4 -0
- package/third-party/include/executorch/kernels/portable/Functions.h +18 -0
- package/third-party/include/executorch/kernels/portable/NativeFunctions.h +6 -0
- package/third-party/include/executorch/runtime/backend/backend_options_map.h +37 -0
- package/third-party/include/executorch/runtime/core/array_ref.h +3 -1
- package/third-party/include/executorch/runtime/core/error.h +1 -0
- package/third-party/include/executorch/runtime/core/evalue.h +256 -9
- package/third-party/include/executorch/runtime/core/exec_aten/exec_aten.h +24 -0
- package/third-party/include/executorch/runtime/core/hierarchical_allocator.h +9 -6
- package/third-party/include/executorch/runtime/core/portable_type/device.h +3 -4
- package/third-party/include/executorch/runtime/core/portable_type/tensor_impl.h +31 -1
- package/third-party/include/executorch/runtime/executor/method.h +9 -3
- package/third-party/include/executorch/runtime/executor/method_meta.h +14 -0
- package/third-party/include/executorch/runtime/executor/platform_memory_allocator.h +12 -2
- package/third-party/include/executorch/runtime/executor/program.h +3 -1
- package/third-party/include/executorch/runtime/executor/tensor_parser.h +5 -1
- package/third-party/include/executorch/runtime/kernel/operator_registry.h +9 -0
- package/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/ExecutorchLib +0 -0
- package/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist +0 -0
- package/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/mlx.metallib +0 -0
- package/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/ExecutorchLib +0 -0
- package/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist +0 -0
- package/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/mlx.metallib +0 -0
|
@@ -20,6 +20,10 @@ struct ImagePath {
|
|
|
20
20
|
std::string path;
|
|
21
21
|
};
|
|
22
22
|
|
|
23
|
+
struct AudioWaveform {
|
|
24
|
+
std::vector<float> samples;
|
|
25
|
+
};
|
|
26
|
+
|
|
23
27
|
class MultimodalInput {
|
|
24
28
|
public:
|
|
25
29
|
explicit MultimodalInput(std::string text) : data_(std::move(text)) {}
|
|
@@ -27,6 +31,7 @@ public:
|
|
|
27
31
|
: data_(std::move(tokens)) {}
|
|
28
32
|
explicit MultimodalInput(ImagePath image_path)
|
|
29
33
|
: data_(std::move(image_path)) {}
|
|
34
|
+
explicit MultimodalInput(AudioWaveform audio) : data_(std::move(audio)) {}
|
|
30
35
|
|
|
31
36
|
MultimodalInput(const MultimodalInput &) = default;
|
|
32
37
|
MultimodalInput &operator=(const MultimodalInput &) = default;
|
|
@@ -42,6 +47,9 @@ public:
|
|
|
42
47
|
bool is_image() const noexcept {
|
|
43
48
|
return std::holds_alternative<ImagePath>(data_);
|
|
44
49
|
}
|
|
50
|
+
bool is_audio() const noexcept {
|
|
51
|
+
return std::holds_alternative<AudioWaveform>(data_);
|
|
52
|
+
}
|
|
45
53
|
|
|
46
54
|
const std::string &get_text() const & { return std::get<std::string>(data_); }
|
|
47
55
|
const std::vector<uint64_t> &get_tokens() const & {
|
|
@@ -50,9 +58,13 @@ public:
|
|
|
50
58
|
const std::string &get_image_path() const & {
|
|
51
59
|
return std::get<ImagePath>(data_).path;
|
|
52
60
|
}
|
|
61
|
+
const AudioWaveform &get_audio() const & {
|
|
62
|
+
return std::get<AudioWaveform>(data_);
|
|
63
|
+
}
|
|
53
64
|
|
|
54
65
|
private:
|
|
55
|
-
std::variant<std::string, std::vector<uint64_t>, ImagePath>
|
|
66
|
+
std::variant<std::string, std::vector<uint64_t>, ImagePath, AudioWaveform>
|
|
67
|
+
data_;
|
|
56
68
|
};
|
|
57
69
|
|
|
58
70
|
inline MultimodalInput make_text_input(const std::string &text) noexcept {
|
|
@@ -64,5 +76,8 @@ inline MultimodalInput make_text_input(std::string &&text) noexcept {
|
|
|
64
76
|
inline MultimodalInput make_image_input(std::string path) noexcept {
|
|
65
77
|
return MultimodalInput(ImagePath{std::move(path)});
|
|
66
78
|
}
|
|
79
|
+
inline MultimodalInput make_audio_input(std::vector<float> samples) noexcept {
|
|
80
|
+
return MultimodalInput(AudioWaveform{std::move(samples)});
|
|
81
|
+
}
|
|
67
82
|
|
|
68
83
|
} // namespace executorch::extension::llm
|
|
@@ -13,6 +13,9 @@
|
|
|
13
13
|
#include "constants.h"
|
|
14
14
|
#include "util.h"
|
|
15
15
|
#include <algorithm>
|
|
16
|
+
#include <cstring>
|
|
17
|
+
#include <rnexecutorch/Log.h>
|
|
18
|
+
#include <string>
|
|
16
19
|
|
|
17
20
|
namespace executorch::extension::llm {
|
|
18
21
|
|
|
@@ -23,91 +26,390 @@ using ::executorch::runtime::Result;
|
|
|
23
26
|
|
|
24
27
|
MultimodalPrefiller::MultimodalPrefiller(
|
|
25
28
|
Module &module, MultimodalDecoderRunner &decoder_runner,
|
|
26
|
-
tokenizers::HFTokenizer &tokenizer,
|
|
29
|
+
tokenizers::HFTokenizer &tokenizer,
|
|
30
|
+
std::unordered_map<std::string, int64_t> metadata, IEncoder *image_encoder,
|
|
31
|
+
IEncoder *audio_encoder)
|
|
27
32
|
: module_(&module), decoder_runner_(&decoder_runner),
|
|
28
|
-
tokenizer_(&tokenizer),
|
|
33
|
+
tokenizer_(&tokenizer), metadata_(metadata),
|
|
34
|
+
image_encoder_(image_encoder), audio_encoder_(audio_encoder) {}
|
|
29
35
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
36
|
+
int64_t MultimodalPrefiller::get_max_seq_len() const {
|
|
37
|
+
auto r = module_->get(kMaxSeqLen);
|
|
38
|
+
if (r.error() != ::executorch::runtime::Error::Ok) {
|
|
39
|
+
return metadata_.at(kMaxSeqLen);
|
|
40
|
+
}
|
|
41
|
+
return r->toScalar().to<int64_t>();
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
int64_t MultimodalPrefiller::get_max_context_len() const {
|
|
45
|
+
auto r = module_->get(kMaxContextLen);
|
|
46
|
+
if (r.error() != ::executorch::runtime::Error::Ok) {
|
|
47
|
+
return metadata_.at(kMaxContextLen) || get_max_seq_len();
|
|
48
|
+
}
|
|
49
|
+
return r->toScalar().to<int64_t>();
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
bool MultimodalPrefiller::get_enable_dynamic_shape() const {
|
|
53
|
+
auto r = module_->get(kEnableDynamicShape);
|
|
54
|
+
if (r.error() != ::executorch::runtime::Error::Ok) {
|
|
55
|
+
return metadata_.at(kEnableDynamicShape);
|
|
56
|
+
}
|
|
57
|
+
return r->toScalar().to<bool>();
|
|
58
|
+
}
|
|
35
59
|
|
|
60
|
+
[[nodiscard]] auto MultimodalPrefiller::processMultimodalInput(
|
|
61
|
+
const MultimodalInput &input, std::vector<int64_t> &ids,
|
|
62
|
+
std::vector<Types::ImageSlot> &image_slots,
|
|
63
|
+
std::vector<Types::AudioSlot> &audio_slots) {
|
|
36
64
|
if (input.is_image()) {
|
|
37
65
|
ET_CHECK_OR_RETURN_ERROR(image_encoder_ != nullptr, InvalidState,
|
|
38
66
|
"No image encoder registered");
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
67
|
+
const int32_t num_visual = image_encoder_->encoderTokenCount();
|
|
68
|
+
ET_CHECK_OR_RETURN_ERROR(num_visual > 0, InvalidState,
|
|
69
|
+
"Image encoder reports 0 visual tokens");
|
|
70
|
+
image_slots.push_back(Types::ImageSlot{&input,
|
|
71
|
+
static_cast<int64_t>(ids.size()),
|
|
72
|
+
static_cast<int64_t>(num_visual)});
|
|
73
|
+
ids.insert(ids.end(), static_cast<size_t>(num_visual), 0);
|
|
74
|
+
} else if (input.is_audio()) {
|
|
75
|
+
ET_CHECK_OR_RETURN_ERROR(audio_encoder_ != nullptr, InvalidState,
|
|
76
|
+
"No audio encoder registered");
|
|
77
|
+
auto enc = audio_encoder_->encode(input);
|
|
78
|
+
ET_CHECK_OK_OR_RETURN_ERROR(enc.error(), "Audio encoding failed");
|
|
79
|
+
// Snapshot the encoder output NOW — see AudioSlot comment above for
|
|
80
|
+
// why the returned EValue's tensor metadata can't survive past the
|
|
81
|
+
// next module_->execute(). num_audio and audio_hidden are read from
|
|
82
|
+
// the tensor directly rather than from encoderTokenCount() so they
|
|
83
|
+
// are guaranteed to reflect THIS encode call.
|
|
84
|
+
auto audio_tensor = enc->toTensor();
|
|
85
|
+
ET_CHECK_OR_RETURN_ERROR(audio_tensor.dim() == 3, InvalidState,
|
|
86
|
+
"audio_encoder output rank=%zd, expected 3",
|
|
87
|
+
audio_tensor.dim());
|
|
88
|
+
const int64_t num_audio = static_cast<int64_t>(audio_tensor.size(1));
|
|
89
|
+
const int64_t audio_hidden = static_cast<int64_t>(audio_tensor.size(2));
|
|
90
|
+
ET_CHECK_OR_RETURN_ERROR(num_audio > 0, InvalidState,
|
|
91
|
+
"Audio encoder produced 0 tokens");
|
|
92
|
+
std::vector<uint8_t> bytes(audio_tensor.nbytes());
|
|
93
|
+
std::memcpy(bytes.data(), audio_tensor.const_data_ptr(),
|
|
94
|
+
audio_tensor.nbytes());
|
|
95
|
+
audio_slots.push_back(Types::AudioSlot{
|
|
96
|
+
std::move(bytes), audio_tensor.scalar_type(),
|
|
97
|
+
static_cast<int64_t>(ids.size()), num_audio, audio_hidden});
|
|
98
|
+
ids.insert(ids.end(), static_cast<size_t>(num_audio), 0);
|
|
99
|
+
} else if (input.is_text()) {
|
|
100
|
+
auto encode_result = tokenizer_->encode(input.get_text());
|
|
101
|
+
if (!encode_result.ok()) {
|
|
102
|
+
ET_LOG(Error, "Tokenizer encode error %d",
|
|
103
|
+
static_cast<uint32_t>(encode_result.error()));
|
|
104
|
+
return Error::InvalidArgument;
|
|
55
105
|
}
|
|
106
|
+
std::vector<uint64_t> tokens = std::move(*encode_result);
|
|
107
|
+
for (auto t : tokens) {
|
|
108
|
+
ids.push_back(static_cast<int64_t>(t));
|
|
109
|
+
}
|
|
110
|
+
} else if (input.is_tokens()) {
|
|
111
|
+
std::vector<uint64_t> tokens = input.get_tokens();
|
|
112
|
+
for (auto t : tokens) {
|
|
113
|
+
ids.push_back(static_cast<int64_t>(t));
|
|
114
|
+
}
|
|
115
|
+
} else {
|
|
116
|
+
ET_LOG(Error, "Unsupported MultimodalInput type");
|
|
117
|
+
return Error::NotSupported;
|
|
118
|
+
}
|
|
119
|
+
return ::executorch::runtime::Error::Ok;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
[[nodiscard]] auto MultimodalPrefiller::encodeAudio(
|
|
123
|
+
const Types::AudioSlot &slot, const auto hidden,
|
|
124
|
+
std::vector<uint8_t> &embeds_buf, const size_t embeds_elem_size,
|
|
125
|
+
const ::executorch::aten::ScalarType &embeds_dtype) {
|
|
126
|
+
ET_CHECK_OR_RETURN_ERROR(
|
|
127
|
+
slot.audio_hidden == static_cast<int64_t>(hidden), InvalidState,
|
|
128
|
+
"audio encoder hidden %lld != text_embed hidden %lld",
|
|
129
|
+
static_cast<long long>(slot.audio_hidden),
|
|
130
|
+
static_cast<long long>(hidden));
|
|
56
131
|
|
|
57
|
-
|
|
132
|
+
const auto audio_dtype = slot.dtype;
|
|
133
|
+
const size_t audio_elems =
|
|
134
|
+
static_cast<size_t>(slot.num_audio) * static_cast<size_t>(hidden);
|
|
135
|
+
const size_t audio_elem_size =
|
|
136
|
+
audio_elems > 0 ? slot.bytes.size() / audio_elems : 0;
|
|
137
|
+
ET_CHECK_OR_RETURN_ERROR(
|
|
138
|
+
audio_elem_size > 0 && audio_elem_size * audio_elems == slot.bytes.size(),
|
|
139
|
+
InvalidState,
|
|
140
|
+
"audio slot bytes %zu inconsistent with num_audio=%lld hidden=%lld",
|
|
141
|
+
slot.bytes.size(), static_cast<long long>(slot.num_audio),
|
|
142
|
+
static_cast<long long>(hidden));
|
|
58
143
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
144
|
+
uint8_t *dst = embeds_buf.data() + static_cast<size_t>(slot.slot_start) *
|
|
145
|
+
static_cast<size_t>(hidden) *
|
|
146
|
+
embeds_elem_size;
|
|
147
|
+
|
|
148
|
+
if (audio_dtype == embeds_dtype) {
|
|
149
|
+
std::memcpy(dst, slot.bytes.data(), audio_elems * embeds_elem_size);
|
|
150
|
+
} else if (audio_dtype == ::executorch::aten::ScalarType::Float &&
|
|
151
|
+
embeds_dtype == ::executorch::aten::ScalarType::Half) {
|
|
152
|
+
const float *src = reinterpret_cast<const float *>(slot.bytes.data());
|
|
153
|
+
auto *dst_h = reinterpret_cast<::executorch::aten::Half *>(dst);
|
|
154
|
+
for (size_t i = 0; i < audio_elems; ++i) {
|
|
155
|
+
dst_h[i] = ::executorch::aten::Half(src[i]);
|
|
156
|
+
}
|
|
157
|
+
} else if (audio_dtype == ::executorch::aten::ScalarType::Half &&
|
|
158
|
+
embeds_dtype == ::executorch::aten::ScalarType::Float) {
|
|
159
|
+
const auto *src =
|
|
160
|
+
reinterpret_cast<const ::executorch::aten::Half *>(slot.bytes.data());
|
|
161
|
+
auto *dst_f = reinterpret_cast<float *>(dst);
|
|
162
|
+
for (size_t i = 0; i < audio_elems; ++i) {
|
|
163
|
+
dst_f[i] = static_cast<float>(src[i]);
|
|
65
164
|
}
|
|
165
|
+
} else {
|
|
166
|
+
ET_CHECK_OR_RETURN_ERROR(
|
|
167
|
+
false, InvalidState,
|
|
168
|
+
"unsupported audio/text dtype pair: audio=%hhd text=%hhd",
|
|
169
|
+
static_cast<int8_t>(audio_dtype), static_cast<int8_t>(embeds_dtype));
|
|
170
|
+
}
|
|
171
|
+
return ::executorch::runtime::Error::Ok;
|
|
172
|
+
}
|
|
66
173
|
|
|
67
|
-
|
|
68
|
-
|
|
174
|
+
[[nodiscard]] auto MultimodalPrefiller::encodeImages(
|
|
175
|
+
const Types::ImageSlot &slot, const auto hidden,
|
|
176
|
+
std::vector<uint8_t> &embeds_buf, const size_t embeds_elem_size,
|
|
177
|
+
const ::executorch::aten::ScalarType &embeds_dtype) {
|
|
178
|
+
auto encode_result = image_encoder_->encode(*slot.input);
|
|
179
|
+
ET_CHECK_OK_OR_RETURN_ERROR(encode_result.error(), "Image encoding failed");
|
|
180
|
+
auto encoder_output = *encode_result;
|
|
181
|
+
auto vision_tensor = encoder_output.toTensor();
|
|
69
182
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
183
|
+
const auto vision_dtype = vision_tensor.scalar_type();
|
|
184
|
+
const size_t visual_elems =
|
|
185
|
+
static_cast<size_t>(slot.num_visual) * static_cast<size_t>(hidden);
|
|
186
|
+
uint8_t *dst = embeds_buf.data() + static_cast<size_t>(slot.slot_start) *
|
|
187
|
+
static_cast<size_t>(hidden) *
|
|
188
|
+
embeds_elem_size;
|
|
189
|
+
if (vision_dtype == embeds_dtype) {
|
|
190
|
+
const uint8_t *src =
|
|
191
|
+
static_cast<const uint8_t *>(vision_tensor.const_data_ptr());
|
|
192
|
+
std::memcpy(dst, src, visual_elems * embeds_elem_size);
|
|
193
|
+
} else if (vision_dtype == ::executorch::aten::ScalarType::Float &&
|
|
194
|
+
embeds_dtype == ::executorch::aten::ScalarType::Half) {
|
|
195
|
+
const float *src = vision_tensor.const_data_ptr<float>();
|
|
196
|
+
auto *dst_h = reinterpret_cast<::executorch::aten::Half *>(dst);
|
|
197
|
+
for (size_t i = 0; i < visual_elems; ++i) {
|
|
198
|
+
dst_h[i] = ::executorch::aten::Half(src[i]);
|
|
199
|
+
}
|
|
200
|
+
} else if (vision_dtype == ::executorch::aten::ScalarType::Half &&
|
|
201
|
+
embeds_dtype == ::executorch::aten::ScalarType::Float) {
|
|
202
|
+
const auto *src = vision_tensor.const_data_ptr<::executorch::aten::Half>();
|
|
203
|
+
auto *dst_f = reinterpret_cast<float *>(dst);
|
|
204
|
+
for (size_t i = 0; i < visual_elems; ++i) {
|
|
205
|
+
dst_f[i] = static_cast<float>(src[i]);
|
|
206
|
+
}
|
|
207
|
+
} else {
|
|
208
|
+
ET_CHECK_OR_RETURN_ERROR(
|
|
209
|
+
false, InvalidState,
|
|
210
|
+
"unsupported vision/text dtype pair: vision=%hhd text=%hhd",
|
|
211
|
+
static_cast<int8_t>(vision_dtype), static_cast<int8_t>(embeds_dtype));
|
|
212
|
+
}
|
|
213
|
+
return ::executorch::runtime::Error::Ok;
|
|
214
|
+
}
|
|
73
215
|
|
|
74
|
-
|
|
75
|
-
|
|
216
|
+
[[nodiscard]] auto
|
|
217
|
+
MultimodalPrefiller::initializePLE(auto &embed_outputs, auto total_len,
|
|
218
|
+
Types::PLEEmbeddings &ple_embeddings) {
|
|
219
|
+
auto full_ple_tok = embed_outputs[1].toTensor();
|
|
220
|
+
ple_embeddings.num_layers = static_cast<SizesType>(full_ple_tok.size(2));
|
|
221
|
+
ple_embeddings.ple_dim = static_cast<SizesType>(full_ple_tok.size(3));
|
|
222
|
+
ple_embeddings.ple_tok_dtype = full_ple_tok.scalar_type();
|
|
223
|
+
const size_t total_numel = static_cast<size_t>(full_ple_tok.numel());
|
|
224
|
+
const size_t total_bytes = full_ple_tok.nbytes();
|
|
225
|
+
ET_CHECK_OR_RETURN_ERROR(total_numel > 0, InvalidState,
|
|
226
|
+
"ple_tok has zero elements");
|
|
227
|
+
ple_embeddings.ple_elem_size = total_bytes / total_numel;
|
|
228
|
+
const size_t prefix_bytes = static_cast<size_t>(total_len) *
|
|
229
|
+
static_cast<size_t>(ple_embeddings.num_layers) *
|
|
230
|
+
static_cast<size_t>(ple_embeddings.ple_dim) *
|
|
231
|
+
ple_embeddings.ple_elem_size;
|
|
232
|
+
ple_embeddings.ple_tok_buf.resize(prefix_bytes);
|
|
233
|
+
std::memcpy(ple_embeddings.ple_tok_buf.data(),
|
|
234
|
+
full_ple_tok.mutable_data_ptr(), prefix_bytes);
|
|
235
|
+
return ::executorch::runtime::Error::Ok;
|
|
236
|
+
}
|
|
76
237
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
238
|
+
[[nodiscard]] auto MultimodalPrefiller::prefillChunk(
|
|
239
|
+
std::vector<EValue> &last_outs, std::vector<uint8_t> &embeds_buf,
|
|
240
|
+
auto chunk_start, auto chunk_len, auto hidden, auto embeds_elem_size,
|
|
241
|
+
auto embeds_dtype, Types::PLEEmbeddings &ple_embeddings,
|
|
242
|
+
std::vector<int64_t> &cache_positions) {
|
|
243
|
+
uint8_t *embeds_chunk_ptr =
|
|
244
|
+
embeds_buf.data() + static_cast<size_t>(chunk_start) *
|
|
245
|
+
static_cast<size_t>(hidden) * embeds_elem_size;
|
|
246
|
+
auto embeds_chunk = ::executorch::extension::from_blob(
|
|
247
|
+
embeds_chunk_ptr, {1, static_cast<SizesType>(chunk_len), hidden},
|
|
248
|
+
embeds_dtype);
|
|
83
249
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
250
|
+
TensorPtr ple_chunk;
|
|
251
|
+
if (decoder_runner_->has_ple()) {
|
|
252
|
+
uint8_t *ple_chunk_ptr =
|
|
253
|
+
ple_embeddings.ple_tok_buf.data() +
|
|
254
|
+
static_cast<size_t>(chunk_start) *
|
|
255
|
+
static_cast<size_t>(ple_embeddings.num_layers) *
|
|
256
|
+
static_cast<size_t>(ple_embeddings.ple_dim) *
|
|
257
|
+
ple_embeddings.ple_elem_size;
|
|
258
|
+
ple_chunk = ::executorch::extension::from_blob(
|
|
259
|
+
ple_chunk_ptr,
|
|
260
|
+
{1, static_cast<SizesType>(chunk_len), ple_embeddings.num_layers,
|
|
261
|
+
ple_embeddings.ple_dim},
|
|
262
|
+
ple_embeddings.ple_tok_dtype);
|
|
87
263
|
}
|
|
88
264
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
265
|
+
auto pos_chunk = ::executorch::extension::from_blob(
|
|
266
|
+
cache_positions.data() + chunk_start, {static_cast<SizesType>(chunk_len)},
|
|
267
|
+
::executorch::aten::ScalarType::Long);
|
|
268
|
+
|
|
269
|
+
auto res = decoder_runner_->has_ple()
|
|
270
|
+
? module_->execute(kTextModelMethod,
|
|
271
|
+
{EValue(*embeds_chunk), EValue(*ple_chunk),
|
|
272
|
+
EValue(*pos_chunk)})
|
|
273
|
+
: module_->execute(kTextModelMethod, {EValue(*embeds_chunk),
|
|
274
|
+
EValue(*pos_chunk)});
|
|
275
|
+
ET_CHECK_OK_OR_RETURN_ERROR(res.error());
|
|
276
|
+
last_outs = std::move(*res);
|
|
277
|
+
return ::executorch::runtime::Error::Ok;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
Result<uint64_t>
|
|
281
|
+
MultimodalPrefiller::prefill(const std::vector<MultimodalInput> &inputs,
|
|
282
|
+
int64_t &start_pos) {
|
|
283
|
+
const bool has_ple = decoder_runner_->has_ple();
|
|
284
|
+
|
|
285
|
+
ET_CHECK_OR_RETURN_ERROR(!inputs.empty(), InvalidArgument,
|
|
286
|
+
"prefill: empty input list");
|
|
287
|
+
|
|
288
|
+
// ------------------------------------------------------------
|
|
289
|
+
// * get_max_seq_len — text_decoder S cap. Max prefill chunk length
|
|
290
|
+
// (<=get_max_conetxt_len)
|
|
291
|
+
// * get_max_context_len — total KV budget. Caps max context length for
|
|
292
|
+
// multi-turn conversation.
|
|
293
|
+
// ------------------------------------------------------------
|
|
294
|
+
int64_t max_seq_len = get_max_seq_len();
|
|
295
|
+
int64_t max_context_len = get_max_context_len();
|
|
296
|
+
bool enable_dynamic_shape = get_enable_dynamic_shape();
|
|
297
|
+
const int64_t prefill_total_cap =
|
|
298
|
+
enable_dynamic_shape ? max_context_len : max_seq_len;
|
|
299
|
+
const int64_t decoder_chunk_size = max_seq_len;
|
|
300
|
+
|
|
301
|
+
std::vector<int64_t> ids;
|
|
302
|
+
ids.reserve(static_cast<size_t>(prefill_total_cap));
|
|
303
|
+
std::vector<Types::ImageSlot> image_slots;
|
|
304
|
+
std::vector<Types::AudioSlot> audio_slots;
|
|
305
|
+
|
|
306
|
+
for (const auto &input : inputs) {
|
|
307
|
+
auto res = processMultimodalInput(input, ids, image_slots, audio_slots);
|
|
308
|
+
if (res != ::executorch::runtime::Error::Ok) {
|
|
309
|
+
return res;
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
const int64_t total_len = static_cast<int64_t>(ids.size());
|
|
314
|
+
ET_CHECK_OR_RETURN_ERROR(total_len > 0, InvalidArgument,
|
|
315
|
+
"prefill produced zero tokens");
|
|
316
|
+
|
|
317
|
+
ET_CHECK_OR_RETURN_ERROR(total_len <= prefill_total_cap, InvalidArgument,
|
|
318
|
+
"Prefill length %lld exceeds %s (%lld)",
|
|
319
|
+
static_cast<long long>(total_len),
|
|
320
|
+
enable_dynamic_shape ? "get_max_context_len"
|
|
321
|
+
: "get_max_seq_len",
|
|
322
|
+
static_cast<long long>(prefill_total_cap));
|
|
323
|
+
if (!enable_dynamic_shape) {
|
|
324
|
+
ids.resize(static_cast<size_t>(max_seq_len), 0);
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
// ------------------------------------------------------------
|
|
328
|
+
// Single token_embedding call over the fused id buffer.
|
|
329
|
+
// ------------------------------------------------------------
|
|
330
|
+
const int64_t tok_buf_len = static_cast<int64_t>(ids.size());
|
|
331
|
+
auto token_tensor = ::executorch::extension::from_blob(
|
|
332
|
+
ids.data(), {1, static_cast<SizesType>(tok_buf_len)},
|
|
333
|
+
::executorch::aten::ScalarType::Long);
|
|
334
|
+
|
|
335
|
+
auto embed_result = module_->execute(kTokenEmbeddingMethod, token_tensor);
|
|
336
|
+
ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error());
|
|
337
|
+
auto &embed_outputs = *embed_result;
|
|
338
|
+
|
|
339
|
+
auto full_embed = embed_outputs[0].toTensor();
|
|
340
|
+
const auto hidden = static_cast<SizesType>(full_embed.size(2));
|
|
341
|
+
|
|
342
|
+
// Own the embeds for the live prefix — subsequent vision_encoder.execute
|
|
343
|
+
// calls may reuse the token_embedding output buffer in the runtime.
|
|
344
|
+
const ::executorch::aten::ScalarType embeds_dtype = full_embed.scalar_type();
|
|
345
|
+
const size_t embeds_total_numel = static_cast<size_t>(full_embed.numel());
|
|
346
|
+
ET_CHECK_OR_RETURN_ERROR(embeds_total_numel > 0, InvalidState,
|
|
347
|
+
"token_embedding returned zero elements");
|
|
348
|
+
const size_t embeds_elem_size = full_embed.nbytes() / embeds_total_numel;
|
|
349
|
+
const size_t embeds_prefix_bytes = static_cast<size_t>(total_len) *
|
|
350
|
+
static_cast<size_t>(hidden) *
|
|
351
|
+
embeds_elem_size;
|
|
352
|
+
std::vector<uint8_t> embeds_buf(embeds_prefix_bytes);
|
|
353
|
+
std::memcpy(embeds_buf.data(), full_embed.mutable_data_ptr(),
|
|
354
|
+
embeds_prefix_bytes);
|
|
355
|
+
|
|
356
|
+
// ------------------------------------------------------------
|
|
357
|
+
// Pass 2: encode images and splice their outputs into embeds_buf.
|
|
358
|
+
// ------------------------------------------------------------
|
|
359
|
+
for (const auto &slot : image_slots) {
|
|
360
|
+
auto res =
|
|
361
|
+
encodeImages(slot, hidden, embeds_buf, embeds_elem_size, embeds_dtype);
|
|
362
|
+
if (res != ::executorch::runtime::Error::Ok) {
|
|
363
|
+
return res;
|
|
364
|
+
}
|
|
94
365
|
}
|
|
95
366
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
367
|
+
// ------------------------------------------------------------
|
|
368
|
+
// Pass 2b: splice encoded audio tokens into embeds_buf. Reads from the
|
|
369
|
+
// byte snapshot taken at encode time so post-encode execute() calls can't
|
|
370
|
+
// invalidate slot state. Same dtype-conversion matrix as vision.
|
|
371
|
+
// ------------------------------------------------------------
|
|
372
|
+
for (auto &slot : audio_slots) {
|
|
373
|
+
auto res =
|
|
374
|
+
encodeAudio(slot, hidden, embeds_buf, embeds_elem_size, embeds_dtype);
|
|
375
|
+
if (res != ::executorch::runtime::Error::Ok) {
|
|
376
|
+
return res;
|
|
377
|
+
}
|
|
378
|
+
}
|
|
100
379
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
380
|
+
Types::PLEEmbeddings ple_embeddings;
|
|
381
|
+
if (has_ple) {
|
|
382
|
+
auto res = initializePLE(embed_outputs, total_len, ple_embeddings);
|
|
383
|
+
if (res != ::executorch::runtime::Error::Ok) {
|
|
384
|
+
return res;
|
|
385
|
+
}
|
|
386
|
+
}
|
|
104
387
|
|
|
105
|
-
|
|
106
|
-
|
|
388
|
+
std::vector<EValue> last_outs;
|
|
389
|
+
const int64_t chunk_cap =
|
|
390
|
+
decoder_chunk_size > 0 ? decoder_chunk_size : total_len;
|
|
391
|
+
std::vector<int64_t> cache_positions(static_cast<size_t>(total_len));
|
|
392
|
+
for (int64_t i = 0; i < total_len; ++i) {
|
|
393
|
+
cache_positions[static_cast<size_t>(i)] = start_pos + i;
|
|
394
|
+
}
|
|
395
|
+
const int64_t num_chunks = (total_len + chunk_cap - 1) / chunk_cap;
|
|
396
|
+
for (int64_t ci = 0; ci < num_chunks; ++ci) {
|
|
397
|
+
const int64_t chunk_start = ci * chunk_cap;
|
|
398
|
+
const int64_t chunk_end = std::min(chunk_start + chunk_cap, total_len);
|
|
399
|
+
const int64_t chunk_len = chunk_end - chunk_start;
|
|
400
|
+
auto res = prefillChunk(last_outs, embeds_buf, chunk_start, chunk_len,
|
|
401
|
+
hidden, embeds_elem_size, embeds_dtype,
|
|
402
|
+
ple_embeddings, cache_positions);
|
|
403
|
+
if (res != ::executorch::runtime::Error::Ok) {
|
|
404
|
+
return res;
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
ET_CHECK_OR_RETURN_ERROR(!last_outs.empty(), InvalidState,
|
|
107
409
|
"text_decoder returned no outputs during prefill");
|
|
108
410
|
|
|
109
|
-
auto logits =
|
|
110
|
-
start_pos +=
|
|
411
|
+
auto logits = last_outs[0].toTensor();
|
|
412
|
+
start_pos += total_len;
|
|
111
413
|
|
|
112
414
|
return static_cast<uint64_t>(decoder_runner_->logits_to_token(logits));
|
|
113
415
|
}
|
|
@@ -127,6 +429,9 @@ Error MultimodalPrefiller::load() {
|
|
|
127
429
|
if (methods.find(kVisionEncoderMethod) != methods.end()) {
|
|
128
430
|
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kVisionEncoderMethod));
|
|
129
431
|
}
|
|
432
|
+
if (methods.find(kAudioEncoderMethod) != methods.end()) {
|
|
433
|
+
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kAudioEncoderMethod));
|
|
434
|
+
}
|
|
130
435
|
return Error::Ok;
|
|
131
436
|
}
|
|
132
437
|
|
|
@@ -140,8 +445,13 @@ bool MultimodalPrefiller::is_method_loaded() {
|
|
|
140
445
|
return false;
|
|
141
446
|
}
|
|
142
447
|
const auto &methods = *methods_res;
|
|
143
|
-
if (methods.find(kVisionEncoderMethod) != methods.end()
|
|
144
|
-
|
|
448
|
+
if (methods.find(kVisionEncoderMethod) != methods.end() &&
|
|
449
|
+
!module_->is_method_loaded(kVisionEncoderMethod)) {
|
|
450
|
+
return false;
|
|
451
|
+
}
|
|
452
|
+
if (methods.find(kAudioEncoderMethod) != methods.end() &&
|
|
453
|
+
!module_->is_method_loaded(kAudioEncoderMethod)) {
|
|
454
|
+
return false;
|
|
145
455
|
}
|
|
146
456
|
return true;
|
|
147
457
|
}
|
|
@@ -18,26 +18,77 @@
|
|
|
18
18
|
|
|
19
19
|
namespace executorch::extension::llm {
|
|
20
20
|
|
|
21
|
+
namespace Types {
|
|
22
|
+
struct ImageSlot {
|
|
23
|
+
const MultimodalInput *input; // non-owning, valid for duration of call
|
|
24
|
+
int64_t slot_start;
|
|
25
|
+
int64_t num_visual;
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
struct AudioSlot {
|
|
29
|
+
std::vector<uint8_t> bytes;
|
|
30
|
+
::executorch::aten::ScalarType dtype;
|
|
31
|
+
int64_t slot_start;
|
|
32
|
+
int64_t num_audio;
|
|
33
|
+
int64_t audio_hidden;
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
struct PLEEmbeddings {
|
|
37
|
+
std::vector<uint8_t> ple_tok_buf;
|
|
38
|
+
aten::SizesType num_layers = 0;
|
|
39
|
+
aten::SizesType ple_dim = 0;
|
|
40
|
+
size_t ple_elem_size = 0;
|
|
41
|
+
::executorch::aten::ScalarType ple_tok_dtype =
|
|
42
|
+
::executorch::aten::ScalarType::Half;
|
|
43
|
+
};
|
|
44
|
+
} // namespace Types
|
|
45
|
+
|
|
21
46
|
class MultimodalPrefiller {
|
|
22
47
|
public:
|
|
23
|
-
explicit MultimodalPrefiller(
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
48
|
+
explicit MultimodalPrefiller(
|
|
49
|
+
Module &module, MultimodalDecoderRunner &decoder_runner,
|
|
50
|
+
tokenizers::HFTokenizer &tokenizer,
|
|
51
|
+
std::unordered_map<std::string, int64_t> metadata,
|
|
52
|
+
IEncoder *image_encoder = nullptr, IEncoder *audio_encoder = nullptr);
|
|
27
53
|
|
|
28
54
|
// Prefill one input segment. Updates start_pos in-place.
|
|
29
55
|
// Returns the first predicted token after this segment.
|
|
30
|
-
::executorch::runtime::Result<uint64_t>
|
|
31
|
-
|
|
56
|
+
::executorch::runtime::Result<uint64_t>
|
|
57
|
+
prefill(const std::vector<MultimodalInput> &inputs, int64_t &start_pos);
|
|
32
58
|
|
|
59
|
+
auto processMultimodalInput(const MultimodalInput &input,
|
|
60
|
+
std::vector<int64_t> &ids,
|
|
61
|
+
std::vector<Types::ImageSlot> &image_slots,
|
|
62
|
+
std::vector<Types::AudioSlot> &audio_slots);
|
|
33
63
|
::executorch::runtime::Error load();
|
|
34
64
|
bool is_method_loaded();
|
|
65
|
+
int64_t get_max_seq_len() const;
|
|
66
|
+
int64_t get_max_context_len() const;
|
|
67
|
+
bool get_enable_dynamic_shape() const;
|
|
35
68
|
|
|
36
69
|
private:
|
|
70
|
+
auto encodeImages(const Types::ImageSlot &slot, const auto hidden,
|
|
71
|
+
std::vector<uint8_t> &embeds_buf,
|
|
72
|
+
const size_t embeds_elem_size,
|
|
73
|
+
const ::executorch::aten::ScalarType &embeds_dtype);
|
|
74
|
+
auto encodeAudio(const Types::AudioSlot &slot, const auto hidden,
|
|
75
|
+
std::vector<uint8_t> &embeds_buf,
|
|
76
|
+
const size_t embeds_elem_size,
|
|
77
|
+
const ::executorch::aten::ScalarType &embeds_dtype);
|
|
78
|
+
auto prefillChunk(std::vector<::executorch::runtime::EValue> &last_outs,
|
|
79
|
+
std::vector<uint8_t> &embeds_buf, auto chunk_start,
|
|
80
|
+
auto chunk_len, auto hidden, auto embeds_elem_size,
|
|
81
|
+
auto embeds_dtype, Types::PLEEmbeddings &ple_embeddings,
|
|
82
|
+
std::vector<int64_t> &cache_positions);
|
|
83
|
+
auto initializePLE(auto &embed_outputs, auto total_len,
|
|
84
|
+
Types::PLEEmbeddings &ple_embeddings);
|
|
85
|
+
|
|
37
86
|
Module *module_;
|
|
38
87
|
MultimodalDecoderRunner *decoder_runner_;
|
|
39
88
|
tokenizers::HFTokenizer *tokenizer_;
|
|
89
|
+
std::unordered_map<std::string, int64_t> metadata_;
|
|
40
90
|
IEncoder *image_encoder_;
|
|
91
|
+
IEncoder *audio_encoder_;
|
|
41
92
|
};
|
|
42
93
|
|
|
43
94
|
} // namespace executorch::extension::llm
|