npm - cui-llama.rn - Versions diffs - 1.7.3 → 1.7.6 - Mend

cui-llama.rn 1.7.3 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (276) hide show

package/cpp/tools/mtmd/clip.h CHANGED Viewed

@@ -4,6 +4,8 @@
 #include <stddef.h>
 #include <stdint.h>
+// !!! Internal header, to be used by mtmd only !!!
 struct clip_ctx;
 struct clip_image_size {
@@ -15,12 +17,22 @@ struct clip_image_f32;
 struct clip_image_u8_batch;
 struct clip_image_f32_batch;
+enum clip_modality {
+    CLIP_MODALITY_VISION,
+    CLIP_MODALITY_AUDIO,
+};
 struct clip_context_params {
     bool use_gpu;
     enum lm_ggml_log_level verbosity;
 };
-struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
+struct clip_init_result {
+    struct clip_ctx * ctx_v; // vision context
+    struct clip_ctx * ctx_a; // audio context
+};
+struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
 void clip_free(struct clip_ctx * ctx);
@@ -34,9 +46,6 @@ int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
 // TODO: should be enum, not string
 const char * clip_patch_merge_type(const struct clip_ctx * ctx);
-const int32_t * clip_image_grid(const struct clip_ctx * ctx);
-size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
 int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
 // for M-RoPE, this will be the number of token positions in X and Y directions
@@ -99,3 +108,4 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
+bool clip_has_whisper_encoder(const struct clip_ctx * ctx);

package/cpp/tools/mtmd/mtmd-audio.cpp CHANGED Viewed

@@ -1,28 +1,5 @@
-// fix problem with std::min and std::max
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <windows.h>
-#endif
 #include "mtmd-audio.h"
-//#define MTMD_AUDIO_DEBUG
-#define MINIAUDIO_IMPLEMENTATION
-#ifndef MTMD_AUDIO_DEBUG
-#   define MA_NO_ENCODING
-#endif
-#define MA_NO_DEVICE_IO
-#define MA_NO_RESOURCE_MANAGER
-#define MA_NO_NODE_GRAPH
-#define MA_NO_ENGINE
-#define MA_NO_GENERATION
-#define MA_API static
-#include "miniaudio.h"
 #define _USE_MATH_DEFINES // for M_PI
 #include <cmath>
 #include <cstdint>
@@ -359,69 +336,6 @@ bool preprocess_audio(
 } // namespace whisper_preprocessor
-namespace audio_helpers {
-bool is_audio_file(const char * buf, size_t len) {
-    if (len < 12) {
-        return false;
-    }
-    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
-    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
-    bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
-    bool is_mp3 = len >= 3 && (
-        memcmp(buf, "ID3", 3) == 0 ||
-        // Check for MPEG sync word (simplified check)
-        ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
-    );
-    bool is_flac = memcmp(buf, "fLaC", 4) == 0;
-    return is_wav || is_mp3 || is_flac;
-}
-// returns true if the buffer is a valid audio file
-bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
-    ma_result result;
-    const int channels = 1;
-    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
-    ma_decoder decoder;
-    result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
-    if (result != MA_SUCCESS) {
-        return false;
-    }
-    ma_uint64 frame_count;
-    ma_uint64 frames_read;
-    result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
-    if (result != MA_SUCCESS) {
-        ma_decoder_uninit(&decoder);
-        return false;
-    }
-    pcmf32_mono.resize(frame_count);
-    result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
-    if (result != MA_SUCCESS) {
-        ma_decoder_uninit(&decoder);
-        return false;
-    }
-#ifdef MTMD_AUDIO_DEBUG
-    // save audio to wav file
-    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
-    ma_encoder encoder;
-    ma_encoder_init_file("output.wav", &config, &encoder);
-    ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
-    ma_encoder_uninit(&encoder);
-#endif
-    ma_decoder_uninit(&decoder);
-    return true;
-}
-} // namespace wav_utils
 // precalculated mel filter banks
 // values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
 //

package/cpp/tools/mtmd/mtmd-audio.h CHANGED Viewed

@@ -32,7 +32,7 @@ struct whisper_filters {
     std::vector<float> data;
 };
-extern bool preprocess_audio(
+bool preprocess_audio(
         const float * samples,
         size_t n_samples,
         const whisper_filters & filters,
@@ -40,23 +40,8 @@ extern bool preprocess_audio(
 } // namespace whisper_preprocessor
-// TODO @ngxson : move this helper to mtmd-helpers.cpp
-namespace audio_helpers {
-extern bool is_audio_file(const char * buf, size_t len);
-extern bool decode_audio_from_buf(
-        const unsigned char * buf_in,
-        size_t len,
-        int target_sampler_rate,
-        std::vector<float> & pcmf32_mono);
-} // namespace audio_helpers
 namespace whisper_precalc_filters {
-extern whisper_preprocessor::whisper_filters get_128_bins();
+whisper_preprocessor::whisper_filters get_128_bins();
 } // namespace whisper_precalc_filters

package/cpp/tools/mtmd/mtmd-helper.cpp CHANGED Viewed

@@ -1,10 +1,37 @@
+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
 #include "mtmd.h"
+#include "mtmd-helper.h"
 #include "llama.h"
 #include <algorithm>
 #include <cinttypes>
 #include <vector>
+//#define MTMD_AUDIO_DEBUG
+#define MINIAUDIO_IMPLEMENTATION
+#ifndef MTMD_AUDIO_DEBUG
+#   define MA_NO_ENCODING
+#endif
+#define MA_NO_DEVICE_IO
+#define MA_NO_RESOURCE_MANAGER
+#define MA_NO_NODE_GRAPH
+#define MA_NO_ENGINE
+#define MA_NO_GENERATION
+#define MA_API static
+#include "miniaudio/miniaudio.h"
+#define STB_IMAGE_IMPLEMENTATION
+#include "stb/stb_image.h"
 #define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
 #define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
@@ -66,7 +93,8 @@ struct decode_embd_batch {
         }
     }
-    void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
+    // M-RoPE for image
+    void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
         LM_GGML_ASSERT(n_pos_per_embd == 4);
         seq_id_0[0] = seq_id;
         for (int y = 0; y < ny; y++) {
@@ -85,6 +113,23 @@ struct decode_embd_batch {
         }
     }
+    // M-RoPE for audio
+    void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
+        LM_GGML_ASSERT(n_pos_per_embd == 4);
+        seq_id_0[0] = seq_id;
+        for (int i = 0; i < batch.n_tokens; i++) {
+            pos[i                     ] = pos_0 + i;
+            pos[i + batch.n_tokens    ] = pos_0 + i;
+            pos[i + batch.n_tokens * 2] = pos_0 + i;
+            pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+        }
+        for (int i = 0; i < batch.n_tokens; i++) {
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
     llama_batch get_view(int offset, int n_tokens) {
         llama_pos * pos_ptr;
         pos_view.clear();
@@ -146,18 +191,20 @@ int32_t mtmd_helper_decode_image_chunk(
     decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
     if (mtmd_decode_use_mrope(ctx)) {
-        const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
-        if (chunk_type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            LOG_ERR("failed to decode chunk: M-RoPE only accepts image chunk\n");
-            return -1;
-        }
-        if (!image_tokens) {
-            LOG_ERR("failed to decode chunk: image tokens are null\n");
-            return -1;
+        if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
+            if (!image_tokens) {
+                LOG_ERR("failed to decode chunk: image tokens are null\n");
+                return -1;
+            }
+            const int nx = mtmd_image_tokens_get_nx(image_tokens);
+            const int ny = mtmd_image_tokens_get_ny(image_tokens);
+            batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
+        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+            batch_embd.set_position_mrope_1d(n_past, seq_id);
+        } else {
+            LM_GGML_ABORT("invalid chunk type for M-RoPE");
         }
-        const int nx = mtmd_image_tokens_get_nx(image_tokens);
-        const int ny = mtmd_image_tokens_get_ny(image_tokens);
-        batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
     } else {
         batch_embd.set_position_normal(n_past, seq_id);
     }
@@ -264,6 +311,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
         LM_GGML_ABORT("chunk type not supported");
     }
+    llama_batch_free(text_batch);
     return 0;
 }
@@ -295,3 +343,118 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
     return 0;
 }
+namespace audio_helpers {
+static bool is_audio_file(const char * buf, size_t len) {
+    if (len < 12) {
+        return false;
+    }
+    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
+    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
+    bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
+    bool is_mp3 = len >= 3 && (
+        memcmp(buf, "ID3", 3) == 0 ||
+        // Check for MPEG sync word (simplified check)
+        ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
+    );
+    bool is_flac = memcmp(buf, "fLaC", 4) == 0;
+    return is_wav || is_mp3 || is_flac;
+}
+// returns true if the buffer is a valid audio file
+static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
+    ma_result result;
+    const int channels = 1;
+    ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
+    ma_decoder decoder;
+    result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
+    if (result != MA_SUCCESS) {
+        return false;
+    }
+    ma_uint64 frame_count;
+    ma_uint64 frames_read;
+    result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
+    if (result != MA_SUCCESS) {
+        ma_decoder_uninit(&decoder);
+        return false;
+    }
+    pcmf32_mono.resize(frame_count);
+    result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
+    if (result != MA_SUCCESS) {
+        ma_decoder_uninit(&decoder);
+        return false;
+    }
+#ifdef MTMD_AUDIO_DEBUG
+    // save audio to wav file
+    ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
+    ma_encoder encoder;
+    ma_encoder_init_file("output.wav", &config, &encoder);
+    ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
+    ma_encoder_uninit(&encoder);
+#endif
+    ma_decoder_uninit(&decoder);
+    return true;
+}
+} // namespace audio_helpers
+mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
+    if (audio_helpers::is_audio_file((const char *)buf, len)) {
+        std::vector<float> pcmf32;
+        int bitrate = mtmd_get_audio_bitrate(ctx);
+        if (bitrate < 0) {
+            LOG_ERR("This model does not support audio input\n");
+            return nullptr;
+        }
+        if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
+            LOG_ERR("Unable to read WAV audio file from buffer\n");
+            return nullptr;
+        }
+        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
+    }
+    // otherwise, we assume it's an image
+    mtmd_bitmap * result = nullptr;
+    {
+        int nx, ny, nc;
+        auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
+        if (!data) {
+            LOG_ERR("%s: failed to decode image bytes\n", __func__);
+            return nullptr;
+        }
+        result = mtmd_bitmap_init(nx, ny, data);
+        stbi_image_free(data);
+    }
+    return result;
+}
+mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
+    std::vector<unsigned char> buf;
+    FILE * f = fopen(fname, "rb");
+    if (!f) {
+        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
+        return nullptr;
+    }
+    fseek(f, 0, SEEK_END);
+    long file_size = ftell(f);
+    fseek(f, 0, SEEK_SET);
+    buf.resize(file_size);
+    size_t n_read = fread(buf.data(), 1, file_size, f);
+    fclose(f);
+    if (n_read != (size_t)file_size) {
+        LOG_ERR("Failed to read entire file %s", fname);
+        return nullptr;
+    }
+    return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
+}

package/cpp/tools/mtmd/mtmd-helper.h ADDED Viewed

@@ -0,0 +1,91 @@
+#ifndef MTMD_HELPER_H
+#define MTMD_HELPER_H
+#include "ggml.h"
+#include "llama.h"
+#include "mtmd.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+#ifdef __cplusplus
+extern "C" {
+#endif
+//
+// libmtmd helper functions
+//
+// Please note that these helpers are not guaranteed to be stable.
+// BREAKING CHANGES are expected.
+//
+// helper function to construct a mtmd_bitmap from a file
+// it calls mtmd_helper_bitmap_init_from_buf() internally
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
+// helper function to construct a mtmd_bitmap from a buffer containing a file
+// supported formats:
+//     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
+//     audio: formats supported by miniaudio: wav, mp3, flac
+// note: audio files will be auto-detected based on magic bytes
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
+// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
+MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
+// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
+// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
+MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
+// helper function that automatically:
+// 1. run llama_decode() on text chunks
+// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
+// otherwise, returns 0 on success
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
+                                         struct llama_context * lctx,
+                                         const mtmd_input_chunks * chunks,
+                                         llama_pos n_past,
+                                         llama_seq_id seq_id,
+                                         int32_t n_batch,
+                                         bool logits_last,
+                                         llama_pos * new_n_past);
+// works like mtmd_helper_eval_chunks(), but only for a single chunk
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+                                               struct llama_context * lctx,
+                                               const mtmd_input_chunk * chunk,
+                                               llama_pos n_past,
+                                               llama_seq_id seq_id,
+                                               int32_t n_batch,
+                                               bool logits_last,
+                                               llama_pos * new_n_past);
+// helper function to decode an image whose embeddings have already been calculated
+// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
+// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
+MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
+                                                struct llama_context * lctx,
+                                                const mtmd_input_chunk * chunk,
+                                                float * encoded_embd,
+                                                llama_pos n_past,
+                                                llama_seq_id seq_id,
+                                                int32_t n_batch,
+                                                llama_pos * new_n_past);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+//
+// C++ wrappers
+//
+#endif