npm - cui-llama.rn - Versions diffs - 1.7.4 → 1.7.6 - Mend

cui-llama.rn 1.7.4 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (276) hide show

package/cpp/tools/mtmd/mtmd.cpp CHANGED Viewed

@@ -95,15 +95,21 @@ mtmd_context_params mtmd_context_params_default() {
 }
 struct mtmd_context {
-    struct clip_ctx * ctx_clip;
+    struct clip_ctx * ctx_v; // vision
+    struct clip_ctx * ctx_a; // audio
     const struct llama_model * text_model;
     std::vector<float> image_embd_v; // image embedding vector
     bool print_timings;
     int n_threads;
     std::string media_marker;
-    bool has_vision;
-    bool has_audio;
+    const int n_embd_text;
+    // these are not token, but strings used to mark the beginning and end of image/audio embeddings
+    std::string img_beg;
+    std::string img_end;
+    std::string aud_beg;
+    std::string aud_end;
     // for llava-uhd style models, we need special tokens in-between slices
     // minicpmv calls them "slices", llama 4 calls them "tiles"
@@ -132,26 +138,61 @@ struct mtmd_context {
         text_model   (text_model),
         print_timings(ctx_params.print_timings),
         n_threads    (ctx_params.n_threads),
-        media_marker (ctx_params.media_marker)
+        media_marker (ctx_params.media_marker),
+        n_embd_text  (llama_model_n_embd(text_model))
     {
         if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
             throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
         }
+        if (media_marker.empty()) {
+            throw std::runtime_error("media_marker must not be empty");
+        }
         clip_context_params ctx_clip_params;
         ctx_clip_params.use_gpu   = ctx_params.use_gpu;
         ctx_clip_params.verbosity = ctx_params.verbosity;
-        ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
-        if (!ctx_clip) {
+        auto res = clip_init(mmproj_fname, ctx_clip_params);
+        ctx_v = res.ctx_v;
+        ctx_a = res.ctx_a;
+        if (!ctx_v && !ctx_a) {
             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
         }
-        has_vision = clip_has_vision_encoder(ctx_clip);
-        has_audio  = clip_has_audio_encoder(ctx_clip);
-        use_mrope  = clip_is_qwen2vl(ctx_clip);
+        // if both vision and audio mmproj are present, we need to validate their n_embd
+        if (ctx_v && ctx_a) {
+            int n_embd_v = clip_n_mmproj_embd(ctx_v);
+            int n_embd_a = clip_n_mmproj_embd(ctx_a);
+            if (n_embd_v != n_embd_a) {
+                throw std::runtime_error(string_format(
+                    "mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
+                    n_embd_v, n_embd_a));
+            }
+        }
+        // since we already validate n_embd of vision and audio mmproj,
+        // we can safely assume that they are the same
+        int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
+        if (n_embd_text != n_embd_clip) {
+            throw std::runtime_error(string_format(
+                "mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
+                "hint: you may be using wrong mmproj\n",
+                n_embd_text, n_embd_clip));
+        }
+        if (ctx_v) {
+            init_vision();
+        }
+        if (ctx_a) {
+            init_audio();
+        }
+    }
-        projector_type proj = clip_get_projector_type(ctx_clip);
-        int minicpmv_version = clip_is_minicpmv(ctx_clip);
+    void init_vision() {
+        LM_GGML_ASSERT(ctx_v != nullptr);
+        use_mrope = clip_is_qwen2vl(ctx_v);
+        projector_type proj = clip_get_projector_type(ctx_v);
+        int minicpmv_version = clip_is_minicpmv(ctx_v);
         if (minicpmv_version == 2) {
             // minicpmv 2.5 format:
             // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
@@ -196,24 +237,82 @@ struct mtmd_context {
             ov_img_first      = false; // overview image is last
         }
-        if (proj == PROJECTOR_TYPE_ULTRAVOX) {
+        // set boi/eoi
+        if (proj == PROJECTOR_TYPE_GEMMA3) {
+            // <start_of_image> ... (image embeddings) ... <end_of_image>
+            img_beg = "<start_of_image>";
+            img_end = "<end_of_image>";
+        } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
+            // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
+            img_beg = "<fake_token_around_image><global-img>";
+            img_end = "<fake_token_around_image>";
+        } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
+            // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
+            img_end = "[IMG_END]";
+        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
+            // <|vision_start|> ... (image embeddings) ... <|vision_end|>
+            img_beg = "<|vision_start|>";
+            img_end = "<|vision_end|>";
+        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
+            // (more details in mtmd_context constructor)
+            img_beg = "<|image_start|>";
+            img_end = "<|image_end|>";
+            LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
+                    "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
+        } else if (proj == PROJECTOR_TYPE_INTERNVL) {
+            // <img> ... (image embeddings) ... </img>
+            img_beg = "<img>";
+            img_end = "</img>";
+        }
+    }
+    void init_audio() {
+        LM_GGML_ASSERT(ctx_a != nullptr);
+        projector_type proj = clip_get_projector_type(ctx_a);
+        if (clip_has_whisper_encoder(ctx_a)) {
             // TODO @ngxson : check if model n_mel is 128 or 80
             w_filters = whisper_precalc_filters::get_128_bins();
         }
-        // warning messages
-        if (proj == PROJECTOR_TYPE_LLAMA4) {
-            LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
-                    "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
+        LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
+                "    https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
+        if (proj == PROJECTOR_TYPE_QWEN2A) {
+            // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
+            aud_beg = "<|audio_bos|>";
+            aud_end = "<|audio_eos|>";
         }
-        if (has_audio) {
-            LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
-                    "    https://github.com/ggml-org/llama.cpp/pull/13623\n", __func__);
+    }
+    // get clip ctx based on chunk type
+    clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
+        if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            return ctx_v;
+        } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
+            return ctx_a;
         }
+        LM_GGML_ABORT("unknown chunk type");
+    }
+    projector_type proj_type_v() const {
+        return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
+    }
+    projector_type proj_type_a() const {
+        return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
     }
     ~mtmd_context() {
-        clip_free(ctx_clip);
+        clip_free(ctx_a);
+        clip_free(ctx_v);
     }
 private:
@@ -260,162 +359,137 @@ void mtmd_free(mtmd_context * ctx) {
     }
 }
-// copied from common_tokenize
-static std::vector<llama_token> mtmd_tokenize_text_internal(
-    const struct llama_vocab * vocab,
-           const std::string & text,
-                        bool   add_special,
-                        bool   parse_special) {
-    // upper limit for the number of tokens
-    int n_tokens = text.length() + 2 * add_special;
-    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
-    if (n_tokens < 0) {
-        result.resize(-n_tokens);
-        int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
-        LM_GGML_ASSERT(check == -n_tokens);
-    } else {
-        result.resize(n_tokens);
-    }
-    return result;
-}
+struct mtmd_tokenizer {
+    mtmd_context * ctx;
+    std::vector<const mtmd_bitmap *> bitmaps;
-int32_t mtmd_tokenize(mtmd_context * ctx,
-            mtmd_input_chunks * output,
+    std::string input_text;
+    bool add_special;
+    bool parse_special;
+    const llama_vocab * vocab;
+    mtmd_input_chunks cur;
+    mtmd_tokenizer(mtmd_context * ctx,
             const mtmd_input_text * text,
             const mtmd_bitmap ** bitmaps,
-            size_t n_bitmaps) {
-    auto vocab = llama_model_get_vocab(ctx->text_model);
-    std::string prompt_modified(text->text);
-    std::string marker_modified(ctx->media_marker);
-    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
-    // for compatibility, we convert image marker to media marker
-    string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
-    // a bit hacky here, but works for now
-    // for some models, we need to add prefix and suffix to the image embeddings
-    if (clip_is_gemma3(ctx->ctx_clip)) {
-        // gemma 3
-        // <start_of_image> ... (image embeddings) ... <end_of_image>
-        marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
-        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
-    } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
-        // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
-        marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
-        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
-    } else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
-        // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
-        marker_modified = ctx->media_marker + "[IMG_END]";
-        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
-    } else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
-        // <|vision_start|> ... (image embeddings) ... <|vision_end|>
-        marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
-        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
-    } else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
-        // (more details in mtmd_context constructor)
-        marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
-        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
-    } else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
-        // <img> ... (image embeddings) ... </img>
-        marker_modified = "<img>" + ctx->media_marker + "</img>";
-        string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
-    }
-    // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
-    // for glm-edge, BOI and EOI token's embeddings are not present in the text model
-    std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
-    output->entries.clear();
-    output->entries.reserve(parts.size());
-    size_t i_bm = 0;
-    // utility for adding raw tokens
-    auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
-        mtmd_input_chunk chunk{
-            MTMD_INPUT_CHUNK_TYPE_TEXT,
-            std::move(tokens),
-            nullptr, // image tokens
-            nullptr, // audio tokens
-        };
-        output->entries.emplace_back(std::move(chunk));
-    };
+            size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
+        add_special   = text->add_special;
+        parse_special = text->parse_special;
+        input_text    = text->text;
+        vocab         = llama_model_get_vocab(ctx->text_model);
+        // for compatibility, we convert image marker to media marker
+        string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
+    }
-    // utility for splitting batch of multiple images into chunks of batch having single images
-    auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
-        std::vector<mtmd_input_chunk> chunks;
+    int32_t tokenize(mtmd_input_chunks * output) {
+        cur.entries.clear();
+        std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
+        size_t i_bm = 0; // index of the current bitmap
+        for (auto & part : parts) {
+            if (part == ctx->media_marker) {
+                // this is a marker, we should add the next bitmap
+                if (i_bm >= bitmaps.size()) {
+                    LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
+                            __func__, bitmaps.size(), parts.size() - 1);
+                    return 1;
+                }
+                const mtmd_bitmap * bitmap = bitmaps[i_bm++];
+                int32_t res = add_media(bitmap);
+                if (res != 0) {
+                    return res;
+                }
+            } else {
+                // this is a text part, we should add it as text
+                add_text(part, parse_special);
+            }
+        }
-        for (auto & entry : batch_f32.entries) {
-            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-            image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
-            image_tokens->ny = 1;
-            image_tokens->batch_f32.entries.push_back(std::move(entry));
-            image_tokens->id = id;
+        if (add_special && llama_vocab_get_add_bos(vocab)) {
+            // if first chunk is text, we add BOS token to first text chunk
+            // otherwise, create a new text chunk with BOS token
+            if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+                // add BOS token to the beginning of first text chunk
+                cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
+            } else {
+                // create a new text chunk with BOS token at the beginning
+                mtmd_input_chunk bos_chunk{
+                    MTMD_INPUT_CHUNK_TYPE_TEXT,
+                    {llama_vocab_bos(vocab)},
+                    nullptr, // image tokens
+                    nullptr, // audio tokens
+                };
+                cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
+            }
+        }
-            mtmd_input_chunk chunk{
-                MTMD_INPUT_CHUNK_TYPE_IMAGE,
-                {}, // text tokens
-                std::move(image_tokens),
-                nullptr, // audio tokens
-            };
-            chunks.emplace_back(std::move(chunk));
+        if (add_special && llama_vocab_get_add_eos(vocab)) {
+            // if last chunk is text, we add EOS token to it
+            add_text({llama_vocab_eos(vocab)});
         }
-        return chunks;
-    };
+        if (i_bm != bitmaps.size()) {
+            LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
+                    __func__, bitmaps.size(), parts.size() - 1);
+            return 1;
+        }
+        *output = std::move(cur);
+        return 0;
+    }
+    void add_text(const std::string & txt, bool parse_special) {
+        LOG_DBG("%s: %s\n", __func__, txt.c_str());
+        auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
+        add_text(tokens);
+    }
-    for (const auto & part : parts) {
-        // printf("tokenizing part: %s\n", part.c_str());
-        bool add_bos = &parts.front() == &part;
-        auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
+    void add_text(const std::vector<llama_token> & tokens) {
         if (tokens.empty()) {
-            continue;
+            return;
         }
-        mtmd_input_chunk chunk{
-            MTMD_INPUT_CHUNK_TYPE_TEXT,
-            std::move(tokens),
-            nullptr, // image tokens
-            nullptr, // audio tokens
-        };
-        output->entries.emplace_back(std::move(chunk));
-        // only add image/audio tokens to middle of 2 parts
-        // therefore, we skip handling image/audio if this is the last part
-        if (&parts.back() == &part) {
-            continue;
+        // if last entry is also a text chunk, add tokens to it instead of creating new chunk
+        if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            cur.entries.back().tokens_text.insert(
+                                            cur.entries.back().tokens_text.end(),
+                                            tokens.begin(),
+                                            tokens.end());
+        } else {
+            mtmd_input_chunk chunk{
+                MTMD_INPUT_CHUNK_TYPE_TEXT,
+                tokens,
+                nullptr, // image tokens
+                nullptr, // audio tokens
+            };
+            cur.entries.emplace_back(std::move(chunk));
         }
+    }
-        if (!bitmaps[i_bm]->is_audio) {
+    int32_t add_media(const mtmd_bitmap * bitmap) {
+        if (!bitmap->is_audio) {
             // handle image
-            if (i_bm >= n_bitmaps) {
-                LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
-                return 1;
-            }
-            if (!ctx->has_vision) {
+            if (!ctx->ctx_v) {
                 LOG_ERR("%s: error: model does not support vision input\n", __func__);
                 return 2;
             }
+            if (!ctx->img_beg.empty()) {
+                add_text(ctx->img_beg, true); // add image begin token
+            }
             // convert mtmd_bitmap to clip_image_u8
             clip_image_u8_ptr img_u8(clip_image_u8_init());
-            img_u8->nx = bitmaps[i_bm]->nx;
-            img_u8->ny = bitmaps[i_bm]->ny;
-            img_u8->buf.resize(bitmaps[i_bm]->data.size());
-            std::memcpy(img_u8->buf.data(), bitmaps[i_bm]->data.data(), img_u8->nx * img_u8->ny * 3);
+            img_u8->nx = bitmap->nx;
+            img_u8->ny = bitmap->ny;
+            img_u8->buf.resize(bitmap->data.size());
+            std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
             // preprocess image
             clip_image_f32_batch batch_f32;
-            bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
+            bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
             if (!ok) {
                 LOG_ERR("Unable to preprocess image\n");
                 return 2;
@@ -427,8 +501,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
                 || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
             ) {
+                const int n_col = batch_f32.grid_x;
+                const int n_row = batch_f32.grid_y;
                 // split batch into chunks of single images
-                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_bm]->id);
+                // NOTE: batch_f32 will be invalidated after this call
+                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
                 LM_GGML_ASSERT(chunks.size() > 0);
                 auto ov_chunk = std::move(chunks.front());
@@ -437,66 +514,65 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 // add overview image (first)
                 if (ctx->ov_img_first) {
                     if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_ov_img_start});
+                        add_text({ctx->tok_ov_img_start});
                     }
-                    output->entries.emplace_back(std::move(ov_chunk));
+                    cur.entries.emplace_back(std::move(ov_chunk));
                     if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_ov_img_end});
+                        add_text({ctx->tok_ov_img_end});
                     }
                 }
                 // add slices (or tiles)
                 if (!chunks.empty()) {
-                    const int n_col = batch_f32.grid_x;
-                    const int n_row = batch_f32.grid_y;
+                    LM_GGML_ASSERT((int)chunks.size() == n_row * n_col);
                     if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_slices_start});
+                        add_text({ctx->tok_slices_start});
                     }
                     for (int y = 0; y < n_row; y++) {
                         for (int x = 0; x < n_col; x++) {
                             const bool is_last_in_row = (x == n_col - 1);
                             if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
-                                add_text_chunk({ctx->tok_sli_img_start});
+                                add_text({ctx->tok_sli_img_start});
                             }
-                            output->entries.emplace_back(std::move(chunks[y * n_col + x]));
+                            cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
                             if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
-                                add_text_chunk({ctx->tok_sli_img_end});
+                                add_text({ctx->tok_sli_img_end});
                             }
                             if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
-                                add_text_chunk({ctx->tok_sli_img_mid});
+                                add_text({ctx->tok_sli_img_mid});
                             }
                         }
                         if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
-                            add_text_chunk({ctx->tok_row_end});
+                            add_text({ctx->tok_row_end});
                         }
                     }
                     if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_slices_end});
+                        add_text({ctx->tok_slices_end});
                     }
                 }
                 // add overview image (last)
                 if (!ctx->ov_img_first) {
                     if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_ov_img_start});
+                        add_text({ctx->tok_ov_img_start});
                     }
-                    output->entries.emplace_back(std::move(ov_chunk));
+                    cur.entries.emplace_back(std::move(ov_chunk));
                     if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
-                        add_text_chunk({ctx->tok_ov_img_end});
+                        add_text({ctx->tok_ov_img_end});
                     }
                 }
             } else {
                 size_t n_tokens = 0;
                 for (const auto & entry : batch_f32.entries) {
-                    n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
+                    n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
                 }
                 mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
                 if (ctx->use_mrope) {
                     // for Qwen2VL, we need this information for M-RoPE decoding positions
-                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
-                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
+                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
+                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
                     image_tokens->use_mrope_pos = true;
                 } else {
                     // other models, we only need the total number of tokens
@@ -504,7 +580,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                     image_tokens->ny = 1;
                 }
                 image_tokens->batch_f32 = std::move(batch_f32);
-                image_tokens->id = bitmaps[i_bm]->id; // optional
+                image_tokens->id = bitmap->id; // optional
                 LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
                 LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
@@ -516,35 +592,35 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                     std::move(image_tokens),
                     nullptr, // audio tokens
                 };
-                output->entries.emplace_back(std::move(chunk));
+                cur.entries.emplace_back(std::move(chunk));
             }
-            i_bm++; // move to next image
-            continue;
+            if (!ctx->img_end.empty()) {
+                add_text(ctx->img_end, true); // add image end token
+            }
         } else {
             // handle audio
-            if (i_bm >= n_bitmaps) {
-                LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
-                return 1;
-            }
-            if (!ctx->has_audio) {
+            if (!ctx->ctx_a) {
                 LOG_ERR("%s: error: model does not support audio input\n", __func__);
                 return 2;
             }
-            if (bitmaps[i_bm]->data.size() == 0) {
+            if (bitmap->data.size() == 0) {
                 LOG_ERR("%s: error: empty audio data\n", __func__);
                 return 2;
             }
+            if (!ctx->aud_beg.empty()) {
+                add_text(ctx->aud_beg, true); // add audio begin token
+            }
             // preprocess audio
             LM_GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
             std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
-            const float * samples = (const float *)bitmaps[i_bm]->data.data();
-            size_t n_samples = bitmaps[i_bm]->data.size() / sizeof(float);
+            const float * samples = (const float *)bitmap->data.data();
+            size_t n_samples = bitmap->data.size() / sizeof(float);
             bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
             if (!ok) {
                 LOG_ERR("Unable to preprocess audio\n");
@@ -558,7 +634,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 mel_f32->nx  = mel_spec.n_len;
                 mel_f32->ny  = mel_spec.n_mel;
                 mel_f32->buf = std::move(mel_spec.data);
-                size_t n_tokens = clip_n_output_tokens(ctx->ctx_clip, mel_f32.get());
+                size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
                 clip_image_f32_batch batch_f32;
                 batch_f32.is_audio = true;
@@ -567,7 +643,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
                 audio_tokens->n_tokens = n_tokens;
                 audio_tokens->batch_f32 = std::move(batch_f32);
-                audio_tokens->id = bitmaps[i_bm]->id; // optional
+                audio_tokens->id = bitmap->id; // optional
                 LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
@@ -577,15 +653,88 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                     nullptr, // image tokens
                     std::move(audio_tokens),
                 };
-                output->entries.emplace_back(std::move(chunk));
+                cur.entries.emplace_back(std::move(chunk));
             }
-            i_bm++;
-            continue;
+            if (!ctx->aud_end.empty()) {
+                add_text(ctx->aud_end, true); // add audio end token
+            }
         }
+        return 0;
     }
-    return 0;
+    std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
+        std::vector<mtmd_input_chunk> chunks;
+        for (auto & entry : batch_f32.entries) {
+            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+            image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
+            image_tokens->ny = 1;
+            image_tokens->batch_f32.entries.push_back(std::move(entry));
+            image_tokens->id = id;
+            mtmd_input_chunk chunk{
+                MTMD_INPUT_CHUNK_TYPE_IMAGE,
+                {}, // text tokens
+                std::move(image_tokens),
+                nullptr, // audio tokens
+            };
+            chunks.emplace_back(std::move(chunk));
+        }
+        return chunks;
+    }
+    // for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
+    static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
+        std::vector<std::string> result;
+        if (input.empty()) {
+            return result;
+        }
+        size_t start = 0;
+        size_t pos = 0;
+        while ((pos = input.find(delimiter, start)) != std::string::npos) {
+            if (pos > start) {
+                result.push_back(input.substr(start, pos - start));
+            }
+            result.push_back(delimiter);
+            start = pos + delimiter.length();
+        }
+        if (start < input.length()) {
+            result.push_back(input.substr(start));
+        }
+        return result;
+    }
+    // copied from common_tokenize
+    static std::vector<llama_token> mtmd_tokenize_text_internal(
+        const struct llama_vocab * vocab,
+               const std::string & text,
+                            bool   add_special,
+                            bool   parse_special) {
+        // upper limit for the number of tokens
+        int n_tokens = text.length() + 2 * add_special;
+        std::vector<llama_token> result(n_tokens);
+        n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+        if (n_tokens < 0) {
+            result.resize(-n_tokens);
+            int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
+            LM_GGML_ASSERT(check == -n_tokens);
+        } else {
+            result.resize(n_tokens);
+        }
+        return result;
+    }
+};
+int32_t mtmd_tokenize(mtmd_context * ctx,
+            mtmd_input_chunks * output,
+            const mtmd_input_text * text,
+            const mtmd_bitmap ** bitmaps,
+            size_t n_bitmaps) {
+    mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
+    return tokenizer.tokenize(output);
 }
 int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
@@ -593,41 +742,54 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
         LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
         return 0;
     } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+        if (!ctx->ctx_v) {
+            LOG_ERR("%s: model does not support vision input\n", __func__);
+            return 1;
+        }
         return mtmd_encode(ctx, chunk->tokens_image.get());
     } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
-        int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
+        if (!ctx->ctx_a) {
+            LOG_ERR("%s: model does not support audio input\n", __func__);
+            return 1;
+        }
+        int n_mmproj_embd = ctx->n_embd_text;
         ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
         bool ok = clip_image_batch_encode(
-            ctx->ctx_clip,
+            ctx->ctx_a,
             ctx->n_threads,
             &chunk->tokens_audio->batch_f32,
             ctx->image_embd_v.data());
         return ok ? 0 : 1;
     }
-    LOG_ERR("mtmd_encode_chunk: unknown chunk type %d\n", (int)chunk->type);
+    LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
     return 1;
 }
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
-    int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
+    clip_ctx * ctx_clip = ctx->ctx_v;
+    if (!ctx_clip) {
+        LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
+        return 1;
+    }
+    int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
     bool ok = false;
-    if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
+    if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
         for (size_t i = 0; i < entries.size(); i++) {
-            int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
+            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
             ok = clip_image_encode(
-                ctx->ctx_clip,
+                ctx_clip,
                 ctx->n_threads,
                 entries[i].get(),
                 ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
         }
     } else {
         ok = clip_image_batch_encode(
-            ctx->ctx_clip,
+            ctx_clip,
             ctx->n_threads,
             &image_tokens->batch_f32,
             ctx->image_embd_v.data());
@@ -641,8 +803,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
 }
 bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
-    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
-    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
+    if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
         return true;
     }
     return false;
@@ -653,60 +814,19 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
 }
 bool mtmd_support_vision(mtmd_context * ctx) {
-    return ctx->has_vision;
+    return ctx->ctx_v != nullptr;
 }
 bool mtmd_support_audio(mtmd_context * ctx) {
-    return ctx->has_audio;
-}
-// these 2 helpers below use internal clip_image_u8_ptr,
-// so unfortunately they cannot moved to mtmd-helper.h
-// however, in theory, user can decode image file to bitmap using
-// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
-mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
-    if (audio_helpers::is_audio_file((const char *)buf, len)) {
-        std::vector<float> pcmf32;
-        if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
-            LOG_ERR("Unable to read WAV audio file from buffer\n");
-            return nullptr;
-        }
-        return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
-    }
-    clip_image_u8_ptr img_u8(clip_image_u8_init());
-    bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
-    if (!ok) {
-        LOG_ERR("Unable to load image from buffer\n");
-        return nullptr;
-    }
-    uint32_t nx, ny;
-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
-    return mtmd_bitmap_init(nx, ny, data);
+    return ctx->ctx_a != nullptr;
 }
-mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
-    std::vector<unsigned char> buf;
-    FILE * f = fopen(fname, "rb");
-    if (!f) {
-        LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
-        return nullptr;
+int mtmd_get_audio_bitrate(mtmd_context * ctx) {
+    if (!ctx->ctx_a) {
+        return -1;
     }
-    fseek(f, 0, SEEK_END);
-    long file_size = ftell(f);
-    fseek(f, 0, SEEK_SET);
-    buf.resize(file_size);
-    size_t n_read = fread(buf.data(), 1, file_size, f);
-    fclose(f);
-    if (n_read != (size_t)file_size) {
-        LOG_ERR("Failed to read entire file %s", fname);
-        return nullptr;
-    }
-    return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
+    // for now, we assume that all audio models have the same bitrate
+    return 16000; // 16kHz
 }
 //